diff --git a/cmake/kernel_headers.cmake b/cmake/kernel_headers.cmake index 376dc2ba07493f721de1273b7ced06d4bd22223d..195863d2b194fbc88002b2a8b5e07634b7638856 100644 --- a/cmake/kernel_headers.cmake +++ b/cmake/kernel_headers.cmake @@ -196,3 +196,7 @@ file(CREATE_LINK ../activation/geglu_tiling_intf.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling_intf.h SYMBOLIC) file(CREATE_LINK ../activation/geglu_tiling.h ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/math/geglu_tiling.h SYMBOLIC) +#initglobalmemory +file(MAKE_DIRECTORY ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/init_global_memory) +file(CREATE_LINK ../utils/init_global_memory.h + ${ASCENDC_INSTALL_BASE_PATH}/ascendc/include/highlevel_api/lib/init_global_memory/init_global_memory.h SYMBOLIC) diff --git a/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h b/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h index 11ce1407789a550c5dbf9e3eeaaffe493fd8a3bb..aaf66a76acd7e2b8db4a3b8a6d68263300194fdb 100644 --- a/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h +++ b/examples/matrix/basic_block_matmul/kernel_impl/basic_block_matmul_custom_impl.h @@ -29,10 +29,10 @@ class BasicBlockMatmulKernel { __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); template __aicore__ inline void Process(AscendC::TPipe* pipe); - matmul::Matmul, - matmul::MatmulType, - matmul::MatmulType, - matmul::MatmulType, MM_CFG> matmulObj; + AscendC::Matmul, + AscendC::MatmulType, + AscendC::MatmulType, + AscendC::MatmulType, MM_CFG> matmulObj; private: __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, BasicBlockMatrixOffset& matrixOffset, diff --git a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h index 065f151cbf3218348ab4d7dcc40ec59f5a3aada0..8b98232b0fe9107d7ce8e469e2b91fbfd1114d5b 100644 --- a/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h +++ b/examples/matrix/batch_matmul/kernel_impl/batch_matmul_custom_impl.h @@ -22,7 +22,7 @@ class BatchMatmulKernel { __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); template __aicore__ inline void Process(AscendC::TPipe* pipe, int32_t batchA, int32_t batchB); - matmul::Matmul matmulObj; + AscendC::Matmul matmulObj; private: __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, int32_t& offsetC, int32_t& offsetBias); diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp index 94206b3efc982492ca4661f50a8f7e299de3197e..6efea394e89347ac1870681718e1e72efb29776d 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_direct/batch_matmul_custom.cpp @@ -31,10 +31,10 @@ extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, { TCubeTiling tiling; CopyTiling(&tiling, tilingGm); - typedef matmul::MatmulType A_TYPE; - typedef matmul::MatmulType B_TYPE; - typedef matmul::MatmulType C_TYPE; - typedef matmul::MatmulType BIAS_TYPE; + typedef AscendC::MatmulType A_TYPE; + typedef AscendC::MatmulType B_TYPE; + typedef AscendC::MatmulType C_TYPE; + typedef AscendC::MatmulType BIAS_TYPE; BatchMatmulKernel batchMatmulKernel; AscendC::TPipe pipe; tiling.shareMode = 0; // 0, share mode diff --git a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp index c55caba16e0cc356523fc44b7dc8c61a0f27c201..056cd2f5b70b90060169ba6e04d5711b6f870a22 100644 --- a/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp +++ b/examples/matrix/batch_matmul/kernel_launch_method_by_framework/op_kernel/batch_matmul_custom.cpp @@ -17,10 +17,10 @@ constexpr int32_t FULL_L0C_SIZE = 128 * 1024; extern "C" __global__ __aicore__ void batch_matmul_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, GM_ADDR tiling) { GET_TILING_DATA(tilingData, tiling); - typedef matmul::MatmulType A_TYPE; - typedef matmul::MatmulType B_TYPE; - typedef matmul::MatmulType C_TYPE; - typedef matmul::MatmulType BIAS_TYPE; + typedef AscendC::MatmulType A_TYPE; + typedef AscendC::MatmulType B_TYPE; + typedef AscendC::MatmulType C_TYPE; + typedef AscendC::MatmulType BIAS_TYPE; BatchMatmulKernel batchMatmulKernel; AscendC::TPipe pipe; tilingData.cubeTilingData.shareMode = 0; // 0, share mode diff --git a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h index 6f85560549f5a1a7274538d03d0d32ba95aae833..a182606025c370d650ef8ff45d4a75c13a425622 100644 --- a/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h +++ b/examples/matrix/matmul/kernel_impl/matmul_custom_impl.h @@ -20,10 +20,10 @@ class MatmulKernel { __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, const TCubeTiling& tiling); template __aicore__ inline void Process(AscendC::TPipe* pipe); - matmul::Matmul, - matmul::MatmulType, - matmul::MatmulType, - matmul::MatmulType, CFG_MDL> matmulObj; + AscendC::Matmul, + AscendC::MatmulType, + AscendC::MatmulType, + AscendC::MatmulType, CFG_MDL> matmulObj; private: __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling& tiling, int32_t& offsetA, int32_t& offsetB, diff --git a/examples/normalization/welford_update/README.md b/examples/normalization/welford_update/README.md new file mode 100644 index 0000000000000000000000000000000000000000..eaa60bc71fe98647905e2eb6b5a7a3e15e1983a8 --- /dev/null +++ b/examples/normalization/welford_update/README.md @@ -0,0 +1,70 @@ + + +## 概述 + +本样例介绍了调用WelfordUpdate高阶API实现welford_update单算子,并按照不同的算子调用方式分别给出了对应的端到端实现。 + +- 直调:使用核函数直调WelfordUpdate自定义算子。 + + 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。 + +- 框架调用:使用框架调用welford_update自定义算子。 + + 按照工程创建->算子实现->编译部署>算子调用的流程完成算子开发。整个过程都依赖于算子工程:基于工程代码框架完成算子核函数的开发和Tiling实现,通过工程编译脚本完成算子的编译部署,继而实现单算子调用或第三方框架中的算子调用。 + +本样例中包含如下调用方式: + +| 调用方式 | 目录 | **描述** | +| --------- | ------------------------------------------------------------ | ---------------------------------------------------------- | +| 直调 | [kernel_launch_method_by_direct](./kernel_launch_method_by_direct) | host侧的核函数调用程序,包含CPU侧、NPU侧、仿真侧三种运行验证方法。 | +| 框架调用 | [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用welford_update算子。 | + +## 样例支持的产品型号为: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 +- Atlas推理系列产品AI Core +- Atlas A3 训练系列产品 + +## 目录结构 + +| 目录 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [kernel_launch_method_by_direct](./kernel_launch_method_by_direct) | 通过kernel直调的方式调用自定义算子工程样例目录 | +| [kernel_launch_method_by_framework](./kernel_launch_method_by_framework) | 通过aclnn调用的方式调用自定义算子工程样例目录 | +| [host_tiling](./host_tiling) | 本样例tiling代码实现 | +| [kernel_impl](./kernel_impl) | 本样例kernel侧代码实现 | + +## 算子描述 + +welford_update单算子,对输入tensor做WelfordUpdate计算。Welford是一种在线计算均值和方差的方法。一方面,它可以在不存储所有样本的情况下,逐步计算所有样本的均值和方差,更适合处理海量数据;另一方面,它只需要对数据进行一次遍历,能减少访存次数,提高计算性能。WelfordUpdate接口为Welford算法的前处理。 + +welford_update算子规格: + + + + + + + + + + + + + + + +
算子类型(OpType)WelfordUpdateCustom
算子输入
nameshapedata typeformat
srcGm1*64halfND
inMeanGm1*64halfND
inVarGm1*64halfND
算子输出
outMeanGm1*64floatND
outVarGm1*64floatND
核函数名welford_update_custom
+ +## 算子实现介绍 + +本样例实现了welford_update算子。 + +- kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用WelfordUpdate高阶API接口完成welford_update计算,得到最终结果,再搬出到外部存储上。 + + welford_update算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行welford_update计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- tiling实现 + + welford_update算子的tiling实现流程如下:首先获取welford_update接口能完成计算所需最大/最小临时空间大小,根据该范围结合实际的内存使用情况设置合适的空间大小,然后根据输入长度dataLength确定所需tiling参数。 \ No newline at end of file diff --git a/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..ef4335a3f3c62e6f66b142877586f84db95a22fc --- /dev/null +++ b/examples/normalization/welford_update/host_tiling/welford_update_custom_tiling.h @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H +#define EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" +#include "tiling/platform/platform_ascendc.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(WelfordUpdateCustomTilingData) + TILING_DATA_FIELD_DEF(bool, inplace); + TILING_DATA_FIELD_DEF(uint32_t, nLength); + TILING_DATA_FIELD_DEF(uint32_t, rLength); + TILING_DATA_FIELD_DEF(uint32_t, abComputeLength); + TILING_DATA_FIELD_DEF(float, nRec); + TILING_DATA_FIELD_DEF(uint32_t, tmpLocalSize); +END_TILING_DATA_DEF; +REGISTER_TILING_DATA_CLASS(WelfordUpdateCustom, WelfordUpdateCustomTilingData) +} // namespace optiling + +constexpr bool ISREUSESOURCE = false; +constexpr bool ISINPLACE = true; + +void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength, + float nRec, optiling::WelfordUpdateCustomTilingData &tiling) +{ + std::vector shapeVec = {nLength, rLength}; + ge::Shape srcShape(shapeVec); + uint32_t maxsize = 0; + uint32_t minsize = 0; + uint32_t dtypesizeT = 2; // half类型 + uint32_t dtypesizeU = 4; // float类型 + + tiling.set_inplace(inplace); + tiling.set_nLength(nLength); + tiling.set_rLength(rLength); + tiling.set_abComputeLength(abComputeLength); + tiling.set_nRec(nRec); + + AscendC::GetWelfordUpdateMaxMinTmpSize(srcShape, dtypesizeT, dtypesizeU, ISREUSESOURCE, ISINPLACE, maxsize, + minsize); + tiling.set_tmpLocalSize(minsize); +} + +#endif // EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H diff --git a/examples/normalization/welford_update/kernel_impl/welford_update_custom.h b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h new file mode 100644 index 0000000000000000000000000000000000000000..19028d91e778fad692d03a58a99741e8417275c3 --- /dev/null +++ b/examples/normalization/welford_update/kernel_impl/welford_update_custom.h @@ -0,0 +1,154 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the + * "License"). Please refer to the License for details. You may not use this + * file except in compliance with the License. THIS SOFTWARE IS PROVIDED ON AN + * "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS + * FOR A PARTICULAR PURPOSE. See LICENSE in the root of the software repository + * for the full text of the License. + */ + +#ifndef EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_H +#define EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_H +#include "kernel_operator.h" + +namespace MyCustomKernel { +struct VecTiling { + bool inplace; + uint32_t nLength; + uint32_t rLength; + uint32_t abComputeLength; + float nRec; + uint32_t tmpLocalSize; +}; + +constexpr AscendC::WelfordUpdateConfig WELFORD_UPDATE_ENABLE_INPLACE_CFG = {true}; +constexpr AscendC::WelfordUpdateConfig WELFORD_UPDATE_UNENABLE_INPLACE_CFG = {false}; +constexpr uint8_t LOCAL_BYTES = 32; + +template +class KernelWelfordUpdate { +public: + __aicore__ inline KernelWelfordUpdate() {} + __aicore__ inline void Init(GM_ADDR inputX_gm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR outputMean_gm, + GM_ADDR outputVar_gm, VecTiling tilingData) { + nLength = tilingData.nLength; + rLength = tilingData.rLength; + abComputeLength = tilingData.abComputeLength; + nRec = tilingData.nRec; + bshLength = tilingData.nLength * tilingData.rLength; + inplace = tilingData.inplace; + tmpLocalBytes = tilingData.tmpLocalSize; + + inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputX_gm), bshLength); + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMean_gm), bshLength); + inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVar_gm), bshLength); + + outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMean_gm), bshLength); + outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVar_gm), bshLength); + + pipe.InitBuffer(inQueueX, 1, sizeof(T) * bshLength); + pipe.InitBuffer(inQueueMean, 1, sizeof(U) * bshLength); + pipe.InitBuffer(inQueueVar, 1, sizeof(U) * bshLength); + pipe.InitBuffer(outQueueMean, 1, sizeof(U) * bshLength); + pipe.InitBuffer(outQueueVar, 1, sizeof(U) * bshLength); + } + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() { + AscendC::LocalTensor inputXLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor inMeanLocal = inQueueMean.AllocTensor(); + AscendC::LocalTensor inVarLocal = inQueueVar.AllocTensor(); + + AscendC::DataCopy(inputXLocal, inputX_global, bshLength); + AscendC::DataCopy(inMeanLocal, inputMean_global, bshLength); + AscendC::DataCopy(inVarLocal, inputVar_global, bshLength); + + inQueueX.EnQue(inputXLocal); + inQueueMean.EnQue(inMeanLocal); + inQueueVar.EnQue(inVarLocal); + } + __aicore__ inline void Compute() { + AscendC::LocalTensor inputXLocal = inQueueX.DeQue(); + AscendC::LocalTensor inMeanLocal = inQueueMean.DeQue(); + AscendC::LocalTensor inVarLocal = inQueueVar.DeQue(); + + AscendC::LocalTensor outMeanLocal = outQueueMean.AllocTensor(); + AscendC::LocalTensor outVarLocal = outQueueVar.AllocTensor(); + + struct AscendC::WelfordUpdateParam para = {nLength, rLength, abComputeLength, nRec}; + if (!tmpLocal) { + if (inplace) { + AscendC::WelfordUpdate(outMeanLocal, + outVarLocal, inMeanLocal, inVarLocal, inputXLocal, para); + } else { + AscendC::WelfordUpdate(outMeanLocal, + outVarLocal, inMeanLocal, inVarLocal, inputXLocal, para); + } + } else { + if (tmpLocalBytes % LOCAL_BYTES != 0) { + tmpLocalBytes = (tmpLocalBytes + LOCAL_BYTES - 1) / LOCAL_BYTES * LOCAL_BYTES; + } + pipe.InitBuffer(tmpLocalBuf, tmpLocalBytes); + AscendC::LocalTensor tmpLocalTensor = tmpLocalBuf.Get(); + if (inplace) { + AscendC::WelfordUpdate(outMeanLocal, + outVarLocal, inMeanLocal, inVarLocal, inputXLocal, tmpLocalTensor, para); + } else { + AscendC::WelfordUpdate(outMeanLocal, + outVarLocal, inMeanLocal, inVarLocal, inputXLocal, tmpLocalTensor, para); + } + } + + inQueueX.FreeTensor(inputXLocal); + inQueueMean.FreeTensor(inMeanLocal); + inQueueVar.FreeTensor(inVarLocal); + + outQueueMean.EnQue(outMeanLocal); + outQueueVar.EnQue(outVarLocal); + } + __aicore__ inline void CopyOut() { + AscendC::LocalTensor outMeanLocal = outQueueMean.DeQue(); + AscendC::LocalTensor outVarLocal = outQueueVar.DeQue(); + + AscendC::DataCopy(outputMean_global, outMeanLocal, bshLength); + AscendC::DataCopy(outputVar_global, outVarLocal, bshLength); + + outQueueMean.FreeTensor(outMeanLocal); + outQueueVar.FreeTensor(outVarLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueMean; + AscendC::TQue inQueueVar; + AscendC::TQue outQueueMean; + AscendC::TQue outQueueVar; + + AscendC::GlobalTensor inputX_global; + AscendC::GlobalTensor inputMean_global; + AscendC::GlobalTensor inputVar_global; + AscendC::GlobalTensor outputMean_global; + AscendC::GlobalTensor outputVar_global; + AscendC::TBuf tmpLocalBuf; + + uint32_t tmpLocalBytes = 0; + uint32_t nLength; + uint32_t rLength; + uint32_t abComputeLength; + float nRec; + uint32_t bshLength; + bool inplace; +}; + +} // namespace MyCustomKernel + +#endif // EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_H diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_direct/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..239d5bf2b6d5d779ece664faac2a592861eaf25f --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/CMakeLists.txt @@ -0,0 +1,73 @@ +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/welford_update_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(welford_update_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/welford_update_custom_tiling.cpp +) + +target_compile_options(welford_update_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(welford_update_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(welford_update_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(welford_update_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS welford_update_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/README.md b/examples/normalization/welford_update/kernel_launch_method_by_direct/README.md new file mode 100644 index 0000000000000000000000000000000000000000..2cf1f879714cbab08baed33b2971565159c977cd --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/README.md @@ -0,0 +1,50 @@ + + +## 概述 + +本样例基于Kernel直调算子工程,介绍了调用WelfordUpdate高阶API实现welford_update单算子,主要演示WelfordUpdate高阶API在Kernel直调工程中的调用。 + +## 目录结构介绍 +| 目录及文件 | 描述 | +|---------------------|----------------------| +| [cmake](./cmake) | 编译工程文件 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| welford_update_custom.cpp | 算子kernel实现 | +| welford_update_custom_tiling.cpp | 算子tiling实现 | +| run.sh | 编译执行脚本 | +| CMakeLists.txt | 编译工程文件 | + + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN包的存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + ``` + 若执行sim仿真,可自行配置仿真日志文件目录,默认仿真日志会在build目录下生成。若需要详细了解sim仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的 工具使用 章节。 + ``` + # 设置仿真模式日志生成目录(可选),需要自行确保设置的目录已存在。若设置为相对路径下的目录,则以程序执行时的目录作为当前目录。例如,执行如下设置时,需要确保./目录下存在xxx目录 + export CAMODEL_LOG_PATH=./xxx + ``` + + - 生成输入和真值 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + 其中cmake参数说明如下: + - RUN_MODE :编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu / sim/ npu] + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。 + + 示例如下: + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/cpu_lib.cmake b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/npu_lib.cmake b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..98413a61adc75e01ac5967f9c61d66e174777237 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/cmake/npu_lib.cmake @@ -0,0 +1,19 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING + ) diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/main.cpp b/examples/normalization/welford_update/kernel_launch_method_by_direct/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a2dd7870535cedbf94f3a550453c5383154d71bb --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/main.cpp @@ -0,0 +1,208 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../../common/data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void welford_update_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *srcGm, uint8_t *inMeanGm, + uint8_t *inVarGm, uint8_t *outMeanGm, uint8_t *outVarGm, uint8_t *workspace, uint8_t *tiling); +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void welford_update_custom(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm, + GM_ADDR outMeanGm, GM_ADDR outVarGm, GM_ADDR workspace, GM_ADDR tiling); +#endif +constexpr uint8_t BLOCK_DIM = 1; +constexpr uint32_t TILINGDATA_SIZE = 6; +constexpr uint32_t WORKSPACE_SIZE = 1024 * 1024; + +constexpr bool ISINPLACE = true; +constexpr uint8_t RN_SIZE = 1; +constexpr uint32_t AB_SIZE = 64; +constexpr uint32_t AB_LENGTH = 35; +constexpr float NREC = 1.0 / 8; + +extern uint8_t *GenerateTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength, + float nRec); + +static bool CompareResult(const void *outputData, int64_t outSize, std::string goldenName) +{ + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden_" + goldenName + ".bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden_%s.bin success!\n", goldenName.c_str()); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden_%s.bin failed output is %lf, golden is %lf\n", goldenName.c_str(), a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResult golden_%s.bin success!\n", goldenName.c_str()); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) +{ + uint32_t blockDim = BLOCK_DIM; + size_t inputSrcSize = RN_SIZE * AB_SIZE * sizeof(float); + size_t inputMeanSize = RN_SIZE * AB_SIZE * sizeof(float); + size_t inputVarSize = RN_SIZE * AB_SIZE * sizeof(float); + size_t outputMeanSize = RN_SIZE * AB_SIZE * sizeof(float); + size_t outputVarSize = RN_SIZE * AB_SIZE * sizeof(float); + + size_t workspaceSize = WORKSPACE_SIZE; + size_t tilingFileSize = TILINGDATA_SIZE * sizeof(uint32_t); + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *inputSrc = (uint8_t *)AscendC::GmAlloc(inputSrcSize); + uint8_t *inputMean = (uint8_t *)AscendC::GmAlloc(inputMeanSize); + uint8_t *inputVar = (uint8_t *)AscendC::GmAlloc(inputVarSize); + uint8_t *outputMean = (uint8_t *)AscendC::GmAlloc(outputMeanSize); + uint8_t *outputVar = (uint8_t *)AscendC::GmAlloc(outputVarSize); + + uint8_t *workspace = (uint8_t *)AscendC::GmAlloc(workspaceSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingFileSize); + + ReadFile("../input/input_srcGm.bin", inputSrcSize, inputSrc, inputSrcSize); + ReadFile("../input/input_inMeanGm.bin", inputMeanSize, inputMean, inputMeanSize); + ReadFile("../input/input_inVarGm.bin", inputVarSize, inputVar, inputVarSize); + + memcpy_s(tiling, tilingFileSize, GenerateTiling(ISINPLACE, RN_SIZE, AB_SIZE, AB_LENGTH, NREC), tilingFileSize); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); + ICPU_RUN_KF(welford_update_custom, blockDim, inputSrc, inputMean, inputVar, outputMean, outputVar, workspace, + tiling); + + WriteFile("../output/output_outMeanGm.bin", outputMean, outputMeanSize); + WriteFile("../output/output_outVarGm.bin", outputVar, outputVarSize); + + bool goldenResult = true; + goldenResult &= CompareResult(outputMean, outputMeanSize, "outMeanGm"); + goldenResult &= CompareResult(outputVar, outputVarSize, "outVarGm"); + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + + AscendC::GmFree((void *)inputSrc); + AscendC::GmFree((void *)inputMean); + AscendC::GmFree((void *)inputVar); + AscendC::GmFree((void *)outputMean); + AscendC::GmFree((void *)outputVar); + AscendC::GmFree((void *)workspace); + AscendC::GmFree((void *)tiling); + +#else + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *srcHost, *inMeanHost, *inVarHost, *outMeanHost, *outVarHost, *workspaceHost; + uint8_t *srcDevice, *inMeanDevice, *inVarDevice, *outMeanDevice, *outVarDevice, *workspaceDevice, *tilingDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&srcHost), inputSrcSize)); + CHECK_ACL(aclrtMallocHost((void **)(&inMeanHost), inputMeanSize)); + CHECK_ACL(aclrtMallocHost((void **)(&inVarHost), inputVarSize)); + CHECK_ACL(aclrtMallocHost((void **)(&outMeanHost), outputMeanSize)); + CHECK_ACL(aclrtMallocHost((void **)(&outVarHost), outputVarSize)); + CHECK_ACL(aclrtMallocHost((void **)(&workspaceHost), workspaceSize)); + + CHECK_ACL(aclrtMalloc((void **)&srcDevice, inputSrcSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&inMeanDevice, inputMeanSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&inVarDevice, inputVarSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&outMeanDevice, outputMeanSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&outVarDevice, outputVarSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_srcGm.bin", inputSrcSize, srcHost, inputSrcSize); + ReadFile("../input/input_inMeanGm.bin", inputMeanSize, inMeanHost, inputMeanSize); + ReadFile("../input/input_inVarGm.bin", inputVarSize, inVarHost, inputVarSize); + + CHECK_ACL(aclrtMemcpy(workspaceDevice, workspaceSize, workspaceHost, workspaceSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingFileSize, GenerateTiling(ISINPLACE, RN_SIZE, AB_SIZE, AB_LENGTH, NREC), + tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + CHECK_ACL(aclrtMemcpy(srcDevice, inputMeanSize, srcHost, inputMeanSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(inMeanDevice, inputMeanSize, inMeanHost, inputMeanSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(inVarDevice, inputVarSize, inVarHost, inputVarSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + welford_update_custom_do(blockDim, nullptr, stream, srcDevice, inMeanDevice, inVarDevice, outMeanDevice, + outVarDevice, workspaceDevice, tilingDevice); + + CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtMemcpy(outMeanHost, outputMeanSize, outMeanDevice, outputMeanSize, ACL_MEMCPY_DEVICE_TO_HOST)); + CHECK_ACL(aclrtMemcpy(outVarHost, outputVarSize, outVarDevice, outputVarSize, ACL_MEMCPY_DEVICE_TO_HOST)); + + WriteFile("../output/output_outMeanGm.bin", outMeanHost, outputMeanSize); + WriteFile("../output/output_outVarGm.bin", outVarHost, outputVarSize); + + bool goldenResult = true; + goldenResult &= CompareResult(outMeanHost, outputMeanSize, "outMeanGm"); + goldenResult &= CompareResult(outVarHost, outputVarSize, "outVarGm"); + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + + CHECK_ACL(aclrtFree(srcDevice)); + CHECK_ACL(aclrtFree(inMeanDevice)); + CHECK_ACL(aclrtFree(inVarDevice)); + CHECK_ACL(aclrtFree(outMeanDevice)); + CHECK_ACL(aclrtFree(outVarDevice)); + CHECK_ACL(aclrtFree(workspaceDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); + + CHECK_ACL(aclrtFreeHost(srcHost)); + CHECK_ACL(aclrtFreeHost(inMeanHost)); + CHECK_ACL(aclrtFreeHost(inVarHost)); + CHECK_ACL(aclrtFreeHost(outMeanHost)); + CHECK_ACL(aclrtFreeHost(outVarHost)); + CHECK_ACL(aclrtFreeHost(workspaceHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + return 0; +} diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/run.sh b/examples/normalization/welford_update/kernel_launch_method_by_direct/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..f8305ad320d1d68e09a8a1d825808e077c4d06cb --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/run.sh @@ -0,0 +1,52 @@ +#!/bin/bash + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH + + if [ ! $CAMODEL_LOG_PATH ]; then + export CAMODEL_LOG_PATH=./ # default log save in build dir + else + export CAMODEL_LOG_PATH=../$CAMODEL_LOG_PATH + rm -rf $CAMODEL_LOG_PATH + mkdir -p $CAMODEL_LOG_PATH + fi +fi + +if [ "${RUN_MODE}" = "cpu" ]; then + export CAMODEL_LOG_PATH=./ # cpu run mode set fixed log path +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 +./welford_update_direct_kernel_op \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/scripts/gen_data.py b/examples/normalization/welford_update/kernel_launch_method_by_direct/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f65af46cbbeeeb50bf8f8c9e0a727761512c7a31 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/scripts/gen_data.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +RN_SIZE = 1 +AB_SIZE = 64 +AB_LENGTH = 35 +NREC = 1.0 / 8 + +def gen_golden_data_simple(): + x1 = np.random.uniform(1, 100, [RN_SIZE * AB_SIZE]).astype(np.float16) + x2 = np.random.uniform(-60000, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32) + x3 = np.random.uniform(0, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32) + golden1 = x2.copy() + golden2 = x3.copy() + + for i in range(AB_LENGTH): + n = np.float32(NREC) + golden1[i] = x2[i] + (x1[i] - x2[i]) * n + golden2[i] = x3[i] + (x1[i] - x2[i]) * (x1[i] - golden1[i]) + + os.system("mkdir -p ./input") + x1.tofile("./input/input_srcGm.bin") + x2.tofile("./input/input_inMeanGm.bin") + x3.tofile("./input/input_inVarGm.bin") + os.system("mkdir -p ./output") + golden1.tofile("./output/golden_outMeanGm.bin") + golden2.tofile("./output/golden_outVarGm.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom.cpp b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..a486b65e87cc2394444321514e765aee5fdc9a82 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom.cpp @@ -0,0 +1,45 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "kernel_operator.h" +#include "../kernel_impl/welford_update_custom.h" + +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling* tiling, GM_ADDR tilingGM) +{ + uint32_t* ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t*>(tilingGM); + + for (uint32_t i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +extern "C" __global__ __aicore__ void welford_update_custom(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm, + GM_ADDR outMeanGm, GM_ADDR outVarGm, GM_ADDR workspace, GM_ADDR tiling) +{ + if ASCEND_IS_AIC { + return; + } + MyCustomKernel::KernelWelfordUpdate op; + MyCustomKernel::VecTiling tilingData; + CopyTiling(&tilingData, tiling); + op.Init(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, tilingData); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +// call of kernel function +void welford_update_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *srcGm, uint8_t *inMeanGm, + uint8_t *inVarGm, uint8_t *outMeanGm, uint8_t *outVarGm, uint8_t *workspace, uint8_t *tiling) +{ + welford_update_custom<<>>(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, workspace, + tiling); +} +#endif diff --git a/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom_tiling.cpp b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9ff63ec66c63c1b5f9cef530f248211bcbed455b --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_direct/welford_update_custom_tiling.cpp @@ -0,0 +1,30 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include +#include +#include +#include "tiling/tiling_api.h" +#include "../host_tiling/welford_update_custom_tiling.h" + +uint8_t *GetTilingBuf(optiling::WelfordUpdateCustomTilingData *tilingData) { + uint32_t tilingSize = sizeof(optiling::WelfordUpdateCustomTilingData); + uint8_t *buf = (uint8_t *)malloc(tilingSize); + tilingData->SaveToBuffer(buf, tilingSize); + return buf; +} +uint8_t* GenerateTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength, float nRec) +{ + optiling::WelfordUpdateCustomTilingData tiling; + ComputeTiling(inplace, nLength, rLength, abComputeLength, nRec, tiling); + return GetTilingBuf(&tiling); +} \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..584132d80993d309434fb1303de83910a1989aba --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakeLists.txt @@ -0,0 +1,69 @@ +cmake_minimum_required(VERSION 3.16.0) +project(opp) +if(ENABLE_CROSS_COMPILE) + if(${CMAKE_SYSTEM_PROCESSOR} STREQUAL x86_64) + set(CROSS_COMPILE_PLATFORM aarch64) + else() + set(CROSS_COMPILE_PLATFORM x86_64) + endif() + set(PLATFORM ${CMAKE_SYSTEM_PROCESSOR}) + set(CMAKE_COMPILE_COMPILER_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/linux/${CROSS_COMPILE_PLATFORM}/) + set(CMAKE_COMPILE_RUNTIME_LIBRARY ${ASCEND_CANN_PACKAGE_PATH}/${PLATFORM}-linux/devlib/${CROSS_COMPILE_PLATFORM}/) + set(CMAKE_SYSTEM_PROCESSOR ${CROSS_COMPILE_PLATFORM}) + set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER}) + set(CMAKE_CXX_COMPILER ${CMAKE_CROSS_PLATFORM_COMPILER}) +else() + set(CMAKE_COMPILE ${CMAKE_CXX_COMPILER}) +endif() + +include(cmake/config.cmake) +include(cmake/func.cmake) +include(cmake/intf.cmake) + +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/framework) + add_subdirectory(framework) +endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_host) + add_subdirectory(op_host) +endif() +if(EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel) + add_subdirectory(op_kernel) +endif() +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +# modify vendor_name in install.sh and upgrade.sh +add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh + COMMAND mkdir -p ${CMAKE_BINARY_DIR}/scripts + COMMAND cp -r ${CMAKE_SOURCE_DIR}/scripts/* ${CMAKE_BINARY_DIR}/scripts/ + COMMAND sed -i "s/vendor_name=customize/vendor_name=${vendor_name}/g" ${CMAKE_BINARY_DIR}/scripts/* +) +add_custom_target(modify_vendor ALL DEPENDS ${CMAKE_BINARY_DIR}/scripts/install.sh ${CMAKE_BINARY_DIR}/scripts/upgrade.sh) +install(DIRECTORY ${CMAKE_BINARY_DIR}/scripts/ DESTINATION . FILE_PERMISSIONS OWNER_EXECUTE OWNER_READ GROUP_READ) + +install(FILES ${CMAKE_SOURCE_DIR}/custom.proto DESTINATION packages OPTIONAL) + +get_system_info(SYSTEM_INFO) + +# gen version.info +add_custom_target(gen_version_info ALL + COMMAND bash ${CMAKE_CURRENT_SOURCE_DIR}/cmake/util/gen_version_info.sh ${ASCEND_CANN_PACKAGE_PATH} ${CMAKE_CURRENT_BINARY_DIR} +) + +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/version.info + DESTINATION packages/vendors/${vendor_name}/) + +# CPack config +set(CPACK_PACKAGE_NAME ${CMAKE_PROJECT_NAME}) +set(CPACK_PACKAGE_VERSION ${CMAKE_PROJECT_VERSION}) +set(CPACK_PACKAGE_DESCRIPTION "CPack opp project") +set(CPACK_PACKAGE_DESCRIPTION_SUMMARY "CPack opp project") +set(CPACK_PACKAGE_DIRECTORY ${CMAKE_INSTALL_PREFIX}) +set(CPACK_PACKAGE_FILE_NAME "custom_opp_${SYSTEM_INFO}.run") +set(CPACK_GENERATOR External) +set(CPACK_CMAKE_GENERATOR "Unix Makefiles") +set(CPACK_EXTERNAL_ENABLE_STAGING TRUE) +set(CPACK_EXTERNAL_PACKAGE_SCRIPT ${CMAKE_SOURCE_DIR}/cmake/makeself.cmake) +set(CPACK_EXTERNAL_BUILT_PACKAGES ${CPACK_PACKAGE_DIRECTORY}/_CPack_Packages/Linux/External/${CPACK_PACKAGE_FILE_NAME}/${CPACK_PACKAGE_FILE_NAME}) +include(CPack) diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakePresets.json b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakePresets.json new file mode 100644 index 0000000000000000000000000000000000000000..e56e9011dff02062a1fe85fc32c85e0205c65b24 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/CMakePresets.json @@ -0,0 +1,63 @@ +{ + "version": 1, + "cmakeMinimumRequired": { + "major": 3, + "minor": 19, + "patch": 0 + }, + "configurePresets": [ + { + "name": "default", + "displayName": "Default Config", + "description": "Default build using Unix Makefiles generator", + "generator": "Unix Makefiles", + "binaryDir": "${sourceDir}/build_out", + "cacheVariables": { + "CMAKE_BUILD_TYPE": { + "type": "STRING", + "value": "Release" + }, + "ENABLE_SOURCE_PACKAGE": { + "type": "BOOL", + "value": "True" + }, + "ENABLE_BINARY_PACKAGE": { + "type": "BOOL", + "value": "True" + }, + "ASCEND_COMPUTE_UNIT": { + "type": "STRING", + "value": "ascend310p;ascend910b" + }, + "ENABLE_TEST": { + "type": "BOOL", + "value": "True" + }, + "vendor_name": { + "type": "STRING", + "value": "customize" + }, + "ASCEND_CANN_PACKAGE_PATH": { + "type": "PATH", + "value": "~/Ascend/ascend-toolkit/latest" + }, + "ASCEND_PYTHON_EXECUTABLE": { + "type": "STRING", + "value": "python3" + }, + "CMAKE_INSTALL_PREFIX": { + "type": "PATH", + "value": "${sourceDir}/build_out" + }, + "ENABLE_CROSS_COMPILE": { + "type": "BOOL", + "value": "False" + }, + "CMAKE_CROSS_PLATFORM_COMPILER": { + "type": "PATH", + "value": "/usr/bin/aarch64-linux-gnu-g++" + } + } + } + ] +} \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/README.md b/examples/normalization/welford_update/kernel_launch_method_by_framework/README.md new file mode 100644 index 0000000000000000000000000000000000000000..8b189f7f96961df1c95bd72f02cda3107b5774bc --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/README.md @@ -0,0 +1,80 @@ + + +## 概述 + +本样例基于自定义算子工程,介绍了调用WelfordUpdate高阶API实现welford_update单算子,主要演示WelfordUpdate高阶API在自定义算子工程中的调用。 + +## 目录结构 +| 目录 | 描述 | +|---------------------|----------------------| +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | host侧实现文件 | +| [op_kernel](./op_kernel) | kernel侧实现文件 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| [testcases](./testcases) | 包含cpu域以及npu域的用例主函数,以及真值校验函数 | +| build.sh | 编译运行算子的脚本 | +| CMakeLists.txt | 编译工程文件 | +| CMakePresets.json | 编译工程配置文件 | + +## 编译运行样例 + +## 1.配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN包的存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` +### 2.生成输入和真值 + ``` + python3 scripts/gen_data.py + ``` + +### 3.编译算子工程 + + - 修改CMakePresets.json中ASCEND_CANN_PACKAGE_PATH为CANN软件包安装后的实际路径。 + + + ``` + { + …… + "configurePresets": [ + { + …… + "ASCEND_CANN_PACKAGE_PATH": { + "type": "PATH", + "value": "~/Ascend/ascend-toolkit/latest" //请替换为CANN软件包安装后的实际路径。eg:/home/HwHiAiUser/Ascend/ascend-toolkit/latest + }, + …… + } + ] + } + ``` + - 在当前算子工程目录下执行如下命令,进行算子工程编译。 + + ``` + bash build.sh + ``` + 编译成功后,会在当前目录下创建build_out目录,并在build_out目录下生成自定义算子安装包custom_opp_\_\.run,例如“custom_opp_ubuntu_x86_64.run”。 + + +### 4.部署算子包 + +执行如下命令,在自定义算子安装包所在路径下,安装自定义算子包。 + +``` +cd build_out +./custom_opp__.run +``` + +命令执行成功后,自定义算子包中的相关文件将部署至当前环境的OPP算子库的vendors/customize目录中。 + +### 5.执行样例 +在build_out目录下执行如下命令 + +``` +./welford_update_custom_npu +``` + +### 注意事项 +本样例工程会自动识别执行的硬件平台,无需单独设置SOC_VERSION \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/build.sh b/examples/normalization/welford_update/kernel_launch_method_by_framework/build.sh new file mode 100644 index 0000000000000000000000000000000000000000..6f3ab96208740ee1ddcd51c739a9adea2fe9bc52 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/build.sh @@ -0,0 +1,76 @@ +#!/bin/bash +script_path=$(realpath $(dirname $0)) + +source $ASCEND_HOME_DIR/bin/setenv.bash +cp -rf ../host_tiling/* op_host/ +ln -s $ASCEND_HOME_DIR/tools/op_project_templates/ascendc/customize/cmake/util/ ./cmake/util +mkdir -p build_out +rm -rf build_out/* +cd build_out + +opts=$(python3 $script_path/cmake/util/preset_parse.py $script_path/CMakePresets.json) +ENABLE_CROSS="-DENABLE_CROSS_COMPILE=True" +ENABLE_BINARY="-DENABLE_BINARY_PACKAGE=True" +cmake_version=$(cmake --version | grep "cmake version" | awk '{print $3}') + +cmake_run_package() +{ + target=$1 + cmake --build . --target $target -j16 + if [ $? -ne 0 ]; then exit 1; fi + + if [ $target = "package" ]; then + if test -d ./op_kernel/binary ; then + ./cust*.run + if [ $? -ne 0 ]; then exit 1; fi + cmake --build . --target binary -j16 + if [ $? -ne 0 ]; then exit 1; fi + cmake --build . --target $target -j16 + fi + fi +} + +if [[ $opts =~ $ENABLE_CROSS ]] && [[ $opts =~ $ENABLE_BINARY ]] +then + target=package + if [ "$1"x != ""x ]; then target=$1; fi + if [ "$cmake_version" \< "3.19.0" ] ; then + cmake .. $opts -DENABLE_CROSS_COMPILE=0 + else + cmake .. --preset=default -DENABLE_CROSS_COMPILE=0 + fi + cmake_run_package $target + cp -r kernel ../ + rm -rf * + if [ "$cmake_version" \< "3.19.0" ] ; then + cmake .. $opts + else + cmake .. --preset=default + fi + + cmake --build . --target $target -j16 + if [ $? -ne 0 ]; then exit 1; fi + if [ $target = "package" ]; then + if test -d ./op_kernel/binary ; then + ./cust*.run + fi + fi + rm -rf ../kernel + +else + target=package + if [ "$1"x != ""x ]; then target=$1; fi + if [ "$cmake_version" \< "3.19.0" ] ; then + cmake .. $opts + else + cmake .. --preset=default + fi + cmake_run_package $target +fi + + +# for debug +# cd build_out +# make +# cpack +# verbose append -v \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/config.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/config.cmake new file mode 100644 index 0000000000000000000000000000000000000000..886119daadd85495676c07dfb0b629e3deab8ccf --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/config.cmake @@ -0,0 +1,25 @@ + +set(CMAKE_CXX_FLAGS_DEBUG "") +set(CMAKE_CXX_FLAGS_RELEASE "") + +if (NOT DEFINED vendor_name) + set(vendor_name customize CACHE STRING "") +endif() +if (NOT DEFINED ASCEND_CANN_PACKAGE_PATH) + set(ASCEND_CANN_PACKAGE_PATH /usr/local/Ascend/latest CACHE PATH "") +endif() +if (NOT DEFINED ASCEND_PYTHON_EXECUTABLE) + set(ASCEND_PYTHON_EXECUTABLE python3 CACHE STRING "") +endif() +if (NOT DEFINED ASCEND_COMPUTE_UNIT) + message(FATAL_ERROR "ASCEND_COMPUTE_UNIT not set in CMakePreset.json ! +") +endif() +set(ASCEND_TENSOR_COMPILER_PATH ${ASCEND_CANN_PACKAGE_PATH}/compiler) +set(ASCEND_CCEC_COMPILER_PATH ${ASCEND_TENSOR_COMPILER_PATH}/ccec_compiler/bin) +set(ASCEND_AUTOGEN_PATH ${CMAKE_BINARY_DIR}/autogen) +set(ASCEND_FRAMEWORK_TYPE tensorflow) +file(MAKE_DIRECTORY ${ASCEND_AUTOGEN_PATH}) +set(CUSTOM_COMPILE_OPTIONS "custom_compile_options.ini") +execute_process(COMMAND rm -rf ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} + COMMAND touch ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS}) diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/func.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/func.cmake new file mode 100644 index 0000000000000000000000000000000000000000..4179dfd25b41487d5aaf1ac95459543e26ab4fff --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/func.cmake @@ -0,0 +1,192 @@ + +function(get_system_info SYSTEM_INFO) + if (UNIX) + execute_process(COMMAND grep -i ^id= /etc/os-release OUTPUT_VARIABLE TEMP) + string(REGEX REPLACE "\n|id=|ID=|\"" "" SYSTEM_NAME ${TEMP}) + set(${SYSTEM_INFO} ${SYSTEM_NAME}_${CMAKE_SYSTEM_PROCESSOR} PARENT_SCOPE) + elseif (WIN32) + message(STATUS "System is Windows. Only for pre-build.") + else () + message(FATAL_ERROR "${CMAKE_SYSTEM_NAME} not support.") + endif () +endfunction() + +function(opbuild) + message(STATUS "Opbuild generating sources") + cmake_parse_arguments(OPBUILD "" "OUT_DIR;PROJECT_NAME;ACCESS_PREFIX" "OPS_SRC" ${ARGN}) + execute_process(COMMAND ${CMAKE_COMPILE} -g -fPIC -shared -std=c++11 ${OPBUILD_OPS_SRC} -D_GLIBCXX_USE_CXX11_ABI=0 + -I ${ASCEND_CANN_PACKAGE_PATH}/include -L ${ASCEND_CANN_PACKAGE_PATH}/lib64 -lexe_graph -lregister -ltiling_api + -o ${OPBUILD_OUT_DIR}/libascend_all_ops.so + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message("build ops lib info: ${EXEC_INFO}") + message("build ops lib error: ${EXEC_ERROR}") + message(FATAL_ERROR "opbuild run failed!") + endif() + set(proj_env "") + set(prefix_env "") + if (NOT "${OPBUILD_PROJECT_NAME}x" STREQUAL "x") + set(proj_env "OPS_PROJECT_NAME=${OPBUILD_PROJECT_NAME}") + endif() + if (NOT "${OPBUILD_ACCESS_PREFIX}x" STREQUAL "x") + set(prefix_env "OPS_DIRECT_ACCESS_PREFIX=${OPBUILD_ACCESS_PREFIX}") + endif() + execute_process(COMMAND ${proj_env} ${prefix_env} ${ASCEND_CANN_PACKAGE_PATH}/toolkit/tools/opbuild/op_build + ${OPBUILD_OUT_DIR}/libascend_all_ops.so ${OPBUILD_OUT_DIR} + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message("opbuild ops info: ${EXEC_INFO}") + message("opbuild ops error: ${EXEC_ERROR}") + endif() + message(STATUS "Opbuild generating sources - done") +endfunction() + +function(add_ops_info_target) + cmake_parse_arguments(OPINFO "" "TARGET;OPS_INFO;OUTPUT;INSTALL_DIR" "" ${ARGN}) + get_filename_component(opinfo_file_path "${OPINFO_OUTPUT}" DIRECTORY) + add_custom_command(OUTPUT ${OPINFO_OUTPUT} + COMMAND mkdir -p ${opinfo_file_path} + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/parse_ini_to_json.py + ${OPINFO_OPS_INFO} ${OPINFO_OUTPUT} + ) + add_custom_target(${OPINFO_TARGET} ALL + DEPENDS ${OPINFO_OUTPUT} + ) + install(FILES ${OPINFO_OUTPUT} + DESTINATION ${OPINFO_INSTALL_DIR} + ) +endfunction() + +function(add_ops_compile_options OP_TYPE) + cmake_parse_arguments(OP_COMPILE "" "OP_TYPE" "COMPUTE_UNIT;OPTIONS" ${ARGN}) + file(APPEND ${ASCEND_AUTOGEN_PATH}/${CUSTOM_COMPILE_OPTIONS} + "${OP_TYPE},${OP_COMPILE_COMPUTE_UNIT},${OP_COMPILE_OPTIONS}\n") +endfunction() + +function(add_ops_impl_target) + cmake_parse_arguments(OPIMPL "" "TARGET;OPS_INFO;IMPL_DIR;OUT_DIR;INSTALL_DIR" "OPS_BATCH;OPS_ITERATE" ${ARGN}) + add_custom_command(OUTPUT ${OPIMPL_OUT_DIR}/.impl_timestamp + COMMAND mkdir -m 700 -p ${OPIMPL_OUT_DIR}/dynamic + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_impl_build.py + ${OPIMPL_OPS_INFO} + \"${OPIMPL_OPS_BATCH}\" \"${OPIMPL_OPS_ITERATE}\" + ${OPIMPL_IMPL_DIR} + ${OPIMPL_OUT_DIR}/dynamic + ${ASCEND_AUTOGEN_PATH} + + COMMAND rm -rf ${OPIMPL_OUT_DIR}/.impl_timestamp + COMMAND touch ${OPIMPL_OUT_DIR}/.impl_timestamp + DEPENDS ${OPIMPL_OPS_INFO} + ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_impl_build.py + ) + add_custom_target(${OPIMPL_TARGET} ALL + DEPENDS ${OPIMPL_OUT_DIR}/.impl_timestamp) + if (${ENABLE_SOURCE_PACKAGE}) + install(DIRECTORY ${OPIMPL_OUT_DIR}/dynamic + DESTINATION ${OPIMPL_INSTALL_DIR} + ) + endif() +endfunction() + +function(add_npu_support_target) + cmake_parse_arguments(NPUSUP "" "TARGET;OPS_INFO_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN}) + get_filename_component(npu_sup_file_path "${NPUSUP_OUT_DIR}" DIRECTORY) + add_custom_command(OUTPUT ${NPUSUP_OUT_DIR}/npu_supported_ops.json + COMMAND mkdir -p ${NPUSUP_OUT_DIR} + COMMAND ${CMAKE_SOURCE_DIR}/cmake/util/gen_ops_filter.sh + ${NPUSUP_OPS_INFO_DIR} + ${NPUSUP_OUT_DIR} + ) + add_custom_target(npu_supported_ops ALL + DEPENDS ${NPUSUP_OUT_DIR}/npu_supported_ops.json + ) + install(FILES ${NPUSUP_OUT_DIR}/npu_supported_ops.json + DESTINATION ${NPUSUP_INSTALL_DIR} + ) +endfunction() + +function(add_bin_compile_target) + cmake_parse_arguments(BINCMP "" "TARGET;OPS_INFO;COMPUTE_UNIT;IMPL_DIR;ADP_DIR;OUT_DIR;INSTALL_DIR" "" ${ARGN}) + file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/src) + file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/bin) + file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/gen) + execute_process(COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_bin_param_build.py + ${BINCMP_OPS_INFO} ${BINCMP_OUT_DIR}/gen ${BINCMP_COMPUTE_UNIT} + RESULT_VARIABLE EXEC_RESULT + OUTPUT_VARIABLE EXEC_INFO + ERROR_VARIABLE EXEC_ERROR + ) + if (${EXEC_RESULT}) + message("ops binary compile scripts gen info: ${EXEC_INFO}") + message("ops binary compile scripts gen error: ${EXEC_ERROR}") + message(FATAL_ERROR "ops binary compile scripts gen failed!") + endif() + if (NOT TARGET binary) + add_custom_target(binary) + endif() + add_custom_target(${BINCMP_TARGET} + COMMAND cp -r ${BINCMP_IMPL_DIR}/*.* ${BINCMP_OUT_DIR}/src + ) + add_custom_target(${BINCMP_TARGET}_gen_ops_config + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/insert_simplified_keys.py -p ${BINCMP_OUT_DIR}/bin + COMMAND ${ASCEND_PYTHON_EXECUTABLE} ${CMAKE_SOURCE_DIR}/cmake/util/ascendc_ops_config.py -p ${BINCMP_OUT_DIR}/bin + -s ${BINCMP_COMPUTE_UNIT} + ) + add_dependencies(binary ${BINCMP_TARGET}_gen_ops_config) + file(GLOB bin_scripts ${BINCMP_OUT_DIR}/gen/*.sh) + foreach(bin_script ${bin_scripts}) + get_filename_component(bin_file ${bin_script} NAME_WE) + string(REPLACE "-" ";" bin_sep ${bin_file}) + list(GET bin_sep 0 op_type) + list(GET bin_sep 1 op_file) + list(GET bin_sep 2 op_index) + if (NOT TARGET ${BINCMP_TARGET}_${op_file}_copy) + file(MAKE_DIRECTORY ${BINCMP_OUT_DIR}/bin/${op_file}) + add_custom_target(${BINCMP_TARGET}_${op_file}_copy + COMMAND cp ${BINCMP_ADP_DIR}/${op_file}.py ${BINCMP_OUT_DIR}/src/${op_type}.py + ) + install(DIRECTORY ${BINCMP_OUT_DIR}/bin/${op_file} + DESTINATION ${BINCMP_INSTALL_DIR}/${BINCMP_COMPUTE_UNIT} OPTIONAL + ) + install(FILES ${BINCMP_OUT_DIR}/bin/${op_file}.json + DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT}/ OPTIONAL + ) + endif() + add_custom_target(${BINCMP_TARGET}_${op_file}_${op_index} + COMMAND export HI_PYTHON=${ASCEND_PYTHON_EXECUTABLE} && bash ${bin_script} ${BINCMP_OUT_DIR}/src/${op_type}.py ${BINCMP_OUT_DIR}/bin/${op_file} && echo $(MAKE) + WORKING_DIRECTORY ${BINCMP_OUT_DIR} + ) + add_dependencies(${BINCMP_TARGET}_${op_file}_${op_index} ${BINCMP_TARGET} ${BINCMP_TARGET}_${op_file}_copy) + add_dependencies(${BINCMP_TARGET}_gen_ops_config ${BINCMP_TARGET}_${op_file}_${op_index}) + endforeach() + install(FILES ${BINCMP_OUT_DIR}/bin/binary_info_config.json + DESTINATION ${BINCMP_INSTALL_DIR}/config/${BINCMP_COMPUTE_UNIT} OPTIONAL + ) + + install(DIRECTORY ${BINCMP_OUT_DIR}/bin/${op_file} + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/../build_out/kernel/${BINCMP_COMPUTE_UNIT} OPTIONAL + ) + install(FILES ${BINCMP_OUT_DIR}/bin/binary_info_config.json + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/../build_out/kernel/config/${BINCMP_COMPUTE_UNIT} OPTIONAL + ) + install(FILES ${BINCMP_OUT_DIR}/bin/${op_file}.json + DESTINATION ${CMAKE_CURRENT_SOURCE_DIR}/../build_out/kernel/config/${BINCMP_COMPUTE_UNIT} OPTIONAL + ) + +endfunction() + +function(add_cross_compile_target) + cmake_parse_arguments(CROSSMP "" "TARGET;OUT_DIR;INSTALL_DIR" "" ${ARGN}) + add_custom_target(${CROSSMP_TARGET} ALL + DEPENDS ${CROSSMP_OUT_DIR} + ) + install(DIRECTORY ${CROSSMP_OUT_DIR} + DESTINATION ${CROSSMP_INSTALL_DIR} + ) +endfunction() diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/intf.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/intf.cmake new file mode 100644 index 0000000000000000000000000000000000000000..2f362c396622d66132f80f54492a8cc3204882fb --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/intf.cmake @@ -0,0 +1,26 @@ + +add_library(intf_pub INTERFACE) +target_compile_options(intf_pub INTERFACE + -fPIC + -fvisibility=hidden + -fvisibility-inlines-hidden + $<$:-O2> + $<$:-O0 -g> + $<$:-std=c++11> + $<$,$>:-ftrapv -fstack-check> + $<$:-pthread -Wfloat-equal -Wshadow -Wformat=2 -Wno-deprecated -Wextra> + $,-fstack-protector-strong,-fstack-protector-all> +) +target_compile_definitions(intf_pub INTERFACE + _GLIBCXX_USE_CXX11_ABI=0 + $<$:_FORTIFY_SOURCE=2> +) +target_include_directories(intf_pub INTERFACE ${ASCEND_CANN_PACKAGE_PATH}/include) +target_link_options(intf_pub INTERFACE + $<$,EXECUTABLE>:-pie> + $<$:-s> + -Wl,-z,relro + -Wl,-z,now + -Wl,-z,noexecstack +) +target_link_directories(intf_pub INTERFACE ${ASCEND_CANN_PACKAGE_PATH}/lib64) diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/makeself.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/makeself.cmake new file mode 100644 index 0000000000000000000000000000000000000000..48c565bfb4f2edc6534a81abaa8565c4cf2dfc30 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/cmake/makeself.cmake @@ -0,0 +1,17 @@ +execute_process(COMMAND chmod +x ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself.sh) +execute_process(COMMAND ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself.sh + --header ${CMAKE_CURRENT_LIST_DIR}/util/makeself/makeself-header.sh + --help-header ./help.info + --gzip --complevel 4 --nomd5 --sha256 + ./ ${CPACK_PACKAGE_FILE_NAME} "version:1.0" ./install.sh + WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY} + RESULT_VARIABLE EXEC_RESULT + ERROR_VARIABLE EXEC_ERROR +) +if (NOT "${EXEC_RESULT}x" STREQUAL "0x") + message(FATAL_ERROR "CPack Command error: ${EXEC_RESULT}\n${EXEC_ERROR}") +endif() +execute_process(COMMAND cp ${CPACK_EXTERNAL_BUILT_PACKAGES} ${CPACK_PACKAGE_DIRECTORY}/ + COMMAND echo "Copy ${CPACK_EXTERNAL_BUILT_PACKAGES} to ${CPACK_PACKAGE_DIRECTORY}/" + WORKING_DIRECTORY ${CPACK_TEMPORARY_DIRECTORY} +) diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..40dd51cfac524b0a9607b7d8b2813edd2210c509 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/CMakeLists.txt @@ -0,0 +1,82 @@ + +aux_source_directory(${CMAKE_CURRENT_SOURCE_DIR} ops_srcs) + +opbuild(OPS_SRC ${ops_srcs} + OUT_DIR ${ASCEND_AUTOGEN_PATH} +) + +add_library(cust_op_proto SHARED ${ops_srcs} ${ASCEND_AUTOGEN_PATH}/op_proto.cc) +target_compile_definitions(cust_op_proto PRIVATE OP_PROTO_LIB) +target_compile_options(cust_op_proto PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_op_proto PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_op_proto PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_op_proto PROPERTIES OUTPUT_NAME + cust_opsproto_rt2.0 +) +add_library(cust_optiling SHARED ${ops_srcs}) +target_compile_definitions(cust_optiling PRIVATE OP_TILING_LIB) +target_compile_options(cust_optiling PRIVATE + -fvisibility=hidden +) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_optiling PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_optiling PRIVATE + intf_pub + exe_graph + register + tiling_api + -Wl,--whole-archive + rt2_registry + -Wl,--no-whole-archive +) +set_target_properties(cust_optiling PROPERTIES OUTPUT_NAME + cust_opmaster_rt2.0 +) + +file(GLOB aclnn_src ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +file(GLOB aclnn_inc ${ASCEND_AUTOGEN_PATH}/aclnn_*.h) +add_library(cust_opapi SHARED ${aclnn_src}) +if(ENABLE_CROSS_COMPILE) + target_link_directories(cust_opapi PRIVATE + ${CMAKE_COMPILE_COMPILER_LIBRARY} + ${CMAKE_COMPILE_RUNTIME_LIBRARY} + ) +endif() +target_link_libraries(cust_opapi PRIVATE intf_pub ascendcl nnopbase) + +add_custom_target(optiling_compat ALL + COMMAND ln -sf lib/linux/${CMAKE_SYSTEM_PROCESSOR}/$ + ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so +) + +install(TARGETS cust_op_proto + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_proto/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${ASCEND_AUTOGEN_PATH}/op_proto.h + DESTINATION packages/vendors/${vendor_name}/op_proto/inc) +install(TARGETS cust_optiling + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling/lib/linux/${CMAKE_SYSTEM_PROCESSOR}) +install(FILES ${CMAKE_CURRENT_BINARY_DIR}/liboptiling.so + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/op_tiling) +install(TARGETS cust_opapi + LIBRARY DESTINATION packages/vendors/${vendor_name}/op_api/lib) +install(FILES ${aclnn_inc} + DESTINATION packages/vendors/${vendor_name}/op_api/include) diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom.cpp b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5de999d7cc70620900f7b6ae3850695e1af26e8c --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom.cpp @@ -0,0 +1,84 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "welford_update_custom_tiling.h" +#include "register/op_def_registry.h" + +namespace optiling { +constexpr uint32_t BLOCK_DIM = 1; +constexpr bool ISINPLACE = true; +constexpr uint8_t RN_SIZE = 1; +constexpr uint32_t AB_SIZE = 64; +constexpr uint32_t AB_LENGTH = 35; +constexpr float NREC = 1.0 / 8; + +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + WelfordUpdateCustomTilingData tiling; + ComputeTiling(ISINPLACE, RN_SIZE, AB_SIZE, AB_LENGTH, NREC, tiling); + + context->SetBlockDim(BLOCK_DIM); + context->SetTilingKey(1); + tiling.SaveToBuffer(context->GetRawTilingData()->GetData(), context->GetRawTilingData()->GetCapacity()); + context->GetRawTilingData()->SetDataSize(tiling.GetDataSize()); + return ge::GRAPH_SUCCESS; +} +} + +namespace ge { +static ge::graphStatus InferShape(gert::InferShapeContext *context) +{ + const gert::Shape *x1_shape = context->GetInputShape(0); + gert::Shape *y_shape = context->GetOutputShape(0); + *y_shape = *x1_shape; + return GRAPH_SUCCESS; +} +} + +namespace ops { +class WelfordUpdateCustom : public OpDef { +public: + explicit WelfordUpdateCustom(const char *name) : OpDef(name) + { + this->Input("srcGm") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT16 }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + this->Input("inMeanGm") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + this->Input("inVarGm") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + + this->Output("outMeanGm") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + this->Output("outVarGm") + .ParamType(REQUIRED) + .DataType({ ge::DT_FLOAT }) + .Format({ ge::FORMAT_ND }) + .UnknownShapeFormat({ ge::FORMAT_ND }); + + this->SetInferShape(ge::InferShape); + this->AICore().SetTiling(optiling::TilingFunc); + this->AICore().AddConfig("ascend910b"); + this->AICore().AddConfig("ascend310p"); + } +}; + +OP_ADD(WelfordUpdateCustom); +} diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..ef4335a3f3c62e6f66b142877586f84db95a22fc --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_host/welford_update_custom_tiling.h @@ -0,0 +1,53 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H +#define EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" +#include "tiling/platform/platform_ascendc.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(WelfordUpdateCustomTilingData) + TILING_DATA_FIELD_DEF(bool, inplace); + TILING_DATA_FIELD_DEF(uint32_t, nLength); + TILING_DATA_FIELD_DEF(uint32_t, rLength); + TILING_DATA_FIELD_DEF(uint32_t, abComputeLength); + TILING_DATA_FIELD_DEF(float, nRec); + TILING_DATA_FIELD_DEF(uint32_t, tmpLocalSize); +END_TILING_DATA_DEF; +REGISTER_TILING_DATA_CLASS(WelfordUpdateCustom, WelfordUpdateCustomTilingData) +} // namespace optiling + +constexpr bool ISREUSESOURCE = false; +constexpr bool ISINPLACE = true; + +void ComputeTiling(bool inplace, uint32_t nLength, uint32_t rLength, uint32_t abComputeLength, + float nRec, optiling::WelfordUpdateCustomTilingData &tiling) +{ + std::vector shapeVec = {nLength, rLength}; + ge::Shape srcShape(shapeVec); + uint32_t maxsize = 0; + uint32_t minsize = 0; + uint32_t dtypesizeT = 2; // half类型 + uint32_t dtypesizeU = 4; // float类型 + + tiling.set_inplace(inplace); + tiling.set_nLength(nLength); + tiling.set_rLength(rLength); + tiling.set_abComputeLength(abComputeLength); + tiling.set_nRec(nRec); + + AscendC::GetWelfordUpdateMaxMinTmpSize(srcShape, dtypesizeT, dtypesizeU, ISREUSESOURCE, ISINPLACE, maxsize, + minsize); + tiling.set_tmpLocalSize(minsize); +} + +#endif // EXAMPLES_NORMALIZATION_WELFORDUPDATE_CUSTOM_TILING_H diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..c50a409a20bd0e0cce495824295a18799e4f8be1 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/CMakeLists.txt @@ -0,0 +1,69 @@ +# set custom compile options +if ("${CMAKE_BUILD_TYPE}x" STREQUAL "Debugx") + add_ops_compile_options(ALL OPTIONS -g -O0) +endif() +add_ops_compile_options(ALL OPTIONS -mllvm -cce-aicore-jump-expand=true) + +foreach(compute_unit ${ASCEND_COMPUTE_UNIT}) + + # generate aic-${compute_unit}-ops-info.json + add_ops_info_target(TARGET ops_info_gen_${compute_unit} + OUTPUT ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core/${compute_unit}/aic-${compute_unit}-ops-info.json + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/config/${compute_unit} + ) + + # generate ascendc impl py once + if (NOT TARGET ascendc_impl_gen) + add_ops_impl_target(TARGET ascendc_impl_gen + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl + ) + endif() + + # dynamic shape binary compile + if (${ENABLE_BINARY_PACKAGE} AND NOT ${ENABLE_CROSS_COMPILE}) + add_bin_compile_target(TARGET ascendc_bin_${compute_unit} + OPS_INFO ${ASCEND_AUTOGEN_PATH}/aic-${compute_unit}-ops-info.ini + IMPL_DIR ${CMAKE_CURRENT_SOURCE_DIR} + ADP_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/dynamic + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/binary/${compute_unit} + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/kernel + COMPUTE_UNIT ${compute_unit} + ) + add_dependencies(ascendc_bin_${compute_unit} ascendc_impl_gen) + endif() + + if (${ENABLE_CROSS_COMPILE} AND ${ENABLE_BINARY_PACKAGE}) + add_cross_compile_target( + TARGET bin_${compute_unit} + OUT_DIR ${CMAKE_CURRENT_SOURCE_DIR}/../kernel + INSTALL_DIR packages/vendors/${vendor_name}/op_impl/ai_core/tbe/ + ) + endif() +endforeach() + +# generate npu_supported_ops.json +add_npu_support_target(TARGET npu_supported_ops + OPS_INFO_DIR ${ASCEND_AUTOGEN_PATH} + OUT_DIR ${CMAKE_CURRENT_BINARY_DIR}/tbe/op_info_cfg/ai_core + INSTALL_DIR packages/vendors/${vendor_name}/framework/${ASCEND_FRAMEWORK_TYPE} +) + +if(ENABLE_TEST AND EXISTS ${CMAKE_CURRENT_SOURCE_DIR}/testcases) + add_subdirectory(testcases) +endif() + +# install kernel file +if (${ENABLE_SOURCE_PACKAGE}) + file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/*.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/*.h + ${CMAKE_CURRENT_SOURCE_DIR}/*.py + ) + install(FILES ${KERNEL_FILES} + DESTINATION packages/vendors/${vendor_name}/op_impl/ai_core/tbe/${vendor_name}_impl/dynamic + ) +endif() diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/welford_update_custom.cpp b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/welford_update_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d0fdc799bc4e3c499ccc418b8bbe79bb94f67c99 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/op_kernel/welford_update_custom.cpp @@ -0,0 +1,23 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../../../../../kernel_impl/welford_update_custom.h" + +extern "C" __global__ __aicore__ void welford_update_custom(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm, + GM_ADDR outMeanGm, GM_ADDR outVarGm, GM_ADDR workspace, GM_ADDR tiling) +{ + GET_TILING_DATA(tilingData, tiling); + MyCustomKernel::VecTiling vecTiling = *reinterpret_cast(&tilingData); + if (TILING_KEY_IS(1)) { + MyCustomKernel::KernelWelfordUpdate op; + op.Init(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, vecTiling); + op.Process(); + } +} diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/gen_data.py b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..f65af46cbbeeeb50bf8f8c9e0a727761512c7a31 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/gen_data.py @@ -0,0 +1,42 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2024 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +RN_SIZE = 1 +AB_SIZE = 64 +AB_LENGTH = 35 +NREC = 1.0 / 8 + +def gen_golden_data_simple(): + x1 = np.random.uniform(1, 100, [RN_SIZE * AB_SIZE]).astype(np.float16) + x2 = np.random.uniform(-60000, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32) + x3 = np.random.uniform(0, 60000, [RN_SIZE * AB_SIZE]).astype(np.float32) + golden1 = x2.copy() + golden2 = x3.copy() + + for i in range(AB_LENGTH): + n = np.float32(NREC) + golden1[i] = x2[i] + (x1[i] - x2[i]) * n + golden2[i] = x3[i] + (x1[i] - x2[i]) * (x1[i] - golden1[i]) + + os.system("mkdir -p ./input") + x1.tofile("./input/input_srcGm.bin") + x2.tofile("./input/input_inMeanGm.bin") + x3.tofile("./input/input_inVarGm.bin") + os.system("mkdir -p ./output") + golden1.tofile("./output/golden_outMeanGm.bin") + golden2.tofile("./output/golden_outVarGm.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/help.info b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/help.info new file mode 100644 index 0000000000000000000000000000000000000000..f4b28d57a8150f0df6c386473b7554c7d087c90f --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/help.info @@ -0,0 +1 @@ + --install-path Install operator package to specific dir path \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/install.sh b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/install.sh new file mode 100644 index 0000000000000000000000000000000000000000..8468c5a256f2c77fad5bf78ab108ca5b62aad672 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/install.sh @@ -0,0 +1,318 @@ +#!/bin/bash +vendor_name=customize +targetdir=/usr/local/Ascend/opp +target_custom=0 + +sourcedir=$PWD/packages +vendordir=vendors/$vendor_name + +QUIET="y" + +while true +do + case $1 in + --quiet) + QUIET="y" + shift + ;; + --install-path=*) + INSTALL_PATH=$(echo $1 | cut -d"=" -f2-) + INSTALL_PATH=${INSTALL_PATH%*/} + shift + ;; + --*) + shift + ;; + *) + break + ;; + esac +done + +log() { + cur_date=`date +"%Y-%m-%d %H:%M:%S"` + echo "[runtime] [$cur_date] "$1 +} + +if [ -n "${INSTALL_PATH}" ]; then + if [[ ! "${INSTALL_PATH}" = /* ]]; then + log "[ERROR] use absolute path for --install-path argument" + exit 1 + fi + if [ ! -d ${INSTALL_PATH} ]; then + mkdir ${INSTALL_PATH} >> /dev/null 2>&1 + if [ $? -ne 0 ]; then + log "[ERROR] create ${INSTALL_PATH} failed" + exit 1 + fi + fi + targetdir=${INSTALL_PATH} +elif [ -n "${ASCEND_CUSTOM_OPP_PATH}" ]; then + if [ ! -d ${ASCEND_CUSTOM_OPP_PATH} ]; then + mkdir -p ${ASCEND_CUSTOM_OPP_PATH} >> /dev/null 2>&1 + if [ $? -ne 0 ]; then + log "[ERROR] create ${ASCEND_CUSTOM_OPP_PATH} failed" + fi + fi + targetdir=${ASCEND_CUSTOM_OPP_PATH} +else + if [ "x${ASCEND_OPP_PATH}" == "x" ]; then + log "[ERROR] env ASCEND_OPP_PATH no exist" + exit 1 + fi + targetdir="${ASCEND_OPP_PATH}" +fi + +if [ ! -d $targetdir ];then + log "[ERROR] $targetdir no exist" + exit 1 +fi + +upgrade() +{ + if [ ! -d ${sourcedir}/$vendordir/$1 ]; then + log "[INFO] no need to upgrade ops $1 files" + return 0 + fi + + if [ ! -d ${targetdir}/$vendordir/$1 ];then + log "[INFO] create ${targetdir}/$vendordir/$1." + mkdir -p ${targetdir}/$vendordir/$1 + if [ $? -ne 0 ];then + log "[ERROR] create ${targetdir}/$vendordir/$1 failed" + return 1 + fi + else + has_same_file=-1 + for file_a in ${sourcedir}/$vendordir/$1/*; do + file_b=${file_a##*/}; + if [ "ls ${targetdir}/$vendordir/$1" = "" ]; then + log "[INFO] ${targetdir}/$vendordir/$1 is empty !!" + return 1 + fi + grep -q $file_b <<<`ls ${targetdir}/$vendordir/$1`; + if [[ $? -eq 0 ]]; then + echo -n "${file_b} " + has_same_file=0 + fi + done + if [ 0 -eq $has_same_file ]; then + if test $QUIET = "n"; then + echo "[INFO]: has old version in ${targetdir}/$vendordir/$1, \ + you want to Overlay Installation , please enter:[o]; \ + or replace directory installation , please enter: [r]; \ + or not install , please enter:[n]." + + while true + do + read orn + if [ "$orn" = n ]; then + return 0 + elif [ "$orn" = m ]; then + break; + elif [ "$0rn" = r ]; then + [ -n "${targetdir}/$vendordir/$1/" ] && rm -rf "${targetdir}/$vendordir/$1"/* + break; + else + echo "[ERROR] input error, please input again!" + fi + done + fi + fi + log "[INFO] replace or merge old ops $1 files .g....." + fi + + log "copy new ops $1 files ......" + if [ -d ${targetdir}/$vendordir/$1/ ]; then + chmod -R +w "$targetdir/$vendordir/$1/" >/dev/null 2>&1 + fi + cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/ + if [ $? -ne 0 ];then + log "[ERROR] copy new $1 files failed" + return 1 + fi + + return 0 +} +upgrade_proto() +{ + if [ ! -f ${sourcedir}/$vendordir/custom.proto ]; then + log "[INFO] no need to upgrade custom.proto files" + return 0 + fi + if [ ! -d ${targetdir}/$vendordir/framework/caffe ];then + log "[INFO] create ${targetdir}/$vendordir/framework/caffe." + mkdir -p ${targetdir}/$vendordir/framework/caffe + if [ $? -ne 0 ];then + log "[ERROR] create ${targetdir}/$vendordir/framework/caffe failed" + return 1 + fi + else + if [ -f ${targetdir}/$vendordir/framework/caffe/custom.proto ]; then + # 有老版本,判断是否要覆盖式安装 + if test $QUIET = "n"; then + echo "[INFO] ${targetdir}/$vendordir/framework/caffe has old version"\ + "custom.proto file. Do you want to replace? [y/n] " + + while true + do + read yn + if [ "$yn" = n ]; then + return 0 + elif [ "$yn" = y ]; then + break; + else + echo "[ERROR] input error, please input again!" + fi + done + fi + fi + log "[INFO] replace old caffe.proto files ......" + fi + chmod -R +w "$targetdir/$vendordir/framework/caffe/" >/dev/null 2>&1 + cp -rf ${sourcedir}/$vendordir/custom.proto ${targetdir}/$vendordir/framework/caffe/ + if [ $? -ne 0 ];then + log "[ERROR] copy new custom.proto failed" + return 1 + fi + log "[INFO] copy custom.proto success" + + return 0 +} + +upgrade_file() +{ + if [ ! -e ${sourcedir}/$vendordir/$1 ]; then + log "[INFO] no need to upgrade ops $1 file" + return 0 + fi + + log "copy new $1 files ......" + cp -f ${sourcedir}/$vendordir/$1 $targetdir/$vendordir/$1 + if [ $? -ne 0 ];then + log "[ERROR] copy new $1 file failed" + return 1 + fi + + return 0 +} + +delete_optiling_file() +{ + if [ ! -d ${targetdir}/vendors ];then + log "[INFO] $1 not exist, no need to uninstall" + return 0 + fi + sys_info=$(uname -m) + if [ ! -d ${sourcedir}/$vendordir/$1/ai_core/tbe/op_tiling/lib/linux/${sys_info} ];then + rm -rf ${sourcedir}/$vendordir/$1/ai_core/tbe/op_tiling/liboptiling.so + fi + return 0 +} + +log "[INFO] copy uninstall sh success" + +if [ ! -d ${targetdir}/vendors ];then + log "[INFO] create ${targetdir}/vendors." + mkdir -p ${targetdir}/vendors + if [ $? -ne 0 ];then + log "[ERROR] create ${targetdir}/vendors failed" + return 1 + fi +fi +chmod u+w ${targetdir}/vendors + +echo "[ops_custom]upgrade framework" +upgrade framework +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade op proto" +upgrade op_proto +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade version.info" +upgrade_file version.info +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade op impl" +delete_optiling_file op_impl +upgrade op_impl +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade op api" +upgrade op_api +if [ $? -ne 0 ];then + exit 1 +fi + +upgrade_proto +if [ $? -ne 0 ];then + exit 1 +fi + +# set the set_env.bash +if [ -n "${INSTALL_PATH}" ] && [ -d ${INSTALL_PATH} ]; then + _ASCEND_CUSTOM_OPP_PATH=${targetdir}/${vendordir} + bin_path="${_ASCEND_CUSTOM_OPP_PATH}/bin" + set_env_variable="#!/bin/bash\nexport ASCEND_CUSTOM_OPP_PATH=${_ASCEND_CUSTOM_OPP_PATH}:\${ASCEND_CUSTOM_OPP_PATH}" + if [ ! -d ${bin_path} ]; then + mkdir -p ${bin_path} >> /dev/null 2>&1 + if [ $? -ne 0 ]; then + log "[ERROR] create ${bin_path} failed" + exit 1 + fi + fi + echo -e ${set_env_variable} > ${bin_path}/set_env.bash + if [ $? -ne 0 ]; then + log "[ERROR] write ASCEND_CUSTOM_OPP_PATH to set_env.bash failed" + exit 1 + else + log "[INFO] using requirements: when custom module install finished or before you run the custom module, \ + execute the command [ source ${bin_path}/set_env.bash ] to set the environment path" + fi +else + config_file=${targetdir}/vendors/config.ini + if [ ! -f ${config_file} ]; then + touch ${config_file} + chmod 640 ${config_file} + echo "load_priority=$vendor_name" > ${config_file} + if [ $? -ne 0 ];then + echo "echo load_priority failed" + exit 1 + fi + else + found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)" + found_vendor=$(echo $found_vendors | sed "s/$vendor_name//g" | tr ',' ' ') + vendor=$(echo $found_vendor | tr -s ' ' ',') + if [ "$vendor" != "" ]; then + sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file" + fi + fi +fi + +chmod u-w ${targetdir}/vendors + +if [ -d ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/ ]; then + chmod -R 440 ${targetdir}/$vendordir/op_impl/cpu/aicpu_kernel/impl/* >/dev/null 2>&1 +fi +if [ -f ${targetdir}/ascend_install.info ]; then + chmod -R 440 ${targetdir}/ascend_install.info +fi +if [ -f ${targetdir}/scene.info ]; then + chmod -R 440 ${targetdir}/scene.info +fi +if [ -f ${targetdir}/version.info ]; then + chmod -R 440 ${targetdir}/version.info +fi + +echo "SUCCESS" +exit 0 + diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/upgrade.sh b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/upgrade.sh new file mode 100644 index 0000000000000000000000000000000000000000..e091734858534a6aa10bb5204b87302438004926 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/scripts/upgrade.sh @@ -0,0 +1,151 @@ +#!/bin/bash +vendor_name=customize +targetdir=/usr/local/Ascend/opp +target_custom=0 + +sourcedir=$PWD/packages +vendordir=vendors/$vendor_name + +log() { + cur_date=`date +"%Y-%m-%d %H:%M:%S"` + echo "[runtime] [$cur_date] "$1 +} + +if [[ "x${ASCEND_OPP_PATH}" == "x" ]];then + log "[ERROR] env ASCEND_OPP_PATH no exist" + exit 1 +fi + +targetdir=${ASCEND_OPP_PATH} + +if [ ! -d $targetdir ];then + log "[ERROR] $targetdir no exist" + exit 1 +fi + +upgrade() +{ + if [ ! -d ${sourcedir}/$vendordir/$1 ]; then + log "[INFO] no need to upgrade ops $1 files" + return 0 + fi + + if [ ! -d ${targetdir}/$vendordir/$1 ];then + log "[INFO] create ${targetdir}/$vendordir/$1." + mkdir -p ${targetdir}/$vendordir/$1 + if [ $? -ne 0 ];then + log "[ERROR] create ${targetdir}/$vendordir/$1 failed" + return 1 + fi + else + vendor_installed_dir=$(ls "$targetdir/vendors" 2> /dev/null) + for i in $vendor_installed_dir;do + vendor_installed_file=$(ls "$vendor_installed_dir/$vendor_name/$i" 2> /dev/null) + if [ "$i" = "$vendor_name" ] && [ "$vendor_installed_file" != "" ]; then + echo "[INFO]: $vendor_name custom opp package has been installed on the path $vendor_installed_dir, \ + you want to Overlay Installation , please enter:[o]; \ + or replace directory installation , please enter: [r]; \ + or not install , please enter:[n]." + fi + while true + do + read mrn + if [ "$mrn" = m ]; then + break + elif [ "$mrn" = r ]; then + [ -n "$vendor_installed_file"] && rm -rf "$vendor_installed_file" + break + elif [ "$mrn" = n ]; then + return 0 + else + echo "[WARNING]: Input error, please input m or r or n to choose!" + fi + done + done + log "[INFO] replace old ops $1 files ......" + fi + + log "copy new ops $1 files ......" + cp -rf ${sourcedir}/$vendordir/$1/* $targetdir/$vendordir/$1/ + if [ $? -ne 0 ];then + log "[ERROR] copy new $1 files failed" + return 1 + fi + + return 0 +} + +upgrade_file() +{ + if [ ! -e ${sourcedir}/$vendordir/$1 ]; then + log "[INFO] no need to upgrade ops $1 file" + return 0 + fi + + log "copy new $1 files ......" + cp -f ${sourcedir}/$vendordir/$1 $targetdir/$vendordir/$1 + if [ $? -ne 0 ];then + log "[ERROR] copy new $1 file failed" + return 1 + fi + + return 0 +} + +log "[INFO] copy uninstall sh success" + +echo "[ops_custom]upgrade framework" +upgrade framework +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade op proto" +upgrade op_proto +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade op impl" +upgrade op_impl +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade op api" +upgrade op_api +if [ $? -ne 0 ];then + exit 1 +fi + +echo "[ops_custom]upgrade version.info" +upgrade_file version.info +if [ $? -ne 0 ];then + exit 1 +fi + +config_file=${targetdir}/vendors/config.ini +found_vendors="$(grep -w "load_priority" "$config_file" | cut --only-delimited -d"=" -f2-)" +found_vendor=$(echo $found_vendors | sed "s/$vendor_name//g" | tr ',' ' ') +vendor=$(echo $found_vendor | tr -s ' ' ',') +if [ "$vendor" != "" ]; then + sed -i "/load_priority=$found_vendors/s@load_priority=$found_vendors@load_priority=$vendor_name,$vendor@g" "$config_file" +fi + +changemode() +{ + if [ -d ${targetdir} ];then + chmod -R 550 ${targetdir}>/dev/null 2>&1 + fi + + return 0 +} +echo "[ops_custom]changemode..." +#changemode +if [ $? -ne 0 ];then + exit 1 +fi + +echo "SUCCESS" +exit 0 + diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..8d2d11c90ac38d0774fa9db32716a6c1e22bd3b5 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/CMakeLists.txt @@ -0,0 +1,2 @@ +include(cmake/fun.cmake) +add_subdirectory(npu) \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/cmake/fun.cmake b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/cmake/fun.cmake new file mode 100644 index 0000000000000000000000000000000000000000..024e26303a128ee4d8edb90b1d8a735a9851f4d7 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/cmake/fun.cmake @@ -0,0 +1,53 @@ + +set(UPER_CHARS A B C D E F G H I J K L M N O P Q R S T U V W X Y Z) +function(string_to_snake str_in snake_out) + set(str_cam ${str_in}) + foreach(uper_char ${UPER_CHARS}) + string(TOLOWER "${uper_char}" lower_char) + string(REPLACE ${uper_char} "_${lower_char}" str_cam ${str_cam}) + endforeach() + string(SUBSTRING ${str_cam} 1 -1 str_cam) + set(${snake_out} "${str_cam}" PARENT_SCOPE) +endfunction() + +function(add_cpu_target) + cmake_parse_arguments(CPU_TEST "" "OP" "SRC" ${ARGN}) + string_to_snake("${CPU_TEST_OP}" op_snake) + add_custom_command(OUTPUT ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h + COMMAND python3 ${CMAKE_SOURCE_DIR}/cmake/util/tiling_data_def_build.py + ${CMAKE_SOURCE_DIR}/op_host/${op_snake}_tiling.h + ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h + DEPENDS ${CMAKE_SOURCE_DIR}/op_host/${op_snake}_tiling.h + ) + add_custom_target(gen_${op_snake}_tiling_header + DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h + ) + + add_executable(${op_snake}_cpu ${CPU_TEST_SRC}) + add_dependencies(${op_snake}_cpu gen_${op_snake}_tiling_header) + target_compile_options(${op_snake}_cpu PRIVATE -g -include ${CMAKE_CURRENT_SOURCE_DIR}/${op_snake}_tiling.h) + target_link_libraries(${op_snake}_cpu PRIVATE tikicpulib::ascend910B1) + set_target_properties(${op_snake}_cpu PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + ) +endfunction() + +function(add_npu_target) + cmake_parse_arguments(NPU_TEST "" "OP" "SRC" ${ARGN}) + string_to_snake("${NPU_TEST_OP}" op_snake) + add_executable(${op_snake}_npu ${NPU_TEST_SRC}) + target_compile_options(${op_snake}_npu PRIVATE -g) + target_include_directories(${op_snake}_npu PRIVATE + ${ASCEND_CANN_PACKAGE_PATH}/include/acl + ${ASCEND_AUTOGEN_PATH} + ) + target_link_libraries(${op_snake}_npu PRIVATE + intf_pub + cust_opapi + ascendcl + nnopbase + ) + set_target_properties(${op_snake}_npu PROPERTIES + RUNTIME_OUTPUT_DIRECTORY ${CMAKE_BINARY_DIR} + ) +endfunction() diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/CMakeLists.txt b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..6ea029137762737c155a94fd918508eb66cbab36 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/CMakeLists.txt @@ -0,0 +1,10 @@ +add_npu_target(OP WelfordUpdateCustom SRC welford_update_custom_main.cpp) + +add_custom_target(run_npu_test + COMMAND echo "===============================================================================" + COMMAND echo " Run NPU test at ${CMAKE_CURRENT_BINARY_DIR}" + COMMAND echo "===============================================================================" + COMMAND $ + COMMAND echo "===============================================================================" + ) +add_dependencies(run_npu_test welford_update_custom_npu) \ No newline at end of file diff --git a/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/welford_update_custom_main.cpp b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/welford_update_custom_main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e7ab1b3d528895d9c766d71cfb7ce46b77d21d87 --- /dev/null +++ b/examples/normalization/welford_update/kernel_launch_method_by_framework/testcases/npu/welford_update_custom_main.cpp @@ -0,0 +1,215 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include +#include "acl/acl_rt.h" +#include "acl/acl.h" +#include "aclnn_welford_update_custom.h" +#include "../../../../../common/data_utils.h" + +constexpr uint8_t SRC_SIZE = 3; +constexpr uint16_t TIMEOUT = 5000; +constexpr uint8_t INDEX_IN_VAR = 2; +constexpr uint8_t INDEX_OUT_MEAN = 3; +constexpr uint8_t INDEX_OUT_VAR = 4; +constexpr uint8_t RN_SIZE = 1; +constexpr uint32_t AB_SIZE = 64; + +aclrtStream CreateStream(int device) +{ + if (aclInit(NULL) != ACL_SUCCESS) { + printf("acl init failed\n"); + return NULL; + } + if (aclrtSetDevice(device) != ACL_SUCCESS) { + printf("Set device failed\n"); + (void)aclFinalize(); + return NULL; + } + aclrtStream stream = nullptr; + if (aclrtCreateStream(&stream) != ACL_SUCCESS) { + printf("Create stream failed\n"); + return NULL; + } + return stream; +} + +void DestroyStream(aclrtStream stream, int device) +{ + (void)aclrtDestroyStream(stream); + if (aclrtResetDevice(device) != ACL_SUCCESS) { + printf("Reset device failed\n"); + } + if (aclFinalize() != ACL_SUCCESS) { + printf("Finalize acl failed\n"); + } +} + +struct tensorInfo { + int64_t *dims; + int64_t dimCnt; + aclDataType dtype; + aclFormat fmt; +}; + +int64_t GetDataSize(struct tensorInfo *desc) +{ + if (!desc->dims) { + return 0; + } + int64_t size = 1; + for (auto i = 0; i < desc->dimCnt; i++) { + size *= desc->dims[i]; + } + return size * sizeof(float); +} + +static bool CompareResult(const void *outputData, int64_t outSize, std::string goldenName) +{ + void *goldenData; + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden_" + goldenName + ".bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden_%s.bin success!\n", goldenName.c_str()); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden_output_%s.bin failed output is %lf, golden is %lf\n", goldenName.c_str(), a, + b); + wrongNum++; + } + } + CHECK_ACL(aclrtFreeHost(goldenData)); + + if (wrongNum != 0) { + return false; + } else { + printf("CompareResult golden_output_%s.bin success\n", goldenName.c_str()); + return true; + } +} + +int main(void) +{ + aclrtStream stream; + + int64_t srcGm[] = {RN_SIZE * AB_SIZE}; + int64_t inMeanGm[] = {RN_SIZE * AB_SIZE}; + int64_t inVarGm[] = {RN_SIZE * AB_SIZE}; + int64_t outMeanGm[] = {RN_SIZE * AB_SIZE}; + int64_t outVarGm[] = {RN_SIZE * AB_SIZE}; + struct tensorInfo tensorDesc[] = {{srcGm, 1, ACL_FLOAT16, ACL_FORMAT_ND}, + {inMeanGm, 1, ACL_FLOAT, ACL_FORMAT_ND}, + {inVarGm, 1, ACL_FLOAT, ACL_FORMAT_ND}, + {outMeanGm, 1, ACL_FLOAT, ACL_FORMAT_ND}, + {outVarGm, 1, ACL_FLOAT, ACL_FORMAT_ND}, + }; + + std::string ParamNames[] = { + "srcGm", + "inMeanGm", + "inVarGm", + "outMeanGm", + "outVarGm", + }; + stream = CreateStream(0); + + aclTensor *tensors[sizeof(tensorDesc) / sizeof(struct tensorInfo)]; + void *devMem[sizeof(tensorDesc) / sizeof(struct tensorInfo)]; + for (auto i = 0; i < sizeof(tensorDesc) / sizeof(struct tensorInfo); i++) { + void *data; + struct tensorInfo *info = &(tensorDesc[i]); + int64_t size = GetDataSize(info); + if (size == 0) { + tensors[i] = NULL; + devMem[i] = NULL; + continue; + } + CHECK_ACL(aclrtMalloc(&data, size, ACL_MEM_MALLOC_HUGE_FIRST)); + // read input + if (i < SRC_SIZE) { + size_t inputSize = size; + void *dataHost; + CHECK_ACL(aclrtMallocHost((void **)(&dataHost), inputSize)); + ReadFile("../input/input_" + ParamNames[i] + ".bin", inputSize, dataHost, inputSize); + CHECK_ACL(aclrtMemcpy(data, size, dataHost, size, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtFreeHost(dataHost)); + } + devMem[i] = data; + tensors[i] = + aclCreateTensor(info->dims, info->dimCnt, info->dtype, NULL, 0, info->fmt, info->dims, info->dimCnt, data); + } + + size_t workspaceSize = 0; + aclOpExecutor *handle; + int32_t ret = 0; + ret = aclnnWelfordUpdateCustomGetWorkspaceSize(tensors[0], tensors[1], tensors[INDEX_IN_VAR], + tensors[INDEX_OUT_MEAN], tensors[INDEX_OUT_VAR], &workspaceSize, &handle); + printf("aclnnWelfordUpdateCustomGetWorkspaceSize ret %u workspace size %lu\n", ret, workspaceSize); + void *workspace = nullptr; + if (workspaceSize != 0) { + CHECK_ACL(aclrtMalloc(&workspace, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); + } + ret = aclnnWelfordUpdateCustom(workspace, workspaceSize, handle, stream); + printf("aclnnWelfordUpdateCustom ret %u\n", ret); + if (aclrtSynchronizeStreamWithTimeout(stream, TIMEOUT) != ACL_SUCCESS) { + printf("Synchronize stream failed\n"); + } + + uint8_t *outMeanHost, *outVarHost; + int64_t outMeanHostSize = GetDataSize(&(tensorDesc[INDEX_OUT_MEAN])); + int64_t outVarHostSize = GetDataSize(&(tensorDesc[INDEX_OUT_VAR])); + + CHECK_ACL(aclrtMallocHost((void **)(&outMeanHost), outMeanHostSize)); + CHECK_ACL(aclrtMallocHost((void **)(&outVarHost), outVarHostSize)); + + CHECK_ACL(aclrtMemcpy(outMeanHost, outMeanHostSize, devMem[INDEX_OUT_MEAN], outMeanHostSize, + ACL_MEMCPY_DEVICE_TO_HOST)); + CHECK_ACL(aclrtMemcpy(outVarHost, outVarHostSize, devMem[INDEX_OUT_VAR], outVarHostSize, + ACL_MEMCPY_DEVICE_TO_HOST)); + + WriteFile("../output/output_outMeanGm.bin", outMeanHost, outMeanHostSize); + WriteFile("../output/output_outVarGm.bin", outVarHost, outVarHostSize); + + bool goldenResult = true; + goldenResult &= CompareResult(outMeanHost, outMeanHostSize, ParamNames[INDEX_OUT_MEAN]); + goldenResult &= CompareResult(outVarHost, outVarHostSize, ParamNames[INDEX_OUT_VAR]); + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + + CHECK_ACL(aclrtFreeHost(outMeanHost)); + CHECK_ACL(aclrtFreeHost(outVarHost)); + + for (auto i = 0; i < sizeof(tensorDesc) / sizeof(struct tensorInfo); i++) { + if (!tensors[i]) + continue; + if (devMem[i]) { + CHECK_ACL(aclrtFree(devMem[i])); + } + aclDestroyTensor(tensors[i]); + } + DestroyStream(stream, 0); + return 0; +} diff --git a/examples/readme.md b/examples/readme.md index b87070f404d9ffeffbb4d5393921e2845774bdd2..ebd14ef84f1cd01cc563983f63bba218aa8ecda6 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -53,6 +53,10 @@ layernorm_grad 计算layernorm的反向传播梯度。 + + welford_update + Welford算法的前处理,一种在线计算均值和方差的方法。 + pad broadcast diff --git a/impl/kfc/kernel_kfc.h b/impl/kfc/kernel_kfc.h index 5972e904a095bd1876c52e0fae158cacf7fc1876..5f7ad2beeb46cdbb3281ae09c1f1245763a4b33b 100644 --- a/impl/kfc/kernel_kfc.h +++ b/impl/kfc/kernel_kfc.h @@ -25,7 +25,7 @@ #include "lib/matmul/matmul_client.h" #include "../matmul/matmul_server.h" #endif -namespace Gemm { +namespace AscendC { constexpr uint16_t WORKSPACE_SYNC_ID = 15; __aicore__ inline void clearWorkspace(__gm__ uint8_t* workspace) { @@ -296,5 +296,5 @@ __aicore__ inline void SetMatrixKfc(TPipe* pipe, KfcCommClient* kfcClient, const } }; // namespace AscendC // Compatible with the previously used matmul namespace -namespace matmul = Gemm; +namespace matmul = AscendC; #endif diff --git a/impl/kfc/kfc_register_obj.h b/impl/kfc/kfc_register_obj.h index aee06e79e21e58d270f28718e7b851321505ef67..38a32c37f1184b2837c6c5680ec254b9748de9ac 100644 --- a/impl/kfc/kfc_register_obj.h +++ b/impl/kfc/kfc_register_obj.h @@ -205,7 +205,7 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b) AscendC::SetMatrixKfc(tpipe, &__kfcClient__, 0, workspace, __VA_ARGS__); \ AscendC::AscendCTimeStamp(static_cast(AscendC::TimeStampId::TIME_STAMP_MATMUL_MATRIX_KFC)); \ if constexpr (!asEnableMixDualMaster) { \ - AscendC::WaitEvent(Gemm::WORKSPACE_SYNC_ID); \ + AscendC::WaitEvent(AscendC::WORKSPACE_SYNC_ID); \ } \ AscendC::AscendCTimeStamp(static_cast(AscendC::TimeStampId::TIME_STAMP_MATMUL_WAIT_EVE)) @@ -224,7 +224,7 @@ __aicore__ inline void InitCurObj(AscendC::TPipe* tpipe, T& a, Args&&... b) } \ AscendC::SetMatrixKfc(tpipe, &__kfcClient__, 0, workspace, __VA_ARGS__); \ if constexpr (!asEnableMixDualMaster) { \ - AscendC::WaitEvent(Gemm::WORKSPACE_SYNC_ID); \ + AscendC::WaitEvent(AscendC::WORKSPACE_SYNC_ID); \ } #endif diff --git a/impl/matmul/matmul_call_back.h b/impl/matmul/matmul_call_back.h index 9796aa31abb66cd1672bf9819e678deb70786e6a..12914ecd07f72749ce6283f3ee31e2e8e4791073 100644 --- a/impl/matmul/matmul_call_back.h +++ b/impl/matmul/matmul_call_back.h @@ -15,8 +15,8 @@ #ifndef LIB_MATMUL_MATMUL_CALL_BACK_H #define LIB_MATMUL_MATMUL_CALL_BACK_H -namespace Gemm { -using namespace AscendC; +namespace AscendC { + template &co1Local, const void *dataCopyOutParams, const uint64_t tilingPtr, const uint64_t dataPtr) = nullptr, void (*CopyA1)(const LocalTensor &aMatrix, const __gm__ void *gm, int row, int col, int useM, int useK, @@ -32,5 +32,5 @@ struct MatmulCallBackFunc { int useK, int useN, const uint64_t tilingPtr, const uint64_t dataPtr) = CopyB1; }; -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/matmul_constant_tiling_impl.h b/impl/matmul/matmul_constant_tiling_impl.h index 31f09d14fc14db5298b061ded439d6f357b90a7d..a9050d6fddd3f061dbef440eab4873f2b2873d15 100644 --- a/impl/matmul/matmul_constant_tiling_impl.h +++ b/impl/matmul/matmul_constant_tiling_impl.h @@ -19,9 +19,8 @@ #include "matmul_utils.h" #include "kernel_operator.h" -namespace Gemm { -using namespace AscendC; - +namespace AscendC { +namespace Impl { constexpr int32_t C0_BYTE_SIZE = 32; constexpr int32_t HW_C0 = 16; constexpr int32_t DB_ON = 2; @@ -36,6 +35,7 @@ constexpr int32_t L1_SIZE = 512 * 1024; #else constexpr int32_t L1_SIZE = 512 * 1024; #endif +} template struct MatmulTiling { @@ -446,17 +446,17 @@ __aicore__ inline constexpr MatmulConfig ToMatmulConfig(const MatmulApiStaticTil template __aicore__ constexpr int32_t GetReduceC0Size() { - return C0_BYTE_SIZE / GetBitSize() * ONE_BYTE_BIT_SIZE; + return Impl::C0_BYTE_SIZE / GetBitSize() * ONE_BYTE_BIT_SIZE; } __aicore__ constexpr int32_t GetML0(const MatmulConfig &mmCFG) { - return CeilNoLog(mmCFG.basicM, HW_C0); + return CeilNoLog(mmCFG.basicM, Impl::HW_C0); } __aicore__ constexpr int32_t GetNL0(const MatmulConfig &mmCFG) { - return CeilNoLog(mmCFG.basicN, HW_C0); + return CeilNoLog(mmCFG.basicN, Impl::HW_C0); } template @@ -472,19 +472,19 @@ __aicore__ constexpr int32_t GetMTE1Loop(const MatmulConfig &mmCFG) int32_t nL0 = GetNL0(mmCFG); int32_t mL0 = GetML0(mmCFG); int32_t kL0 = GetKL0(mmCFG); - return MIN_MTE1_LOAD / ((nL0 == 1 ? 1 : kL0) + (kL0 == 1 ? 1 : mL0)); + return Impl::MIN_MTE1_LOAD / ((nL0 == 1 ? 1 : kL0) + (kL0 == 1 ? 1 : mL0)); } __aicore__ constexpr int32_t GetMaxMAL1(const MatmulConfig &mmCFG) { - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); int32_t mL0 = GetML0(mmCFG); return CeilNoLog(m, mL0); } __aicore__ constexpr int32_t GetMaxNBL1(const MatmulConfig &mmCFG) { - int32_t n = CeilNoLog(mmCFG.singleCoreN, HW_C0); + int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); int32_t nL0 = GetNL0(mmCFG); return CeilNoLog(n, nL0); } @@ -494,7 +494,7 @@ __aicore__ constexpr int32_t GetMaxKAL1(const MatmulConfig &mmCFG) { int32_t mL0 = GetML0(mmCFG); int32_t kL0 = GetKL0(mmCFG); - int32_t maxAL1 = ((MIN_MTE1_LOAD + mL0 - 1) / mL0 + kL0 - 1) / kL0; + int32_t maxAL1 = ((Impl::MIN_MTE1_LOAD + mL0 - 1) / mL0 + kL0 - 1) / kL0; return MaxValue(maxAL1, GetMTE1Loop(mmCFG)); } @@ -503,7 +503,7 @@ __aicore__ constexpr int32_t GetMaxKBL1(const MatmulConfig &mmCFG) { int32_t nL0 = GetNL0(mmCFG); int32_t kL0 = GetKL0(mmCFG); - int32_t maxBL1 = ((MIN_MTE1_LOAD + nL0 - 1) / nL0 + kL0 - 1) / kL0; + int32_t maxBL1 = ((Impl::MIN_MTE1_LOAD + nL0 - 1) / nL0 + kL0 - 1) / kL0; return MaxValue(maxBL1, GetMTE1Loop(mmCFG)); } @@ -613,19 +613,19 @@ __aicore__ constexpr int32_t CalcL1MaxLen(int32_t l1LeftSize, const L1Status &l1 int32_t maxLen = 1; switch (type) { case L1TilingType::KAL1_16: - maxLen = l1LeftSize / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * C0_BYTE_SIZE); + maxLen = l1LeftSize / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE); maxLen = AlignDown(maxLen, alignValue); break; case L1TilingType::KBL1_16: - maxLen = l1LeftSize / (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * C0_BYTE_SIZE); + maxLen = l1LeftSize / (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * Impl::C0_BYTE_SIZE); maxLen = AlignDown(maxLen, alignValue); break; case L1TilingType::M_AL1: - maxLen = l1LeftSize / (Align(l1Status.kAL1, alignValue) * mmCFG.basicM * l1Status.dbAL1 * C0_BYTE_SIZE); + maxLen = l1LeftSize / (Align(l1Status.kAL1, alignValue) * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE); break; case L1TilingType::N_BL1: - maxLen = l1LeftSize / (Align(l1Status.kBL1, alignValue) * mmCFG.basicN * l1Status.dbBL1 * C0_BYTE_SIZE + - GetChannelWise(mmCFG) * mmCFG.basicN * C0_BYTE_SIZE); + maxLen = l1LeftSize / (Align(l1Status.kBL1, alignValue) * mmCFG.basicN * l1Status.dbBL1 * Impl::C0_BYTE_SIZE + + GetChannelWise(mmCFG) * mmCFG.basicN * Impl::C0_BYTE_SIZE); break; } return maxLen; @@ -643,8 +643,8 @@ __aicore__ constexpr L1Status GetL1StatusBothFullLoad(const MatmulConfig &mmCFG, return {kAL1, kBL1, 1, 1, 1, 1, 0}; } L1Status l1Status {kAL1, kBL1, GetMaxMAL1(mmCFG), GetMaxNBL1(mmCFG), 1, 1, INT32_MAX}; - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); - int32_t n = CeilNoLog(mmCFG.singleCoreN, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); + int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); if (GetL1Size(l1Status, mmCFG) <= l1Size) { int32_t loadSize = (PhyPosIsL1(A_TYPE::pos) ? 0 : m) + (PhyPosIsL1(B_TYPE::pos) ? 0 : n); @@ -675,13 +675,13 @@ __aicore__ constexpr L1Status GetL1StatusAL1FullLoad(const MatmulConfig &mmCFG, return {0, 0, 0, 0, 0, 0, INT32_MAX}; } int32_t kaAlignValue = GetKAAlignValue(); - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); int32_t aL1Size = MaxValue(Align(k, kaAlignValue), Align(kAL1, kaAlignValue)) * - MaxValue(maxMAL1 * mmCFG.basicM, m * HW_C0) * C0_BYTE_SIZE; + MaxValue(maxMAL1 * mmCFG.basicM, m * Impl::HW_C0) * Impl::C0_BYTE_SIZE; int32_t bL1Size = PhyPosIsL1(A_TYPE::pos) ? l1Size : l1Size - aL1Size; - l1Status.dbBL1 = DB_ON; + l1Status.dbBL1 = Impl::DB_ON; if (GetL1Size(l1Status, mmCFG) > l1Size) { - l1Status.dbBL1 = DB_OFF; + l1Status.dbBL1 = Impl::DB_OFF; } int32_t biasSize = GetBiasL1Size(l1Status, mmCFG); int32_t dequantSize = GetDeQuantSize(l1Status, mmCFG); @@ -698,13 +698,13 @@ __aicore__ constexpr L1Status GetL1StatusAL1FullLoad(const MatmulConfig &mmCFG, int32_t nRepeat = CeilNoLog(mmCFG.singleCoreN, mmCFG.basicN); l1Status.nBL1 = GetNearestFactor(nRepeat, l1Status.nBL1); if (l1Status.nBL1 * mmCFG.basicN == mmCFG.singleCoreN) { - l1Status.dbBL1 = DB_OFF; + l1Status.dbBL1 = Impl::DB_OFF; } } bool invalidL1Status = (l1Status.nBL1 == 0 || l1Status.kBL1 == 0); int32_t mRepeat = CeilNoLog(mmCFG.singleCoreM, mmCFG.basicM); int32_t possibleMRepeat = (l1Status.kBL1 == k) ? 1 : mRepeat; - int32_t n = CeilNoLog(mmCFG.singleCoreN, HW_C0); + int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); l1Status.loadSize = invalidL1Status ? INT32_MAX : (PhyPosIsL1(A_TYPE::pos) ? 0 : m) + possibleMRepeat * n; return l1Status; } @@ -722,13 +722,13 @@ __aicore__ constexpr L1Status GetL1StatusBL1FullLoad(const MatmulConfig &mmCFG, return {0, 0, 0, 0, 0, 0, INT32_MAX}; } int32_t kbAlignValue = GetKBAlignValue(); - int32_t n = CeilNoLog(mmCFG.singleCoreN, HW_C0); + int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); int32_t bL1Size = MaxValue(Align(k, kbAlignValue), Align(kBL1, kbAlignValue)) * - MaxValue(maxNBL1 * mmCFG.basicN, n * HW_C0) * C0_BYTE_SIZE; + MaxValue(maxNBL1 * mmCFG.basicN, n * Impl::HW_C0) * Impl::C0_BYTE_SIZE; int32_t aL1Size = PhyPosIsL1(B_TYPE::pos) ? l1Size : l1Size - bL1Size; - l1Status.dbAL1 = DB_ON; + l1Status.dbAL1 = Impl::DB_ON; if (GetL1Size(l1Status, mmCFG) > l1Size) { - l1Status.dbAL1 = DB_OFF; + l1Status.dbAL1 = Impl::DB_OFF; } int32_t biasSize = GetBiasL1Size(l1Status, mmCFG); int32_t dequantSize = GetDeQuantSize(l1Status, mmCFG); @@ -743,13 +743,13 @@ __aicore__ constexpr L1Status GetL1StatusBL1FullLoad(const MatmulConfig &mmCFG, int32_t mRepeat = CeilNoLog(mmCFG.singleCoreM, mmCFG.basicM); l1Status.mAL1 = GetNearestFactor(mRepeat, l1Status.mAL1); if (l1Status.mAL1 * mmCFG.basicM == mmCFG.singleCoreM) { - l1Status.dbAL1 = DB_OFF; + l1Status.dbAL1 = Impl::DB_OFF; } } bool invalidL1Status = (l1Status.mAL1 == 0 || l1Status.kAL1 == 0); int32_t nRepeat = CeilNoLog(mmCFG.singleCoreN, mmCFG.basicN); int32_t possibleNRepeat = (l1Status.kAL1 == k) ? 1 : nRepeat; - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); l1Status.loadSize = invalidL1Status ? INT32_MAX : (PhyPosIsL1(B_TYPE::pos) ? 0 : n) + possibleNRepeat * m; return l1Status; } @@ -791,8 +791,8 @@ __aicore__ constexpr L1Status GetL1StatusMFirst(const L1Status &l1Status, const l1MFirst, mmCFG, kbAlignValue, L1TilingType::N_BL1), GetMaxNBL1(mmCFG), nRepeat), 1); l1MFirst.nBL1 = GetNearestFactor(mRepeat, l1MFirst.nBL1); int32_t mL0 = GetML0(mmCFG); - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); - int32_t n = CeilNoLog(mmCFG.singleCoreN, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); + int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); l1MFirst.loadSize = m + n * CeilNoLog(m, l1MFirst.mAL1 * mL0); return l1MFirst; } @@ -819,8 +819,8 @@ __aicore__ constexpr L1Status GetL1StatusNFirst(const L1Status &l1Status, const l1NFirst.mAL1 = GetNearestFactor(mRepeat, l1NFirst.mAL1); l1NFirst.nBL1 = GetNearestFactor(mRepeat, l1NFirst.nBL1); int32_t nL0 = GetNL0(mmCFG); - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); - int32_t n = CeilNoLog(mmCFG.singleCoreN, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); + int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); l1NFirst.loadSize = n + m * CeilNoLog(n, l1NFirst.nBL1 * nL0); return l1NFirst; } @@ -832,26 +832,26 @@ __aicore__ constexpr L1Status GetL1DbNeitherFullLoad(const MatmulConfig &mmCFG, int32_t reduceC0Size = GetReduceC0Size(); int32_t k = CeilNoLog(mmCFG.singleCoreK, reduceC0Size); int32_t kL0 = GetKL0(mmCFG); - L1Status l1Status {kL0, DB_ON, 1, 1, DB_ON, DB_ON, 0}; + L1Status l1Status {kL0, Impl::DB_ON, 1, 1, Impl::DB_ON, Impl::DB_ON, 0}; if (GetL1Size(l1Status, mmCFG) > l1Size) { - l1Status.dbBL1 = DB_OFF; + l1Status.dbBL1 = Impl::DB_OFF; if (GetL1Size(l1Status, mmCFG) > l1Size) { - l1Status.dbAL1 = DB_OFF; + l1Status.dbAL1 = Impl::DB_OFF; } } l1Status.kBL1 = k; - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); int32_t mL0 = GetML0(mmCFG); bool bothDoubleBuffer = m != mL0 && mmCFG.singleCoreK > mmCFG.basicK && GetL1Size(l1Status, mmCFG) > l1Size; l1Status.kBL1 = kL0; if (bothDoubleBuffer) { - l1Status.dbAL1 = DB_ON; - l1Status.dbBL1 = DB_ON; + l1Status.dbAL1 = Impl::DB_ON; + l1Status.dbBL1 = Impl::DB_ON; if (GetL1Size(l1Status, mmCFG) > l1Size) { - l1Status.dbBL1 = DB_OFF; + l1Status.dbBL1 = Impl::DB_OFF; if (GetL1Size(l1Status, mmCFG) > l1Size) { - l1Status.dbAL1 = DB_OFF; + l1Status.dbAL1 = Impl::DB_OFF; } } } @@ -882,8 +882,8 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoadForNZ(const L1Status &l1Nz, } else { // when NeitherFullLoadMN change the nBL1 and mAL1 int32_t perK = MinValue((l1Size - biasSize - dequantSize) / - (mmCFG.basicM * C0_BYTE_SIZE * l1Status.dbAL1 + - mmCFG.basicN * C0_BYTE_SIZE * l1Status.dbBL1) / + (mmCFG.basicM * Impl::C0_BYTE_SIZE * l1Status.dbAL1 + + mmCFG.basicN * Impl::C0_BYTE_SIZE * l1Status.dbBL1) / kL0 * kL0, k); const int32_t aAlignedPerK = Align(perK, kaAlignValue); const int32_t bAlignedPerK = Align(perK, kbAlignValue); @@ -928,10 +928,10 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoad(const L1Status &l1Db, int32_t bL1Size = GetBL1Size(l1Status, mmCFG); int32_t aL1Size = l1Size - bL1Size; l1Status.kAL1 = MinValue((aL1Size - biasSize - dequantSize) / - (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * C0_BYTE_SIZE), k); + (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * Impl::C0_BYTE_SIZE), k); aL1Times = MaxValue(l1Status.kAL1 / kL0, 1); l1Status.kAL1 = aL1Times * kL0; - aL1Size = l1Status.kAL1 * l1Status.mAL1 * mmCFG.basicM * C0_BYTE_SIZE * l1Status.dbAL1; + aL1Size = l1Status.kAL1 * l1Status.mAL1 * mmCFG.basicM * Impl::C0_BYTE_SIZE * l1Status.dbAL1; bL1Size = l1Size - aL1Size; l1Status.kBL1 = MinValue((bL1Size - dequantSize - biasSize) / (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * mmCFG.basicK * kL0 * GetBitSize() / ONE_BYTE_BIT_SIZE), k); @@ -944,11 +944,11 @@ __aicore__ constexpr L1Status GetKL1NeitherFullLoad(const L1Status &l1Db, int32_t aL1Size = GetAL1Size(l1Status, mmCFG); int32_t bL1Size = l1Size - aL1Size; l1Status.kBL1 = MinValue((bL1Size - biasSize - dequantSize) / - (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * C0_BYTE_SIZE), k); + (l1Status.nBL1 * mmCFG.basicN * l1Status.dbBL1 * Impl::C0_BYTE_SIZE), k); int32_t bL1Times = MaxValue(l1Status.kBL1 / kL0, 1); bL1Times = GetNearestFactor(aL1Times, bL1Times); l1Status.kBL1 = bL1Times * kL0; - bL1Size = l1Status.kBL1 * l1Status.nBL1 * mmCFG.basicN * C0_BYTE_SIZE * l1Status.dbBL1; + bL1Size = l1Status.kBL1 * l1Status.nBL1 * mmCFG.basicN * Impl::C0_BYTE_SIZE * l1Status.dbBL1; aL1Size = l1Size - bL1Size; l1Status.kAL1 = MinValue((aL1Size - dequantSize - biasSize) / (l1Status.mAL1 * mmCFG.basicM * l1Status.dbAL1 * mmCFG.basicK * kL0 * GetBitSize() / ONE_BYTE_BIT_SIZE), k); @@ -999,8 +999,8 @@ __aicore__ constexpr L1Status GetL1StatusNeitherFullLoad(const MatmulConfig &mmC if (l1Status.kAL1 < k && l1Status.kBL1 < k) { l1Status.mAL1 = 1; l1Status.nBL1 = 1; - int32_t m = CeilNoLog(mmCFG.singleCoreM, HW_C0); - int32_t n = CeilNoLog(mmCFG.singleCoreN, HW_C0); + int32_t m = CeilNoLog(mmCFG.singleCoreM, Impl::HW_C0); + int32_t n = CeilNoLog(mmCFG.singleCoreN, Impl::HW_C0); int32_t nL0 = GetNL0(mmCFG); l1Status.loadSize = m * CeilNoLog(n, nL0) + n * CeilNoLog(m, nL0); } @@ -1077,12 +1077,12 @@ __aicore__ constexpr int32_t GetIterateOrder(const L1Status &l1Status, const Mat __aicore__ constexpr int32_t GetL0ADb(const MatmulConfig &mmCFG, uint32_t l0ASize) { - return (mmCFG.basicM * C0_BYTE_SIZE > l0ASize / DB_ON) ? DB_OFF : DB_ON; + return (mmCFG.basicM * Impl::C0_BYTE_SIZE > l0ASize / Impl::DB_ON) ? Impl::DB_OFF : Impl::DB_ON; } __aicore__ constexpr int32_t GetL0BDb(const MatmulConfig &mmCFG, uint32_t l0BSize) { - return (mmCFG.basicN * C0_BYTE_SIZE > l0BSize / DB_ON) ? DB_OFF : DB_ON; + return (mmCFG.basicN * Impl::C0_BYTE_SIZE > l0BSize / Impl::DB_ON) ? Impl::DB_OFF : Impl::DB_ON; } template @@ -1139,5 +1139,5 @@ __aicore__ constexpr int32_t GetTransLength(const MatmulConfig &mmCFG, const L1S } return MaxValue(a1Length, b1Length, c1Length, biasLength); } -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MATMUL_CONSTANT_TILING_IMPL_H \ No newline at end of file diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 1fd7b996146a265d07610403d5cbce44cd566eee..2e7ebcb1c01a3d500660fc1c605abb59bad86c8e 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -20,9 +20,10 @@ #include "../../impl/matmul/modules/matmul_module.h" #include "../../impl/matmul/modules/matmul_param.h" #include "../../impl/matmul/matmul_macro_def.h" -namespace Gemm { - +namespace AscendC { +namespace Impl { constexpr int32_t DOUBLE_SIZE = 2; +} template , MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> @@ -641,13 +642,13 @@ __aicore__ inline void MatmulImplBase= 220 if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { - lenFactor = DOUBLE_SIZE; + lenFactor = Impl::DOUBLE_SIZE; } #endif MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, lenFactor); @@ -1053,7 +1054,7 @@ __aicore__ inline void MatmulImplBaseInitBuffer(var.qidBias_, 1, DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT)); + var.tpipe_->InitBuffer(var.qidBias_, 1, Impl::DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT)); } else { var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_.GetBaseN() * sizeof(BiasT)); } @@ -1175,7 +1176,7 @@ __aicore__ inline void MatmulImplBase= 220 if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { - lenFactor = DOUBLE_SIZE; + lenFactor = Impl::DOUBLE_SIZE; } #endif MATMUL_MODULE(CubeOutBuffer)->Init(var.baseMN_, lenFactor); @@ -1192,7 +1193,7 @@ __aicore__ inline void MatmulImplBaseInitBuffer(var.qidBias_, 1, DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT)); + var.tpipe_->InitBuffer(var.qidBias_, 1, Impl::DOUBLE_SIZE * var.tiling_.GetBaseN() * sizeof(BiasT)); } else { var.tpipe_->InitBuffer(var.qidBias_, 1, var.tiling_.GetBaseN() * sizeof(BiasT)); } @@ -2973,8 +2974,8 @@ template __aicore__ inline void MatmulImplBase::ComputeNormWithMdb(int kInner) { - int dbLoop = (var.curM_ + 1 == var.mIter_) ? 1 : DOUBLE_SIZE; - MatmulInstr::useL0PingPong_ = (dbLoop == DOUBLE_SIZE) ? 1 : 0; + int dbLoop = (var.curM_ + 1 == var.mIter_) ? 1 : Impl::DOUBLE_SIZE; + MatmulInstr::useL0PingPong_ = (dbLoop == Impl::DOUBLE_SIZE) ? 1 : 0; LocalTensor bias; bool isBiasEnable = false; if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { @@ -2989,7 +2990,7 @@ __aicore__ inline void MatmulImplBaseLoadData(kInner, var.curN_, var.baseUseK_, var.baseUseN_); for (int dbInner = 0; dbInner < dbLoop; dbInner++) { if (dbInner > 0) { - if (var.curM_ + DOUBLE_SIZE == var.mIter_) { + if (var.curM_ + Impl::DOUBLE_SIZE == var.mIter_) { // if tailM_ != baseM, reset sAL1M_ and sMadM_ dbUsedM = var.tailM_; MatmulInstr::sAL1M_ = CeilAlign(dbUsedM, BLOCK_CUBE); @@ -3044,8 +3045,8 @@ template __aicore__ inline void MatmulImplBase::ComputeNormWithNdb(int kInner) { - int dbLoop = (var.curN_ + 1 == var.nIter_) ? 1 : DOUBLE_SIZE; - MatmulInstr::useL0PingPong_ = (dbLoop == DOUBLE_SIZE) ? 1 : 0; + int dbLoop = (var.curN_ + 1 == var.nIter_) ? 1 : Impl::DOUBLE_SIZE; + MatmulInstr::useL0PingPong_ = (dbLoop == Impl::DOUBLE_SIZE) ? 1 : 0; LocalTensor bias; bool isBiasEnable = false; if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { @@ -3060,7 +3061,7 @@ __aicore__ inline void MatmulImplBaseLoadData(var.curM_, kInner, var.baseUseM_, var.baseUseK_); for (int dbInner = 0; dbInner < dbLoop; dbInner++) { if (dbInner > 0) { - if (var.curN_ + DOUBLE_SIZE == var.nIter_) { + if (var.curN_ + Impl::DOUBLE_SIZE == var.nIter_) { // if tailN_ != baseN, reset sBL1N_ and sMadN_ dbUsedN = var.tailN_; MatmulInstr::sBL1N_ = CeilAlign(dbUsedN, BLOCK_CUBE); @@ -6697,5 +6698,5 @@ __aicore__ inline void MatmulImplBase; }; -} // namespace Gemm +} // namespace AscendC #endif // _MATMUL_MACRO_DEF_H_ \ No newline at end of file diff --git a/impl/matmul/matmul_macro_utils.h b/impl/matmul/matmul_macro_utils.h index b18a5f993df2293d7d5bc9309551196cf7a4b368..7abb77b8abb9e1325ce91007d27605615df1418e 100644 --- a/impl/matmul/matmul_macro_utils.h +++ b/impl/matmul/matmul_macro_utils.h @@ -15,6 +15,8 @@ #ifndef IMPL_MATMUL_MATMUL_MACRO_UTILS_H #define IMPL_MATMUL_MATMUL_MACRO_UTILS_H +namespace AscendC { +namespace Impl { #define HW_N0 16 #define HW_M0 16 #define ALIGN_NUM 16 @@ -35,7 +37,8 @@ constexpr int32_t SHIFT_48_BIT = 48; constexpr int32_t SHIFT_56_BIT = 56; constexpr int32_t CTRL_51_BIT = 51; constexpr uint8_t padList[4] = {0, 0, 0, 0}; -namespace Gemm { +} + __aicore__ inline uint16_t CeilDiv(uint16_t num1, uint16_t num2) { ASSERT(num2 > 0); @@ -65,9 +68,9 @@ template __aicore__ inline constexpr bool IsL0ACache() { if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { - return ToMatmulConfig(MM_CFG).basicM * ToMatmulConfig(MM_CFG).basicK * sizeof(A_TYPE) * DB_FACTOR <= L0AUF_SIZE; + return ToMatmulConfig(MM_CFG).basicM * ToMatmulConfig(MM_CFG).basicK * sizeof(A_TYPE) * Impl::DB_FACTOR <= L0AUF_SIZE; } else { - return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * DB_FACTOR; + return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * Impl::DB_FACTOR; } } @@ -75,9 +78,9 @@ template __aicore__ inline constexpr bool IsL0BCache() { if constexpr (ToMatmulConfig(MM_CFG).scheduleType == ScheduleType::OUTER_PRODUCT) { - return ToMatmulConfig(MM_CFG).basicK * ToMatmulConfig(MM_CFG).basicN * sizeof(B_TYPE) * DB_FACTOR <= L0BUF_SIZE; + return ToMatmulConfig(MM_CFG).basicK * ToMatmulConfig(MM_CFG).basicN * sizeof(B_TYPE) * Impl::DB_FACTOR <= L0BUF_SIZE; } else { - return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * DB_FACTOR; + return ToMatmulConfig(MM_CFG).singleCoreK <= ToMatmulConfig(MM_CFG).basicK * Impl::DB_FACTOR; } } @@ -96,5 +99,5 @@ __aicore__ inline constexpr bool IsL0Cache() } return IsL0ACache() || IsL0BCache(); } -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/matmul_macro_v200_impl.h b/impl/matmul/matmul_macro_v200_impl.h index 64646f546fc8851b5bacf0eedd42ba45e0084e32..f20dba135451c4a27ad88a19b89c2c991236698c 100644 --- a/impl/matmul/matmul_macro_v200_impl.h +++ b/impl/matmul/matmul_macro_v200_impl.h @@ -18,9 +18,7 @@ #include "kernel_operator.h" #include "matmul_macro_utils.h" -namespace Gemm { -using namespace AscendC; - +namespace AscendC { // ===========mad template=================/ // Cmatrix type, Amatrix type, Bmatrix type, L0C_using_uniflag, L0C_using_hset template @@ -145,11 +143,11 @@ inline __aicore__ void MacroMatmulV200 0) { uint16_t wAlign = CeilAlign(sAL1K_, HW_M0); - Load3DSetFMatrixCal(sFmH_, wAlign, padList); + Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList); } else { // fmatrix w should be 16 aligned uint16_t wAlign = CeilAlign(sAL1M_, HW_M0); - Load3DSetFMatrixCal(sFmH_, wAlign, padList); + Load3DSetFMatrixCal(sFmH_, wAlign, Impl::padList); } if (isGemv_) { int32_t fracSize = BYTE_PER_FRACTAL / sizeof(A_T); @@ -195,10 +193,10 @@ inline __aicore__ void MacroMatmulV200::value && IsSameType::value) { kDirectionAlign_ = 1; } - Load3DSetFMatrixCal(1, sAL1K_, padList); + Load3DSetFMatrixCal(1, sAL1K_, Impl::padList); } else { // fmatrix w should be 16 aligned - Load3DSetFMatrixCal(1, ToMatmulConfig(MM_CFG).basicM, padList); + Load3DSetFMatrixCal(1, ToMatmulConfig(MM_CFG).basicM, Impl::padList); } if (ssBmatrixTranspose1_ < 1) { - Load3DSetFMatrixBCal(1, sBL1K_, padList); + Load3DSetFMatrixBCal(1, sBL1K_, Impl::padList); } if constexpr (isBias) { if (sL0cInit_) { @@ -404,5 +403,5 @@ inline __aicore__ void MacroMatmulBasic &cMatrix, uint16_t mmadK, uint8_t unitFlag, bool l0c_initial) {} }; -} // namespace Gemm +} // namespace AscendC #endif \ No newline at end of file diff --git a/impl/matmul/matmul_macro_v220_l0cache_impl.h b/impl/matmul/matmul_macro_v220_l0cache_impl.h index e8a9acd271136dbf4dcc89d7ab73f18f58b4fbb6..c73b2515e3ddd6882303ce1b3f53d55e44675811 100644 --- a/impl/matmul/matmul_macro_v220_l0cache_impl.h +++ b/impl/matmul/matmul_macro_v220_l0cache_impl.h @@ -17,8 +17,8 @@ #include "matmul_macro_v220_intf.h" -namespace Gemm { -using namespace AscendC; +namespace AscendC { + // ===========mad template=================/ // Cmatrix type, Amatrix type, Bmatrix type, L0C_using_uniflag, L0C_using_hset @@ -592,14 +592,14 @@ inline __aicore__ void MacroMatmul::value) { l0a = l0a[L0AUF_SIZE / sizeof(A_T)]; } else { - l0a = l0a[L0AUF_SIZE / DB_FACTOR / sizeof(A_T)]; + l0a = l0a[L0AUF_SIZE / Impl::DB_FACTOR / sizeof(A_T)]; } } @@ -635,7 +635,7 @@ inline __aicore__ void MacroMatmul struct IBShareCache { __aicore__ inline IBShareCache() {}; }; @@ -879,14 +879,14 @@ private: template __aicore__ inline constexpr bool IsSharedMatmul() { - if constexpr (!Gemm::ToMatmulConfig(MM_CFG).enableInit || - Gemm::ToMatmulConfig(MM_CFG).enableMixDualMaster) { + if constexpr (!AscendC::ToMatmulConfig(MM_CFG).enableInit || + AscendC::ToMatmulConfig(MM_CFG).enableMixDualMaster) { return true; } return false; } template , + const auto& MM_CFG = CFG_NORM, class MM_CB = AscendC::MatmulCallBackFunc, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> struct MatmulInstBase { __aicore__ inline MatmulInstBase(){}; @@ -895,13 +895,13 @@ template struct MatmulInstShared : MatmulInstBase { __aicore__ inline MatmulInstShared(){}; - Gemm::MatmulService cubeObj[1]; + AscendC::MatmulService cubeObj[1]; }; template struct MatmulInst : MatmulInstBase { __aicore__ inline MatmulInst(){}; - Gemm::MatmulService cubeObj[MIX_NUM]; + AscendC::MatmulService cubeObj[MIX_NUM]; }; template , template, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> +class MM_CB = AscendC::MatmulCallBackFunc, MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> class MatmulServiceAuxBase { using SrcT = typename A_TYPE::T; using SrcAT = typename A_TYPE::T; @@ -1257,5 +1257,5 @@ class MatmulServiceAux __aicore__ inline constexpr int32_t GetC0Size() { @@ -88,7 +88,7 @@ struct DataCopyOutParams { uint64_t cbufWorkspaceAddr = 0; }; -constexpr int32_t ONE_BYTE_BIT_SIZE = 8; +namespace Impl { constexpr int32_t QUEUE_DEPTH = 1; constexpr int32_t NZ_MASK_VAlUE = 2; constexpr int32_t FLOAT_FACTOR = 2; @@ -96,11 +96,8 @@ constexpr int32_t B4_C0SIZE = 64; constexpr int32_t B8_C0SIZE = 32; constexpr int32_t B32_C0SIZE = 8; constexpr int32_t B16_C0SIZE = 16; -constexpr int32_t CTRL_46_BIT = 46; -constexpr int32_t CTRL_47_BIT = 47; constexpr int32_t L0_SIZE = 64 * 1024; constexpr int32_t MAX_BLOCK_COUNT_SIZE = 4095; -constexpr int32_t INT4_TWO = 2; #if __CCE_AICORE__ < 200 constexpr int32_t DB_FACTOR = 1; #else @@ -121,6 +118,10 @@ const LocalTensor NULL_TENSOR; // equal: sizeof(KfcMsg) * MAX_MATMUL_OBJ * MAX_AIV_NUM + // equal: TOTAL_UB_SIZE * MAX_AIV_NUM constexpr int64_t GM_OFFSET = 128 * 2 * 64 * 50 + 128 * 8 * 50 + 192 * 1024 * 50; +} + +template +const LocalTensor NULL_TENSOR; template struct GetDstType { using Type = T; @@ -153,7 +154,7 @@ int32_t constexpr GetNdNzMask(CubeFormat dstFormat, CubeFormat srcFormat) if ((srcFormat == CubeFormat::ND) && (dstFormat == CubeFormat::NZ)) { return 1; } else if ((srcFormat == CubeFormat::NZ) && (dstFormat == CubeFormat::ND)) { - return NZ_MASK_VAlUE; + return Impl::NZ_MASK_VAlUE; } return 0; } @@ -162,7 +163,7 @@ template __aicore__ inline constexpr static int32_t AuxGetFactor() { if (sizeof(SrcT) == sizeof(float)) { - return FLOAT_FACTOR; + return Impl::FLOAT_FACTOR; } return 1; } @@ -171,13 +172,13 @@ template __aicore__ inline constexpr static int32_t AuxGetC0Size() { if (sizeof(SrcT) == sizeof(float)) { - return B32_C0SIZE; + return Impl::B32_C0SIZE; } else if (IsSameType::value) { - return B8_C0SIZE; + return Impl::B8_C0SIZE; } else if (IsSameType::value) { - return B4_C0SIZE; + return Impl::B4_C0SIZE; } - return B16_C0SIZE; + return Impl::B16_C0SIZE; } __aicore__ constexpr bool DoMatmulNorm(MatmulConfig mmCFG) @@ -391,8 +392,8 @@ __aicore__ constexpr bool IsBasic(const MatmulApiStaticTiling &mmCFG) __aicore__ constexpr int GetL0PingPong(MatmulConfig mmCFG) { - return ((mmCFG.basicM * mmCFG.basicK * DB_FACTOR) <= L0_SIZE) && - ((mmCFG.basicK * mmCFG.basicN * DB_FACTOR) <= L0_SIZE) ? 1 : 0; + return ((mmCFG.basicM * mmCFG.basicK * Impl::DB_FACTOR) <= Impl::L0_SIZE) && + ((mmCFG.basicK * mmCFG.basicN * Impl::DB_FACTOR) <= Impl::L0_SIZE) ? 1 : 0; } __aicore__ constexpr int GetL0PingPong(const MatmulApiStaticTiling &mmCFG) @@ -479,13 +480,6 @@ __aicore__ constexpr bool GetDstNzC0Stride(const MatmulApiStaticTiling &mmCFG) return GetDstNzC0Stride(mmCFG.cfg); } -__aicore__ inline int Ceil(int num1, int num2) -{ - ASCENDC_ASSERT((num2 > 0), - { KERNEL_LOG(KERNEL_ERROR, "num2 is %d , which should be larger than 0", num2); }); - return (num1 + num2 - 1) / num2; -} - template __aicore__ inline T CeilT(T num1, T num2) { @@ -694,5 +688,5 @@ __aicore__ inline T CeilAlign(T num1, T num2) return Ceil(num1, num2) * num2; } -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/context/context.h b/impl/matmul/modules/context/context.h index bc4ae71b0ced2511b8a572d0644c6244040627ff..dbbe11dadf97bd918141341a4fdfdecf747ecfb0 100644 --- a/impl/matmul/modules/context/context.h +++ b/impl/matmul/modules/context/context.h @@ -17,7 +17,7 @@ #ifndef IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H #define IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -35,5 +35,5 @@ public: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif //IMPL_MATMUL_MODULES_CONTEXT_CONTEXT_H \ No newline at end of file diff --git a/impl/matmul/modules/dfx/dfx_config.h b/impl/matmul/modules/dfx/dfx_config.h index 33b0efdc10298a237cccb15c698fe63cde06dde5..5690003782498620e5e874faba2221d737695e0c 100644 --- a/impl/matmul/modules/dfx/dfx_config.h +++ b/impl/matmul/modules/dfx/dfx_config.h @@ -19,7 +19,7 @@ #include "handlers/dfx_chain_handler.h" #include "dfx_func_info.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { struct DfxConfig { @@ -28,5 +28,5 @@ struct DfxConfig { }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/dfx/dfx_func_info.h b/impl/matmul/modules/dfx/dfx_func_info.h index 55cc0e852bd242d2686482c06b4445a516f03cd6..2e6951543b1047999dc31e0eb89f23e9be73c8d1 100644 --- a/impl/matmul/modules/dfx/dfx_func_info.h +++ b/impl/matmul/modules/dfx/dfx_func_info.h @@ -16,7 +16,7 @@ #ifndef MATMUL_DFX_FUNC_INFO_H #define MATMUL_DFX_FUNC_INFO_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { struct DfxFuncInfo { @@ -29,5 +29,5 @@ struct DfxFuncInfo { }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/dfx/dfx_handler.h b/impl/matmul/modules/dfx/dfx_handler.h index 92ac062b108c5e848546d0bd8b6c30423c39cf03..ddc67ca05fe0b8f3c04054f7587e2668cc876b0c 100644 --- a/impl/matmul/modules/dfx/dfx_handler.h +++ b/impl/matmul/modules/dfx/dfx_handler.h @@ -18,7 +18,7 @@ #include "dfx_config.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -40,5 +40,5 @@ struct DfxHandler { } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/dfx/dfx_proxy.h b/impl/matmul/modules/dfx/dfx_proxy.h index 855da1b8b6385c7a7cad6dd0ab718385215f148e..a243bd2f0d3631ed66519d10a725bffa18d78ccb 100644 --- a/impl/matmul/modules/dfx/dfx_proxy.h +++ b/impl/matmul/modules/dfx/dfx_proxy.h @@ -19,7 +19,7 @@ #include #include "dfx_handler.h" -namespace Gemm { +namespace AscendC { template using enable_if_t = typename std::enable_if::type; @@ -172,5 +172,5 @@ private: \ } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/dfx/dfx_registry.h b/impl/matmul/modules/dfx/dfx_registry.h index 9cea2a72b30f81d36c70a1eeea3d87dfcdfc3a65..5ce6a846ff343b4fd20ee363a4a920939d041d8c 100644 --- a/impl/matmul/modules/dfx/dfx_registry.h +++ b/impl/matmul/modules/dfx/dfx_registry.h @@ -19,11 +19,11 @@ #include "dfx_proxy.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { MATMUL_DFX_PROXY_REGISTER(InputL1Cache, ClearAL1Cache, ClearBL1Cache); } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h b/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h index 6be4698bca2d0da859d3b353317814b54439eed3..b1eb816546299d96d5560556237ea031811b3c7d 100644 --- a/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h +++ b/impl/matmul/modules/dfx/handlers/dfx_chain_handler.h @@ -16,7 +16,7 @@ #ifndef MATMUL_DFX_CHAIN_HANDLER_H #define MATMUL_DFX_CHAIN_HANDLER_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -41,5 +41,5 @@ struct DfxChainHandler { } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/feature_trait/matmul_chip_cap.h b/impl/matmul/modules/feature_trait/matmul_chip_cap.h index 038c2296a7d27e09747fdee5e32cbd9a319920b9..aed7df70c10be523caabb10d31662201cb9055b4 100644 --- a/impl/matmul/modules/feature_trait/matmul_chip_cap.h +++ b/impl/matmul/modules/feature_trait/matmul_chip_cap.h @@ -15,7 +15,7 @@ #ifndef IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H #define IMPL_MATMUL_MODULES_MATMUL_CHIP_CAP_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -80,5 +80,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _MATMUL_CHIP_CAP_H_ diff --git a/impl/matmul/modules/feature_trait/matmul_feature_trait.h b/impl/matmul/modules/feature_trait/matmul_feature_trait.h index 25a274389fabcb0b143b5764ee53370ec9d9faea..5531b3e691f0667bca14bed80c9b512403701789 100644 --- a/impl/matmul/modules/feature_trait/matmul_feature_trait.h +++ b/impl/matmul/modules/feature_trait/matmul_feature_trait.h @@ -19,7 +19,7 @@ #include "matmul_chip_cap.h" #include "matmul_iter_ctrl_cfg.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -55,5 +55,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _MATMUL_FEATURE_TRAIT_H_ diff --git a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h index 25680363198491039e2af16ab038adaf61a8c603..a092212bfc0e8501ee1a2e3cbb5e0420de6ed66b 100644 --- a/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h +++ b/impl/matmul/modules/feature_trait/matmul_iter_ctrl_cfg.h @@ -18,7 +18,7 @@ #include "../../../../lib/matmul/tiling.h" #include "../../../../lib/matmul/constant_tiling.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -31,5 +31,5 @@ struct MatmulIterCtrlCfg { } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _MATMUL_ITER_CTRL_CFG_H_ \ No newline at end of file diff --git a/impl/matmul/modules/iterator/matmul_iterate_controller.h b/impl/matmul/modules/iterator/matmul_iterate_controller.h index da4cd7c7fe43f42a1d53c997135fae1de3908846..455e39c4c3c847aef72ef3b70ec866432907663b 100644 --- a/impl/matmul/modules/iterator/matmul_iterate_controller.h +++ b/impl/matmul/modules/iterator/matmul_iterate_controller.h @@ -20,7 +20,7 @@ #include "../../matmul_utils.h" #include "../feature_trait/matmul_iter_ctrl_cfg.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -150,5 +150,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif diff --git a/impl/matmul/modules/matmul_local_workspace.h b/impl/matmul/modules/matmul_local_workspace.h index fccb8b3b126b74334796ec68e66e593a770d6566..c6461475bf4f9580138aaf6d9790e1e02d53cc5d 100644 --- a/impl/matmul/modules/matmul_local_workspace.h +++ b/impl/matmul/modules/matmul_local_workspace.h @@ -17,7 +17,7 @@ #include "matmul_module.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { constexpr int32_t ENVEC_UBUNREUSE_COEFF = 2; @@ -293,5 +293,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_MATMUL_LOCAL_WORKSPACE_H \ No newline at end of file diff --git a/impl/matmul/modules/matmul_module.h b/impl/matmul/modules/matmul_module.h index db9968eb1ef2f54b00b25b80973147d99038ddd5..f3226e9752125b73d295ff0d89edbde3432688e6 100644 --- a/impl/matmul/modules/matmul_module.h +++ b/impl/matmul/modules/matmul_module.h @@ -19,14 +19,14 @@ #include "dfx/dfx_config.h" /* MatmulModuleBase */ -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template using void_t = void; // if user define self-implement module, but inherited from base module implemented in matmul, -// child module shoud declare : using BASE_MODULE = Gemm::XXXModuleName; +// child module shoud declare : using BASE_MODULE = AscendC::XXXModuleName; struct MatmulNullBase {}; template @@ -41,7 +41,7 @@ struct MatmulModuleBase> { } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC /* MatmulImplBase */ #define MATMUL_IMPL__ IMPL #define MATMUL_POLICY__ POLICY @@ -56,23 +56,23 @@ struct MatmulModuleBase> { (static_cast(MATMUL_CAST_TO_CONST_IMPL())) #define MATMUL_CAST_TO_PROXY_OF(NAME) \ -typename Gemm::Impl::Detail::DfxProxy (*MATMUL_CAST_TO_IMPL_OF(NAME)) +typename AscendC::Impl::Detail::DfxProxy (*MATMUL_CAST_TO_IMPL_OF(NAME)) #define MATMUL_CAST_TO_CONST_PROXY_OF(NAME) \ -typename Gemm::Impl::Detail::DfxProxy (*MATMUL_CAST_TO_CONST_IMPL_OF(NAME)) +typename AscendC::Impl::Detail::DfxProxy (*MATMUL_CAST_TO_CONST_IMPL_OF(NAME)) #define MATMUL_MODULE(NAME) cast_to_##NAME() #define MATMUL_USE_MODULE(NAME) \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) { \ - if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ + if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_PROXY_OF(NAME); \ } else { \ return MATMUL_CAST_TO_IMPL_OF(NAME); \ } \ } \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ - if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ + if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_CONST_PROXY_OF(NAME); \ } else { \ return MATMUL_CAST_TO_CONST_IMPL_OF(NAME); \ @@ -81,14 +81,14 @@ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ #define MATMUL_USE_MODULE_ON(NAME, ...) \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) { \ - if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ + if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_PROXY_OF(template NAME<__VA_ARGS__>); \ } else { \ return MATMUL_CAST_TO_IMPL_OF(template NAME<__VA_ARGS__>); \ } \ } \ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ - if constexpr (Gemm::Impl::Detail::DfxConfig::ENABLE) { \ + if constexpr (AscendC::Impl::Detail::DfxConfig::ENABLE) { \ return MATMUL_CAST_TO_CONST_PROXY_OF(template NAME<__VA_ARGS__>);\ } else { \ return MATMUL_CAST_TO_CONST_IMPL_OF(template NAME<__VA_ARGS__>); \ @@ -100,7 +100,7 @@ __aicore__ inline constexpr decltype(auto) MATMUL_MODULE(NAME) const { \ #define MATMUL_POLICY_DEFAULT_OF(DEFAULT) \ template \ - class MATMUL_POLICY = Gemm::Impl::Detail::DEFAULT + class MATMUL_POLICY = AscendC::Impl::Detail::DEFAULT #define MATMUL_POLICY_TEMPLATE_OF(NAME) \ template class NAME @@ -118,14 +118,14 @@ MATMUL_POLICY_TEMPLATE::type; \ +friend typename AscendC::Impl::Detail::MatmulModuleBase::type; \ friend NAME #define MATMUL_ALLOW_USING_TEMPLATE(NAME, ...) \ using NAME = typename MATMUL_MODULE_IN_POLICY(template NAME<__VA_ARGS__>) /* Matmul Private Module */ -#define MATMUL_PRIVATE_TEMPLATE Gemm::Impl::Detail::MatmulPrivateModules +#define MATMUL_PRIVATE_TEMPLATE AscendC::Impl::Detail::MatmulPrivateModules #define MATMUL_MODULE_IN_PRIVATE(...) \ MATMUL_PRIVATE_TEMPLATE::__VA_ARGS__ diff --git a/impl/matmul/modules/matmul_param.h b/impl/matmul/modules/matmul_param.h index 3f89a0f33d378296a4b678d5a0c12c382408e0e5..c37d87b74211e7cf0d6fb89f68136342518410fa 100644 --- a/impl/matmul/modules/matmul_param.h +++ b/impl/matmul/modules/matmul_param.h @@ -24,7 +24,7 @@ #include "matmul_type_def.h" #include "resource/cube_in_buffer/global_cache.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* ************************************************************************************************** @@ -454,5 +454,5 @@ struct MatmulParams::Type; using Context = MatmulContext; - using CubeOutBuffer = Gemm::Impl::Detail::CubeOutBuffer; - using CopyCubeOut = Gemm::Impl::Detail::CopyCubeOut; - using CopyCubeInA = Gemm::Impl::Detail::CopyCubeIn, MM_CFG>; + using CubeOutBuffer = AscendC::Impl::Detail::CubeOutBuffer; + using CopyCubeOut = AscendC::Impl::Detail::CopyCubeOut; + using CopyCubeInA = AscendC::Impl::Detail::CopyCubeIn, MM_CFG>; using CopyCubeInB = CopyCubeIn, MM_CFG>; using CubeInBufferA = CubeInBuffer, MM_CFG>; using CubeInBufferB = CubeInBuffer, MM_CFG>; }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _MATMUL_POLICY_H_ diff --git a/impl/matmul/modules/matmul_private_modules.h b/impl/matmul/modules/matmul_private_modules.h index 6bc0f5674863a26c3eed38cf733f152c91205ce2..a21dc2161d8b29379ce1066534f36b00939f8509 100644 --- a/impl/matmul/modules/matmul_private_modules.h +++ b/impl/matmul/modules/matmul_private_modules.h @@ -29,7 +29,7 @@ #include "param/matmul_usr_define_info.h" #include "iterator/matmul_iterate_controller.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -39,8 +39,8 @@ struct MatmulPrivateModules { using CopyCubeInParamsB = CopyCubeInParams>; using MatmulTensorInfoA = MatmulTensorInfo>; using MatmulTensorInfoB = MatmulTensorInfo>; - using MatmulSubBlockInfo = Gemm::Impl::Detail::MatmulSubBlockInfo; - using MatmulShapeTiling = Gemm::Impl::Detail::MatmulShapeTiling; + using MatmulSubBlockInfo = AscendC::Impl::Detail::MatmulSubBlockInfo; + using MatmulShapeTiling = AscendC::Impl::Detail::MatmulShapeTiling; using DataCopyUtilsA = DataCopyWrapper>; using DataCopyUtilsB = DataCopyWrapper>; using BatchDataCopyUtilsA = BatchDataCopyWrapper>; @@ -50,14 +50,14 @@ struct MatmulPrivateModules { using BatchCopyCubeInA = BatchCopyCubeIn>; using BatchCopyCubeInB = BatchCopyCubeIn>; using IterateController = - Gemm::Impl::Detail::MatmulIterateController::iterCtrlCfg>; + AscendC::Impl::Detail::MatmulIterateController::iterCtrlCfg>; using LocalWorkspace = MatmulLocalWorkspace; - using MatmulShapeInfo = Gemm::Impl::Detail::MatmulShapeInfo; - using MatmulQuantProcessor = Gemm::Impl::Detail::MatmulQuantProcessor; - using MatmulUserDefineInfo = Gemm::Impl::Detail::MatmulUserDefineInfo; + using MatmulShapeInfo = AscendC::Impl::Detail::MatmulShapeInfo; + using MatmulQuantProcessor = AscendC::Impl::Detail::MatmulQuantProcessor; + using MatmulUserDefineInfo = AscendC::Impl::Detail::MatmulUserDefineInfo; }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _MATMUL_PRIVATE_MODULES_H_ diff --git a/impl/matmul/modules/matmul_type_def.h b/impl/matmul/modules/matmul_type_def.h index 60b29b4ecf08dafe8e3189ff9cc29a081d7c1059..f157549326e1a271004f1d05bbe1ae2deafb6f57 100644 --- a/impl/matmul/modules/matmul_type_def.h +++ b/impl/matmul/modules/matmul_type_def.h @@ -17,7 +17,7 @@ #include "lib/matmul/tiling.h" -namespace Gemm { +namespace AscendC { enum class InputTypeTag : uint8_t { A = 0, @@ -54,5 +54,5 @@ struct MatmulInputCType : INPUT_TYPE { constexpr static InputTypeTag TAG = InputTypeTag::C; }; -} // namespace Gemm +} // namespace AscendC #endif // _MATMUL_TYPE_DEF_H_ \ No newline at end of file diff --git a/impl/matmul/modules/param/matmul_shape_info.h b/impl/matmul/modules/param/matmul_shape_info.h index 91ae26d0567d06afad7b49b8d7a9187d7148fba5..bc8cb9e01b02f92ba740ce961d14fbe0e1a758c9 100644 --- a/impl/matmul/modules/param/matmul_shape_info.h +++ b/impl/matmul/modules/param/matmul_shape_info.h @@ -18,7 +18,7 @@ #include "../matmul_module.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -141,5 +141,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_INFO_H diff --git a/impl/matmul/modules/param/matmul_shape_tiling.h b/impl/matmul/modules/param/matmul_shape_tiling.h index 4f1f9855e4597ce5182822511f39f430819ba054..a31707b382e70890cfccd6a1b3ee49eec0f9c625 100644 --- a/impl/matmul/modules/param/matmul_shape_tiling.h +++ b/impl/matmul/modules/param/matmul_shape_tiling.h @@ -18,7 +18,7 @@ #include "../matmul_module.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -31,5 +31,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SHAPE_TILING_H diff --git a/impl/matmul/modules/param/matmul_subblock_info.h b/impl/matmul/modules/param/matmul_subblock_info.h index 88bf9426b839fd953624f3828f9d91620d5b87cc..fd7d02d06ced0326ee28b9757cade94b41b0d8ac 100644 --- a/impl/matmul/modules/param/matmul_subblock_info.h +++ b/impl/matmul/modules/param/matmul_subblock_info.h @@ -18,7 +18,7 @@ #include "../matmul_module.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -36,5 +36,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_SUBBLOCK_INFO_H diff --git a/impl/matmul/modules/param/matmul_tensor_info.h b/impl/matmul/modules/param/matmul_tensor_info.h index 89eaa007cad6f2231ba70dd66abaec5ab365c801..0361addf13d2b259ba5fe383d4309fc7273f5dd2 100644 --- a/impl/matmul/modules/param/matmul_tensor_info.h +++ b/impl/matmul/modules/param/matmul_tensor_info.h @@ -18,7 +18,7 @@ #include "../matmul_module.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -106,5 +106,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_TENSOR_INFO_H diff --git a/impl/matmul/modules/param/matmul_usr_define_info.h b/impl/matmul/modules/param/matmul_usr_define_info.h index ef6610d5514b1fedd6fd0b5bdf31fbe33b2a9b2b..3571c5fe4682dcba9aacca11b8a8c429a984849a 100644 --- a/impl/matmul/modules/param/matmul_usr_define_info.h +++ b/impl/matmul/modules/param/matmul_usr_define_info.h @@ -15,7 +15,7 @@ #ifndef IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_ #define IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_ -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -33,5 +33,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_PARAM_MATMUL_USER_DEFINE_INFO_H_ \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h index 2a4eb1f93d5242a8c74dcbd4ca73c8c0f0ff7587..a136f40b98766c761033019749d8cf031d0e656a 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_buffer.h @@ -18,7 +18,7 @@ #include "cube_in_buffer_intf.h" #include "../../param/matmul_shape_tiling.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { constexpr int32_t BANK_CONFLICT_SIZE = 512; @@ -38,7 +38,7 @@ public: __aicore__ inline ~CubeInBuffer() {} __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) { - int32_t matrixByteSize = baseBlockSize * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + int32_t matrixByteSize = baseBlockSize * AscendC::GetBitSize() / ONE_BYTE_BIT_SIZE; int32_t stepSize = GetTotalCacheNum(); cacheFactor_ = (cacheNum / stepSize - 1) & 1; int32_t queDepth = cacheFactor_ == 0 ? SINGLE_QUE : DOUBLE_QUE; @@ -245,5 +245,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _CUBE_IN_BUFFER_DOUBLE_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h index 1b8c4b2b49d6f5d27c992ef59a854d6fead6f205..b0d3eb0335361d8fbdfe52163803ba2ffdc82a68 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_double_global_buffer.h @@ -18,7 +18,7 @@ #include "cube_in_buffer_intf.h" #include "global_cache.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -39,7 +39,7 @@ public: baseBlockSize_ = baseBlockSize; groupCache0_.Init(); groupCache1_.Init(); - int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + int32_t matrixByteSize = baseBlockSize_ * AscendC::GetBitSize() / ONE_BYTE_BIT_SIZE; groupCache0_.InitBuffer(matrixByteSize * cacheNum); groupCache1_.InitBuffer(matrixByteSize * cacheNum); } @@ -150,5 +150,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _CUBE_IN_BUFFER_DOUBLE_GLOBAL_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h index 6dba5339344b3634cafa28c3db584f46db6c9d35..a24220281994c6412a74a4062ca0c156ce7b401c 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_intf.h @@ -19,7 +19,7 @@ #include "../../../matmul_utils.h" #include "cube_in_buffer_utils.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -116,5 +116,5 @@ public: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _CUBE_IN_BUFFER_INTF_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h index 713e441a724ba521eb5657944497c33d6bd37ae4..6cd296545ce790ad2a3a6ad969cda8e03c7f9812 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_normal.h @@ -17,7 +17,7 @@ #include "cube_in_buffer_intf.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -37,7 +37,7 @@ public: __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) { baseBlockSize_ = baseBlockSize; - int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + int32_t matrixByteSize = baseBlockSize_ * AscendC::GetBitSize() / ONE_BYTE_BIT_SIZE; int32_t reduceAxisCnt = MATMUL_MODULE(MatmulShapeInfo)->GetKIter(); auto tpipePtr = GetTPipePtr(); if (cacheNum > DB_FACTOR) { @@ -179,5 +179,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _CUBE_IN_BUFFER_NORMAL_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h index 73f6ae4ff1a808eda3e1430366114e69b2bf860e..449f47739b8682e0b24634f7303eee62d659724a 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_buffer.h @@ -17,7 +17,7 @@ #include "cube_in_buffer_intf.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -35,7 +35,7 @@ public: __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) { (void) cacheNum; - int32_t matrixByteSize = baseBlockSize * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + int32_t matrixByteSize = baseBlockSize * AscendC::GetBitSize() / ONE_BYTE_BIT_SIZE; GetTPipePtr()->InitBuffer(qid_, SINGLE_QUE, matrixByteSize); } @@ -100,5 +100,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _CUBE_IN_BUFFER_SINGLE_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h index b094afb683d7a9d2024990d35e5c47f570dfdb1f..d2cb23c63511352f7ce8980314ed31f49f242834 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_single_global_buffer.h @@ -18,7 +18,7 @@ #include "cube_in_buffer_intf.h" #include "global_cache.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -37,7 +37,7 @@ public: __aicore__ inline void Init(int32_t baseBlockSize, int32_t cacheNum) { baseBlockSize_ = baseBlockSize; - int32_t matrixByteSize = baseBlockSize_ * Gemm::GetBitSize() / ONE_BYTE_BIT_SIZE; + int32_t matrixByteSize = baseBlockSize_ * AscendC::GetBitSize() / ONE_BYTE_BIT_SIZE; GetGlobalCachePtr()->InitBuffer(matrixByteSize * cacheNum); } @@ -128,5 +128,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _CUBE_IN_BUFFER_SINGLE_GLOBAL_BUFFER_H_ diff --git a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h index e2d8d579ec8d9cab30dd32e83a321419eb5b51aa..aba48174cc96f65d90592d658e35ca5871cf2cf4 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h +++ b/impl/matmul/modules/resource/cube_in_buffer/cube_in_buffer_utils.h @@ -16,7 +16,7 @@ #include "../../matmul_type_def.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -92,5 +92,5 @@ __aicore__ inline constexpr CubeInBufferType GetCubeInBufferType() } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _CUBE_IN_BUFFER_UTILS_H_ \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_in_buffer/global_cache.h b/impl/matmul/modules/resource/cube_in_buffer/global_cache.h index 5f721412b53cc8baf91673fb6d543996d2e03a6f..ccaf0d94ff1c9e764f7768f33ea648f8c8d4ad64 100644 --- a/impl/matmul/modules/resource/cube_in_buffer/global_cache.h +++ b/impl/matmul/modules/resource/cube_in_buffer/global_cache.h @@ -14,20 +14,20 @@ #ifndef IMPL_MATMUL_MODULES_GLOBAL_CACHE_H_ #define IMPL_MATMUL_MODULES_GLOBAL_CACHE_H_ -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { class GlobalCache; } // namespace Detail } // namespace Impl -} // namespace Gemm -__BLOCK_LOCAL__ __inline__ Gemm::Impl::Detail::GlobalCache* gL1Cache; -__aicore__ inline Gemm::Impl::Detail::GlobalCache* GetGlobalCachePtr() +} // namespace AscendC +__BLOCK_LOCAL__ __inline__ AscendC::Impl::Detail::GlobalCache* gL1Cache; +__aicore__ inline AscendC::Impl::Detail::GlobalCache* GetGlobalCachePtr() { return gL1Cache; } -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -158,5 +158,5 @@ public: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _GLOBAL_CACHE_H_ \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h index a85f717b299076331496e128216e0287bf36cf73..62f2b2796a7e8cb86a73317319cb2151fc12a27b 100644 --- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h +++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_base.h @@ -14,7 +14,7 @@ #ifndef IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H #define IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { enum class UNIT_FLAG_CTRL : uint8_t { @@ -43,5 +43,5 @@ struct L0cType { }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_BASE_H \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h index 2c7637adb76adcec3fe67639c50e986d945b20bb..222db563dbccc1e6c791ba2f82357f6d296900d7 100644 --- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h +++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_no_unit_flag.h @@ -18,7 +18,7 @@ #include "../../../matmul_utils.h" #include "lib/matmul/tiling.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -85,5 +85,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_NO_UNIT_FLAG_H \ No newline at end of file diff --git a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h index 6051bb610d004bbe7b55852bea73e9638c9321a1..f81a329575287b33fc4b471a1483a394065eaeb0 100644 --- a/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h +++ b/impl/matmul/modules/resource/cube_out_buffer/cube_out_buffer_unit_flag.h @@ -19,7 +19,7 @@ #include "lib/matmul/tiling.h" #include "../../feature_trait/matmul_feature_trait.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -78,5 +78,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_RESOURCE_CUBE_OUT_BUFFER_CUBE_OUT_BUFFER_UNIT_FLAG_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/copy_cube_in/async_tensor.h b/impl/matmul/modules/stage/copy_cube_in/async_tensor.h index 0625a308a7f43478bb72aee7114526b303d2e385..4f4726597ae5cbdc875852a10eb5093a96a89272 100644 --- a/impl/matmul/modules/stage/copy_cube_in/async_tensor.h +++ b/impl/matmul/modules/stage/copy_cube_in/async_tensor.h @@ -16,7 +16,7 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_ASYNC_TENSOR_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -64,5 +64,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif //_ASYNC_TENSOR_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h index 5226ee1340594582830ca2316912b859c7606b65..5bde8c9014a8b344d3a98fd45558b1c20887dc07 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in.h @@ -21,7 +21,7 @@ #include "../../../resource/cube_in_buffer/cube_in_buffer.h" #include "../copy_cube_in_params.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { // Specialized Template Class of Batch Matmul CopyIn @@ -444,5 +444,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_H diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h index 0cc2bc69643c534c98af5b27d9f23299ee379a48..012b98237825e634662f8fdc6e3ae10d3c101719 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_intf.h @@ -15,10 +15,10 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ #define IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { -using namespace AscendC; + template class BatchCopyCubeIn @@ -90,5 +90,5 @@ public: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _BATCH_COPY_CUBE_IN_INTF_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h index 674c28baaef914607c4e7daeab35427e6bdd8da5..a6342074ddc078b4bd455e54ffa98a393c00d4f8 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_params.h @@ -16,7 +16,7 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -85,5 +85,5 @@ public: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_PARAMS_H diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h index a8322e3cc4a723cc97192c0d6663d55997d47597..6fe09993d0a41bb8e6479ef13d24cc2cc066d397 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_copy_cube_in_using_ub.h @@ -21,7 +21,7 @@ #include "../../../resource/cube_in_buffer/cube_in_buffer.h" #include "../copy_cube_in_params.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { // Specialized Template Class of Batch Matmul CopyIn @@ -470,5 +470,5 @@ private: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_BATCH_BATCH_COPY_CUBE_IN_USING_UB_H diff --git a/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h b/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h index 0515326ac069f748523cfcb0ed71493ee203fa08..8ab82a4abd350d3f78fe1a9562cd4c30c4b83c4e 100644 --- a/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h +++ b/impl/matmul/modules/stage/copy_cube_in/batch/batch_data_copy_wrapper.h @@ -20,12 +20,10 @@ #include "../copy_cube_in_utils.h" #include "../copy_cube_in_params.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { -using namespace AscendC; - template class BatchDataCopyWrapper { @@ -817,5 +815,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_BATCH_COPY_CUBE_IN_BATCH_DATA_COPY_WRAPPER_H diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h index 72a70024fb1bf18dfe58db10b1ad15f2bcde159a..62304feef856708a96693c788db6f97baa8217ed 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_from_l1.h @@ -20,7 +20,7 @@ #include "data_copy_wrapper.h" #include "copy_cube_in_intf.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -77,5 +77,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _COPY_CUBE_IN_FROM_L1_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h index 4af9fe74d1ae1d5e300ac39c0ed9a95dcc610f9d..04d4065fc192079397d3d3bf64f430fe74be2ba2 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_intf.h @@ -14,10 +14,10 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_INTF_H_ -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { -using namespace AscendC; + /* CopyCubeIn is considered entirely experimental. We retain the freedom to make incompatible changes, but do not guarantee the stability. @@ -95,5 +95,5 @@ public: } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _COPY_CUBE_IN_INTF_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h index 0e023261d427868f74235b2b1bd1eb261a204518..25462b44222b9ac10e244cc757d33457b2cfb846 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_mdl.h @@ -21,7 +21,7 @@ #include "copy_cube_in_intf.h" #include "async_tensor.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -156,5 +156,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _COPY_CUBE_IN_MDL_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h index 27024b41c584fd51d4697d8c806a84b7bf5e7397..333aa1ee44a4530ed49bc80ef075673737ecddbd 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_norm.h @@ -21,7 +21,7 @@ #include "copy_cube_in_intf.h" #include "async_tensor.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -224,5 +224,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _COPY_CUBE_IN_NORM_H_ diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h index 8a2bc529fd0e045c5e2a4ab0a528d70b836a3b5d..d690a9f9fd3cdb6a36bc334147edf2488b9b67e1 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_params.h @@ -18,7 +18,7 @@ #include "../../param/matmul_shape_tiling.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { template @@ -465,5 +465,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_PARAMS_H diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h index 31fdae655f464f93e8c8f09fd0f304e2d9c3667b..3ef607960702c0d01b3bf1e77eb6ad8d531377ee 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_using_ub.h @@ -20,10 +20,10 @@ #include "../../matmul_param.h" #include "copy_cube_in_intf.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { -using namespace AscendC; + constexpr int32_t FIRST_16BIT_OFFSET_MM_API = 16; constexpr int32_t SECOND_16BIT_OFFSET_MM_API = 32; @@ -1822,5 +1822,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_COPY_CUBE_IN_SET_UB_H diff --git a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h index 95290033ef266a78923b4e636b9fa8f0dc4a55c7..de8ed33d42bfe41d0e58b37343a372a396681c2a 100644 --- a/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h +++ b/impl/matmul/modules/stage/copy_cube_in/copy_cube_in_utils.h @@ -16,7 +16,7 @@ #include "../../feature_trait/matmul_feature_trait.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { enum class CopyCubeInType : uint8_t { @@ -64,5 +64,5 @@ __aicore__ inline constexpr CopyCubeInType GetCopyCubeInType() } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // _COPY_CUBE_IN_UTILS_H_ \ No newline at end of file diff --git a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h index 4e7958c6fbf4d21948b6545ed801312997bd19db..558e0e94c9e08c9d1a04a4c23a80b542173091e2 100644 --- a/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h +++ b/impl/matmul/modules/stage/copy_cube_in/data_copy_wrapper.h @@ -20,12 +20,10 @@ #include "copy_cube_in_utils.h" #include "copy_cube_in_params.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { -using namespace AscendC; - template class DataCopyWrapper { using TransT = typename INPUT_TYPE::TRANS_T; @@ -524,5 +522,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_IN_DATA_COPY_WRAPPER_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h index 248f6ec923454404f2fa29467a0751e3f1ff184b..90fa43c16fc5d6ec1b812be467574bf28dd3319c 100644 --- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_datacopy.h @@ -20,7 +20,7 @@ #include "../../matmul_param.h" #include "copy_cube_out_intf.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -1054,5 +1054,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_DATACOPY_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h index 4c0ae19e4f5c42d20757508eb8264fa4137935ce..81786a86dcab6eb4e819c9c5164355835688ea6c 100644 --- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_fixpipe.h @@ -24,7 +24,7 @@ #include "../quant/quant_processor_utils.h" #include "copy_cube_out_utils.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -228,5 +228,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_FIXPIPE_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h index e65dad088bf3653f044b76b12ac4e1ef6f07ac09..d0cf6c6a5d27f28f3e61ed06b3589adb8d7d21e2 100644 --- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_intf.h @@ -17,7 +17,7 @@ #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_INTF_H #include "../../feature_trait/matmul_chip_cap.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { /* @@ -73,5 +73,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_INTF_H diff --git a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h index eca08a35ed177dee0ec2d4d4d1c7124523145a6b..1cf4c4188ae4bb3a8c203e09282e568a0548b8e4 100644 --- a/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h +++ b/impl/matmul/modules/stage/copy_cube_out/copy_cube_out_utils.h @@ -16,7 +16,7 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H #define IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -128,5 +128,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_COPY_CUBE_OUT_COPY_CUBE_OUT_UTILS_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_datacopy.h b/impl/matmul/modules/stage/quant/quant_processor_datacopy.h index 952e837e89af95dd0cbb8970cafc9af4d38aeea8..170dfe07938e28e2eca4567e372bcdd1da7b6cbe 100644 --- a/impl/matmul/modules/stage/quant/quant_processor_datacopy.h +++ b/impl/matmul/modules/stage/quant/quant_processor_datacopy.h @@ -20,7 +20,7 @@ #include "../../matmul_param.h" #include "quant_processor_intf.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -127,5 +127,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_DATACOPY_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h b/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h index 84e0b74e717c65eb6e7937dac50c8b82a7a076d7..1233a8b5a00f706133dcbe8a2c9b7297b44fec2c 100644 --- a/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h +++ b/impl/matmul/modules/stage/quant/quant_processor_fixpipe.h @@ -21,7 +21,7 @@ #include "quant_processor_intf.h" #include "quant_processor_utils.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -167,5 +167,5 @@ private: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_FIXPIPE_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_intf.h b/impl/matmul/modules/stage/quant/quant_processor_intf.h index 852f32769ce5bcdc78f25f33cf9095196469595c..4a9be58898a699b042a39d0c3258b7f55cb20720 100644 --- a/impl/matmul/modules/stage/quant/quant_processor_intf.h +++ b/impl/matmul/modules/stage/quant/quant_processor_intf.h @@ -17,7 +17,7 @@ #define IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H #include "../../feature_trait/matmul_chip_cap.h" -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -99,5 +99,5 @@ public: }; } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_INTF_H \ No newline at end of file diff --git a/impl/matmul/modules/stage/quant/quant_processor_utils.h b/impl/matmul/modules/stage/quant/quant_processor_utils.h index 5324a1aebe56c3fdbaabfa4f27fd4b865c1a8a2c..2a831512115c1f817ab8d913cb367e3b088a2d54 100644 --- a/impl/matmul/modules/stage/quant/quant_processor_utils.h +++ b/impl/matmul/modules/stage/quant/quant_processor_utils.h @@ -16,7 +16,7 @@ #ifndef IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H #define IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H -namespace Gemm { +namespace AscendC { namespace Impl { namespace Detail { @@ -36,5 +36,5 @@ __aicore__ inline constexpr static bool IsQuantSenario() } } // namespace Detail } // namespace Impl -} // namespace Gemm +} // namespace AscendC #endif // IMPL_MATMUL_MODULES_STAGE_QUANT_QUANT_PROCESSOR_UTILS_H \ No newline at end of file diff --git a/impl/normalization/layernorm/layernorm_common_basic_impl.h b/impl/normalization/layernorm/layernorm_common_basic_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..bdd6e297d42340b0773cd7ec74638b793d196a62 --- /dev/null +++ b/impl/normalization/layernorm/layernorm_common_basic_impl.h @@ -0,0 +1,389 @@ +/* Copyright (c) Huawei Technologies Co., Ltd. 2024. All rights reserved. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file layernorm_common_basic_impl.h + * \brief + */ +#ifndef IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_BASIC_IMPL_H +#define IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_BASIC_IMPL_H +namespace AscendC { +constexpr uint32_t MASK_LOW_6BITS = 0x3f; +constexpr uint32_t MASK_HIGH_26BITS = 0xFFFFFFC0; + +struct LayerNormPara { + uint32_t aLength; + uint32_t rLength; + uint32_t rLengthWithPadding; +}; + +struct LayerNormConfig { + bool isNoBeta = false; + bool isNoGamma = false; + bool isOnlyOutput = false; +}; + +struct WelfordUpdateParam { + uint32_t rnLength; + uint32_t abLength; + uint32_t abComputeLength; + float nRec; +}; +struct WelfordUpdateConfig { + __aicore__ constexpr WelfordUpdateConfig(const bool isInplaceIn): isInplace(isInplaceIn) {} + bool isInplace = false; +}; + +__aicore__ constexpr LayerNormConfig GetLayerNormNormalConfig() +{ + return {.isNoBeta = false, .isNoGamma = false, .isOnlyOutput = false}; +} +constexpr LayerNormConfig LNCFG_NORM = GetLayerNormNormalConfig(); + +template +__aicore__ inline void LayerNormReduceSumImpl(const LocalTensor& dstMVTmp, const LocalTensor& dst, + const LocalTensor& src, const uint32_t bsLength, const uint32_t hLength) +{ + ResetMask(); + SetMaskNorm(); + // Contract the horizontal axis to one repeat length 64 (2^6) + constexpr uint32_t rightShiftSix = 6; + if (hLength > ONE_REPEAT_FLOAT_SIZE) { + uint32_t addRepeatTime = (hLength >> rightShiftSix) - 1; + uint32_t addTailNumber = (hLength & MASK_LOW_6BITS); + if ((hLength & MASK_LOW_6BITS) == 0) { + for (uint32_t i = 0; i < bsLength * hLength; i += hLength) { + LocalTensor dstTmp = src[i]; + LocalTensor srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE]; + Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime, + { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); + PipeBarrier(); + } + } else if (addRepeatTime > 0) { + for (uint32_t i = 0; i < bsLength * hLength; i += hLength) { + LocalTensor dstTmp = src[i]; + LocalTensor srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE]; + LocalTensor srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)]; + Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime, + { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); + PipeBarrier(); + Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1, + { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); + PipeBarrier(); + } + } else { + for (uint32_t i = 0; i < bsLength * hLength; i += hLength) { + LocalTensor dstTmp = src[i]; + LocalTensor srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)]; + Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1, + { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); + PipeBarrier(); + } + } + } + + uint32_t repeatTime = bsLength; + uint32_t cursorSrc = 0; + uint32_t wholeReduceSumHLength = (hLength > ONE_REPEAT_FLOAT_SIZE) ? ONE_REPEAT_FLOAT_SIZE : hLength; + constexpr uint32_t rightShiftThree = 3; + const uint32_t reduceSumSrcRepeatStride = hLength >> rightShiftThree; + + while (repeatTime >= MAX_REPEAT_TIMES) { + LocalTensor srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength]; + LocalTensor dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength]; + if constexpr (isRelocate) { + WholeReduceSum(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength, + MAX_REPEAT_TIMES, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride); + } + WholeReduceSum(dstTmp, srcTmp, wholeReduceSumHLength, MAX_REPEAT_TIMES, hLength, DEFAULT_BLK_STRIDE, + reduceSumSrcRepeatStride); + PipeBarrier(); + repeatTime -= MAX_REPEAT_TIMES; + ++cursorSrc; + } + + uint32_t reduceSumSrcRepeatTimeTail = bsLength - cursorSrc * MAX_REPEAT_TIMES; + if (reduceSumSrcRepeatTimeTail > 0) { + LocalTensor srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength]; + LocalTensor dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength]; + if constexpr (isRelocate) { + WholeReduceSum(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength, + reduceSumSrcRepeatTimeTail, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride); + } + WholeReduceSum(dstTmp, srcTmp, wholeReduceSumHLength, reduceSumSrcRepeatTimeTail, hLength, + DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride); + PipeBarrier(); + } + + SetMaskCount(); +} + +constexpr WelfordUpdateConfig WFUPDATE_DEFAULT_CFG = {false}; +__aicore__ inline void BroadcastLastDim(const LocalTensor& dst, const LocalTensor& src, + const uint32_t bsLength, const uint32_t hLength) +{ + SetVectorMask(0, hLength); + + SetCmpMask(src); + PipeBarrier(); + + LocalTensor maskLocal = src.ReinterpretCast(); + + const UnaryRepeatParams unaryParams; + Muls(maskLocal, maskLocal, 0, MASK_PLACEHOLDER, 1, unaryParams); + PipeBarrier(); + + const BinaryRepeatParams binaryParams; + Select(dst, maskLocal, dst, 1, binaryParams); + PipeBarrier(); + + for (uint32_t i = 1; i < bsLength; i++) { + SetCmpMask(src[i * hLength]); + PipeBarrier(); + + Select(dst[i * hLength], maskLocal, dst, 1, binaryParams); + PipeBarrier(); + } +} + +__aicore__ inline void DuplicateMulImpl(const LocalTensor& dst, const LocalTensor& src0, + const LocalTensor& src1, const uint32_t bsLength, const uint32_t hLength) +{ + const BinaryRepeatParams binaryParams; + for (uint32_t i = 0; i < bsLength; i++) { + Mul(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams); + } + PipeBarrier(); +} + +__aicore__ inline void DuplicateAddImpl(const LocalTensor& dst, const LocalTensor& src0, + const LocalTensor& src1, const uint32_t bsLength, const uint32_t hLength) +{ + const BinaryRepeatParams binaryParams; + for (uint32_t i = 0; i < bsLength; i++) { + Add(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams); + } + PipeBarrier(); +} + +template +__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor& inputX, const LocalTensor& outputMean, + const LocalTensor& outputVariance, const LocalTensor& stackBuffer, const LayerNormTiling& tiling, + LayerNormParams& params) +{ + params.tempTensorA = stackBuffer[tiling.firstTmpStartPos]; + params.tempTensorB = stackBuffer[tiling.secondTmpStartPos]; + params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos]; + params.meanTmpTensor = stackBuffer[tiling.meanTmpTensorPos]; + params.varianceTmpTensor = stackBuffer[tiling.varianceTmpTensorPos]; + ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", + tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); + }); + ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tiling.tmpBufSize is (%d)", + stackBuffer.GetSize(), tiling.tmpBufSize); + }); +} + +template <> +__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor &inputX, + const LocalTensor &outputMean, const LocalTensor &outputVariance, + const LocalTensor &stackBuffer, const LayerNormTiling &tiling, LayerNormParams ¶ms) +{ + params.meanTmpTensor = outputMean; + params.varianceTmpTensor = outputVariance; + + params.tempTensorA = stackBuffer[tiling.firstTmpStartPos]; + params.tempTensorB = stackBuffer[tiling.secondTmpStartPos]; + params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos]; + + ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", + tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); + }); + + ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(), + tiling.tmpBufSize); + }); +} + +template <> +__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor &inputX, + const LocalTensor &outputMean, const LocalTensor &outputVariance, + const LocalTensor &stackBuffer, const LayerNormTiling &tiling, LayerNormParams ¶ms) +{ + params.meanTmpTensor = outputMean; + params.varianceTmpTensor = outputVariance; + + params.tempTensorA = inputX; + params.tempTensorB = stackBuffer[tiling.firstTmpStartPos]; + params.tempTensorC = stackBuffer[tiling.secondTmpStartPos]; + + ASCENDC_ASSERT((tiling.secondTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "secondTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", + tiling.secondTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); + }); + + ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { + KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(), + tiling.tmpBufSize); + }); +} + +__aicore__ inline void GetOutputMeanVariance(const LocalTensor& outputMean, + const LocalTensor& outputVariance, const LayerNormTiling& tiling, const LayerNormParams& params) +{ + SetVectorMask(0, tiling.meanVarSize); + + UnaryRepeatParams unaryParams; + unaryParams.dstRepStride = DEFAULT_REPEAT_STRIDE / sizeof(half); + + Cast(outputMean, params.meanTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, unaryParams); + PipeBarrier(); + + Cast(outputVariance, params.varianceTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, + unaryParams); + PipeBarrier(); +} + +__aicore__ inline void WelfordUpdateComputeMean(const LocalTensor& tmpVreg, const LocalTensor& src, + const LocalTensor& inMean, const LocalTensor& outVreg, const LocalTensor& outMean, + const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams, const WelfordUpdateParam ¶) +{ + PipeBarrier(); + Sub(tmpVreg, src, inMean, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); + Muls(outVreg, tmpVreg, static_cast(para.nRec), MASK_PLACEHOLDER, 1, unaryParams); + PipeBarrier(); + Add(outMean, outVreg, inMean, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); +} + +__aicore__ inline void WelfordUpdateComputeVar(const LocalTensor& tmpVreg, const LocalTensor& inVar, + const LocalTensor& outVar, const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams, + const WelfordUpdateParam ¶) +{ + PipeBarrier(); + Add(outVar, tmpVreg, inVar, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); +} + +template +__aicore__ inline constexpr uint32_t WelfordUpdateGetTmpSize() +{ + if constexpr (sizeof(T) == sizeof(half)) { + return 0x3; + } + + if constexpr (isReuseSource) { + return 1; + } + return 0x2; +} + +__aicore__ inline void GetLayerNormOutputMean(const LocalTensor& outputMean, const LocalTensor& inputX, + const LayerNormTiling& tiling, const LayerNormParams& params, const LocalTensor& tmpMean) +{ + SetVectorMask(0, tiling.bshCurLength); + + const UnaryRepeatParams unaryParams; + Muls(params.tempTensorC, inputX, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams); + PipeBarrier(); + + LayerNormReduceSumImpl(tmpMean, outputMean, params.tempTensorC, tiling.bsCurLength, tiling.hLength); +} + +__aicore__ inline void GetLayerNormOutputVariance(const LocalTensor& outputVariance, + const LocalTensor& inputX, const LocalTensor& inputMean, const LayerNormTiling& tiling, + const LayerNormParams& params, const LocalTensor& tmpVariance) +{ + LocalTensor tempTensorA = params.tempTensorA; + LocalTensor tempTensorB = params.tempTensorB; + LocalTensor tempTensorC = params.tempTensorC; + + BroadcastLastDim(tempTensorC, inputMean, tiling.bsCurLength, tiling.hLength); + + SetVectorMask(0, tiling.bshCurLength); + + const BinaryRepeatParams binaryParams; + Sub(tempTensorB, inputX, tempTensorC, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); + + Mul(tempTensorC, tempTensorB, tempTensorB, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); + + const UnaryRepeatParams unaryParams; + Muls(tempTensorA, tempTensorC, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams); + PipeBarrier(); + + LayerNormReduceSumImpl(tmpVariance, outputVariance, tempTensorA, tiling.bsCurLength, tiling.hLength); + PipeBarrier(); +} + +template +__aicore__ inline void WelfordUpdateInplaceCompute(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& inMean, const LocalTensor& inVar, const WelfordUpdateParam ¶, uint32_t alignNum) +{ + uint32_t inPlaceLength = AlignUp(para.abLength - para.abComputeLength, alignNum); + uint32_t dstOffset = para.abLength - inPlaceLength; + + DataCopy(outMean[dstOffset], inMean[dstOffset], inPlaceLength); + DataCopy(outVar[dstOffset], inVar[dstOffset], inPlaceLength); + PipeBarrier(); +} +__aicore__ inline void WelfordUpdateInplace(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& inMean, const LocalTensor& inVar, const WelfordUpdateParam ¶) +{ + WelfordUpdateInplaceCompute(outMean, outVar, inMean, inVar, para, B32_DATA_NUM_PER_BLOCK); +} + +__aicore__ inline void WelfordUpdateInplace(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& inMean, const LocalTensor& inVar, const WelfordUpdateParam ¶) +{ + WelfordUpdateInplaceCompute(outMean, outVar, inMean, inVar, para, B16_DATA_NUM_PER_BLOCK); +} + +__aicore__ inline void GetLayerNormOutputPre(const LocalTensor& xSubMean, + const LocalTensor& inputVariance, const float epsilon, const LayerNormTiling& tiling, + const LayerNormParams& params) +{ + const float exponent = -0.5; + LocalTensor tempTensorA = params.tempTensorA; + LocalTensor tempTensorB = params.tempTensorB; + LocalTensor tempTensorC = params.tempTensorC; + + BroadcastLastDim(tempTensorA, inputVariance, tiling.bsCurLength, tiling.hLength); + + SetVectorMask(0, tiling.bshCurLength); + + const UnaryRepeatParams unaryParams; + Adds(tempTensorC, tempTensorA, epsilon, MASK_PLACEHOLDER, 1, unaryParams); + PipeBarrier(); + + Sqrt(tempTensorA, tempTensorC, MASK_PLACEHOLDER, 1, unaryParams); + PipeBarrier(); + + SetVectorMask(0, B32_DATA_NUM_PER_BLOCK); + Duplicate(tempTensorC, 1, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); + PipeBarrier(); + + SetVectorMask(0, tiling.bshCurLength); + Div(tempTensorA, tempTensorC, tempTensorA, MASK_PLACEHOLDER, 1, + { 1, 0, 1, DEFAULT_REPEAT_STRIDE, 0, DEFAULT_REPEAT_STRIDE }); + PipeBarrier(); + + const BinaryRepeatParams binaryParams; + Mul(tempTensorC, tempTensorA, xSubMean, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); +} + +} // namespace AscendC +#endif // IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_BASIC_IMPL_H \ No newline at end of file diff --git a/impl/normalization/layernorm/layernorm_common_impl.h b/impl/normalization/layernorm/layernorm_common_impl.h index 87def13b24fb82d964c0d8e5d36bfef98768647f..db07d32162cd0ef6dd453c060bb5e38ab19900a8 100644 --- a/impl/normalization/layernorm/layernorm_common_impl.h +++ b/impl/normalization/layernorm/layernorm_common_impl.h @@ -18,207 +18,9 @@ #include "kernel_tensor.h" #include "kernel_pop_stack_buffer.h" #include "kernel_tiling/kernel_tiling.h" +#include "layernorm_common_basic_impl.h" namespace AscendC { -constexpr uint32_t MASK_LOW_6BITS = 0x3f; -constexpr uint32_t MASK_HIGH_26BITS = 0xFFFFFFC0; -template -__aicore__ inline void LayerNormReduceSumImpl(const LocalTensor& dstMVTmp, const LocalTensor& dst, - const LocalTensor& src, const uint32_t bsLength, const uint32_t hLength) -{ - ResetMask(); - SetMaskNorm(); - // Contract the horizontal axis to one repeat length 64 (2^6) - constexpr uint32_t rightShiftSix = 6; - if (hLength > ONE_REPEAT_FLOAT_SIZE) { - uint32_t addRepeatTime = (hLength >> rightShiftSix) - 1; - uint32_t addTailNumber = (hLength & MASK_LOW_6BITS); - if ((hLength & MASK_LOW_6BITS) == 0) { - for (uint32_t i = 0; i < bsLength * hLength; i += hLength) { - LocalTensor dstTmp = src[i]; - LocalTensor srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE]; - Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime, - { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); - PipeBarrier(); - } - } else if (addRepeatTime > 0) { - for (uint32_t i = 0; i < bsLength * hLength; i += hLength) { - LocalTensor dstTmp = src[i]; - LocalTensor srcTmp = src[i + ONE_REPEAT_FLOAT_SIZE]; - LocalTensor srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)]; - Add(dstTmp, srcTmp, dstTmp, ONE_REPEAT_FLOAT_SIZE, addRepeatTime, - { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); - PipeBarrier(); - Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1, - { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); - PipeBarrier(); - } - } else { - for (uint32_t i = 0; i < bsLength * hLength; i += hLength) { - LocalTensor dstTmp = src[i]; - LocalTensor srcTailTmp = src[i + (hLength & MASK_HIGH_26BITS)]; - Add(dstTmp, srcTailTmp, dstTmp, addTailNumber, 1, - { DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, DEFAULT_BLK_STRIDE, 0, DEFAULT_REPEAT_STRIDE, 0 }); - PipeBarrier(); - } - } - } - - uint32_t repeatTime = bsLength; - uint32_t cursorSrc = 0; - uint32_t wholeReduceSumHLength = (hLength > ONE_REPEAT_FLOAT_SIZE) ? ONE_REPEAT_FLOAT_SIZE : hLength; - constexpr uint32_t rightShiftThree = 3; - const uint32_t reduceSumSrcRepeatStride = hLength >> rightShiftThree; - - while (repeatTime >= MAX_REPEAT_TIMES) { - LocalTensor srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength]; - LocalTensor dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength]; - if constexpr (isRelocate) { - WholeReduceSum(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength, - MAX_REPEAT_TIMES, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride); - } - WholeReduceSum(dstTmp, srcTmp, wholeReduceSumHLength, MAX_REPEAT_TIMES, hLength, DEFAULT_BLK_STRIDE, - reduceSumSrcRepeatStride); - PipeBarrier(); - repeatTime -= MAX_REPEAT_TIMES; - ++cursorSrc; - } - - uint32_t reduceSumSrcRepeatTimeTail = bsLength - cursorSrc * MAX_REPEAT_TIMES; - if (reduceSumSrcRepeatTimeTail > 0) { - LocalTensor srcTmp = src[cursorSrc * MAX_REPEAT_TIMES * hLength]; - LocalTensor dstTmp = dst[cursorSrc * MAX_REPEAT_TIMES * hLength]; - if constexpr (isRelocate) { - WholeReduceSum(dstMVTmp[cursorSrc * MAX_REPEAT_TIMES], srcTmp, wholeReduceSumHLength, - reduceSumSrcRepeatTimeTail, 1, DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride); - } - WholeReduceSum(dstTmp, srcTmp, wholeReduceSumHLength, reduceSumSrcRepeatTimeTail, hLength, - DEFAULT_BLK_STRIDE, reduceSumSrcRepeatStride); - PipeBarrier(); - } - - SetMaskCount(); -} - -__aicore__ inline void GetLayerNormOutputMean(const LocalTensor& outputMean, const LocalTensor& inputX, - const LayerNormTiling& tiling, const LayerNormParams& params, const LocalTensor& tmpMean) -{ - SetVectorMask(0, tiling.bshCurLength); - - const UnaryRepeatParams unaryParams; - Muls(params.tempTensorC, inputX, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams); - PipeBarrier(); - - LayerNormReduceSumImpl(tmpMean, outputMean, params.tempTensorC, tiling.bsCurLength, tiling.hLength); -} - -__aicore__ inline void BroadcastLastDim(const LocalTensor& dst, const LocalTensor& src, - const uint32_t bsLength, const uint32_t hLength) -{ - SetVectorMask(0, hLength); - - SetCmpMask(src); - PipeBarrier(); - - LocalTensor maskLocal = src.ReinterpretCast(); - - const UnaryRepeatParams unaryParams; - Muls(maskLocal, maskLocal, 0, MASK_PLACEHOLDER, 1, unaryParams); - PipeBarrier(); - - const BinaryRepeatParams binaryParams; - Select(dst, maskLocal, dst, 1, binaryParams); - PipeBarrier(); - - for (uint32_t i = 1; i < bsLength; i++) { - SetCmpMask(src[i * hLength]); - PipeBarrier(); - - Select(dst[i * hLength], maskLocal, dst, 1, binaryParams); - PipeBarrier(); - } -} - -__aicore__ inline void GetLayerNormOutputVariance(const LocalTensor& outputVariance, - const LocalTensor& inputX, const LocalTensor& inputMean, const LayerNormTiling& tiling, - const LayerNormParams& params, const LocalTensor& tmpVariance) -{ - LocalTensor tempTensorA = params.tempTensorA; - LocalTensor tempTensorB = params.tempTensorB; - LocalTensor tempTensorC = params.tempTensorC; - - BroadcastLastDim(tempTensorC, inputMean, tiling.bsCurLength, tiling.hLength); - - SetVectorMask(0, tiling.bshCurLength); - - const BinaryRepeatParams binaryParams; - Sub(tempTensorB, inputX, tempTensorC, MASK_PLACEHOLDER, 1, binaryParams); - PipeBarrier(); - - Mul(tempTensorC, tempTensorB, tempTensorB, MASK_PLACEHOLDER, 1, binaryParams); - PipeBarrier(); - - const UnaryRepeatParams unaryParams; - Muls(tempTensorA, tempTensorC, tiling.lastDimValueBack, MASK_PLACEHOLDER, 1, unaryParams); - PipeBarrier(); - - LayerNormReduceSumImpl(tmpVariance, outputVariance, tempTensorA, tiling.bsCurLength, tiling.hLength); - PipeBarrier(); -} - -__aicore__ inline void GetLayerNormOutputPre(const LocalTensor& xSubMean, - const LocalTensor& inputVariance, const float epsilon, const LayerNormTiling& tiling, - const LayerNormParams& params) -{ - const float exponent = -0.5; - LocalTensor tempTensorA = params.tempTensorA; - LocalTensor tempTensorB = params.tempTensorB; - LocalTensor tempTensorC = params.tempTensorC; - - BroadcastLastDim(tempTensorA, inputVariance, tiling.bsCurLength, tiling.hLength); - - SetVectorMask(0, tiling.bshCurLength); - - const UnaryRepeatParams unaryParams; - Adds(tempTensorC, tempTensorA, epsilon, MASK_PLACEHOLDER, 1, unaryParams); - PipeBarrier(); - - Sqrt(tempTensorA, tempTensorC, MASK_PLACEHOLDER, 1, unaryParams); - PipeBarrier(); - - SetVectorMask(0, B32_DATA_NUM_PER_BLOCK); - Duplicate(tempTensorC, 1, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); - PipeBarrier(); - - SetVectorMask(0, tiling.bshCurLength); - Div(tempTensorA, tempTensorC, tempTensorA, MASK_PLACEHOLDER, 1, - { 1, 0, 1, DEFAULT_REPEAT_STRIDE, 0, DEFAULT_REPEAT_STRIDE }); - PipeBarrier(); - - const BinaryRepeatParams binaryParams; - Mul(tempTensorC, tempTensorA, xSubMean, MASK_PLACEHOLDER, 1, binaryParams); - PipeBarrier(); -} - -__aicore__ inline void DuplicateMulImpl(const LocalTensor& dst, const LocalTensor& src0, - const LocalTensor& src1, const uint32_t bsLength, const uint32_t hLength) -{ - const BinaryRepeatParams binaryParams; - for (uint32_t i = 0; i < bsLength; i++) { - Mul(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams); - } - PipeBarrier(); -} - -__aicore__ inline void DuplicateAddImpl(const LocalTensor& dst, const LocalTensor& src0, - const LocalTensor& src1, const uint32_t bsLength, const uint32_t hLength) -{ - const BinaryRepeatParams binaryParams; - for (uint32_t i = 0; i < bsLength; i++) { - Add(dst[i * hLength], src0[i * hLength], src1, MASK_PLACEHOLDER, 1, binaryParams); - } - PipeBarrier(); -} template __aicore__ inline void GetLayerNormOutput(const LocalTensor& output, const LocalTensor& inputY, @@ -323,88 +125,6 @@ __aicore__ inline void LayerNormExe(const LocalTensor& inputX, con GetLayerNormOutput(output, tempTensorC, gamma, beta, tiling, params); } -template -__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor& inputX, const LocalTensor& outputMean, - const LocalTensor& outputVariance, const LocalTensor& stackBuffer, const LayerNormTiling& tiling, - LayerNormParams& params) -{ - params.tempTensorA = stackBuffer[tiling.firstTmpStartPos]; - params.tempTensorB = stackBuffer[tiling.secondTmpStartPos]; - params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos]; - params.meanTmpTensor = stackBuffer[tiling.meanTmpTensorPos]; - params.varianceTmpTensor = stackBuffer[tiling.varianceTmpTensorPos]; - ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { - KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", - tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); - }); - ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { - KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) should >= tiling.tmpBufSize is (%d)", - stackBuffer.GetSize(), tiling.tmpBufSize); - }); -} - -template <> -__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor &inputX, - const LocalTensor &outputMean, const LocalTensor &outputVariance, - const LocalTensor &stackBuffer, const LayerNormTiling &tiling, LayerNormParams ¶ms) -{ - params.meanTmpTensor = outputMean; - params.varianceTmpTensor = outputVariance; - - params.tempTensorA = stackBuffer[tiling.firstTmpStartPos]; - params.tempTensorB = stackBuffer[tiling.secondTmpStartPos]; - params.tempTensorC = stackBuffer[tiling.thirdTmpStartPos]; - - ASCENDC_ASSERT((tiling.thirdTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { - KERNEL_LOG(KERNEL_ERROR, "thirdTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", - tiling.thirdTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); - }); - - ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { - KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(), - tiling.tmpBufSize); - }); -} - -template <> -__aicore__ inline void GetLayerNormNDTensorInfo(const LocalTensor &inputX, - const LocalTensor &outputMean, const LocalTensor &outputVariance, - const LocalTensor &stackBuffer, const LayerNormTiling &tiling, LayerNormParams ¶ms) -{ - params.meanTmpTensor = outputMean; - params.varianceTmpTensor = outputVariance; - - params.tempTensorA = inputX; - params.tempTensorB = stackBuffer[tiling.firstTmpStartPos]; - params.tempTensorC = stackBuffer[tiling.secondTmpStartPos]; - - ASCENDC_ASSERT((tiling.secondTmpStartPos + tiling.oneTmpSize <= tiling.tmpBufSize), { - KERNEL_LOG(KERNEL_ERROR, "secondTmpStartPos + oneTmpSize is (%d) should <= tmpBufSize is (%d)", - tiling.secondTmpStartPos + tiling.oneTmpSize, tiling.tmpBufSize); - }); - - ASCENDC_ASSERT((stackBuffer.GetSize() >= tiling.tmpBufSize), { - KERNEL_LOG(KERNEL_ERROR, "stackBuffer.GetSize is (%d) >= tiling.tmpBufSize is (%d)", stackBuffer.GetSize(), - tiling.tmpBufSize); - }); -} - -__aicore__ inline void GetOutputMeanVariance(const LocalTensor& outputMean, - const LocalTensor& outputVariance, const LayerNormTiling& tiling, const LayerNormParams& params) -{ - SetVectorMask(0, tiling.meanVarSize); - - UnaryRepeatParams unaryParams; - unaryParams.dstRepStride = DEFAULT_REPEAT_STRIDE / sizeof(half); - - Cast(outputMean, params.meanTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, unaryParams); - PipeBarrier(); - - Cast(outputVariance, params.varianceTmpTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, - unaryParams); - PipeBarrier(); -} - template __aicore__ inline void LayerNormND(const LocalTensor& inputX, const LocalTensor& gamma, const LocalTensor& beta, const LocalTensor& output, const LocalTensor& outputMean, @@ -477,5 +197,180 @@ __aicore__ inline void LayerNormImpl(const LocalTensor& output, const LocalTe LayerNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, sharedTmpBuffer, epsilon, tiling); } + +template +__aicore__ inline void WelfordUpdateCompute(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& src, const LocalTensor& inMean, const LocalTensor& inVar, + const LocalTensor& sharedTmpBuffer, const WelfordUpdateParam ¶, const uint32_t tmpNum, + const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams) +{ + LocalTensor srcVreg = sharedTmpBuffer.ReinterpretCast(); + uint32_t tmpIndex = B32_DATA_NUM_PER_REPEAT * tmpNum; + LocalTensor tmpVreg = srcVreg[tmpIndex]; + LocalTensor outVreg = srcVreg[tmpIndex + tmpIndex]; + + PipeBarrier(); + Cast(srcVreg, src, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, + {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE}); + + WelfordUpdateComputeMean(tmpVreg, srcVreg, inMean, outVreg, outMean, unaryParams, binaryParams, para); + + Sub(outVreg, srcVreg, outMean, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); + Mul(tmpVreg, tmpVreg, outVreg, MASK_PLACEHOLDER, 1, binaryParams); + + WelfordUpdateComputeVar(tmpVreg, inVar, outVar, unaryParams, binaryParams, para); +} + +__aicore__ inline void WelfordUpdateComputeTo32Res(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& src, const LocalTensor& inMean, const LocalTensor& inVar, + const LocalTensor& sharedTmpBuffer, const WelfordUpdateParam ¶, const uint32_t tmpNum, + const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams) +{ + LocalTensor tmpVreg = sharedTmpBuffer.ReinterpretCast(); + + WelfordUpdateComputeMean(tmpVreg, src, inMean, tmpVreg, outMean, unaryParams, binaryParams, para); + + Sub(tmpVreg, src, outMean, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); + Sub(src, src, inMean, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); + Mul(tmpVreg, tmpVreg, src, MASK_PLACEHOLDER, 1, binaryParams); + + WelfordUpdateComputeVar(tmpVreg, inVar, outVar, unaryParams, binaryParams, para); +} + +__aicore__ inline void WelfordUpdateComputeTo32(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& src, const LocalTensor& inMean, const LocalTensor& inVar, + const LocalTensor& sharedTmpBuffer, const WelfordUpdateParam ¶, const uint32_t tmpNum, + const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams) +{ + LocalTensor tmpVreg = sharedTmpBuffer.ReinterpretCast(); + LocalTensor outVreg = tmpVreg[B32_DATA_NUM_PER_REPEAT * tmpNum]; + + WelfordUpdateComputeMean(tmpVreg, src, inMean, outVreg, outMean, unaryParams, binaryParams, para); + + Sub(outVreg, src, outMean, MASK_PLACEHOLDER, 1, binaryParams); + PipeBarrier(); + Mul(tmpVreg, tmpVreg, outVreg, MASK_PLACEHOLDER, 1, binaryParams); + + WelfordUpdateComputeVar(tmpVreg, inVar, outVar, unaryParams, binaryParams, para); +} + +template +__aicore__ inline void WelfordUpdateCompute(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& src, const LocalTensor& inMean, const LocalTensor& inVar, + const LocalTensor& sharedTmpBuffer, const WelfordUpdateParam ¶, const uint32_t tmpNum, + const UnaryRepeatParams unaryParams, const BinaryRepeatParams binaryParams) +{ + if (isReuseSource) { + WelfordUpdateComputeTo32Res(outMean, outVar, src, inMean, inVar, sharedTmpBuffer, para, tmpNum, unaryParams, + binaryParams); + } else { + WelfordUpdateComputeTo32(outMean, outVar, src, inMean, inVar, sharedTmpBuffer, para, tmpNum, unaryParams, + binaryParams); + } +} + +template +__aicore__ inline void WelfordUpdateComputeImpl(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& src, const LocalTensor& inMean, const LocalTensor& inVar, + const LocalTensor& sharedTmpBuffer, const WelfordUpdateParam ¶) +{ + constexpr uint32_t tmpBufNum = WelfordUpdateGetTmpSize(); + + uint32_t tmpNum = sharedTmpBuffer.GetSize() / (ONE_REPEAT_BYTE_SIZE * tmpBufNum); +#if ASCENDC_CPU_DEBUG + ASCENDC_ASSERT((tmpNum != 0), { + KERNEL_LOG(KERNEL_ERROR, + "Failed to check the size of sharedTmpBuffer, the size of sharedTmpBuffer is %dB, it is smaller.", + sharedTmpBuffer.GetSize()); + }); +#endif + + const uint32_t round = para.abComputeLength / (B32_DATA_NUM_PER_REPEAT * tmpNum); + const uint32_t tail = para.abComputeLength % (B32_DATA_NUM_PER_REPEAT * tmpNum); + + SetVectorMask(0, B32_DATA_NUM_PER_REPEAT * tmpNum); + uint32_t offset = 0; + + const UnaryRepeatParams unaryParams; + const BinaryRepeatParams binaryParams; + + for (uint32_t i = 0; i < round; ++i) { + WelfordUpdateCompute(outMean[offset], outVar[offset], src[offset], inMean[offset], + inVar[offset], sharedTmpBuffer, para, tmpNum, unaryParams, binaryParams); + offset = offset + B32_DATA_NUM_PER_REPEAT * tmpNum; + } + + if (tail != 0) { + SetVectorMask(0, tail); + WelfordUpdateCompute(outMean[offset], outVar[offset], src[offset], inMean[offset], + inVar[offset], sharedTmpBuffer, para, tmpNum, unaryParams, binaryParams); + } +} + +template +__aicore__ inline void WelfordUpdateImpl(const LocalTensor& outputMean, const LocalTensor& outputVariance, + const LocalTensor& inputMean, const LocalTensor& inputVariance, const LocalTensor& inputX, + const LocalTensor& sharedTmpBuffer, const WelfordUpdateParam& para) +{ + static_assert((std::is_same::value || std::is_same::value), + "Failed to check dtype of inputX, inputX support dtype is: half/float."); + static_assert((std::is_same::value), + "Failed to check dtype of mean/var, mean/var support dtype is: float."); +#if ASCENDC_CPU_DEBUG + ASCENDC_ASSERT(((QuePosition)inputX.GetPosition() == TPosition::VECIN || + (QuePosition)inputX.GetPosition() == TPosition::VECOUT || + (QuePosition)inputX.GetPosition() == TPosition::VECCALC), { + KERNEL_LOG(KERNEL_ERROR, + "Failed to check dtype of input position, support position is VECIN, VECOUT, VECCALC."); + }); + ASCENDC_ASSERT((para.abLength <= inputX.GetSize()), { + KERNEL_LOG(KERNEL_ERROR, + "Failed to check para.abLength, current size is %u, which should not larger than inputX size %u.", + para.abLength, inputX.GetSize()); + }); + ASCENDC_ASSERT((para.abComputeLength <= para.abLength), { + KERNEL_LOG(KERNEL_ERROR, + "Failed to check para.abComputeLength, current size is %u, which should not larger than abLength size %u.", + para.abComputeLength, para.abLength); + }); + ASCENDC_ASSERT((para.abComputeLength > 0), { + KERNEL_LOG(KERNEL_ERROR, + "Failed to check para.abComputeLength, para.abComputeLength should be greater than 0.", + para.abComputeLength, para.abLength); + }); + ASCENDC_ASSERT((para.rnLength == 1), { + KERNEL_LOG(KERNEL_ERROR, + "Failed to check para.rnLength, rnLength is %u, which should is 1.", para.rnLength); + }); + ASCENDC_ASSERT((para.abLength % (ONE_BLK_SIZE / sizeof(T)) == 0), { + KERNEL_LOG(KERNEL_ERROR, + "Failed to check para.abLength, para.abLength should be 32B aligned."); + }); +#endif + SetMaskCount(); + if (config.isInplace && (para.abComputeLength < para.abLength)) { + WelfordUpdateInplace(outputMean, outputVariance, inputMean, inputVariance, para); + } + WelfordUpdateComputeImpl(outputMean, outputVariance, inputX, inputMean, inputVariance, + sharedTmpBuffer, para); + SetMaskNorm(); + ResetMask(); +} + +template +__aicore__ inline void WelfordUpdateImpl(const LocalTensor& outMean, const LocalTensor& outVar, + const LocalTensor& inMean, const LocalTensor& inVar, const LocalTensor& srcUb, + const WelfordUpdateParam& para) +{ + LocalTensor stackTensor; + bool ans = PopStackBuffer(stackTensor); + ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); + + WelfordUpdateImpl(outMean, outVar, inMean, inVar, srcUb, stackTensor, para); +} + } // namespace AscendC #endif // IMPL_NORMALIZATION_LAYERNORM_LAYERNORM_COMMON_IMPL_H \ No newline at end of file diff --git a/impl/normalization/layernorm/layernorm_tiling_impl.cpp b/impl/normalization/layernorm/layernorm_tiling_impl.cpp index 47dc41644012d164b7e5cf6c0956bdd73d791dfe..f0d93b335e7ac4fa78ba578f4f8c48034ac7ecbc 100644 --- a/impl/normalization/layernorm/layernorm_tiling_impl.cpp +++ b/impl/normalization/layernorm/layernorm_tiling_impl.cpp @@ -24,6 +24,9 @@ constexpr uint32_t LAYERNORM_ONE_BLK_SHIFT_AMOUNT = 5; constexpr uint32_t LAYERNORM_ONE_NUMBER = 1; constexpr uint32_t LAYERNOR_ZERO_NUMBER = 0; constexpr float LAYERNOR_LAST_DIM_INIT_VALUE = 1.0; +constexpr uint32_t WEL_UP_REP_SIZE = 256; +constexpr uint32_t WEL_UP_FLOAT_SIZE = 256 / sizeof(float); +constexpr uint32_t SHAPE_DIM = 2; uint32_t GetLayerNormMaxTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource) { @@ -166,4 +169,27 @@ void GetLayerNormNDTillingInfo(const ge::Shape& srcShape, const uint32_t stackBu tilling.set_bsCurLength(bsCurLength); tilling.set_lastDimValueBack(lastDimValueBack); } + +void GetWelfordUpdateMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSizeT, const uint32_t typeSizeU, + const bool isReuseSource, const bool isInplace, uint32_t& maxValue, uint32_t& minValue) +{ + (void)isInplace; + (void)typeSizeU; + + std::vector shapeDims = srcShape.GetDims(); + ASCENDC_HOST_ASSERT(shapeDims.size() == SHAPE_DIM, return, "srcShape dims must be 2."); + + const uint32_t rnLength = static_cast(shapeDims[0]); + const uint32_t abLength = static_cast(shapeDims[1]); + + if (typeSizeT == sizeof(uint16_t)) { + minValue = 0x3 * WEL_UP_REP_SIZE; // dispense 3 buffers + } else if (isReuseSource) { + minValue = 1 * WEL_UP_REP_SIZE; // dispense 1 buffer + } else { + minValue = 0x2 * WEL_UP_REP_SIZE; // dispense 2 buffers + } + maxValue = (rnLength * abLength + WEL_UP_FLOAT_SIZE - 1) / WEL_UP_FLOAT_SIZE * minValue; +} + } // namespace AscendC \ No newline at end of file diff --git a/impl/utils/init_global_memory/init_global_memory_v200_impl.h b/impl/utils/init_global_memory/init_global_memory_v200_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..93ca1ddb5cd8cef61c6425a6fa546eeda31a1907 --- /dev/null +++ b/impl/utils/init_global_memory/init_global_memory_v200_impl.h @@ -0,0 +1,83 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file init_global_memory_v200_impl.h + * \brief + */ +#ifndef IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V200_IMPL_H +#define IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V200_IMPL_H + +#include "kernel_tensor.h" +#include "kernel_operator_intf.h" + +namespace AscendC { +template +__aicore__ inline void InitGlobalMemoryImpl(GlobalTensor &gmWorkspaceAddr, const uint64_t size, const T value) +{ + if ASCEND_IS_AIC { + return; + } + LocalTensor popBuffer; + constexpr uint32_t MAX_REPEAT_LEN = 256; + bool ret = PopStackBuffer(popBuffer); + ASCENDC_ASSERT(ret, { KERNEL_LOG(KERNEL_ERROR, "No space left to allocate in Unified Buffer"); }); + constexpr uint32_t maxBurstSize = (MAX_REPEAT_TIMES * MAX_REPEAT_LEN) / sizeof(T); + const uint32_t popSize = popBuffer.GetSize() >= maxBurstSize ? maxBurstSize : popBuffer.GetSize(); + const uint32_t round = size / popSize; + const uint32_t tail = size % popSize; + const uint32_t roundSize = round != 0 ? popSize : 0; + Duplicate(popBuffer, value, popSize); + event_t eventIDVToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMTE3); + WaitFlag(eventIDVToMTE3); + struct DataCopyParams repeatParams; + repeatParams.blockCount = 1; + uint32_t comOffset = 0; + // compute the main block + if ((roundSize * sizeof(T)) % ONE_BLK_SIZE == 0) { + repeatParams.blockLen = static_cast(roundSize * sizeof(T)) / ONE_BLK_SIZE; + for (uint32_t index = 0; index < round; ++index) { + DataCopy(gmWorkspaceAddr[comOffset], popBuffer, repeatParams); + comOffset += roundSize; + } + } else { + const uint32_t roundSizeExtra = roundSize * sizeof(T) % ONE_BLK_SIZE; + const uint32_t roundSizeAlign = roundSize * sizeof(T) - roundSizeExtra; + repeatParams.blockLen = static_cast(roundSizeAlign) / ONE_BLK_SIZE; + for (uint32_t index = 0; index < round; ++index) { + DataCopy(gmWorkspaceAddr[comOffset],popBuffer, repeatParams); + comOffset += roundSize; + for (uint64_t i = comOffset - roundSizeExtra / sizeof(T); i < comOffset; ++i) { + gmWorkspaceAddr.SetValue(i, value); + } + } + } + // compute the tail block + if (tail != 0) { + if ((tail * sizeof(T)) % ONE_BLK_SIZE == 0) { + repeatParams.blockLen = static_cast(tail * sizeof(T)) / ONE_BLK_SIZE; + comOffset = round * roundSize; + DataCopy(gmWorkspaceAddr[comOffset], popBuffer, repeatParams); + } else { + const uint32_t tailExtra = tail * sizeof(T) % ONE_BLK_SIZE; + const uint32_t tailAlign = tail * sizeof(T) - tailExtra; + repeatParams.blockLen = static_cast(tailAlign) / ONE_BLK_SIZE; + comOffset = round * roundSize; + DataCopy(gmWorkspaceAddr[comOffset], popBuffer, repeatParams); + for (uint64_t i = comOffset + tailAlign / sizeof(T); i < size; ++i) { + gmWorkspaceAddr.SetValue(i, value); + } + } + } + PipeBarrier(); +} +} // namespace AscendC +#endif // IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V200_IMPL_H diff --git a/impl/utils/init_global_memory/init_global_memory_v220_impl.h b/impl/utils/init_global_memory/init_global_memory_v220_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..9035a4c12cd9297b8e18048e5adf7e9064c9c15c --- /dev/null +++ b/impl/utils/init_global_memory/init_global_memory_v220_impl.h @@ -0,0 +1,59 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file init_global_memory_v220_impl.h + * \brief + */ +#ifndef IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V220_IMPL_H +#define IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V220_IMPL_H + +#include "kernel_tensor.h" +#include "kernel_operator_intf.h" + +namespace AscendC { +template +__aicore__ inline void InitGlobalMemoryImpl(GlobalTensor &gmWorkspaceAddr, const uint64_t size, const T value) +{ + if ASCEND_IS_AIC { + return; + } + LocalTensor popBuffer; + constexpr uint32_t MAX_REPEAT_LEN = 256; + bool ret = PopStackBuffer(popBuffer); + ASCENDC_ASSERT(ret, { KERNEL_LOG(KERNEL_ERROR, "No space left to allocate in Unified Buffer"); }); + constexpr uint32_t maxBurstSize = (MAX_REPEAT_TIMES * MAX_REPEAT_LEN) / sizeof(T); + const uint32_t popSize = popBuffer.GetSize() >= maxBurstSize ? maxBurstSize : popBuffer.GetSize(); + const uint32_t round = size / popSize; + const uint32_t tail = size % popSize; + const uint32_t roundSize = round != 0 ? popSize : 0; + Duplicate(popBuffer, value, popSize); + event_t eventIDVToMTE3 = static_cast(GetTPipePtr()->FetchEventID(HardEvent::V_MTE3)); + SetFlag(eventIDVToMTE3); + WaitFlag(eventIDVToMTE3); + struct DataCopyExtParams repeatParams; + repeatParams.blockCount = 1; + uint32_t comOffset = 0; + // compute the main block + repeatParams.blockLen = static_cast(roundSize * sizeof(T)); + for (uint32_t index = 0; index < round; ++index) { + DataCopyPad(gmWorkspaceAddr[comOffset], popBuffer, repeatParams); + comOffset += roundSize; + } + // compute the tail block + repeatParams.blockLen = static_cast(tail * sizeof(T)); + if (tail != 0) { + comOffset = round * roundSize; + DataCopyPad(gmWorkspaceAddr[comOffset], popBuffer, repeatParams); + } + PipeBarrier(); +} +} // namespace AscendC +#endif // IMPL_UTILS_INIT_GLOBAL_MEMORY_INIT_GLOBAL_MEMORY_V220_IMPL_H diff --git a/lib/matmul/constant_tiling.h b/lib/matmul/constant_tiling.h index 4ddf535bc886f75a65dbaacc69c775a1d3927ba9..f4d9754a3de115baf879906882bcb755ce623f34 100644 --- a/lib/matmul/constant_tiling.h +++ b/lib/matmul/constant_tiling.h @@ -17,9 +17,9 @@ #include "../../impl/matmul/matmul_constant_tiling_impl.h" -namespace Gemm { +namespace AscendC { template -__aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig &mmCFG, int32_t l1Size = L1_SIZE) +__aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig &mmCFG, int32_t l1Size = Impl::L1_SIZE) { MatmulApiStaticTiling tiling; tiling.cfg = mmCFG; @@ -69,5 +69,5 @@ __aicore__ constexpr MatmulApiStaticTiling GetMatmulApiTiling(const MatmulConfig tiling.shareUbSize = 0; return tiling; } -} // namespace matmul +} // namespace AscendC #endif // LIB_MATMUL_CONSTANT_TILING_H \ No newline at end of file diff --git a/lib/matmul/matmul.h b/lib/matmul/matmul.h index f1884fb234eafd8ae9789f00dff0c64bd983c3fa..0b8aaa043d610ccf7f2e13421967cd4c68a2ac26 100644 --- a/lib/matmul/matmul.h +++ b/lib/matmul/matmul.h @@ -20,8 +20,8 @@ #include "lib/matmul/constant_tiling.h" #include "../../impl/matmul/matmul_call_back.h" -namespace Gemm { -using namespace AscendC; +namespace AscendC { + template struct MatmulApiConfig { @@ -114,8 +114,8 @@ public: using CallBack = MM_CB; }; -} // namespace Gemm +} // namespace AscendC // Compatible with the previously used matmul namespace -namespace matmul = Gemm; +namespace matmul = AscendC; #include "../../impl/matmul/matmul_impl.h" #endif diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h index 9da6e34ae5592b50c8ebde06e4298ddd9f8bfed8..dd474ffc613605a1143bda45eb94532297518f5b 100644 --- a/lib/matmul/matmul_client.h +++ b/lib/matmul/matmul_client.h @@ -26,8 +26,8 @@ #include "../../impl/matmul/matmul_server.h" #endif -namespace Gemm { -using namespace AscendC; +namespace AscendC { + constexpr int32_t VECTOR_QUANT_MODE = 2; @@ -1216,5 +1216,5 @@ class MatmulClient, MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> using Matmul = MatmulImpl; -} #else -namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> using Matmul = MatmulClient; -} #endif #else -namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> using Matmul = MatmulImpl; -} #endif #else #ifdef __DAV_C220_CUBE__ #ifdef ASCENDC_CUBE_ONLY -namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> using Matmul = MatmulImpl; -} #else -namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> using Matmul = MatmulServiceAux; -} #endif #elif defined(__DAV_C220_VEC__) -namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> using Matmul = MatmulClient; -} #else -namespace Gemm { template , MATMUL_POLICY_VARIADIC_TEMPLATE_OF(MATMUL_POLICY)> using Matmul = MatmulImpl; -} //namespace Gemm #endif #endif +} //namespace AscendC #endif \ No newline at end of file diff --git a/lib/normalization/layernorm.h b/lib/normalization/layernorm.h index ced532ca3538eda802da930b4e8dc23f16fb0702..c234a73ea6d4e2e787d7e885958954e5410ca413 100644 --- a/lib/normalization/layernorm.h +++ b/lib/normalization/layernorm.h @@ -67,6 +67,55 @@ __aicore__ inline void LayerNorm(const LocalTensor& output, const LocalTensor { LayerNormImpl(output, outputMean, outputVariance, inputX, gamma, beta, epsilon, tiling); } + +/*! + * \brief Calculate the mean and variance for each time using the Welford algorithm. + * + * \note support data type: T(half and float)、U(float) + * + * \param [out] outputMean, output LocalTensor, shape is [A, R] + * \param [out] outputVariance, output LocalTensor, shape is [A, R] + * \param [in] inputMean, input LocalTensor, shape is [A, R] + * \param [in] inputVariance, input LocalTensor, shape is [A, R] + * \param [in] inputX, input LocalTensor, shape is [A, R] + * \param [in] para, para detailed information about the original data shape + */ +template +__aicore__ inline void WelfordUpdate(const LocalTensor& outputMean, const LocalTensor& outputVariance, + const LocalTensor& inputMean, const LocalTensor& inputVariance, const LocalTensor& inputX, + const WelfordUpdateParam& para) +{ + if ASCEND_IS_AIC { + return; + } + WelfordUpdateImpl(outputMean, outputVariance, inputMean, inputVariance, inputX, para); +} + +/*! + * \brief Calculate the mean and variance for each time using the Welford algorithm. + * + * \note support data type: T(half and float)、U(float) + * + * \param [out] outputMean, output LocalTensor, shape is [A, R] + * \param [out] outputVariance, output LocalTensor, shape is [A, R] + * \param [in] inputMean, input LocalTensor, shape is [A, R] + * \param [in] inputVariance, input LocalTensor, shape is [A, R] + * \param [in] inputX, input LocalTensor, shape is [A, R] + * \param [in] sharedTmpBuffer, input local temporary Tensor + * \param [in] para, para detailed information about the original data shape + */ +template +__aicore__ inline void WelfordUpdate(const LocalTensor& outputMean, const LocalTensor& outputVariance, + const LocalTensor& inputMean, const LocalTensor& inputVariance, const LocalTensor& inputX, + const LocalTensor& sharedTmpBuffer, const WelfordUpdateParam& para) +{ + if ASCEND_IS_AIC { + return; + } + WelfordUpdateImpl(outputMean, outputVariance, inputMean, inputVariance, inputX, + sharedTmpBuffer, para); +} + #pragma end_pipe } // namespace AscendC #endif // LIB_NORMALIZATION_LAYERNORM_H \ No newline at end of file diff --git a/lib/normalization/layernorm_tiling.h b/lib/normalization/layernorm_tiling.h index a82fb2c4aef5bee3aaf39573b5e6845557d2dce2..4d8f086cd4fc4b42bad0a85bcebb13efc427e412 100644 --- a/lib/normalization/layernorm_tiling.h +++ b/lib/normalization/layernorm_tiling.h @@ -22,5 +22,18 @@ void GetLayerNormMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSiz void GetLayerNormNDTillingInfo(const ge::Shape& srcShape, const uint32_t stackBufferSize, const uint32_t typeSize, const bool isReuseSource, optiling::LayerNormTiling& tilling); + +/*! + * \brief calculate max and min tmp buffer size for WelfordUpdate interface. + * \param [in] srcShape: input shape + * \param [in] typeSizeU: data type size: sizeof(U) + * \param [in] typeSizeT: data type size: sizeof(T) + * \param [in] isReuseSource: indicate whether to reuse source tensor. Reserved paramater. + * \param [in] isInplace: indicate whether outputs that are not calculated are multiplexed inputs. + * \param [out] maxValue: max size required for tmp buffer + * \param [out] minValue: min size required for tmp buffer + */ +void GetWelfordUpdateMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSizeT, const uint32_t typeSizeU, + const bool isReuseSource, const bool isInplace, uint32_t& maxValue, uint32_t& minValue); } #endif // LIB_NORMALIZATION_LAYERNORM_TILING_H diff --git a/lib/utils/init_global_memory.h b/lib/utils/init_global_memory.h new file mode 100644 index 0000000000000000000000000000000000000000..e64b08f72083c1f26ab6c024acf5ee11722ff382 --- /dev/null +++ b/lib/utils/init_global_memory.h @@ -0,0 +1,51 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/* ! + * \file init_global_memory.h + * \brief + */ +#ifndef LIB_UTILS_INIT_GLOBAL_MEMORY_H +#define LIB_UTILS_INIT_GLOBAL_MEMORY_H + +#if __CCE_AICORE__ == 200 +#include "../../impl/utils/init_global_memory/init_global_memory_v200_impl.h" +#elif __CCE_AICORE__ == 220 +#include "../../impl/utils/init_global_memory/init_global_memory_v220_impl.h" +#endif + +namespace AscendC { +/* ! + * \brief This function realizes the clear global memory function. + * + * \note support data type: uint16_t, int16_t, half, float, uint32_t, int32_t + * + * \param [out] GlobalTensor + * \param [in] size, size of space to be initialized + * \param [in] value, value to be initialized in global memory + */ +#if __CCE_AICORE__ == 200 +template +__aicore__ inline __in_pipe__(V) + __out_pipe__(MTE3, S) void InitGlobalMemory(GlobalTensor &gmWorkspaceAddr, const uint64_t size, const T value) +{ + InitGlobalMemoryImpl(gmWorkspaceAddr, size, value); +} + +#elif __CCE_AICORE__ == 220 +template +__aicore__ inline __in_pipe__(V) + __out_pipe__(MTE3) void InitGlobalMemory(GlobalTensor &gmWorkspaceAddr, const uint64_t size, const T value) +{ + InitGlobalMemoryImpl(gmWorkspaceAddr, size, value); +} +#endif +} // namespace AscendC +#endif // LIB_UTILS_INIT_GLOBAL_MEMORY_H diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt index e02da30f529c357856fc504d3fbae661aadcbb76..7823902ac9f8a209ab95882735563ac57a6b904d 100644 --- a/tests/CMakeLists.txt +++ b/tests/CMakeLists.txt @@ -62,6 +62,7 @@ file(GLOB ASCENDC_TEST_ascend310p_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp ${ASCENDC_TESTS_DIR}/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp ${ASCENDC_TESTS_DIR}/matmul/test_operator_matmul_v200.cpp + ${ASCENDC_TESTS_DIR}/normalization/welfordupdate/test_operator_welfordupdate.cpp ) # ascend910B1 aiv test cases @@ -82,6 +83,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES # ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgrad.cpp ${ASCENDC_TESTS_DIR}/normalization/layernorm/test_operator_layernormgradbeta.cpp ${ASCENDC_TESTS_DIR}/normalization/rmsnorm/test_operator_rmsnorm.cpp + ${ASCENDC_TESTS_DIR}/normalization/welfordupdate/test_operator_welfordupdate.cpp ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_antiquant_scalar.cpp ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_antiquant_weight_scalar.cpp ${ASCENDC_TESTS_DIR}/quantization/antiquant/test_ascend_antiquant_weight.cpp @@ -90,6 +92,7 @@ file(GLOB ASCENDC_TEST_ascend910B1_AIV_CASE_SRC_FILES ${ASCENDC_TESTS_DIR}/quantization/quant/test_operator_quant.cpp ${ASCENDC_TESTS_DIR}/quantization/quant/test_operator_quant_per_channel.cpp ${ASCENDC_TESTS_DIR}/sort/topk/test_operator_topk.cpp + ${ASCENDC_TESTS_DIR}/utils/init_global_memory/test_operator_init_global_memory.cpp ) # ascend910B1 aic test cases diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp index b89a24bd962154a840dbff3c15ba46ff6c48ed00..924ac98106b928231d4b0b233d54bc4828bf5f69 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template @@ -77,7 +77,7 @@ public: private: TQueBind qid_; + 1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_; LocalTensor tensor_; int32_t cacheProc_ = 0; }; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp index 51f5eae6013bb53ce02d02321ad3f00022757cb9..1ff4ef02ca097134997742eaa6e3195d5524e03a 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_mdl_310p.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template @@ -77,7 +77,7 @@ public: private: TQueBind qid_; + 1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_; LocalTensor tensor_; int32_t cacheProc_ = 0; }; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp index 6d8184993173f7fdb876fdecceabb6d53426863d..428d5654630b660acf0c3e295bb1337e817c45d2 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template @@ -77,7 +77,7 @@ public: private: TQueBind qid_; + 1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_; LocalTensor tensor_; int32_t cacheProc_ = 0; }; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp index baa79f58697960e9e4348bbf38f773276bb658d2..a1325f506f8b6bdc7decc70fadbf000c65dbda8b 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_norm_310p.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template @@ -77,7 +77,7 @@ public: private: TQueBind qid_; + 1, GetNdNzMask(CubeFormat::NZ, INPUT_TYPE::format)> qid_; LocalTensor tensor_; int32_t cacheProc_ = 0; }; diff --git a/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp b/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp index ded109cf459d4630491b83835eb461a0f5fca20b..ad72f0f2caa418e4bbec000a0389fa5b168c946b 100644 --- a/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp +++ b/tests/matmul/copy_cube_in/test_copy_cube_in_params.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp index e18e5e1ccf7cf70ec406ad4babba66ceba575ea6..186a1c6f65e2df151d71ef8e5128aae3a9e8b9ab 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_double_global_buffer.cpp @@ -14,7 +14,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp index 6b07bf9b71c8694b0584e9c30154dba41d55b1b6..df32c7d21b28ecbb51d67a29e541ce8a84e0125b 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_normal.cpp @@ -14,7 +14,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp index ecf91f4fe74940f6686a78eca334b089740bf84a..af22f4f71153bcbad756461dbc4057a9cf32bab8 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_params.cpp @@ -14,7 +14,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template diff --git a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp index 7af390d520e16d761b304fbf64fd6b20ce2c05f4..2d9b7d7a17c2f36e1c24bc16961cfdcdb108acd9 100644 --- a/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp +++ b/tests/matmul/cube_in_buffer/test_cube_in_buffer_single_global_buffer.cpp @@ -14,7 +14,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template diff --git a/tests/matmul/test_matmul_channel_split.cpp b/tests/matmul/test_matmul_channel_split.cpp index 64ee080ae61248d0649d8cb1dbf5559bd28d2208..f002feae29637ed2c7287d2bd1cdefb080372afe 100644 --- a/tests/matmul/test_matmul_channel_split.cpp +++ b/tests/matmul/test_matmul_channel_split.cpp @@ -190,7 +190,7 @@ __aicore__ inline void main_kernel_matmul_channel_split(GM_ADDR aGM, GM_ADDR bGM TQue qidA1; TQue qidB1; - Gemm::MatmulImpl mm; + AscendC::MatmulImpl mm; mm.SetSubBlockIdx(0); mm.Init(&tiling, &que); @@ -268,10 +268,10 @@ protected: A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, enSequentialWrite) \ namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling##_##enOuter##_##enOrderM \ { \ - typedef Gemm::MatmulType aType; \ - typedef Gemm::MatmulType bType; \ - typedef Gemm::MatmulType cType; \ - typedef Gemm::MatmulType biasType; \ + typedef AscendC::MatmulType aType; \ + typedef AscendC::MatmulType bType; \ + typedef AscendC::MatmulType cType; \ + typedef AscendC::MatmulType biasType; \ constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM; \ constexpr static MatmulFuncParams mFuncParams{false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::INNER_PRODUCT, true, false, false, true}; \ constexpr static MatmulConfig MM_CFG = GetMMConfig(mFuncParams); \ diff --git a/tests/matmul/test_matmul_config.cpp b/tests/matmul/test_matmul_config.cpp index 8b1990fc4eaf32b14c56344792d57100fb9cb69d..b37b09db8679e35f393b75bea8c1dec0fa552fa6 100644 --- a/tests/matmul/test_matmul_config.cpp +++ b/tests/matmul/test_matmul_config.cpp @@ -11,7 +11,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + class TestMatmulConfig : public testing::Test { protected: diff --git a/tests/matmul/test_matmul_iterate_controller.cpp b/tests/matmul/test_matmul_iterate_controller.cpp index 93905e2403d1393501ae793d44d70b6276b40c2b..6692290850150b47eb2a12849b4fe810f8d42c0b 100644 --- a/tests/matmul/test_matmul_iterate_controller.cpp +++ b/tests/matmul/test_matmul_iterate_controller.cpp @@ -14,7 +14,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + using A_TYPE = MatmulType; using B_TYPE = MatmulType; diff --git a/tests/matmul/test_matmul_l0c_buffer.cpp b/tests/matmul/test_matmul_l0c_buffer.cpp index d7ce8817e9085277574235c98abc31e004bb488b..f5415b8966f8cfb27432afe31e4c8b42e416845b 100644 --- a/tests/matmul/test_matmul_l0c_buffer.cpp +++ b/tests/matmul/test_matmul_l0c_buffer.cpp @@ -14,7 +14,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { diff --git a/tests/matmul/test_matmul_l0db.cpp b/tests/matmul/test_matmul_l0db.cpp index 741b6db5278eb4b86057c3ef49b1e5014a046ec9..d53bbd5ecde4cdb5f0951ce476bd7775ea0f804a 100644 --- a/tests/matmul/test_matmul_l0db.cpp +++ b/tests/matmul/test_matmul_l0db.cpp @@ -199,8 +199,8 @@ __aicore__ inline void main_kernel_matmul_l0db(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR TQue qidA1; TQue qidB1; - Gemm::MatmulImpl mm1; - Gemm::MatmulImpl mm2; + AscendC::MatmulImpl mm1; + AscendC::MatmulImpl mm2; mm1.SetSubBlockIdx(0); mm1.Init(&tiling, &que); mm2.SetSubBlockIdx(0); @@ -235,10 +235,10 @@ protected: #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL_L0DB, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, enSequentialWrite) \ namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##enSequentialWrite{ \ - typedef Gemm::MatmulType aType; \ - typedef Gemm::MatmulType bType; \ - typedef Gemm::MatmulType cType; \ - typedef Gemm::MatmulType biasType; \ + typedef AscendC::MatmulType aType; \ + typedef AscendC::MatmulType bType; \ + typedef AscendC::MatmulType cType; \ + typedef AscendC::MatmulType biasType; \ constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;\ constexpr static MatmulFuncParams dbFuncParams{false, false, false, false, 0, IterateOrder::UNDEF, ScheduleType::INNER_PRODUCT, true, true, false, true};\ constexpr static MatmulConfig CFG_NORM_DB = GetMMConfig(dbFuncParams);\ diff --git a/tests/matmul/test_matmul_shape_info.cpp b/tests/matmul/test_matmul_shape_info.cpp index b53e9395059f1daf8a608e605a500f0e811c780a..b2b609225dddeaf47fd7fec1fdb22db453addf22 100644 --- a/tests/matmul/test_matmul_shape_info.cpp +++ b/tests/matmul/test_matmul_shape_info.cpp @@ -16,7 +16,7 @@ using namespace std; using namespace AscendC; -using namespace Gemm; + namespace { template qidA1; TQue qidB1; - Gemm::MatmulImpl mm; + AscendC::MatmulImpl mm; mm.SetSubBlockIdx(0); mm.Init(&tiling, &que); @@ -337,12 +337,12 @@ protected: #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling) \ namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling{ \ - typedef Gemm::MatmulType aType; \ - typedef Gemm::MatmulType bType; \ - typedef Gemm::MatmulType cType; \ - typedef Gemm::MatmulType biasType; \ + typedef AscendC::MatmulType aType; \ + typedef AscendC::MatmulType bType; \ + typedef AscendC::MatmulType cType; \ + typedef AscendC::MatmulType biasType; \ constexpr static MatmulConfig mmCFG = CFG_Mode; \ - constexpr static MatmulApiStaticTiling mmTiling = Gemm::GetMatmulApiTiling(mmCFG); \ + constexpr static MatmulApiStaticTiling mmTiling = AscendC::GetMatmulApiTiling(mmCFG); \ TEST_F(TEST_KERNEL_MATMUL, Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling) \ { \ const int32_t left_data_size = tilingParams.M_ * tilingParams.K_; \ diff --git a/tests/matmul/test_operator_matmul_v220.cpp b/tests/matmul/test_operator_matmul_v220.cpp index b3a83d4182d651f7d0d05bb8c6b10799097579ff..84f8a92334b2406bd8b51dc423da513774927ad1 100644 --- a/tests/matmul/test_operator_matmul_v220.cpp +++ b/tests/matmul/test_operator_matmul_v220.cpp @@ -201,7 +201,7 @@ __aicore__ inline void main_kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, TQue qidA1; TQue qidB1; - Gemm::MatmulImpl mm; + AscendC::MatmulImpl mm; mm.SetSubBlockIdx(0); mm.Init(&tiling, &que); @@ -375,12 +375,12 @@ protected: #define KERNEL_MATMUL_TESTCASE(TEST_KERNEL_MATMUL, tilingParams, A_Pos, B_Pos, C_Pos, BIAS_Pos, A_Format, B_Format, C_Format, BIAS_Format, \ A_DType, B_DType, C_DType, BIAS_DType, isTransposeA, isTransposeB, CFG_Mode, enSequentialWrite, enTiling, enOuter, enOrderM) \ namespace Kernel_Matmul_Case_##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##enSequentialWrite##_##enTiling##_##enOuter##_##enOrderM{ \ - typedef Gemm::MatmulType aType; \ - typedef Gemm::MatmulType bType; \ - typedef Gemm::MatmulType cType; \ - typedef Gemm::MatmulType biasType; \ + typedef AscendC::MatmulType aType; \ + typedef AscendC::MatmulType bType; \ + typedef AscendC::MatmulType cType; \ + typedef AscendC::MatmulType biasType; \ constexpr static MatmulConfig mmCFG = CFG_Mode; \ - constexpr static MatmulApiStaticTiling mmTiling = Gemm::GetMatmulApiTiling(mmCFG); \ + constexpr static MatmulApiStaticTiling mmTiling = AscendC::GetMatmulApiTiling(mmCFG); \ constexpr static MatmulConfigMode configMode = MatmulConfigMode::CONFIG_NORM;\ constexpr static MatmulFuncParams mFuncParams{false, false, false, false, 0, IterateOrder::ORDER_M, ScheduleType::OUTER_PRODUCT};\ constexpr static MatmulConfig normOuterM = GetMMConfig(mFuncParams);\ diff --git a/tests/matmul/test_operator_matmul_v300.cpp b/tests/matmul/test_operator_matmul_v300.cpp index 19587e381dcafb0903e24474a2af42c3e44e8590..e61aae495541944429395677b4cc206e0a8d98cf 100644 --- a/tests/matmul/test_operator_matmul_v300.cpp +++ b/tests/matmul/test_operator_matmul_v300.cpp @@ -229,7 +229,7 @@ __aicore__ inline void kernel_matmul(GM_ADDR aGM, GM_ADDR bGM, GM_ADDR cGM, GM_A set_atomic_none(); - Gemm::MatmulImpl mm; + AscendC::MatmulImpl mm; if constexpr(mmMatmul) { REGIST_MATMUL_OBJ(&que, GetSysWorkSpacePtr(), mm); mm.Init(&tiling); @@ -309,10 +309,10 @@ TilingParams g_tilingParams = { 1, 16, 32, 32, 16, 32, 32, 16, 32, 32, 1, 1, 1, TEST_F(TEST_KERNEL_MATMUL, \ Kernel_Matmul_Case##tilingParams##_##A_Pos##_##B_Pos##_##C_Pos##_##BIAS_Pos##_##A_Format##_##B_Format##_##C_Format##_##BIAS_Format##_##A_DType##_##B_DType##_##C_DType##_##BIAS_DType##_##isTransposeA##_##isTransposeB##_##CFG_Mode##_##MM_Matmul) \ { \ - typedef Gemm::MatmulType aType; \ - typedef Gemm::MatmulType bType; \ - typedef Gemm::MatmulType cType; \ - typedef Gemm::MatmulType biasType; \ + typedef AscendC::MatmulType aType; \ + typedef AscendC::MatmulType bType; \ + typedef AscendC::MatmulType cType; \ + typedef AscendC::MatmulType biasType; \ TilingParams tilingParam = tilingParams; \ const int32_t left_data_size = tilingParam.M_ * tilingParam.K_; \ const int32_t right_data_size = tilingParam.K_ * tilingParam.N_; \ diff --git a/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b7b7d2b9646af5d63e2691bf298faf25a40b7a6a --- /dev/null +++ b/tests/normalization/welfordupdate/test_operator_welfordupdate.cpp @@ -0,0 +1,245 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include +#define private public +#define protect public +#include "kernel_operator.h" +#include +using namespace std; +using namespace AscendC; + +constexpr uint32_t WEL_UP_BLOCK_SIZE = 32; +constexpr WelfordUpdateConfig WELFORD_UPDATE_ENABLE_INPLACE_CFG = {true}; +constexpr WelfordUpdateConfig WELFORD_UPDATE_UNENABLE_INPLACE_CFG = {false}; +namespace TEST_CASE { +constexpr uint32_t WEL_UP_REP_SIZE = 256; +constexpr uint32_t WEL_UP_FLOAT_SIZE = 256 / sizeof(float); + +bool GetWelfordUpdateMaxMinTmpSize(const int32_t rnLength, const int32_t abLength, const uint32_t typeSizeT, + const uint32_t typeSizeU, const bool isReuseSource, const bool isInplace, uint32_t& maxValue, uint32_t& minValue) +{ + if (typeSizeT == sizeof(uint16_t)) { + minValue = 0x3 * WEL_UP_REP_SIZE; + } else if (isReuseSource) { + minValue = 1 * WEL_UP_REP_SIZE; + } else { + minValue = 0x2 * WEL_UP_REP_SIZE; + } + maxValue = (rnLength * abLength + WEL_UP_FLOAT_SIZE - 1) / WEL_UP_FLOAT_SIZE * minValue; + return true; +} + +} // namespace TEST_CASE + +template +class KernelWelfordUpdate { +public: + __aicore__ inline KernelWelfordUpdate() + {} + __aicore__ inline void Init(GM_ADDR inputX_gm, GM_ADDR inputMean_gm, GM_ADDR inputVar_gm, GM_ADDR outputMean_gm, + GM_ADDR outputVar_gm, int32_t rnLength, int32_t abLength, int32_t abComputeLength, float nRec) + { + m_rnLength = rnLength; + m_abLength = abLength; + m_abComputeLength = abComputeLength; + m_nRec = nRec; + bshLength = rnLength * abLength; + inplace = isInplace; + + inputX_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(inputX_gm), bshLength); + inputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputMean_gm), bshLength); + inputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(inputVar_gm), bshLength); + outputMean_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputMean_gm), bshLength); + outputVar_global.SetGlobalBuffer(reinterpret_cast<__gm__ U *>(outputVar_gm), bshLength); + + pipe.InitBuffer(inQueueX, 1, bshLength * sizeof(T)); + pipe.InitBuffer(inQueueMean, 1, bshLength * sizeof(U)); + pipe.InitBuffer(inQueueVar, 1, bshLength * sizeof(U)); + pipe.InitBuffer(outQueueMean, 1, bshLength * sizeof(U)); + pipe.InitBuffer(outQueueVar, 1, bshLength * sizeof(U)); + + } + __aicore__ inline void Process() + { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() + { + LocalTensor inputXLocal = inQueueX.AllocTensor(); + LocalTensor inputMeanLocal = inQueueMean.AllocTensor(); + LocalTensor inputVarLocal = inQueueVar.AllocTensor(); + + DataCopy(inputXLocal, inputX_global, bshLength); + DataCopy(inputMeanLocal, inputMean_global, bshLength); + DataCopy(inputVarLocal, inputVar_global, bshLength); + + inQueueX.EnQue(inputXLocal); + inQueueMean.EnQue(inputMeanLocal); + inQueueVar.EnQue(inputVarLocal); + } + __aicore__ inline void Compute() + { + LocalTensor inputXLocal = inQueueX.DeQue(); + LocalTensor inputMeanLocal = inQueueMean.DeQue(); + LocalTensor inputVarLocal = inQueueVar.DeQue(); + + LocalTensor outMeanLocal = outQueueMean.AllocTensor(); + LocalTensor outVarLocal = outQueueVar.AllocTensor(); + + AscendC::Duplicate(outMeanLocal, (U)(0.0), bshLength); + AscendC::Duplicate(outVarLocal, (U)(0.0), bshLength); + + struct WelfordUpdateParam para = {m_rnLength, m_abLength, m_abComputeLength, m_nRec}; + if (tmpLocal) { + TEST_CASE::GetWelfordUpdateMaxMinTmpSize(m_rnLength, m_abLength, sizeof(T), sizeof(U), isReuseSource, + isInplace, tmpMaxBytes, tmpMinBytes); + if (tmpMinBytes % WEL_UP_BLOCK_SIZE != 0) { + tmpMinBytes = (tmpMinBytes + WEL_UP_BLOCK_SIZE - 1) / WEL_UP_BLOCK_SIZE * WEL_UP_BLOCK_SIZE; + } + pipe.InitBuffer(tmpLocalBuf, tmpMinBytes); + LocalTensor tmpLocalTensor = tmpLocalBuf.Get(); + if (inplace) { + WelfordUpdate(outMeanLocal, outVarLocal, + inputMeanLocal, inputVarLocal, inputXLocal, tmpLocalTensor, para); + } else { + WelfordUpdate(outMeanLocal, outVarLocal, + inputMeanLocal, inputVarLocal, inputXLocal, tmpLocalTensor, para); + } + } else { + if (inplace) { + WelfordUpdate(outMeanLocal, outVarLocal, + inputMeanLocal, inputVarLocal, inputXLocal, para); + } else { + WelfordUpdate(outMeanLocal, outVarLocal, + inputMeanLocal, inputVarLocal, inputXLocal, para); + } + } + + outQueueMean.EnQue(outMeanLocal); + outQueueVar.EnQue(outVarLocal); + + inQueueX.FreeTensor(inputXLocal); + inQueueMean.FreeTensor(inputMeanLocal); + inQueueVar.FreeTensor(inputVarLocal); + } + __aicore__ inline void CopyOut() + { + LocalTensor outMeanLocal = outQueueMean.DeQue(); + LocalTensor outVarLocal = outQueueVar.DeQue(); + + DataCopy(outputMean_global, outMeanLocal, bshLength); + DataCopy(outputVar_global, outVarLocal, bshLength); + + outQueueMean.FreeTensor(outMeanLocal); + outQueueVar.FreeTensor(outVarLocal); + } + +private: + TPipe pipe; + TQue inQueueX; + TQue inQueueMean; + TQue inQueueVar; + TQue outQueueMean; + TQue outQueueVar; + + GlobalTensor inputX_global; + GlobalTensor inputMean_global; + GlobalTensor inputVar_global; + GlobalTensor outputMean_global; + GlobalTensor outputVar_global; + TBuf tmpLocalBuf; + + uint32_t m_rnLength; + uint32_t m_abLength; + uint32_t m_abComputeLength; + float m_nRec; + uint32_t bshLength; + bool inplace; + uint32_t tmpMinBytes = 0; + uint32_t tmpMaxBytes = 0; +}; + +template +__aicore__ void main_WelfordUpdate_test(GM_ADDR srcGm, GM_ADDR inMeanGm, GM_ADDR inVarGm, GM_ADDR outMeanGm, + GM_ADDR outVarGm, int32_t rnLength, int32_t abLength, int32_t abComputeLength, float nRec) +{ + KernelWelfordUpdate op; + op.Init(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, rnLength, abLength, abComputeLength, nRec); + op.Process(); +} + +struct WelfordUpdateTestParams { + int64_t rnLength; + int64_t abLength; + int64_t abComputeLength; + float nRec; + uint32_t TypeSizeT; + uint32_t TypeSizeU; + void (*calFunc)(uint8_t *, uint8_t *, uint8_t *, uint8_t *, uint8_t *, int32_t, int32_t, int32_t, float); +}; + +class WelfordUpdateTestSuite : public testing::Test, public testing::WithParamInterface { +protected: + static void SetUpTestCase() + { + std::cout << "WelfordUpdateTestSuite SetUpTestCase" << std::endl; + } + static void TearDownTestCase() + { + std::cout << "WelfordUpdateTestSuite TearDownTestCase" << std::endl; + } + virtual void SetUp() + {} + virtual void TearDown() + {} +}; + +INSTANTIATE_TEST_CASE_P(TEST_PACKAGE_WelfordUpdate, WelfordUpdateTestSuite, + ::testing::Values( + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 16, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 16, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(half), sizeof(float), main_WelfordUpdate_test }, + + WelfordUpdateTestParams { 1, 8, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 13, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 16, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test }, + WelfordUpdateTestParams { 1, 16, 3, 0.8, sizeof(float), sizeof(float), main_WelfordUpdate_test } +)); + +TEST_P(WelfordUpdateTestSuite, WelfordUpdateTestCase) +{ + auto param = GetParam(); + uint32_t srcSize = param.rnLength * param.abLength; + uint8_t srcGm[srcSize * param.TypeSizeT]{0x00}; + uint8_t inMeanGm[srcSize * param.TypeSizeU]{0x00}; + uint8_t inVarGm[srcSize * param.TypeSizeU]{0x00}; + uint8_t outMeanGm[srcSize * param.TypeSizeU]{0x00}; + uint8_t outVarGm[srcSize * param.TypeSizeU]{0x00}; + param.calFunc(srcGm, inMeanGm, inVarGm, outMeanGm, outVarGm, param.rnLength, param.abLength, param.abComputeLength, + param.nRec); + for (int32_t i = 0; i < srcSize; i++) { + EXPECT_EQ(outMeanGm[i], 0x00); + EXPECT_EQ(outVarGm[i], 0x00); + } +} + diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index 4568f444de0ceaa2320289fbc44f066e51e8ebb3..936a7180416dc940153de54abffa8d53f8aeb6f4 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -3473,6 +3473,38 @@ TEST_F(TestTiling, tiling_compute_error) EXPECT_EQ(ret, -1); } +TEST_F(TestTiling, TestWelfordUpdateTiling) +{ + std::vector shapeDims1d = {1, 128}; + auto shape1d = ge::Shape(shapeDims1d); + uint32_t maxsize = 0; + uint32_t minsize = 0; + uint32_t dtypesizeT = 2; // half类型 + uint32_t dtypesizeU = 4; // float类型 + bool isReuseSource = false; + GetWelfordUpdateMaxMinTmpSize(shape1d, dtypesizeT, dtypesizeU, isReuseSource, false, maxsize, minsize); + EXPECT_EQ(minsize, 3 * 256); + EXPECT_EQ(maxsize, 2 * 3 * 256); + + std::vector shapeDims2d = {1, 72}; + auto shape2d = ge::Shape(shapeDims2d); + dtypesizeT = 4; // float类型 + dtypesizeU = 4; // float类型 + isReuseSource = false; + GetWelfordUpdateMaxMinTmpSize(shape2d, dtypesizeT, dtypesizeU, isReuseSource, false, maxsize, minsize); + EXPECT_EQ(minsize, 2 * 256); + EXPECT_EQ(maxsize, 2 * 2 * 256); + + std::vector shapeDims3d = {1, 256}; + auto shape3d = ge::Shape(shapeDims3d); + dtypesizeT = 4; // float类型 + dtypesizeU = 4; // float类型 + isReuseSource = true; + GetWelfordUpdateMaxMinTmpSize(shape3d, dtypesizeT, dtypesizeU, isReuseSource, false, maxsize, minsize); + EXPECT_EQ(minsize, 1 * 256); + EXPECT_EQ(maxsize, 4 * 1 * 256); +} + TEST_F(TestTiling, TestNZFp32UnalignedK) { matmul_tiling::PlatformInfo plat {.socVersion = platform_ascendc::SocVersion::ASCEND910B, .l1Size = 524288, diff --git a/tests/utils/init_global_memory/test_operator_init_global_memory.cpp b/tests/utils/init_global_memory/test_operator_init_global_memory.cpp new file mode 100644 index 0000000000000000000000000000000000000000..0c3cdd892e730b6199eb1e5729b69497fb391f95 --- /dev/null +++ b/tests/utils/init_global_memory/test_operator_init_global_memory.cpp @@ -0,0 +1,59 @@ +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include +#include "kernel_operator.h" + +using namespace std; +using namespace AscendC; + +class TEST_INIT_GLOBAL_MEMORY : public testing::Test { +protected: + void SetUp() + { + AscendC::SetGCoreType(2); + } + void TearDown() + { + AscendC::SetGCoreType(0); + } +}; + +template +void main_init_global_memory_demo(__gm__ uint8_t *__restrict__ dst_gm, const uint64_t dataSize) +{ + TPipe tpipe; + GlobalTensor dst_global; + dst_global.SetGlobalBuffer(reinterpret_cast<__gm__ T *>(dst_gm), dataSize); + InitGlobalMemory(dst_global, dataSize, (T)10); + pipe_barrier(PIPE_ALL); +} +#define VEC_INIT_GLOBAL_MEMORY_TESTCASE(DATASIZE, DATA_TYPE) \ + TEST_F(TEST_INIT_GLOBAL_MEMORY, INIT_GLOBAL_MEMORY##_##DATASIZE##_##DATA_TYPE##_##Case) \ + { \ + uint8_t output_gm[DATASIZE * sizeof(DATA_TYPE)] = {0}; \ + main_init_global_memory_demo(output_gm, DATASIZE); \ + for (uint32_t i = 0; i < DATASIZE; i++) { \ + EXPECT_EQ(output_gm[i], 0x00); \ + } \ + } + +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, half); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, half); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, float); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, float); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, uint16_t); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, uint16_t); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, int16_t); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, int16_t); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, uint32_t); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, uint32_t); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8192, int32_t); +VEC_INIT_GLOBAL_MEMORY_TESTCASE(8193, int32_t); \ No newline at end of file