diff --git a/examples/filter/dropout/CMakeLists.txt b/examples/filter/dropout/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..00cc3f66543121cadf0f897998346c6c773ac2d3 --- /dev/null +++ b/examples/filter/dropout/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/dropout_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(dropout_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/dropout_custom_tiling.cpp +) + +target_compile_options(dropout_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(dropout_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(dropout_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(dropout_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS dropout_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/filter/dropout/README.md b/examples/filter/dropout/README.md new file mode 100644 index 0000000000000000000000000000000000000000..5a4aa0cf593bab3bd3d7d2b15048b32995039967 --- /dev/null +++ b/examples/filter/dropout/README.md @@ -0,0 +1,93 @@ + + +## 概述 + +本样例介绍了调用Dropout高阶API实现dropout算子,并按照核函数直调的方式分别给出了对应的端到端实现,关于Dropout高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 数据过滤 > Dropout" 章节。 + +本样例以直调的方式调用算子核函数。 + +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品AI Core + +## 目录结构 + +| 目录 | 描述 +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + + DropOutCustom算子根据输入MaskTensor对输入SrcTensor进行过滤得到输出DstTensor,本算子仅支持输入shape为ND格式。DropOut API支持的过滤功能包括两种模式,字节模式和比特模式,本算子样例使用DropOut的字节模式。 + +- 算子规格: + + + + + + + + + + + + +
算子类型(OpType)DropoutCustom
算子输入
nameshapedata typeformat
x8*2048floatND
mask8*2048uint8_tND
算子输出
y8*2048floatND
核函数名dropout_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[8, 2048],输出y[8, 2048]的DropoutCustom算子。 + +- Kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用Dropout高阶API接口完成dropout计算,得到最终结果,再搬出到外部存储上。 + + DropoutCustom算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行dropout计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- Tiling实现 + + 根据输入数据x、mask的内轴、外轴长度和所用核数量确定所需tiling参数,并调用GetDropOutMaxMinTmpSize接口获取DropOut计算所需的临时空间大小。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/filter/dropout/cmake/cpu_lib.cmake b/examples/filter/dropout/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f --- /dev/null +++ b/examples/filter/dropout/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/filter/dropout/cmake/npu_lib.cmake b/examples/filter/dropout/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964 --- /dev/null +++ b/examples/filter/dropout/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/filter/dropout/main.cpp b/examples/filter/dropout/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1b255de2ebe75478897e540ce269b280ca1f06f3 --- /dev/null +++ b/examples/filter/dropout/main.cpp @@ -0,0 +1,169 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#include "./op_host/dropout_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_dropout_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm, + DropoutCustomTilingData tiling); +#endif + +namespace { +constexpr uint32_t USED_CORE_NUM = 8; +constexpr uint32_t TILINGDATA_SIZE = 5; +constexpr uint32_t FIRST_AXIS = 8; +constexpr uint32_t SRC_LAST_AXIS = 2048; +} + +extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis, + const uint32_t maskLastAxis, const uint32_t coreNum); + +static bool CompareResult(const void *outputData, uint32_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (size_t i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + printf("wrongNum: %ld\n", wrongNum); + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + size_t inputSize = FIRST_AXIS * SRC_LAST_AXIS * sizeof(uint32_t); + size_t maskSize = FIRST_AXIS * SRC_LAST_AXIS; + size_t outputSize = inputSize; + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *mask = (uint8_t *)AscendC::GmAlloc(maskSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + ReadFile("../input/input_mask.bin", maskSize, mask, maskSize); + + GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(dropout_custom, USED_CORE_NUM, x, y, mask, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", y, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(y, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)mask); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost, *maskHost, *tiling; + uint8_t *xDevice, *yDevice, *maskDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&maskHost), maskSize)); + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)) + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&maskDevice, maskSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + ReadFile("../input/input_mask.bin", maskSize, maskHost, maskSize); + + GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(maskDevice, maskSize, maskHost, maskSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(dropout_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, maskDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(yHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(maskDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(maskHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.cpp b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c668709dc5b8160363d599746c034b2dc208cb6d --- /dev/null +++ b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "dropout_custom_tiling.h" +#include "tiling/tiling_api.h" +#include + +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis, + const uint32_t maskLastAxis, const uint32_t coreNum) { + std::vector srcDims = {firstAxis, srcLastAxis, maskLastAxis}; + + uint32_t typeSize = 4; + ge::Shape shape(srcDims); + uint32_t minValue = 0; + uint32_t maxValue = 0; + + AscendC::GetDropOutMaxMinTmpSize(shape, typeSize, false, maxValue, minValue); + + DropoutCustomTilingData *tiling = reinterpret_cast(tilingBuf); + + tiling->firstAxis = firstAxis; + tiling->srcLastAxis = srcLastAxis; + tiling->maskLastAxis = maskLastAxis; + + tiling->tmpBufferSize = minValue; + tiling->tileNum = coreNum; +} \ No newline at end of file diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.h b/examples/filter/dropout/op_host/dropout_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..ddf7470c8ff5dad3b8c240a99ce1092ff1329ca6 --- /dev/null +++ b/examples/filter/dropout/op_host/dropout_custom_tiling.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H +#define EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H + +#include + +struct DropoutCustomTilingData { + uint32_t firstAxis; + uint32_t srcLastAxis; + uint32_t maskLastAxis; + uint32_t tmpBufferSize; + uint32_t tileNum; +}; + +#endif // EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/filter/dropout/op_kernel/dropout_custom.cpp b/examples/filter/dropout/op_kernel/dropout_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8bae3d52cacf9e97400efd7a3bb0b9758ffa45f7 --- /dev/null +++ b/examples/filter/dropout/op_kernel/dropout_custom.cpp @@ -0,0 +1,23 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "dropout_custom_impl.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm, + DropoutCustomTilingData tiling) { + if ASCEND_IS_AIC { + return; + } + AscendC::TPipe pipe; + MyCustomKernel::KernelDropout op; + op.Init(srcGm, dstGm, maskGm, tiling, &pipe); + op.Process(); +} \ No newline at end of file diff --git a/examples/filter/dropout/op_kernel/dropout_custom_impl.h b/examples/filter/dropout/op_kernel/dropout_custom_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..18541c2430be3848390b87764c0a4768034de9e5 --- /dev/null +++ b/examples/filter/dropout/op_kernel/dropout_custom_impl.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_IMPL_H +#define EXAMPLES_FILTER_DROPOUT_CUSTOM_IMPL_H + +#include "../op_host/dropout_custom_tiling.h" +#include "kernel_operator.h" + +namespace MyCustomKernel { +constexpr int32_t BUFFER_NUM = 2; + +template +class KernelDropout { +public: + __aicore__ inline KernelDropout() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR m, DropoutCustomTilingData tilingData, AscendC::TPipe *pipeIn) { + ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); }); + srcSize = tilingData.firstAxis * tilingData.srcLastAxis; + tileNum = tilingData.tileNum; + blockLength = srcSize / AscendC::GetBlockNum(); + tileLength = this->blockLength / tileNum / BUFFER_NUM; + + info.firstAxis = tilingData.firstAxis / tilingData.tileNum; + info.srcLastAxis = tileLength; + info.maskLastAxis = tileLength; + + xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + maskGm.SetGlobalBuffer((__gm__ uint8_t *)m + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + + pipe = pipeIn; + pipe->InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T)); + pipe->InitBuffer(inQueueMask, BUFFER_NUM, this->tileLength * sizeof(uint8_t)); + pipe->InitBuffer(outQueue, BUFFER_NUM, this->tileLength * sizeof(T)); + pipe->InitBuffer(tmpBuf, tilingData.tmpBufferSize * sizeof(uint8_t)); + } + __aicore__ inline void Process() { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor maskLocal = inQueueMask.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(maskLocal, maskGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueMask.EnQue(maskLocal); + } + __aicore__ inline void Compute(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor maskLocal = inQueueMask.DeQue(); + + AscendC::LocalTensor yLocal = outQueue.AllocTensor(); + AscendC::LocalTensor sharedTmpBuffer = tmpBuf.AllocTensor(); + + AscendC::DropOut(yLocal, xLocal, maskLocal, sharedTmpBuffer, probValue, info); + + outQueue.EnQue(yLocal); + + inQueueX.FreeTensor(xLocal); + inQueueMask.FreeTensor(maskLocal); + tmpBuf.FreeTensor(sharedTmpBuffer); + } + __aicore__ inline void CopyOut(int32_t progress) { + AscendC::LocalTensor yLocal = outQueue.DeQue(); + AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength); + outQueue.FreeTensor(yLocal); + } + +private: + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor maskGm; + + AscendC::TPipe *pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueMask; + AscendC::TQue outQueue; + AscendC::TBuf tmpBuf; + + uint32_t srcSize = 0; + uint32_t tileNum = 0; + uint32_t blockLength = 0; + uint32_t tileLength = 0; + float probValue = 0.8; + AscendC::DropOutShapeInfo info; +}; +} + +#endif \ No newline at end of file diff --git a/examples/filter/dropout/run.sh b/examples/filter/dropout/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..76ad17bbb6186bd6806c92e608f2a0a9203e40d5 --- /dev/null +++ b/examples/filter/dropout/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./dropout_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./dropout_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./dropout_direct_kernel_op +fi \ No newline at end of file diff --git a/examples/filter/dropout/scripts/gen_data.py b/examples/filter/dropout/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..22b92a1c5e84822486d7e0a5502b432b3621a331 --- /dev/null +++ b/examples/filter/dropout/scripts/gen_data.py @@ -0,0 +1,32 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 2048] + + input_x = np.random.uniform(1, 100, input_shape).astype(dtype) + input_mask = np.random.randint(0, 2, size=input_shape, dtype=bool) + golden = np.where(input_mask, input_x / 0.8, np.zeros_like(input_x)) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + input_mask.tofile("./input/input_mask.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() \ No newline at end of file diff --git a/examples/math/cos/CMakeLists.txt b/examples/math/cos/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..188a58818efc58f6a78f6c0bdc893047f1fe4e79 --- /dev/null +++ b/examples/math/cos/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/cos_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(cos_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/cos_custom_tiling.cpp +) + +target_compile_options(cos_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(cos_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(cos_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(cos_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS cos_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/math/cos/README.md b/examples/math/cos/README.md new file mode 100644 index 0000000000000000000000000000000000000000..303f5bba4bf7c330c4b0d36272f9942212493737 --- /dev/null +++ b/examples/math/cos/README.md @@ -0,0 +1,92 @@ + + +## 概述 + +本样例介绍了调用Cos高阶API实现cos算子,并按照核函数直调的方式分别给出了对应的端到端实现。关于Cos高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 数学库 > Cos" 章节 + +本样例以直调的方式调用算子核函数。 + +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品AI Core + +## 目录结构 + +| 目录 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + + CosCustom算子对输入tensor按元素做三角函数余弦运算。 + +- 算子规格: + + + + + + + + + + + +
算子类型(OpType)CosCustom
算子输入
nameshapedata typeformat
x8*2048floatND
算子输出
y8*2048floatND
核函数名cos_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[8, 2048],输出y[8, 2048]的CosCustom算子。 + +- Kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用Cos高阶API接口完成cos计算,得到最终结果,再搬出到外部存储上。 + + CosCustom算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行cos计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- Tiling实现 + + 根据输入长度和所用核数量设置tiling参数totalLength和coreNum。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/math/cos/cmake/cpu_lib.cmake b/examples/math/cos/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f --- /dev/null +++ b/examples/math/cos/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/math/cos/cmake/npu_lib.cmake b/examples/math/cos/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964 --- /dev/null +++ b/examples/math/cos/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/math/cos/main.cpp b/examples/math/cos/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..2c1c704e0c0e5b9451d1278c9a40570e58121dbb --- /dev/null +++ b/examples/math/cos/main.cpp @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#include "op_host/cos_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_cos_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling); +#endif + +namespace { +constexpr uint32_t USED_CORE_NUM = 8; +constexpr uint32_t TILINGDATA_SIZE = 2; +constexpr uint32_t TOTAL_LENGTH = 8 * 2048; +} + +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum); + +static bool CompareResult(const void *outputData, uint32_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (size_t i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + size_t inputSize = TOTAL_LENGTH * sizeof(uint32_t); + size_t outputSize = inputSize; + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + + GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(cos_custom, USED_CORE_NUM, x, y, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", y, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(y, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost, *tiling; + uint8_t *xDevice, *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)) + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + + GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(cos_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(yHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} diff --git a/examples/math/cos/op_host/cos_custom_tiling.cpp b/examples/math/cos/op_host/cos_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..352215a9931bc58122741c3568652754f1b84fed --- /dev/null +++ b/examples/math/cos/op_host/cos_custom_tiling.cpp @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "cos_custom_tiling.h" +#include "tiling/tiling_api.h" +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum) { + CosCustomTilingData *tiling = reinterpret_cast(tilingBuf); + + tiling->totalLength = totalLength; + tiling->tileNum = coreNum; +} \ No newline at end of file diff --git a/examples/math/cos/op_host/cos_custom_tiling.h b/examples/math/cos/op_host/cos_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..1ba0775f5cf345748d164d26cf35c9b9adfd2f6d --- /dev/null +++ b/examples/math/cos/op_host/cos_custom_tiling.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATH_COS_CUSTOM_TILING_H +#define EXAMPLES_MATH_COS_CUSTOM_TILING_H + +#include + +struct CosCustomTilingData { + uint32_t totalLength; + uint32_t tileNum; +}; + +#endif // EXAMPLES_MATH_COS_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/math/cos/op_kernel/cos_custom.cpp b/examples/math/cos/op_kernel/cos_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..86e77d7eab7972ab8d278f82e655d6dd2c5641da --- /dev/null +++ b/examples/math/cos/op_kernel/cos_custom.cpp @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "cos_custom_impl.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling) { + if ASCEND_IS_AIC { + return; + } + AscendC::TPipe pipe; + MyCustomKernel::KernelCos op; + op.Init(x, y, tiling, &pipe); + op.Process(); +} \ No newline at end of file diff --git a/examples/math/cos/op_kernel/cos_custom_impl.h b/examples/math/cos/op_kernel/cos_custom_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..09ee14d7f26092a25ef7148a6244004389f03a38 --- /dev/null +++ b/examples/math/cos/op_kernel/cos_custom_impl.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATH_COS_CUSTOM_IMPL_H +#define EXAMPLES_MATH_COS_CUSTOM_IMPL_H +#include "../op_host/cos_custom_tiling.h" +#include "kernel_operator.h" + +namespace MyCustomKernel { +constexpr int32_t BUFFER_NUM = 2; + +template +class KernelCos { +public: + __aicore__ inline KernelCos() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, CosCustomTilingData tilingData, AscendC::TPipe *pipeIn) { + ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); }); + this->tileNum = tilingData.tileNum; + this->blockLength = tilingData.totalLength / AscendC::GetBlockNum(); + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + + xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + + pipe = pipeIn; + pipe->InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T)); + pipe->InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(T)); + } + __aicore__ inline void Process() { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + AscendC::Cos(yLocal, xLocal, this->tileLength); + outQueueY.EnQue(yLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) { + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength); + outQueueY.FreeTensor(yLocal); + } + +private: + AscendC::TPipe* pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueY; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + + uint32_t tileNum = 0; + uint32_t blockLength = 0; + uint32_t tileLength = 0; +}; +} // namespace MyCustomKernel + +#endif \ No newline at end of file diff --git a/examples/math/cos/run.sh b/examples/math/cos/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..2f8c76a41493ebf1ad8c5c31959b19e1d4cd4c2a --- /dev/null +++ b/examples/math/cos/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./cos_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./cos_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./cos_direct_kernel_op +fi \ No newline at end of file diff --git a/examples/math/cos/scripts/gen_data.py b/examples/math/cos/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..c295292ee9d933a2b02b3ac1103042506d8fd336 --- /dev/null +++ b/examples/math/cos/scripts/gen_data.py @@ -0,0 +1,30 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 2048] + # generate value between [-65504, 65504] + input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + golden = np.cos(input_x).astype(dtype) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/examples/readme.md b/examples/readme.md index 5af4dde79d76a75d8513887ad839e37851bcabb8..b3ebb211645bd3848a3dfb9f15082640bb987b65 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -146,6 +146,26 @@ welford_finalize Welford算法的后处理,一种在线计算均值和方差的方法。 + + math + cos + 对输入tensor做Cos计算。 + + + filter + dropout + 对输入tensor进行数据过滤。 + + + reduce + sum + 计算输入tensor最后一个维度的元素之和。 + + + transpose + confusion_transpose + 对输入数据进行数据排布及Reshape操作。 + pad broadcast diff --git a/examples/reduce/sum/CMakeLists.txt b/examples/reduce/sum/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..db2524fce36b508c5eed1a2e004b6a9a96b468f4 --- /dev/null +++ b/examples/reduce/sum/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/sum_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(sum_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/sum_custom_tiling.cpp +) + +target_compile_options(sum_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(sum_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(sum_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(sum_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS sum_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/reduce/sum/README.md b/examples/reduce/sum/README.md new file mode 100644 index 0000000000000000000000000000000000000000..9a492b2229c3862592ebc71d1b15aeb36a93c102 --- /dev/null +++ b/examples/reduce/sum/README.md @@ -0,0 +1,92 @@ + + +## 概述 + +本样例介绍了调用Sum高阶API实现sum算子,并按照核函数直调的方式分别给出了对应的端到端实现,关于Sum高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 归约操作 > Sum" 章节。 + +本样例以直调的方式调用算子核函数。 + +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品 AI Core + +## 目录结构 + +| 目录 | 描述 +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + + SumCustom算子,获取输入数据最后一个维度的元素总和。如果输入是向量,则对向量中各元素进行累加;如果输入是矩阵,则沿最后一个维度对每行中元素求和。 + +- 算子规格: + + + + + + + + + + + +
算子类型(OpType)SumCustom
算子输入
nameshapedata typeformat
x7*2023floatND
算子输出
y8*1floatND
核函数名sum_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[7, 2023],输出y[8]的SumCustom算子,其中y中的有效值数量为7,对输入x的每行元素求和后,输出y的有效数据为前7位,最后一位为padding填充的数据。 + +- Kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用Sum高阶API接口完成sum计算,得到最终结果,再搬出到外部存储上。 + + SumCustom算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行sum计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- Tiling实现 + + 根据输入数据的内轴长度、内轴实际长度、外轴长度确定所需tiling参数,例如输出内轴补齐后长度等。调用GetSumMaxMinTmpSize接口获取Sum接口完成计算所需的临时空间大小。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品 AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/reduce/sum/cmake/cpu_lib.cmake b/examples/reduce/sum/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f --- /dev/null +++ b/examples/reduce/sum/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/reduce/sum/cmake/npu_lib.cmake b/examples/reduce/sum/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964 --- /dev/null +++ b/examples/reduce/sum/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/reduce/sum/main.cpp b/examples/reduce/sum/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..c3ec5710ebf2f08f98f79c432de8b1fde70ac2b6 --- /dev/null +++ b/examples/reduce/sum/main.cpp @@ -0,0 +1,162 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#include "./op_host/sum_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_sum_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling); +#endif + +namespace { +constexpr uint32_t USED_CORE_NUM = 1; +constexpr uint32_t TILINGDATA_SIZE = 5; +constexpr uint32_t M = 7; // outter +constexpr uint32_t N = 2023; // inner_actual +} + +extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N); + +static bool CompareResult(const void *outputData, uint32_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (size_t i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + printf("wrongNum: %ld\n", wrongNum); + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + uint8_t *tiling = nullptr; + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + +#ifdef ASCENDC_CPU_DEBUG + tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost; + uint8_t *xDevice, *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)) +#endif + GenerateTilingData(tiling, M, N); + + auto tilingData = reinterpret_cast(tiling); + + size_t inputSize = tilingData->outter * tilingData->inner * sizeof(uint32_t); + size_t outputSize = tilingData->out_inner * sizeof(uint32_t); + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(sum_custom, USED_CORE_NUM, x, y, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", y, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(y, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(sum_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(yHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..ebfc06e6f61fb18690aeb1a76d85324a239a4c99 --- /dev/null +++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "sum_custom_tiling.h" +#include "tiling/tiling_api.h" +#include + +namespace { +constexpr uint32_t PADDING_BYTE = 32U; +} + +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N) { + uint32_t minValue = 0; + uint32_t maxValue = 0; + + AscendC::GetSumMaxMinTmpSize(N, sizeof(uint32_t), false, maxValue, minValue); + + SumCustomTilingData *tiling = reinterpret_cast(tilingBuf); + + auto paddingFunc = [](const uint32_t n, const uint32_t typeSize) -> uint32_t { + if (typeSize == 0) { + return 0; + } + return (n * typeSize + PADDING_BYTE - 1U) / PADDING_BYTE * PADDING_BYTE / typeSize; + }; + + tiling->outter = M; + tiling->inner = paddingFunc(N, sizeof(uint32_t)); + tiling->n = N; + tiling->tmpBufSize = minValue; + + tiling->out_inner = paddingFunc(M, sizeof(uint32_t)); +} \ No newline at end of file diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.h b/examples/reduce/sum/op_host/sum_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..e39475357ed4ae7393d4fbf2ba72c0638f6ba4c1 --- /dev/null +++ b/examples/reduce/sum/op_host/sum_custom_tiling.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_REDUCE_SUM_COSTOM_TILING_H +#define EXAMPLES_REDUCE_SUM_COSTOM_TILING_H + +#include + +struct SumCustomTilingData { + uint32_t inner; + uint32_t outter; + uint32_t n; + uint32_t tmpBufSize; + uint32_t out_inner; +}; + +#endif // EXAMPLES_REDUCE_SUM_COSTOM_TILING_H \ No newline at end of file diff --git a/examples/reduce/sum/op_kernel/sum_custom.cpp b/examples/reduce/sum/op_kernel/sum_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d18e0e8b8edfcf1164c94c3e220e30947baa393 --- /dev/null +++ b/examples/reduce/sum/op_kernel/sum_custom.cpp @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "sum_custom_impl.h" + +extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling) { + if ASCEND_IS_AIC { + return; + } + AscendC::TPipe pipe; + MyCustomKernel::KernelSum op; + op.Init(srcGm, dstGm, tiling, &pipe); + op.Process(); +} \ No newline at end of file diff --git a/examples/reduce/sum/op_kernel/sum_custom_impl.h b/examples/reduce/sum/op_kernel/sum_custom_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..71f8bef190a523191aeb76f7857400a2f882efdb --- /dev/null +++ b/examples/reduce/sum/op_kernel/sum_custom_impl.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_REDUCE_SUM_CUSTOM_IMPL_H +#define EXAMPLES_REDUCE_SUM_CUSTOM_IMPL_H + +#include "../op_host/sum_custom_tiling.h" +#include "kernel_operator.h" + +namespace MyCustomKernel { +template +class KernelSum { +public: + __aicore__ inline KernelSum() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, SumCustomTilingData tilingData, AscendC::TPipe *pipeIn) { + ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); }); + inner = tilingData.inner; + outter = tilingData.outter; + n = tilingData.n; + tmpBufSize = tilingData.tmpBufSize; + out_inner = tilingData.out_inner; + + params.inner = inner; + params.outter = outter; + params.n = n; + + xGm.SetGlobalBuffer((__gm__ T *)x); + yGm.SetGlobalBuffer((__gm__ T *)y); + + pipe = pipeIn; + pipe->InitBuffer(inQueue, 1, inner * outter * sizeof(T)); + pipe->InitBuffer(outQueue, 1, out_inner * sizeof(T)); + pipe->InitBuffer(tmpBuf, tmpBufSize * sizeof(uint8_t)); + } + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() { + AscendC::LocalTensor xLocal = inQueue.AllocTensor(); + AscendC::DataCopy(xLocal, xGm, inner * outter); + inQueue.EnQue(xLocal); + } + __aicore__ inline void Compute() { + AscendC::LocalTensor xLocal = inQueue.DeQue(); + AscendC::LocalTensor yLocal = outQueue.AllocTensor(); + AscendC::LocalTensor sharedTmpBuffer = tmpBuf.AllocTensor(); + + T scalar(0); + AscendC::Duplicate(yLocal, scalar, out_inner); + AscendC::Sum(yLocal, xLocal, sharedTmpBuffer, params); + + outQueue.EnQue(yLocal); + inQueue.FreeTensor(xLocal); + tmpBuf.FreeTensor(sharedTmpBuffer); + } + __aicore__ inline void CopyOut() { + AscendC::LocalTensor yLocal = outQueue.DeQue(); + AscendC::DataCopy(yGm, yLocal, out_inner); + outQueue.FreeTensor(yLocal); + } + +private: + AscendC::TPipe* pipe; + AscendC::TQue inQueue; + AscendC::TQue outQueue; + AscendC::TBuf tmpBuf; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + + uint32_t inner = 0; + uint32_t outter = 0; + uint32_t n = 0; + uint32_t tmpBufSize = 0; + uint32_t out_inner = 0; + AscendC::SumParams params; +}; +} + +#endif \ No newline at end of file diff --git a/examples/reduce/sum/run.sh b/examples/reduce/sum/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..796c0fd787e97537762719509f4fcff26fd809c4 --- /dev/null +++ b/examples/reduce/sum/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./sum_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./sum_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./sum_direct_kernel_op +fi \ No newline at end of file diff --git a/examples/reduce/sum/scripts/gen_data.py b/examples/reduce/sum/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..9538198ba677e42cf8ae53dd835dc6634f97403b --- /dev/null +++ b/examples/reduce/sum/scripts/gen_data.py @@ -0,0 +1,30 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [7, 2024] + input_x = np.random.uniform(-100, 100, input_shape).astype(dtype) + + golden = np.sum(input_x[:, :-1], axis=1).astype(dtype) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/examples/transpose/confusion_transpose/CMakeLists.txt b/examples/transpose/confusion_transpose/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..ef7fd4a9d807536ce598ce80ee962018ad37504e --- /dev/null +++ b/examples/transpose/confusion_transpose/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/confusion_transpose_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(confusion_transpose_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/confusion_transpose_custom_tiling.cpp +) + +target_compile_options(confusion_transpose_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(confusion_transpose_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(confusion_transpose_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(confusion_transpose_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS confusion_transpose_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/README.md b/examples/transpose/confusion_transpose/README.md new file mode 100644 index 0000000000000000000000000000000000000000..f633d6d6be7c69e3d51a326522b17c3661e208f7 --- /dev/null +++ b/examples/transpose/confusion_transpose/README.md @@ -0,0 +1,96 @@ + + +## 概述 + +本样例介绍了调用ConfusionTranspose高阶API实现confusionTranspose算子,并按照核函数直调的方式分别给出了对应的端到端实现,关于ConfusionTranspose高阶API的具体内容请参考《[Ascend C算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi)》中的 "高阶 API > 变形 > ConfusionTranspose" 章节。 + +本样例以直调的方式调用算子核函数。 + +直调:核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧和仿真三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品 AI Core + +## 目录结构 + +| 目录 | 描述 +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 + + ConfusionTransposeCustom算子,对输入数据进行数据排布及Reshape操作。 + +- 算子规格: + + + + + + + + + + + +
算子类型(OpType)ConfusionTransposeCustom
算子输入
nameshapedata typeformat
x1*2*2*4*16*16halfNZ
算子输出
y1*64*2*32halfND
核函数名confusion_transpose_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[1, 2, 2, 4, 16, 16],输出y[1, 64, 2, 32]的ConfusionTransposeCustom算子,针对NZ2ND场景,实现1、2轴互换。 + +输入Tensor { shape:[B, N, H/N/16, S/16, 16, 16], origin_shape:[B, N, S, H/N], format:"NZ", origin_format:"ND"} + +输出Tensor { shape:[B, S, N, H/N], origin_shape:[B, S, N, H/N], format:"ND", origin_format:"ND"} + +- Kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用ConfusionTranspose高阶API接口完成confusionTranspose计算,得到最终结果,再搬出到外部存储上。 + + ConfusionTransposeCustom算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行confusionTranspose计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- Tiling实现 + + 使用Ascend C提供的GetConfusionTransposeTilingInfo接口,获取所需的Tiling参数,并调用GetConfusionTransposeMaxMinTmpSize接口获取ConfusionTranspose接口计算所需的临时空间大小。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品 AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..693f15ac115d655aacd3218bc5b14060c0a3de2f --- /dev/null +++ b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/cmake/npu_lib.cmake b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake new file mode 100644 index 0000000000000000000000000000000000000000..8ad136f38b80bea109925ab797fbde0871874964 --- /dev/null +++ b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/main.cpp b/examples/transpose/confusion_transpose/main.cpp new file mode 100644 index 0000000000000000000000000000000000000000..8bd7759de4eca059b07aa5944a13ff67b6ffbfee --- /dev/null +++ b/examples/transpose/confusion_transpose/main.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_confusion_transpose_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling); +#endif + +namespace { +constexpr uint32_t USED_CORE_NUM = 1; +constexpr uint32_t TILINGDATA_SIZE = 22; +constexpr uint32_t B = 1; +constexpr uint32_t N = 2; +constexpr uint32_t S = 64; +constexpr uint32_t HNDIV = 32; +} + +extern uint8_t *GenerateTiling(uint32_t b, uint32_t n, uint32_t s, uint32_t hnDiv); + +int32_t main(int32_t argc, char *argv[]) { + size_t inputSize = B * S * N * HNDIV * sizeof(uint16_t); + size_t outputSize = B * S * N * HNDIV * sizeof(uint16_t); + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + + uint8_t *buf = GenerateTiling(B, N, S, HNDIV); + + memcpy_s(tiling, tilingSize, buf, tilingSize); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); + ICPU_RUN_KF(confusion_transpose_custom, USED_CORE_NUM, x, y, tiling); + + WriteFile("../output/output.bin", y, outputSize); + + free(buf); + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost; + uint8_t *xDevice, *yDevice, *tilingDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + + uint8_t *buf = GenerateTiling(B, N, S, HNDIV); + + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, buf, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(confusion_transpose_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, tilingDevice); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Clean up memory + free(buf); + + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + return 0; +} diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cfb1244e4123eb2f522c6142dd453cb270169ff8 --- /dev/null +++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp @@ -0,0 +1,24 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "confusion_transpose_custom_tiling.h" + +uint8_t *GetTilingBuf(optiling::ConfusionTransposeCustomTilingData *tilingData) { + uint32_t tilingSize = sizeof(optiling::ConfusionTransposeCustomTilingData); + uint8_t *buf = (uint8_t *)malloc(tilingSize); + tilingData->SaveToBuffer(buf, tilingSize); + return buf; +} + +uint8_t *GenerateTiling(uint32_t b, uint32_t n, uint32_t s, uint32_t hnDiv) { + optiling::ConfusionTransposeCustomTilingData tiling; + ComputeTiling(b, n, s, hnDiv, tiling); + return GetTilingBuf(&tiling); +} \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..b5098bf30ba2e001945de42436093ff2e9c13b2f --- /dev/null +++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H +#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/platform/platform_ascendc.h" +#include "tiling/tiling_api.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(ConfusionTransposeCustomTilingData) + TILING_DATA_FIELD_DEF(uint32_t, b); + TILING_DATA_FIELD_DEF(uint32_t, n); + TILING_DATA_FIELD_DEF(uint32_t, s); + TILING_DATA_FIELD_DEF(uint32_t, hnDiv); + TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData); +END_TILING_DATA_DEF; +REGISTER_TILING_DATA_CLASS(ConfusionTransposeCustom, ConfusionTransposeCustomTilingData) +} // namespace optiling + +void ComputeTiling(uint32_t b, uint32_t n, uint32_t s, uint32_t hnDiv, + optiling::ConfusionTransposeCustomTilingData &tiling) { + tiling.set_b(b); + tiling.set_n(n); + tiling.set_s(s); + tiling.set_hnDiv(hnDiv); + + std::vector shapeVec = {b, n, s, hnDiv}; + ge::Shape srcShape(shapeVec); + + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, sizeof(uint16_t), 1, maxValue, minValue); + const uint32_t stackBufferSize = minValue; + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, sizeof(uint16_t), 1, tiling.confusionTransposeTilingData); +} + +#endif // EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fd6ff38a3a54a5058b24a3d4a8a7671e638bd5e6 --- /dev/null +++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp @@ -0,0 +1,33 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "confusion_transpose_custom_impl.h" +#include "kernel_operator.h" + +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM) { + uint32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); + + for (uint32_t i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling) { + if ASCEND_IS_AIC { + return; + } + AscendC::TPipe pipe; + MyCustomKernel::KernelConfusionTranspose op; + MyCustomKernel::VecTiling tilingData; + CopyTiling(&tilingData, tiling); + op.Init(x, y, tilingData, &pipe); + op.Process(); +} \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h new file mode 100644 index 0000000000000000000000000000000000000000..ae2ea6ac167c69077365b5676c06c4edc41cb55b --- /dev/null +++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_IMPL_H +#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_IMPL_H + +#include "kernel_operator.h" + +namespace MyCustomKernel { +struct VecTiling { + uint32_t b; + uint32_t n; + uint32_t s; + uint32_t hnDiv; + ConfusionTransposeTiling confusionTransposeTilingData; +}; + +template +class KernelConfusionTranspose { +public: + __aicore__ inline KernelConfusionTranspose() {} + __aicore__ inline void Init(__gm__ uint8_t *srcGm, __gm__ uint8_t *dstGm, const VecTiling &tilingData, AscendC::TPipe *pipeIn) { + ASCENDC_ASSERT(AscendC::GetBlockNum() != 0, { KERNEL_LOG(KERNEL_ERROR, "block dim can not be zero!"); }); + this->b = tilingData.b; + this->n = tilingData.n; + this->s = tilingData.s; + this->hnDiv = tilingData.hnDiv; + + srcGlobal.SetGlobalBuffer((__gm__ T *)srcGm, b * n * s * hnDiv); + dstGlobal.SetGlobalBuffer((__gm__ T *)dstGm, b * n * s * hnDiv); + + pipe = pipeIn; + pipe->InitBuffer(inQueueSrcVecIn, 1, b * n * s * hnDiv * sizeof(T)); + pipe->InitBuffer(inQueueSrcVecOut, 1, b * n * s * hnDiv * sizeof(T)); + this->tiling = tilingData.confusionTransposeTilingData; + } + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() { + AscendC::LocalTensor srcLocal = inQueueSrcVecIn.AllocTensor(); + AscendC::DataCopy(srcLocal, srcGlobal, b * n * s * hnDiv); + inQueueSrcVecIn.EnQue(srcLocal); + } + __aicore__ inline void Compute() { + AscendC::LocalTensor srcLocal = inQueueSrcVecIn.DeQue(); + AscendC::LocalTensor dstLocal = inQueueSrcVecOut.AllocTensor(); + AscendC::ConfusionTranspose(dstLocal, srcLocal, AscendC::TransposeType::TRANSPOSE_NZ2ND_0213, this->tiling); + inQueueSrcVecOut.EnQue(dstLocal); + inQueueSrcVecIn.FreeTensor(srcLocal); + } + __aicore__ inline void CopyOut() { + AscendC::LocalTensor dstLocal = inQueueSrcVecOut.DeQue(); + AscendC::DataCopy(dstGlobal, dstLocal, b * n * s * hnDiv); + inQueueSrcVecOut.FreeTensor(dstLocal); + } + +private: + AscendC::TPipe *pipe; + AscendC::TQue inQueueSrcVecIn; + AscendC::TQue inQueueSrcVecOut; + AscendC::GlobalTensor srcGlobal; + AscendC::GlobalTensor dstGlobal; + uint32_t b = 0; + uint32_t n = 0; + uint32_t s = 0; + uint32_t hnDiv = 0; + ConfusionTransposeTiling tiling; +}; +} // namespace MyCustomKernel +#endif \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/run.sh b/examples/transpose/confusion_transpose/run.sh new file mode 100644 index 0000000000000000000000000000000000000000..cddce28f098bdb4770b57c95990f35f40e6d1fe0 --- /dev/null +++ b/examples/transpose/confusion_transpose/run.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./confusion_transpose_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./confusion_transpose_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./confusion_transpose_direct_kernel_op +fi + +cd .. +python3 scripts/verify_data.py output/output.bin output/golden.bin diff --git a/examples/transpose/confusion_transpose/scripts/gen_data.py b/examples/transpose/confusion_transpose/scripts/gen_data.py new file mode 100644 index 0000000000000000000000000000000000000000..7afe9444f5ec4ee510fdcf4edccd41bd777b7771 --- /dev/null +++ b/examples/transpose/confusion_transpose/scripts/gen_data.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float16 + + b, n, s, hn_div = 1, 2, 64, 32 + + origin_shape = (b, n, s, hn_div) + + origin_data = np.random.uniform(-1000.0, 1000.0, origin_shape).astype(dtype) + + reshape_shape = (b, n, s // 16, 16, hn_div // 16, 16) + reshaped_data = origin_data.reshape(reshape_shape) + + transposed_data = np.transpose(reshaped_data, (0, 1, 4, 2, 3, 5)) + + golden = np.transpose(origin_data, (0, 2, 1, 3)) + + os.system("mkdir -p ./input") + transposed_data.tofile("./input/input_x.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/scripts/verify_data.py b/examples/transpose/confusion_transpose/scripts/verify_data.py new file mode 100644 index 0000000000000000000000000000000000000000..19122a78e348feed3cbd32480ba43320b12f6cb5 --- /dev/null +++ b/examples/transpose/confusion_transpose/scripts/verify_data.py @@ -0,0 +1,56 @@ +#!/usr/bin/python3 +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import sys +import numpy as np + +# for float16 +RELATIVE_TOL = 1e-3 +ABSOLUTE_TOL = 1e-5 +ERROR_TOL = 1e-3 + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float16).reshape(-1) + golden = np.fromfile(golden, dtype=np.float16).reshape(-1) + + different_element_results = np.isclose(output, + golden, + rtol=RELATIVE_TOL, + atol=ABSOLUTE_TOL, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, ERROR_TOL)) + return error_ratio <= ERROR_TOL + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) \ No newline at end of file diff --git a/impl/activation/geglu/geglu_tiling.cpp b/impl/activation/geglu/geglu_tiling.cpp index 91204a8e62112372c853f7e72366e7af3abe6691..3afc5da060e39aef9ffa50bc1ec7bcd970154e9c 100644 --- a/impl/activation/geglu/geglu_tiling.cpp +++ b/impl/activation/geglu/geglu_tiling.cpp @@ -57,10 +57,10 @@ void GetGeGLUMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c maxValue = GetGeGLUMaxTmpSize(inputSize, typeSize); } -void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { HighLevelApiCheck::TypeSizeVerifyingParameters(typeSize, SUPPORT_TYPESIZE); extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? GEGLU_FLOAT_CALC_FAC : GEGLU_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? GEGLU_FLOAT_CALC_FAC : GEGLU_HALF_CALC_FAC; } } // namespace AscendC diff --git a/impl/activation/softmax/softmax_tiling.cpp b/impl/activation/softmax/softmax_tiling.cpp index 401bf3bf1aa1a1c60ab4bb74c8d7bd1f174cbbee..08f5ab196228f2322ab61799ce5859a528211932 100644 --- a/impl/activation/softmax/softmax_tiling.cpp +++ b/impl/activation/softmax/softmax_tiling.cpp @@ -617,7 +617,7 @@ void SoftMaxFlashV2TilingFunc(const ge::Shape& srcShape, const uint32_t dataType if (isFlashOutputBrc && (softmaxFlashTiling.get_rangeM() > 1 || softmaxFlashTiling.get_tailM() != 0)) { ASCENDC_HOST_ASSERT((softmaxFlashTiling.get_reduceM() % (SOFTMAX_DEFAULT_BLK_SIZE / dataTypeSize1) == 0), return, - "When dataTypeSize1(%d) is float(or half), softmaxFlashTiling.reduceM(%d) must be a multiple of 8(or 16), " + "When dataTypeSize1(%u) is float(or half), softmaxFlashTiling.reduceM(%u) must be a multiple of 8(or 16), " "Adjust the input parameter -> localWorkSpaceSize.\n", dataTypeSize1, softmaxFlashTiling.get_reduceM()); } } diff --git a/impl/activation/swish/swish_common_impl.h b/impl/activation/swish/swish_common_impl.h index 047005d67202e379e2c29e7ac4074d8600c40474..b682cb6d6bc4c262c5390bdda3bd5442e65b4d1a 100644 --- a/impl/activation/swish/swish_common_impl.h +++ b/impl/activation/swish/swish_common_impl.h @@ -52,14 +52,6 @@ __aicore__ inline __inout_pipe__(V) void SwishCompute( if ASCEND_IS_AIC { return; } -#if ASCENDC_CPU_DEBUG - bool ret = (dataSize <= srcLocal.GetSize()) && (dataSize <= dstLocal.GetSize()) && (dataSize > 0); - ASCENDC_ASSERT( - ret, { KERNEL_LOG(KERNEL_ERROR, "DataSize must bigger than 0 and smaller than or equal to src&dst tensor."); }); - - ret = (std::is_same::value) || (std::is_same::value); - ASCENDC_ASSERT(ret, { KERNEL_LOG(KERNEL_ERROR, "DataType must be half or float."); }); -#endif T scalar = static_cast(static_cast(-1) * static_cast(scalarValue)); #if __CCE_AICORE__ >= 200 SetMaskCount(); diff --git a/impl/math/acosh/acosh_tiling.cpp b/impl/math/acosh/acosh_tiling.cpp index 200618a8e49f796a38ed03e5c9a72cbc11746b1e..08ed5ef07f1a2354ad8982df7f74073dab434a5b 100644 --- a/impl/math/acosh/acosh_tiling.cpp +++ b/impl/math/acosh/acosh_tiling.cpp @@ -48,9 +48,9 @@ void GetAcoshMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c maxValue = GetAcoshMaxTmpSize(inputSize, typeSize); } -void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? ACOSH_FLOAT_CALC_PROC : ACOSH_HALF_CALC_PROC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? ACOSH_FLOAT_CALC_PROC : ACOSH_HALF_CALC_PROC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/asinh/asinh_tiling.cpp b/impl/math/asinh/asinh_tiling.cpp index e0a917feda6949135c1024f9db656c12b10326ce..3d87ed0fc14edb7b212350f9a9d9d13a0997f67f 100644 --- a/impl/math/asinh/asinh_tiling.cpp +++ b/impl/math/asinh/asinh_tiling.cpp @@ -48,9 +48,9 @@ void GetAsinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c maxValue = GetAsinhMaxTmpSize(inputSize, typeSize); } -void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? ASINH_FLOAT_CALC_PROC : ASINH_HALF_CALC_PROC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? ASINH_FLOAT_CALC_PROC : ASINH_HALF_CALC_PROC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/atan/atan_common_impl.h b/impl/math/atan/atan_common_impl.h index c727294d162272b3cd8119a61c9fec5ecd7d0729..80e2d4304a028665427997edaaf84249d0e29312 100644 --- a/impl/math/atan/atan_common_impl.h +++ b/impl/math/atan/atan_common_impl.h @@ -270,7 +270,7 @@ __aicore__ inline void AtanImpl(const LocalTensor& dstTensor, const LocalTens LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - AtanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + AtanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } } // namespace AscendC diff --git a/impl/math/atan/atan_tiling_impl.cpp b/impl/math/atan/atan_tiling_impl.cpp index b58abf939f5b4b164554bb1b6f22669ea98d5247..6ffcc20633e2a140712f8a10f648473a67703724 100644 --- a/impl/math/atan/atan_tiling_impl.cpp +++ b/impl/math/atan/atan_tiling_impl.cpp @@ -47,9 +47,9 @@ void GetAtanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co maxValue = GetAtanMaxTmpSize(inputSize, typeSize); } -void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? ATAN_FLOAT_CALC_FAC : ATAN_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? ATAN_FLOAT_CALC_FAC : ATAN_HALF_CALC_FAC; } } // AscendC \ No newline at end of file diff --git a/impl/math/atanh/atanh_tiling.cpp b/impl/math/atanh/atanh_tiling.cpp index 493ebe9d53541fe8117f99fb7fe88552c20a0826..2b437a733f9874172219031e850956b05a0e6dd4 100644 --- a/impl/math/atanh/atanh_tiling.cpp +++ b/impl/math/atanh/atanh_tiling.cpp @@ -44,9 +44,9 @@ void GetAtanhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c maxValue = GetAtanhMaxTmpSize(inputSize, typeSize); } -void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? ATANH_FLOAT_CALC_PROC : ATANH_HALF_CALC_PROC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? ATANH_FLOAT_CALC_PROC : ATANH_HALF_CALC_PROC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/axpy/axpy_tiling_impl.cpp b/impl/math/axpy/axpy_tiling_impl.cpp index 9d22afcd00e6575dcf71c050672028295dd0ef77..6df81faddc24783cbf6f5e2f2f314605c61dd1b7 100644 --- a/impl/math/axpy/axpy_tiling_impl.cpp +++ b/impl/math/axpy/axpy_tiling_impl.cpp @@ -53,9 +53,9 @@ void GetAxpyMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co maxValue = GetAxpyMaxTmpSize(inputSize, typeSize); } -void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf) +void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? AXPY_FLOAT_CALC_PROC : AXPY_HALF_CALC_PROC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? AXPY_FLOAT_CALC_PROC : AXPY_HALF_CALC_PROC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/ceil/ceil_tiling_impl.cpp b/impl/math/ceil/ceil_tiling_impl.cpp index 8d76e57c6a86e37a6d7fb4b94dcd55fd801e6c64..d5ec08eb75e9210a68f239221075c089243f3d1f 100644 --- a/impl/math/ceil/ceil_tiling_impl.cpp +++ b/impl/math/ceil/ceil_tiling_impl.cpp @@ -45,9 +45,9 @@ void GetCeilMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co maxValue = GetCeilMaxTmpSize(inputSize, typeSize); } -void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? CEIL_FLOAT_CALC_FAC : CEIL_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? CEIL_FLOAT_CALC_FAC : CEIL_HALF_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/clamp/clamp_tiling_impl.cpp b/impl/math/clamp/clamp_tiling_impl.cpp index 0df62079a1bfee80a9fdc83b7e5a8e652d88291b..1ce8255eec81c0b853b38e3d09423ba7e2036662 100644 --- a/impl/math/clamp/clamp_tiling_impl.cpp +++ b/impl/math/clamp/clamp_tiling_impl.cpp @@ -37,10 +37,10 @@ void GetClampMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c } } -void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { (void)typeSize; extraBuf = 0; - maxLiveNodeCnt = CLAMP_CALC_FAC; + maxLiveNodeCount = CLAMP_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/cos/cos_tiling_impl.cpp b/impl/math/cos/cos_tiling_impl.cpp index c4cf8bcecb2e4fc793d831004d6b5abfb0503d2b..1f7d3ae5debe38277fb8e7150d9e4e7ce28e4cb6 100644 --- a/impl/math/cos/cos_tiling_impl.cpp +++ b/impl/math/cos/cos_tiling_impl.cpp @@ -62,9 +62,9 @@ void GetCosMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con maxValue = GetCosMaxTmpSize(inputSize, typeSize, isReuseSource); } -void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? COS_FLOAT_NOREUSE_CALC_FAC : COS_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? COS_FLOAT_NOREUSE_CALC_FAC : COS_HALF_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/cosh/cosh_common_impl.h b/impl/math/cosh/cosh_common_impl.h index 70dad85a38e06b0063168e36813ce1a9ec9d2ba1..b45df24f32bfd9b2e4112504507cf40450b9fc59 100644 --- a/impl/math/cosh/cosh_common_impl.h +++ b/impl/math/cosh/cosh_common_impl.h @@ -157,7 +157,7 @@ __aicore__ inline void CoshImpl(const LocalTensor& dstTensor, const LocalTens LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - CoshImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + CoshImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } } // namespace AscendC diff --git a/impl/math/erfc/erfc_common_impl.h b/impl/math/erfc/erfc_common_impl.h index ce63b39f9da6f366c2a018dd465ca5f6eb84da5c..a5706c2ffd2a042fe6274f930fce5dd1dd6b1f6b 100644 --- a/impl/math/erfc/erfc_common_impl.h +++ b/impl/math/erfc/erfc_common_impl.h @@ -346,7 +346,7 @@ __aicore__ inline void ErfcImpl(const LocalTensor& dstTensor, const LocalTens LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - ErfcImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + ErfcImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } } // namespace AscendC #endif // IMPL_MATH_ERFC_ERFC_COMMON_IMPL_H diff --git a/impl/math/erfc/erfc_tiling_impl.cpp b/impl/math/erfc/erfc_tiling_impl.cpp index 00d5ecc284167b6ea7e61a089a4834629500b147..44bfeea0198bf62f5d3428ae350ca92bfbae7007 100644 --- a/impl/math/erfc/erfc_tiling_impl.cpp +++ b/impl/math/erfc/erfc_tiling_impl.cpp @@ -48,9 +48,9 @@ void GetErfcMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co maxValue = GetErfcMaxTmpSize(inputSize, typeSize); } -void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? ERFC_FLOAT_CALC_FAC : ERFC_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? ERFC_FLOAT_CALC_FAC : ERFC_HALF_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/floor/floor_tiling_impl.cpp b/impl/math/floor/floor_tiling_impl.cpp index b959cb64dd4c76ff7e072530abc2a5d579032f45..ac50fc2c4208e96a016189fd4cfa19f4cdb0ff2f 100644 --- a/impl/math/floor/floor_tiling_impl.cpp +++ b/impl/math/floor/floor_tiling_impl.cpp @@ -44,9 +44,9 @@ void GetFloorMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c maxValue = GetFloorMaxTmpSize(inputSize, typeSize); } -void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? 0 : FLOOR_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? 0 : FLOOR_HALF_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/fmod/fmod_common_impl.h b/impl/math/fmod/fmod_common_impl.h index 264afdebbff4c93ec9bb5a49ceac2bee4cbb8fcf..1cec4fa9c184afa183b6d660302289344a7015cc 100644 --- a/impl/math/fmod/fmod_common_impl.h +++ b/impl/math/fmod/fmod_common_impl.h @@ -94,10 +94,10 @@ __aicore__ inline void FmodImpl(const LocalTensor &dstTensor, const LocalTens return; } - constexpr uint32_t maxLiveNodeCnt = 8; // The corresponding maxLiveNodeCnt for half is 8, extra is 3 * 2 + trunc 2. + constexpr uint32_t maxLiveNodeCount = 8; // The corresponding maxLiveNodeCount for half is 8, extra is 3 * 2 + trunc 2. uint32_t bufferSize = sharedTmpBuffer.GetSize(); uint32_t stackSize = - bufferSize / sizeof(T) / maxLiveNodeCnt / ONE_BLK_SIZE * ONE_BLK_SIZE; // divided by how many counts + bufferSize / sizeof(T) / maxLiveNodeCount / ONE_BLK_SIZE * ONE_BLK_SIZE; // divided by how many counts CheckTmpBufferSize(stackSize, 0, bufferSize); ASCENDC_ASSERT((src0Tensor.GetSize() > 0), { KERNEL_LOG(KERNEL_ERROR, "src0Tensor size must > 0!"); }); stackSize = stackSize > src0Tensor.GetSize() ? src0Tensor.GetSize() : stackSize; // No more than localTensor diff --git a/impl/math/fmod/fmod_tiling_impl.cpp b/impl/math/fmod/fmod_tiling_impl.cpp index 1f68f81dc15c94d79799f2a3cc7f23f1fb57e1d8..103ba893cc66cf1336bf1960a2d489964f405289 100644 --- a/impl/math/fmod/fmod_tiling_impl.cpp +++ b/impl/math/fmod/fmod_tiling_impl.cpp @@ -50,10 +50,10 @@ void GetFmodMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co minValue = GetFmodMinTmpSize(typeSize, truncCalPro); } -void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - GetTruncTmpBufferFactorSize(typeSize, maxLiveNodeCnt, extraBuf); - maxLiveNodeCnt += (typeSize == sizeof(float)) ? FMOD_FLOAT_TENSOR_NUM : FMOD_HALF_TENSOR_NUM; + GetTruncTmpBufferFactorSize(typeSize, maxLiveNodeCount, extraBuf); + maxLiveNodeCount += (typeSize == sizeof(float)) ? FMOD_FLOAT_TENSOR_NUM : FMOD_HALF_TENSOR_NUM; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/frac/frac_tiling_impl.cpp b/impl/math/frac/frac_tiling_impl.cpp index 5fa01948c7c2d4cbdcd832ddd2b95467150b7ee8..caafa6e5e08205ac226867f9f2bec6c5659419bd 100644 --- a/impl/math/frac/frac_tiling_impl.cpp +++ b/impl/math/frac/frac_tiling_impl.cpp @@ -49,9 +49,9 @@ void GetFracMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co maxValue = GetFracMaxTmpSize(inputSize, typeSize); } -void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? FRAC_FLOAT_CALC_FAC : FRAC_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? FRAC_FLOAT_CALC_FAC : FRAC_HALF_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/lgamma/lgamma_common_basic_impl.h b/impl/math/lgamma/lgamma_common_basic_impl.h index 57b3a7c44fb40751d24951cbd9cefad0d1197c2c..87f5b7553d2f045785f11d0ffc1f0d6a3025af30 100644 --- a/impl/math/lgamma/lgamma_common_basic_impl.h +++ b/impl/math/lgamma/lgamma_common_basic_impl.h @@ -54,7 +54,7 @@ __aicore__ inline void LGammaCalcMulAdd(const LocalTensor &tmp, const Loc } // cal result of 0.7 <= x < 1.5 on tmp1, Ln return inf when x is 0 -__aicore__ inline void LGamma007(const LocalTensor &src, const LGammaFParams ¶ms) +__aicore__ inline void LGamma007(const LocalTensor &src, const LGammaParams ¶ms) { // tmp1 = MulAdd(x) LGammaCalcMulAdd(params.tmp1, src, params.unaryParams, params.binaryParams, params007, params007Len); @@ -71,7 +71,7 @@ __aicore__ inline void LGamma007(const LocalTensor &src, const LGammaFPar } // cal result of 0.7 <= x < 1.5 on tmp2 -__aicore__ inline void LGamma0715(const LocalTensor &src, const LGammaFParams ¶ms) +__aicore__ inline void LGamma0715(const LocalTensor &src, const LGammaParams ¶ms) { // tmp1 = 1.0 - x Muls(params.tmp1, src, fn1, MASK_PLACEHOLDER, 1, params.unaryParams); @@ -84,7 +84,7 @@ __aicore__ inline void LGamma0715(const LocalTensor &src, const LGammaFPa } // cal result of 1.5 <= x < 3 on tmp2 -__aicore__ inline void LGamma153(const LocalTensor &src, const LGammaFParams ¶ms) +__aicore__ inline void LGamma153(const LocalTensor &src, const LGammaParams ¶ms) { // tmp1 = x - 2.0 Adds(params.tmp1, src, fn2, MASK_PLACEHOLDER, 1, params.unaryParams); @@ -95,7 +95,7 @@ __aicore__ inline void LGamma153(const LocalTensor &src, const LGammaFPar } // cal result of 3 <= x < 5.8 on tmp3 -__aicore__ inline void LGamma358(const LocalTensor &src, const LGammaFParams ¶ms) +__aicore__ inline void LGamma358(const LocalTensor &src, const LGammaParams ¶ms) { // tmp1 = x - 3.0 Adds(params.tmp1, src, fn3, MASK_PLACEHOLDER, 1, params.unaryParams); @@ -123,7 +123,7 @@ __aicore__ inline void LGamma358(const LocalTensor &src, const LGammaFPar } // cal result of x >= 5.8 on tmp1 -__aicore__ inline void LGamma58(const LocalTensor &src, const LGammaFParams ¶ms) +__aicore__ inline void LGamma58(const LocalTensor &src, const LGammaParams ¶ms) { // tmp1 = ln(x) * 0.5 Ln(params.tmp1, src, MASK_PLACEHOLDER, 1, params.unaryParams); @@ -167,7 +167,7 @@ __aicore__ inline void LGamma58(const LocalTensor &src, const LGammaFPara // gen mask, src < scalar set 1, other set 0, used tmp1 __aicore__ inline void LGammaGenLTMask( - const LocalTensor &mask, const LocalTensor &src, const LGammaFParams ¶ms, const float scalar) + const LocalTensor &mask, const LocalTensor &src, const LGammaParams ¶ms, const float scalar) { Duplicate(params.tmp1, scalar, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); @@ -179,7 +179,7 @@ __aicore__ inline void LGammaGenLTMask( // gen mask, src >= scalar set 1, other set 0, used tmp1 __aicore__ inline void LGammaGenGEMask( - const LocalTensor &mask, const LocalTensor &src, const LGammaFParams ¶ms, const float scalar) + const LocalTensor &mask, const LocalTensor &src, const LGammaParams ¶ms, const float scalar) { Duplicate(params.tmp1, scalar, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); @@ -191,7 +191,7 @@ __aicore__ inline void LGammaGenGEMask( // gen mask on params.mask, min > src >= max set 1, other set 0, used tmp1 __aicore__ inline void LGammaGenRangeMask( - const LocalTensor &src, const LGammaFParams ¶ms, const float min, const float max) + const LocalTensor &src, const LGammaParams ¶ms, const float min, const float max) { LGammaGenLTMask(params.mask, src, params, max); LGammaGenGEMask(params.tmpMask1, src, params, min); @@ -209,7 +209,7 @@ __aicore__ inline void LGammaGenRangeMask( // Select the value of src at mask 1, and accumulate the result onto dst, used tmp1 __aicore__ inline void LGammaSelect(const LocalTensor &dst, const LocalTensor &src, - const LocalTensor &mask, const LGammaFParams ¶ms) + const LocalTensor &mask, const LGammaParams ¶ms) { SetCmpMask(params.tmpScalar); PipeBarrier(); @@ -220,7 +220,7 @@ __aicore__ inline void LGammaSelect(const LocalTensor &dst, const LocalTe } // tmp6 is |x|, res on tmp5 -__aicore__ inline void LGammaPositive(const LGammaFParams ¶ms) +__aicore__ inline void LGammaPositive(const LGammaParams ¶ms) { Duplicate(params.tmp5, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); @@ -258,7 +258,7 @@ __aicore__ inline void LGammaPositive(const LGammaFParams ¶ms) } // cal tmp val on tmp2, mask for odd, tmpMask1 for even, tmp6 is |x| -__aicore__ inline void LGammaCalNegTmp1(const LGammaFParams ¶ms) +__aicore__ inline void LGammaCalNegTmp1(const LGammaParams ¶ms) { // tmp2 = floor(tmp6 + tmp6 + 0.5) Add(params.tmp2, params.tmp6, params.tmp6, MASK_PLACEHOLDER, 1, params.binaryParams); @@ -298,7 +298,7 @@ __aicore__ inline void LGammaCalNegTmp1(const LGammaFParams ¶ms) } // input is tmp2, mask for odd, tmpMask1 for even, result on tmp2 -__aicore__ inline void LGammaCalNegTmp2(const LGammaFParams ¶ms) +__aicore__ inline void LGammaCalNegTmp2(const LGammaParams ¶ms) { // tmp3 = tmp2 * tmp2 Mul(params.tmp3, params.tmp2, params.tmp2, MASK_PLACEHOLDER, 1, params.binaryParams); @@ -325,7 +325,7 @@ __aicore__ inline void LGammaCalNegTmp2(const LGammaFParams ¶ms) } // Get final result, tmp3 save for inf -__aicore__ inline void LGammaCalNegTmp3(const LGammaFParams ¶ms) +__aicore__ inline void LGammaCalNegTmp3(const LGammaParams ¶ms) { // tmp2 = 1.1447298526763916015625 - ln(|tmp2| * tmp6) - tmp5 Abs(params.tmp1, params.tmp2, MASK_PLACEHOLDER, 1, params.unaryParams); @@ -347,7 +347,7 @@ __aicore__ inline void LGammaCalNegTmp3(const LGammaFParams ¶ms) } // Near zero negative, input as |x|, result on tmp1 -__aicore__ inline void LGammaCalMinNeg(const LocalTensor &src, const LGammaFParams ¶ms) +__aicore__ inline void LGammaCalMinNeg(const LocalTensor &src, const LGammaParams ¶ms) { // tmp1 = -ln(src) Ln(params.tmp1, src, MASK_PLACEHOLDER, 1, params.unaryParams); @@ -357,7 +357,7 @@ __aicore__ inline void LGammaCalMinNeg(const LocalTensor &src, const LGam } // cal for x < 0, result on tmp4, tmp6 is |x|, tmp5 is pos res -__aicore__ inline void LGammaNegative(const LGammaFParams ¶ms) +__aicore__ inline void LGammaNegative(const LGammaParams ¶ms) { Duplicate(params.tmp4, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); @@ -431,7 +431,7 @@ __aicore__ inline void LGammaNegative(const LGammaFParams ¶ms) template __aicore__ inline void LGammaInitFParams( - const LocalTensor &tmp, const uint32_t splitSize, const LocalTensor &src, LGammaFParams ¶ms) + const LocalTensor &tmp, const uint32_t splitSize, const LocalTensor &src, LGammaParams ¶ms) { params.tmp1 = tmp; params.tmp2 = tmp[splitSize]; @@ -464,6 +464,37 @@ __aicore__ inline void LGammaInitFParams( params.splitSize = splitSize; } + +template +__aicore__ inline void LGammaInitHParams( + const LocalTensor &tmp, const uint32_t splitSize, const LocalTensor &src, LGammaParams ¶ms) +{ + params.tmp1 = tmp; + params.tmp2 = tmp[splitSize]; + params.tmp3 = params.tmp2[splitSize]; + params.tmp4 = params.tmp3[splitSize]; + params.tmp5 = params.tmp4[splitSize]; + params.tmpScalar = params.tmp5[splitSize]; + params.mask = params.tmpScalar[splitSize].ReinterpretCast(); + params.tmpMask1 = params.mask[splitSize]; + params.tmpMask2 = params.tmpMask1[splitSize]; + params.tmpMask3 = params.tmpMask2[splitSize]; + params.tmp6 = params.tmpScalar[splitSize * i2]; + + params.tmp1.SetSize(splitSize); + params.tmp2.SetSize(splitSize); + params.tmp3.SetSize(splitSize); + params.tmp4.SetSize(splitSize); + params.tmp5.SetSize(splitSize); + params.mask.SetSize(splitSize); + params.tmpMask1.SetSize(splitSize); + params.tmpMask2.SetSize(splitSize); + params.tmpMask3.SetSize(splitSize); + params.tmpScalar.SetSize(splitSize); + params.tmp6.SetSize(splitSize * i6); + + params.splitSize = splitSize; +} } // namespace AscendC #endif #endif // IMPL_MATH_LGAMMA_LGAMMA_COMMOM_BASIC_IMPL_H \ No newline at end of file diff --git a/impl/math/lgamma/lgamma_common_impl.h b/impl/math/lgamma/lgamma_common_impl.h index 6be655e66c82e9ff7ab6ac677801a71d37bc9ffd..e53a6865e4b7df1d28a7e16f6a77936d2a33976b 100644 --- a/impl/math/lgamma/lgamma_common_impl.h +++ b/impl/math/lgamma/lgamma_common_impl.h @@ -265,102 +265,73 @@ __aicore__ inline void LGammaSelectINF(const LocalTensor &dstTensor, cons PipeBarrier(); } -__aicore__ inline void LgammaComputeImpl(const LocalTensor &dstTensor, const LocalTensor &srcTensor, - const LocalTensor &tmpTensor, const uint32_t &splitSize, bool isReuseSource) +__aicore__ inline void LgammaComputeImpl(const LocalTensor &dstTensor, const LocalTensor &srcTensor, + LGammaParams ¶ms, const uint32_t splitSize) { - (void)isReuseSource; - const UnaryRepeatParams unaryParams; - const BinaryRepeatParams binParams; - // half-->float - LocalTensor restmpBuffer = tmpTensor; - LocalTensor srctmpBuffer = restmpBuffer[splitSize]; - Duplicate(restmpBuffer, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); + Duplicate(params.tmp1, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); PipeBarrier(); - Cast(srctmpBuffer, srcTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, + Cast(params.tmp2, srcTensor, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, {1, 1, DEFAULT_REPEAT_STRIDE, HALF_DEFAULT_REPEAT_STRIDE}); PipeBarrier(); - // compute result x >= 0 - LocalTensor TensorPosRes = srctmpBuffer[splitSize]; - // compute result x < 0 - LocalTensor TensorNegRes = TensorPosRes[splitSize]; - LocalTensor tmp1Tensor = TensorNegRes[splitSize]; // all 0 tensor - LocalTensor tmpScalar = tmp1Tensor[splitSize]; - Duplicate(tmpScalar, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); - PipeBarrier(); - LocalTensor MaskPos = tmpScalar[splitSize].ReinterpretCast(); - LocalTensor MaskNeg = MaskPos[splitSize]; - LocalTensor tmpMask1 = MaskNeg[splitSize]; - LocalTensor tmpMask2 = tmpMask1[splitSize]; - LocalTensor stackTensor = tmpScalar[splitSize*i2]; - - restmpBuffer.SetSize(splitSize); - srctmpBuffer.SetSize(splitSize); - TensorPosRes.SetSize(splitSize); - TensorNegRes.SetSize(splitSize); - tmp1Tensor.SetSize(splitSize); - tmpScalar.SetSize(splitSize); - MaskNeg.SetSize(splitSize); - MaskPos.SetSize(splitSize); - tmpMask1.SetSize(splitSize); - tmpMask2.SetSize(splitSize); - stackTensor.SetSize(splitSize * i6); + Duplicate(params.tmpScalar, 0.0f, MASK_PLACEHOLDER, 1, DEFAULT_BLK_STRIDE, DEFAULT_REPEAT_STRIDE); + PipeBarrier(); // compute result x >= 0 - LgammaComputePosHalf(TensorPosRes, srctmpBuffer, stackTensor, splitSize); + LgammaComputePosHalf(params.tmp3, params.tmp2, params.tmp6, splitSize); PipeBarrier(); // compute mask x >= 0 - LGammaGenGEMaskHalf(MaskPos, srctmpBuffer, tmp1Tensor, 0.0f, splitSize); + LGammaGenGEMaskHalf(params.mask, params.tmp2, params.tmp5, 0.0f, splitSize); PipeBarrier(); - LGammaSelectHalf(restmpBuffer, TensorPosRes, MaskPos, tmp1Tensor, tmpScalar); + LGammaSelectHalf(params.tmp1, params.tmp3, params.mask, params.tmp5, params.tmpScalar); PipeBarrier(); // compute result x < 0 - LgammaComputeNegHalf(TensorNegRes, srctmpBuffer, stackTensor, splitSize); + LgammaComputeNegHalf(params.tmp4, params.tmp2, params.tmp6, splitSize); PipeBarrier(); // compute mask x < 0 - LGammaGenLTMaskHalf(MaskNeg, srctmpBuffer, tmp1Tensor, 0.0f, splitSize); + LGammaGenLTMaskHalf(params.tmpMask1, params.tmp2, params.tmp5, 0.0f, splitSize); PipeBarrier(); - LGammaSelectHalf(restmpBuffer, TensorNegRes, MaskNeg, tmp1Tensor, tmpScalar); + LGammaSelectHalf(params.tmp1, params.tmp4, params.tmpMask1, params.tmp5, params.tmpScalar); PipeBarrier(); // for nan - SetVectorMask(0, ConstCeil(splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE)); - Not(tmpMask1.ReinterpretCast(), MaskPos.ReinterpretCast(), - MASK_PLACEHOLDER, 1, unaryParams); - Not(tmpMask2.ReinterpretCast(), MaskNeg.ReinterpretCast(), - MASK_PLACEHOLDER, 1, unaryParams); + SetVectorMask(0, ConstCeil(params.splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE)); + Not(params.tmpMask2.ReinterpretCast(), params.mask.ReinterpretCast(), + MASK_PLACEHOLDER, 1, params.unaryParams); + Not(params.tmpMask3.ReinterpretCast(), params.tmpMask1.ReinterpretCast(), + MASK_PLACEHOLDER, 1, params.unaryParams); PipeBarrier(); - And(tmpMask1.ReinterpretCast(), tmpMask1.ReinterpretCast(), - tmpMask2.ReinterpretCast(), MASK_PLACEHOLDER, 1, binParams); + And(params.tmpMask2.ReinterpretCast(), params.tmpMask1.ReinterpretCast(), + params.tmpMask3.ReinterpretCast(), MASK_PLACEHOLDER, 1, params.binaryParams); PipeBarrier(); - SetVectorMask(0, splitSize); - LGammaSelectHalf(restmpBuffer, srctmpBuffer, tmpMask1, tmp1Tensor, tmpScalar); + SetVectorMask(0, params.splitSize); + LGammaSelectHalf(params.tmp1, params.tmp2, params.tmpMask2, params.tmp4, params.tmpScalar); PipeBarrier(); // for inf/-inf - Abs(srctmpBuffer, srctmpBuffer, MASK_PLACEHOLDER, 1, unaryParams); + Abs(params.tmp2, params.tmp2, MASK_PLACEHOLDER, 1, params.unaryParams); PipeBarrier(); // generate |x| >= 65504 mask - LGammaGenGEMaskHalf(tmpMask1, srctmpBuffer, tmp1Tensor, 65504.0f, splitSize); + LGammaGenGEMaskHalf(params.tmpMask2, params.tmp2, params.tmp4, 65504.0f, splitSize); PipeBarrier(); - SetVectorMask(0, ConstCeil(splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE)); - Not(tmpMask2.ReinterpretCast(), tmpMask1.ReinterpretCast(), - MASK_PLACEHOLDER, 1, unaryParams); + SetVectorMask(0, ConstCeil(params.splitSize, sizeof(uint16_t) * ONE_BYTE_BIT_SIZE)); + Not(params.tmpMask3.ReinterpretCast(), params.tmpMask2.ReinterpretCast(), + MASK_PLACEHOLDER, 1, params.unaryParams); PipeBarrier(); - SetVectorMask(0, splitSize); - LGammaSelectINF(restmpBuffer, restmpBuffer, tmpMask2, tmp1Tensor, tmpScalar); + SetVectorMask(0, params.splitSize); + LGammaSelectINF(params.tmp1, params.tmp1, params.tmpMask3, params.tmp4, params.tmpScalar); PipeBarrier(); // float-->half - Cast(dstTensor, restmpBuffer, RoundMode::CAST_NONE, MASK_PLACEHOLDER, + Cast(dstTensor, params.tmp1, RoundMode::CAST_NONE, MASK_PLACEHOLDER, 1, {1, 1, HALF_DEFAULT_REPEAT_STRIDE, DEFAULT_REPEAT_STRIDE}); } __aicore__ inline void LgammaComputeImpl( - const LocalTensor &dst, const LocalTensor &src, LGammaFParams ¶ms) + const LocalTensor &dst, const LocalTensor &src, LGammaParams ¶ms) { // Gen masks with x >= 0 and < 0, which will not be overwritten in the future LGammaGenGEMask(params.tmpMask2, src, params, 0.0f); @@ -423,25 +394,30 @@ __aicore__ inline void LgammaCompute(const LocalTensor &dstTensor, const L CheckTmpBufferSize(tmpBufferSize, 0, bufferSize); LocalTensor tmpBuffer = sharedTmpBuffer.ReinterpretCast(); - uint32_t stackSize = 0; + uint32_t splitSize = 0; - stackSize = tmpBufferSize / LGAMMA_HALF_CALC_PROCEDURE / ONE_BLK_SIZE * ONE_BLK_SIZE; // 32 byte - CheckTmpBufferSize(stackSize, 0, bufferSize); + splitSize = tmpBufferSize / LGAMMA_HALF_CALC_PROCEDURE / ONE_BLK_SIZE * ONE_BLK_SIZE; // 32 byte + CheckTmpBufferSize(splitSize, 0, bufferSize); + + // init params + LGammaParams params; + LGammaInitHParams(tmpBuffer, splitSize, srcTensor, params); - const uint32_t round = calCount / stackSize; - const uint32_t tail = calCount % stackSize; + const uint32_t round = calCount / splitSize; + const uint32_t tail = calCount % splitSize; SetMaskCount(); - SetVectorMask(0, stackSize); + SetVectorMask(0, splitSize); uint32_t offset = 0; for (uint32_t i = 0; i < round; i++) { - LgammaComputeImpl(dstTensor[offset], srcTensor[offset], tmpBuffer, stackSize, isReuseSource); - offset = offset + stackSize; + LgammaComputeImpl(dstTensor[offset], srcTensor[offset], params, splitSize); + offset = offset + splitSize; } if (tail > 0) { SetVectorMask(0, tail); + params.splitSize = tail; LgammaComputeImpl( - dstTensor[round * stackSize], srcTensor[round * stackSize], tmpBuffer, stackSize, isReuseSource); + dstTensor[round * splitSize], srcTensor[round * splitSize], params, splitSize); } SetMaskNorm(); AscendCUtils::ResetMask(); @@ -469,7 +445,7 @@ __aicore__ inline void LgammaCompute(const LocalTensor &dst, const LocalT CheckTmpBufferSize(splitSize, 0, tmpBufferSize); // init params - LGammaFParams params; + LGammaParams params; LGammaInitFParams(tmpBuffer, splitSize, src, params); const uint32_t loopCount = calCount / splitSize; @@ -480,7 +456,6 @@ __aicore__ inline void LgammaCompute(const LocalTensor &dst, const LocalT LgammaComputeImpl(dst[i * splitSize], src[i * splitSize], params); } if (calcTail > 0) { - calcTail = (calcTail + ONE_BYTE_BIT_SIZE - 1U) / ONE_BYTE_BIT_SIZE * ONE_BYTE_BIT_SIZE; SetVectorMask(0, calcTail); params.splitSize = calcTail; LgammaComputeImpl(dst[loopCount * splitSize], src[loopCount * splitSize], params); diff --git a/impl/math/lgamma/lgamma_common_utils.h b/impl/math/lgamma/lgamma_common_utils.h index 123389918a72bd4e55c2f326457404d54b252926..bfc535250e92fd14edff117244a1755361ab9d8d 100644 --- a/impl/math/lgamma/lgamma_common_utils.h +++ b/impl/math/lgamma/lgamma_common_utils.h @@ -97,8 +97,8 @@ constexpr float negParamsEven[negParamsEvenLen] = { -0.000195746586541645228862762451171875, 0.0083327032625675201416015625, -0.16666662693023681640625}; } // namespace -struct LGammaFParams { - __aicore__ LGammaFParams() +struct LGammaParams { + __aicore__ LGammaParams() {} LocalTensor tmp1; LocalTensor tmp2; diff --git a/impl/math/log/log_tiling.cpp b/impl/math/log/log_tiling.cpp index 43c9542939d2add606fe54f4f06fab4b49bf3995..389db54e3948b45566be5212a3ddb990647105a0 100644 --- a/impl/math/log/log_tiling.cpp +++ b/impl/math/log/log_tiling.cpp @@ -51,11 +51,11 @@ void GetLogMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con minValue = 0; } -void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { (void)typeSize; extraBuf = 0; - maxLiveNodeCnt = 0; + maxLiveNodeCount = 0; } void GetLog10MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource, @@ -68,11 +68,11 @@ void GetLog10MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c minValue = 0; } -void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { (void)typeSize; extraBuf = 0; - maxLiveNodeCnt = 0; + maxLiveNodeCount = 0; } void GetLog2MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource, @@ -86,10 +86,10 @@ void GetLog2MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co maxValue = GetLog2MaxTmpSize(inputSize, typeSize); } -void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { auto calcFactor = (typeSize == sizeof(float)) ? LOG2_FLOAT_CALC_FAC : LOG2_HALF_CALC_FAC; extraBuf = 0; - maxLiveNodeCnt = calcFactor; + maxLiveNodeCount = calcFactor; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/round/round_tiling_impl.cpp b/impl/math/round/round_tiling_impl.cpp index 760e226eda434845b54780b0040ba9faf77ee17e..98d6a5daa60f5f5257bd4a0b34162268d92dee77 100644 --- a/impl/math/round/round_tiling_impl.cpp +++ b/impl/math/round/round_tiling_impl.cpp @@ -47,22 +47,22 @@ void GetRoundMaxMinTmpSize(const platform_ascendc::PlatformAscendC &ascendcPlatf } void GetRoundTmpBufferFactorSize(const platform_ascendc::PlatformAscendC &ascendcPlatform, const uint32_t typeSize, - uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) + uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; platform_ascendc::SocVersion socVersion = ascendcPlatform.GetSocVersion(); constexpr uint32_t liveNodeTwo = 2; if (socVersion == platform_ascendc::SocVersion::ASCEND910B) { if (typeSize == sizeof(float)) { - maxLiveNodeCnt = 0; + maxLiveNodeCount = 0; } else { - maxLiveNodeCnt = 1; + maxLiveNodeCount = 1; } } else if (socVersion == platform_ascendc::SocVersion::ASCEND310P) { if (typeSize == sizeof(float)) { - maxLiveNodeCnt = 1; + maxLiveNodeCount = 1; } else { - maxLiveNodeCnt = liveNodeTwo; + maxLiveNodeCount = liveNodeTwo; } } } diff --git a/impl/math/sin/sin_common_impl.h b/impl/math/sin/sin_common_impl.h index 432584b317108f86bd0e37acc23db138a35cae5e..8d125e1db3056eeb9ba1c1db12a856ad420e109b 100644 --- a/impl/math/sin/sin_common_impl.h +++ b/impl/math/sin/sin_common_impl.h @@ -263,7 +263,7 @@ __aicore__ inline void SinImpl(const LocalTensor& dstTensor, const LocalTenso LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - SinImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + SinImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } } // namespace AscendC diff --git a/impl/math/sin/sin_tiling_impl.cpp b/impl/math/sin/sin_tiling_impl.cpp index 86870e202f4869e4f8224ea54ab1fca8d27789bb..b12b16c1c3613e2cd7e2e40373ac91e4f9c1e31e 100644 --- a/impl/math/sin/sin_tiling_impl.cpp +++ b/impl/math/sin/sin_tiling_impl.cpp @@ -25,10 +25,10 @@ constexpr uint32_t SIN_FLOAT_REUSE_CALC_FAC = 2; constexpr uint32_t SIN_ONE_REPEAT_BYTE_SIZE = 256; } // namespace -void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf) +void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? SIN_FLOAT_NOREUSE_CALC_FAC : SIN_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? SIN_FLOAT_NOREUSE_CALC_FAC : SIN_HALF_CALC_FAC; } inline uint32_t GetSinMaxTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, const bool isReuseSource) diff --git a/impl/math/sinh/sinh_common_impl.h b/impl/math/sinh/sinh_common_impl.h index 9a0ab9a844a676548cf8a5b5bfb20caa0fe1e224..15fa0cbd5cb07d97c6fb047cb0130218b2f856d0 100644 --- a/impl/math/sinh/sinh_common_impl.h +++ b/impl/math/sinh/sinh_common_impl.h @@ -160,7 +160,7 @@ __aicore__ inline void SinhImpl(const LocalTensor& dstTensor, const LocalTens LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - SinhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + SinhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } } // namespace AscendC diff --git a/impl/math/sinh/sinh_tiling_impl.cpp b/impl/math/sinh/sinh_tiling_impl.cpp index 6474d1f9580e084ab60642cc29badf299b518f03..156319dad4231b9ec22ce24eb26346551b6341e2 100644 --- a/impl/math/sinh/sinh_tiling_impl.cpp +++ b/impl/math/sinh/sinh_tiling_impl.cpp @@ -48,9 +48,9 @@ void GetSinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co maxValue = GetSinhMaxTmpSize(inputSize, typeSize); } -void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? SINH_FLOAT_CALC_PROC : SINH_HALF_CALC_PROC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? SINH_FLOAT_CALC_PROC : SINH_HALF_CALC_PROC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/tan/tan_common_impl.h b/impl/math/tan/tan_common_impl.h index cf413b7085be72a97e0a746d3923a8858480c5d9..1f2283224a44a438847ae0c4b4e9455d4abe5cda 100644 --- a/impl/math/tan/tan_common_impl.h +++ b/impl/math/tan/tan_common_impl.h @@ -332,7 +332,7 @@ __aicore__ inline void TanImpl(const LocalTensor& dstTensor, const LocalTenso LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - TanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + TanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } } // namespace AscendC diff --git a/impl/math/tan/tan_tiling_impl.cpp b/impl/math/tan/tan_tiling_impl.cpp index ffe5daf3b7e4658129255ec5440679df2273b0db..3bf548a6f6a959825672a9605a808819fe8abd70 100644 --- a/impl/math/tan/tan_tiling_impl.cpp +++ b/impl/math/tan/tan_tiling_impl.cpp @@ -48,9 +48,9 @@ void GetTanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con maxValue = GetTanMaxTmpSize(inputSize, typeSize); } -void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? TAN_FLOAT_CALC_FAC : TAN_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? TAN_FLOAT_CALC_FAC : TAN_HALF_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/tanh/tanh_common_impl.h b/impl/math/tanh/tanh_common_impl.h index 798bebb11651b129ce8cc8fddbf1fadddf6eaac0..331ec4b9442ef4793480f368338ed60a8403efec 100644 --- a/impl/math/tanh/tanh_common_impl.h +++ b/impl/math/tanh/tanh_common_impl.h @@ -165,7 +165,7 @@ __aicore__ inline void TanhImpl(const LocalTensor& dstTensor, const LocalTens LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - TanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + TanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } } // namespace AscendC diff --git a/impl/math/tanh/tanh_tiling_impl.cpp b/impl/math/tanh/tanh_tiling_impl.cpp index e96438f6e1741e492fad4715da46647c43120454..3854e5765fcabf9ab99a9dfe91abfad760d034fd 100644 --- a/impl/math/tanh/tanh_tiling_impl.cpp +++ b/impl/math/tanh/tanh_tiling_impl.cpp @@ -48,9 +48,9 @@ void GetTanhMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co maxValue = GetTanhMaxTmpSize(inputSize, typeSize); } -void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf) +void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? TANH_FLOAT_CALC_PROC : TANH_HALF_CALC_PROC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? TANH_FLOAT_CALC_PROC : TANH_HALF_CALC_PROC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/math/trunc/trunc_tiling_impl.cpp b/impl/math/trunc/trunc_tiling_impl.cpp index 75c7577ed7bc42b57ebca2c5b174cda313324f51..bad7750af8a39a84c1c5aca2d303cf631a72c7ff 100644 --- a/impl/math/trunc/trunc_tiling_impl.cpp +++ b/impl/math/trunc/trunc_tiling_impl.cpp @@ -47,9 +47,9 @@ void GetTruncMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c maxValue = GetTruncMaxTmpSize(inputSize, typeSize); } -void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf) +void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf) { extraBuf = 0; - maxLiveNodeCnt = (typeSize == sizeof(float)) ? TRUNC_FLOAT_CALC_FAC : TRUNC_HALF_CALC_FAC; + maxLiveNodeCount = (typeSize == sizeof(float)) ? TRUNC_FLOAT_CALC_FAC : TRUNC_HALF_CALC_FAC; } } // namespace AscendC \ No newline at end of file diff --git a/impl/normalization/layernormgrad/layernormgrad_common_impl.h b/impl/normalization/layernormgrad/layernormgrad_common_impl.h index 1460d91c6ef8622f9fbb1bec935f8d2012a4e64e..b3cffd201dffa196bc072d47398f12019e9f8ff4 100644 --- a/impl/normalization/layernormgrad/layernormgrad_common_impl.h +++ b/impl/normalization/layernormgrad/layernormgrad_common_impl.h @@ -509,7 +509,7 @@ __aicore__ inline void LayerNormGradImpl(const LocalTensor &outputPdX, const ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); LayerNormGradImpl(outputPdX, resForGamma, inputDy, inputX, inputVariance, inputMean, inputGamma, - stackBuffer, epsilon, tiling); + stackBuffer, epsilon, tiling, shapeInfo); } } // namespace AscendC #endif // IMPL_NORMALIZATION_LAYERNORMGRAD_LAYERNORMGRAD_COMMON_IMPL_H diff --git a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp index 244456092f3ea848a0fcbd09f77ffaa73af861e3..65abcb07ee617a4987efff7324b3e51489031dee 100644 --- a/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp +++ b/impl/reduce/reduce_xor_sum/reduce_xor_sum_tiling.cpp @@ -38,7 +38,7 @@ void GetReduceXorSumMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t type inputSize); std::vector shapeDims = srcShape.GetDims(); ASCENDC_HOST_ASSERT(shapeDims.size() > 0UL, return, - "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape dimension number is %lli, expected is greater than 0!", + "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter srcShape dimension number is %lu, expected is greater than 0!", shapeDims.size()); ASCENDC_HOST_ASSERT(typeSize == 2U, return, "[ReduceXorSum][GetReduceXorSumMaxMinTmpSize] The parameter typeSize is %u, expected is 2!", typeSize); diff --git a/lib/activation/geglu_tiling.h b/lib/activation/geglu_tiling.h index 742d6dda9cc688943c7702f34aef5fd286321b36..7284ad795a3c5c53531d4f1cbc3b7ed6fc5d39eb 100644 --- a/lib/activation/geglu_tiling.h +++ b/lib/activation/geglu_tiling.h @@ -33,13 +33,13 @@ void GetGeGLUMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c /*! * \brief The calculation of the GeGLU interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetGeGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_ACTIVATION_GEGLU_TILING_H diff --git a/lib/activation/reglu.h b/lib/activation/reglu.h index cd1abc2b6bca2e2732d8f8b71970b644631b8341..f166db2fff8b51aa7e19c36c3cab52ac43e7a123 100644 --- a/lib/activation/reglu.h +++ b/lib/activation/reglu.h @@ -36,7 +36,7 @@ __aicore__ inline void ReGlu(const LocalTensor& dstTensor, const LocalTensor< if (g_coreType == AIC) { return; } - ReGluImpl(dstTensor, srcTensor0, srcTensor1, sharedTmpBuffer, calCount); + ReGluImpl(dstTensor, srcTensor0, srcTensor1, sharedTmpBuffer, calCount); } /* @@ -52,7 +52,7 @@ __aicore__ inline void ReGlu(const LocalTensor& dstTensor, const LocalTensor< if (g_coreType == AIC) { return; } - ReGluImpl(dstTensor, srcTensor0, srcTensor1, calCount); + ReGluImpl(dstTensor, srcTensor0, srcTensor1, calCount); } #pragma end_pipe diff --git a/lib/activation/silu.h b/lib/activation/silu.h index 482c1a9ed5d9d454786b343e56a2c2a6b8949ec4..ada158151869d1fb5da331a26244802a37011123 100644 --- a/lib/activation/silu.h +++ b/lib/activation/silu.h @@ -38,7 +38,7 @@ __aicore__ inline __inout_pipe__(V) void Silu(const LocalTensor &dstLocal, co if ASCEND_IS_AIC { return; } - SiluCompute(dstLocal, srcLocal, dataSize); + SiluCompute(dstLocal, srcLocal, dataSize); } } // namespace AscendC diff --git a/lib/activation/swiglu_tiling.h b/lib/activation/swiglu_tiling.h index 0170fbd96df9b61da5fd517c41dc0abae68334b2..ef2bf1e652ba08607f025bdfc65d38d3a08614fe 100644 --- a/lib/activation/swiglu_tiling.h +++ b/lib/activation/swiglu_tiling.h @@ -35,11 +35,11 @@ void GetSwiGLUMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, /*! * \brief The calculation of the SwiGLU interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetSwiGLUTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer); diff --git a/lib/activation/swish.h b/lib/activation/swish.h index edb2ea938be8f632e36fd25848c2b688f802e518..5ed05ba0abbb2995aa6115db62e72d73990934e0 100644 --- a/lib/activation/swish.h +++ b/lib/activation/swish.h @@ -38,7 +38,7 @@ __aicore__ inline __inout_pipe__(V) void Swish( if ASCEND_IS_AIC { return; } - SwishCompute(dstLocal, srcLocal, dataSize, scalarValue); + SwishCompute(dstLocal, srcLocal, dataSize, scalarValue); } } // namespace AscendC #endif // LIB_SWISH_SWISH_H \ No newline at end of file diff --git a/lib/math/acos_tiling.h b/lib/math/acos_tiling.h index bb3bb41bf2b1b0f5598a37f9e6200f9d00d96ee0..5997cff9e81d63747b58c6fbb6d7afb8f2435479 100644 --- a/lib/math/acos_tiling.h +++ b/lib/math/acos_tiling.h @@ -24,7 +24,7 @@ namespace AscendC { * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetAcosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer); diff --git a/lib/math/acosh_tiling.h b/lib/math/acosh_tiling.h index a57ec81d85be1fa9087f5ac6cbe0ce8eb2c74c3a..1b1ee4f1b0319cef8ff189911c9067c0f5f411cf 100644 --- a/lib/math/acosh_tiling.h +++ b/lib/math/acosh_tiling.h @@ -34,13 +34,13 @@ void GetAcoshMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c /*! * \brief The calculation of the Acosh interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetAcoshTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AcsendC #endif // LIB_MATH_ACOSH_TILING \ No newline at end of file diff --git a/lib/math/asin_tiling.h b/lib/math/asin_tiling.h index 8e845953bb98062668e1847acaac13270737b57d..17fe9ecbcee1e687ec75833f2a62f875637111df 100644 --- a/lib/math/asin_tiling.h +++ b/lib/math/asin_tiling.h @@ -24,7 +24,7 @@ namespace AscendC { * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetAsinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer); diff --git a/lib/math/asinh_tiling.h b/lib/math/asinh_tiling.h index a37360bc0040118b206dbd9233c7453f7e121541..b4edef07b082a6917179706e962333e4e1161703 100644 --- a/lib/math/asinh_tiling.h +++ b/lib/math/asinh_tiling.h @@ -33,13 +33,13 @@ void GetAsinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c /*! * \brief The calculation of the Asinh interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetAsinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } #endif // LIB_MATH_ASINH_TILING_H \ No newline at end of file diff --git a/lib/math/atan.h b/lib/math/atan.h index c25b39347a2d2240026e58aa06f8ab568aa29f67..f3e59887caa5cbfbfe7c9c32f48b14d2b431c2f2 100644 --- a/lib/math/atan.h +++ b/lib/math/atan.h @@ -70,7 +70,7 @@ template __aicore__ inline void Atan(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const uint32_t calCount) { - AtanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + AtanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -104,7 +104,7 @@ template __aicore__ inline void Atan(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const uint32_t calCount) { - AtanImpl(dstTensor, srcTensor, calCount); + AtanImpl(dstTensor, srcTensor, calCount); } #pragma end_pipe } // namespace AscendC diff --git a/lib/math/atan_tiling.h b/lib/math/atan_tiling.h index 5e92c5ceffa6bddbc0517cc63add37f8a761da8e..43b715052afc3aca381fb748b31732bfd36e2647 100644 --- a/lib/math/atan_tiling.h +++ b/lib/math/atan_tiling.h @@ -34,13 +34,13 @@ void GetAtanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co /*! * \brief The calculation of the Atan interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuffer, the size of the extra temporary space */ -void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetAtanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_ATAN_TILING_H \ No newline at end of file diff --git a/lib/math/atanh.h b/lib/math/atanh.h index 47ccd80dfeab07fd89e2ca01dbd934e7398fe474..44ddc13c3d9f6d0c70d8b5c51d137cf70747a767 100644 --- a/lib/math/atanh.h +++ b/lib/math/atanh.h @@ -45,7 +45,7 @@ __aicore__ inline void Atanh(const LocalTensor &dstTensor, return; } - AtanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + AtanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /* ! diff --git a/lib/math/atanh_tiling.h b/lib/math/atanh_tiling.h index 9a3926992c956edd74d0242a18676884ee07160b..96e3b2d78232597bfdf06daca115efc810ad168e 100644 --- a/lib/math/atanh_tiling.h +++ b/lib/math/atanh_tiling.h @@ -30,14 +30,14 @@ void GetAtanhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c /*! * \brief The calculation of the Atanh interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] ascendcPlatform, platform Information * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetAtanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_ATANH_TILING_H \ No newline at end of file diff --git a/lib/math/axpy_tiling.h b/lib/math/axpy_tiling.h index f6460291dfc7ae265a7e1d189d5d578dca76b2e3..60daae9cfcea5cf9cd86a4bf688f4ce847acd610 100644 --- a/lib/math/axpy_tiling.h +++ b/lib/math/axpy_tiling.h @@ -34,12 +34,12 @@ void GetAxpyMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co /*! * \brief The calculation of the Axpy interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * \param [in] typeSize: size of the input data type, in bytes - * \param [out] maxLiveNodeCnt: the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount: the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf: the size of the extra temporary space */ -void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf); +void GetAxpyTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf); } // namespace AscendC #endif // LIB_MATH_AXPY_TILING_H \ No newline at end of file diff --git a/lib/math/ceil.h b/lib/math/ceil.h index e0cfcab0733fe2fa7d1f9e1f56012216862effe4..2f61a244998b02d08de2dc20471e9b24647920cc 100644 --- a/lib/math/ceil.h +++ b/lib/math/ceil.h @@ -46,7 +46,7 @@ __aicore__ inline void Ceil(const LocalTensor &dstTensor, const LocalTensor(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -65,7 +65,7 @@ __aicore__ inline void Ceil(const LocalTensor &dstTensor, const LocalTensor(dstTensor, srcTensor, calCount); } #pragma end_pipe diff --git a/lib/math/ceil_tiling.h b/lib/math/ceil_tiling.h index 79c0f8514492864bcde4b7d51b3e8739458a7f9e..aa822dce5c459460d1b3379e861085260c2b8522 100644 --- a/lib/math/ceil_tiling.h +++ b/lib/math/ceil_tiling.h @@ -38,9 +38,9 @@ void GetCeilMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetCeilTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_CEIL_TILING_H diff --git a/lib/math/clamp.h b/lib/math/clamp.h index 2e1c85bc85cd1473c38e5cce7dce47f419c69710..b1cfa6f5182c8fa8a1924055bb4b99dbc86b2016 100644 --- a/lib/math/clamp.h +++ b/lib/math/clamp.h @@ -45,7 +45,7 @@ __aicore__ inline void ClampMax(const LocalTensor& dstTensor, const LocalTens if ASCEND_IS_AIC { return; } - ClampMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); + ClampMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); } template @@ -60,7 +60,7 @@ __aicore__ inline void ClampMax(const LocalTensor& dstTensor, const LocalTens LocalTensor sharedTmpBuffer; bool ret = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ret), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - ClampMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); + ClampMaxImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); } /* ! @@ -87,7 +87,7 @@ __aicore__ inline void ClampMin(const LocalTensor& dstTensor, const LocalTens if ASCEND_IS_AIC { return; } - ClampMinImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); + ClampMinImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); } template @@ -102,7 +102,7 @@ __aicore__ inline void ClampMin(const LocalTensor& dstTensor, const LocalTens LocalTensor sharedTmpBuffer; bool ret = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ret), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - ClampMinImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); + ClampMinImpl(dstTensor, srcTensor, sharedTmpBuffer, scalar, calCount); } #pragma end_pipe } // namespace AscendC diff --git a/lib/math/clamp_tiling.h b/lib/math/clamp_tiling.h index 675b7acf6489c1402810ebf8186b78329810c44a..3e6f4d1ce90bbf1b6a5a81424c258bf47364322e 100644 --- a/lib/math/clamp_tiling.h +++ b/lib/math/clamp_tiling.h @@ -38,9 +38,9 @@ void GetClampMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetClampTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } #endif // LIB_MATH_CLAMP_TILING_H \ No newline at end of file diff --git a/lib/math/cos_tiling.h b/lib/math/cos_tiling.h index 80ff4954c8a6416272a0db64f452abca1f50154c..4bb5381928d6b2a419a31d90e1c659a1914ddb83 100644 --- a/lib/math/cos_tiling.h +++ b/lib/math/cos_tiling.h @@ -34,13 +34,13 @@ void GetCosMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con /*! * \brief The calculation of the Cos interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuffer, the size of the extra temporary space */ -void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetCosTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AcsendC #endif // LIB_MATH_COS_TILING_H \ No newline at end of file diff --git a/lib/math/cosh.h b/lib/math/cosh.h index 035ce1d8a2a34cd2c4d253a93c1a9afaf0f83390..7d246be36245f6d6815d096123aca60383b3f223 100644 --- a/lib/math/cosh.h +++ b/lib/math/cosh.h @@ -65,7 +65,7 @@ template __aicore__ inline void Cosh(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const uint32_t calCount) { - CoshImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + CoshImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -98,7 +98,7 @@ __aicore__ inline void Cosh(const LocalTensor& dstTensor, const LocalTensor __aicore__ inline void Cosh(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const uint32_t calCount) { - CoshImpl(dstTensor, srcTensor, calCount); + CoshImpl(dstTensor, srcTensor, calCount); } #pragma end_pipe } // namespace AscendC diff --git a/lib/math/cosh_tiling.h b/lib/math/cosh_tiling.h index 58b173619accbe21741944db7d33b74b6d0b83e2..a89c628824a41b5fd10b2a723d4fda9e1d272b94 100644 --- a/lib/math/cosh_tiling.h +++ b/lib/math/cosh_tiling.h @@ -34,8 +34,8 @@ void GetCoshMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co /*! * \brief The calculation of the Cosh interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space diff --git a/lib/math/erf_tiling.h b/lib/math/erf_tiling.h index cd69dcdf708e309252c1750f9653fd7d2e3f0acd..f5887e46d5fbc612f678f6cb0e9dcf829dd20b16 100644 --- a/lib/math/erf_tiling.h +++ b/lib/math/erf_tiling.h @@ -24,7 +24,7 @@ namespace AscendC { * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetErfTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer); diff --git a/lib/math/erfc.h b/lib/math/erfc.h index c82246ea02321cbc9f676ec22188eda13d319d0f..fac81c9df81c4dcf697a1bb2a0cc39cc1f53aab4 100644 --- a/lib/math/erfc.h +++ b/lib/math/erfc.h @@ -61,7 +61,7 @@ template __aicore__ inline void Erfc(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const uint32_t calCount) { - ErfcImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + ErfcImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -92,7 +92,7 @@ __aicore__ inline void Erfc(const LocalTensor& dstTensor, const LocalTensor __aicore__ inline void Erfc(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const uint32_t calCount) { - ErfcImpl(dstTensor, srcTensor, calCount); + ErfcImpl(dstTensor, srcTensor, calCount); } /*! diff --git a/lib/math/erfc_tiling.h b/lib/math/erfc_tiling.h index e5cbd9480b8f21c7e7e8c5368db1d45b0b36499a..85560c2d75fe438ff1d2bd19ab9aa060b0371ec9 100644 --- a/lib/math/erfc_tiling.h +++ b/lib/math/erfc_tiling.h @@ -24,7 +24,7 @@ namespace AscendC { * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetErfcMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, const bool isReuseSource, @@ -40,6 +40,6 @@ void GetErfcMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co * \param [out] maxValue, maximum temporary space required * \param [out] minValue, minimum temporary space required */ -void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetErfcTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_ERFC_TILING_H \ No newline at end of file diff --git a/lib/math/exp_tiling.h b/lib/math/exp_tiling.h index 41f841f80fb4d48519d30db50b7f52c0fb133148..0fae539b6ebb3812f0ae3755003e7ed2aa38a133 100644 --- a/lib/math/exp_tiling.h +++ b/lib/math/exp_tiling.h @@ -22,8 +22,8 @@ namespace AscendC { /*! * \brief The calculation of the Exp interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * \param [in] typeSize: size of the input data type, in bytes * \param [out] maxLiveNodeCount: the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuffer: the size of the extra temporary space diff --git a/lib/math/floor_tiling.h b/lib/math/floor_tiling.h index 9e0163ee48632e58307d091bbd8d1dcad1166918..60141f45cd775958a98a2ef2542c88ef521ed21d 100644 --- a/lib/math/floor_tiling.h +++ b/lib/math/floor_tiling.h @@ -38,9 +38,9 @@ void GetFloorMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetFloorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_FLOOR_TILING_H \ No newline at end of file diff --git a/lib/math/fmod_tiling.h b/lib/math/fmod_tiling.h index 7ebb76a1c56cf06d983fac48323938074d8f80dd..98db7f19e3aeb7ff9e607a8bfe14726f6c3b8c16 100644 --- a/lib/math/fmod_tiling.h +++ b/lib/math/fmod_tiling.h @@ -38,9 +38,9 @@ void GetFmodMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetFmodTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_FMOD_TILING_H \ No newline at end of file diff --git a/lib/math/frac_tiling.h b/lib/math/frac_tiling.h index b8260b3e29bb0019089e9249b04509726c00364f..c2a80d948657737cd754ad25b91f49f7b05a46cc 100644 --- a/lib/math/frac_tiling.h +++ b/lib/math/frac_tiling.h @@ -37,9 +37,9 @@ void GetFracMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetFracTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_FRAC_TILING_H \ No newline at end of file diff --git a/lib/math/lgamma_tiling.h b/lib/math/lgamma_tiling.h index 50370b11221bf30e0b4e3a32b3ef1a51b1753308..8327f84c8e521b94f12f279f4afad00f925b566f 100644 --- a/lib/math/lgamma_tiling.h +++ b/lib/math/lgamma_tiling.h @@ -22,11 +22,11 @@ namespace AscendC { /*! * \brief The calculation of the Lgamma interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetLgammaTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer); diff --git a/lib/math/log_tiling.h b/lib/math/log_tiling.h index 99df3308a9cc8c734cafa288cb44cc66c37d5686..0a1175ac896539203b9a6b4918b32f55c9f4edd2 100644 --- a/lib/math/log_tiling.h +++ b/lib/math/log_tiling.h @@ -34,14 +34,14 @@ void GetLogMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con /*! * \brief The calculation of the Log interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetLogTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); /*! * \brief This interface is used to obtain the maximum and minimum temporary space reserved or applied. @@ -59,14 +59,14 @@ void GetLog10MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c /*! * \brief The calculation of the Log10 interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetLog10TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); /*! * \brief This interface is used to obtain the maximum and minimum temporary space reserved or applied. @@ -84,13 +84,13 @@ void GetLog2MaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co /*! * \brief The calculation of the Log2 interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetLog2TmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_LOG_TILING \ No newline at end of file diff --git a/lib/math/power_tiling.h b/lib/math/power_tiling.h index 9057c7e7f24e0024b0f0543a561e43b162b2d5eb..8ded75656a68f816a66e01a01b929347feda99f9 100644 --- a/lib/math/power_tiling.h +++ b/lib/math/power_tiling.h @@ -38,8 +38,8 @@ void GetPowerMaxMinTmpSize(const ge::Shape& srcShape1, const ge::Shape& srcShape * @ingroup GetPowerTmpBufferFactorSize for V220 * @brief The calculation of the Round interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * @param [in] baseIsTensor : whether base input is a Tensor * @param [in] expIsTensor : whether exponent input is a Tensor * @param [in] typeIsInt : whether the src type is int diff --git a/lib/math/round.h b/lib/math/round.h index d16c1acd673415c532e63e6ce61408aea8b796e0..9634af5870a95f5d0363dd1f18cb5f89a0e5f7a3 100644 --- a/lib/math/round.h +++ b/lib/math/round.h @@ -39,7 +39,7 @@ __aicore__ inline void Round(const LocalTensor &dstTensor, const LocalTensor< if ASCEND_IS_AIC { return; } - RoundImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + RoundImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -56,7 +56,7 @@ __aicore__ inline void Round(const LocalTensor &dstTensor, const LocalTensor< if ASCEND_IS_AIC { return; } - RoundImpl(dstTensor, srcTensor, calCount); + RoundImpl(dstTensor, srcTensor, calCount); } #pragma end_pipe } // namespace AscendC diff --git a/lib/math/round_tiling.h b/lib/math/round_tiling.h index 94b8a45d724b3e08084792cffaf7da1649275a73..e22c7a511af4c27b20b5f24fdff8d0b93579852d 100644 --- a/lib/math/round_tiling.h +++ b/lib/math/round_tiling.h @@ -35,15 +35,15 @@ void GetRoundMaxMinTmpSize(const platform_ascendc::PlatformAscendC &ascendcPlatf /*! * \brief The calculation of the Round interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] ascendcPlatform, platform Information * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetRoundTmpBufferFactorSize(const platform_ascendc::PlatformAscendC &ascendcPlatform, const uint32_t typeSize, - uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); + uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } #endif // LIB_MATH_ROUND_TILING_INTF \ No newline at end of file diff --git a/lib/math/sign_tiling.h b/lib/math/sign_tiling.h index 08d130503de326adc92e9aa389e1eb94eec66fb3..19da68254d466f2b860ea7ed7d806acf52501709 100644 --- a/lib/math/sign_tiling.h +++ b/lib/math/sign_tiling.h @@ -22,11 +22,11 @@ namespace AscendC { /*! * \brief The calculation of the Sign interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetSignTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer); diff --git a/lib/math/sin.h b/lib/math/sin.h index 1b09b150a6fca8c7a8ba33d391fdb16e7db1686e..97378cf21d3f0ab9f43100a0df19d68a0a64c36b 100644 --- a/lib/math/sin.h +++ b/lib/math/sin.h @@ -60,7 +60,7 @@ template __aicore__ inline void Sin(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const uint32_t calCount) { - SinImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + SinImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -100,7 +100,7 @@ template __aicore__ inline void Sin(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const uint32_t calCount) { - SinImpl(dstTensor, srcTensor, calCount); + SinImpl(dstTensor, srcTensor, calCount); } /*! diff --git a/lib/math/sin_tiling.h b/lib/math/sin_tiling.h index 211b2373c9d9896bdbc534284d2d323fceca93ef..329ddd0b2b19cea071f5b1faf36022b07b109c6d 100644 --- a/lib/math/sin_tiling.h +++ b/lib/math/sin_tiling.h @@ -34,13 +34,13 @@ void GetSinMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con /*! * \brief The calculation of the Sin interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuffer, the size of the extra temporary space */ -void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetSinTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_SIN_TILING_H \ No newline at end of file diff --git a/lib/math/sinh_tiling.h b/lib/math/sinh_tiling.h index 2f23d7e4ae3b5959eb02dec761f84bb93894d96c..0b02095517bbd7cfdb6236d2835ab5fe500c3f0e 100644 --- a/lib/math/sinh_tiling.h +++ b/lib/math/sinh_tiling.h @@ -34,13 +34,13 @@ void GetSinhMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, co /*! * \brief The calculation of the Sinh interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuffer, the size of the extra temporary space */ -void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetSinhTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_SINH_TILING_H \ No newline at end of file diff --git a/lib/math/tan.h b/lib/math/tan.h index 7fa5075398f7e47acbae17935175b357dfa650a2..8250adfc13d7aaf4fd3bb84c3a23fae7d6a3cbf1 100644 --- a/lib/math/tan.h +++ b/lib/math/tan.h @@ -93,7 +93,7 @@ template __aicore__ inline void Tan(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const uint32_t calCount) { - TanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + TanImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -111,7 +111,7 @@ __aicore__ inline void Tan(const LocalTensor& dstTensor, const LocalTensor template __aicore__ inline void Tan(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const uint32_t calCount) { - TanImpl(dstTensor, srcTensor, calCount); + TanImpl(dstTensor, srcTensor, calCount); } /*! diff --git a/lib/math/tan_tiling.h b/lib/math/tan_tiling.h index 68f0c4c3b319c950ea5739e68ea78b51c961047a..f6506a5263f1f6cef9281bc0dc1719d476f2432e 100644 --- a/lib/math/tan_tiling.h +++ b/lib/math/tan_tiling.h @@ -35,13 +35,13 @@ void GetTanMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, con /*! * \brief The calculation of the Tan interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuffer, the size of the extra temporary space */ -void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetTanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } #endif // LIB_MATH_TAN_TILING_H \ No newline at end of file diff --git a/lib/math/tanh.h b/lib/math/tanh.h index 06fb09d96143920a27f7c86030f9aa4c6cb4490c..ecea5f974ff63b483619ef457eb828e4532a4d36 100644 --- a/lib/math/tanh.h +++ b/lib/math/tanh.h @@ -65,7 +65,7 @@ template __aicore__ inline void Tanh(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const LocalTensor& sharedTmpBuffer, const uint32_t calCount) { - TanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); + TanhImpl(dstTensor, srcTensor, sharedTmpBuffer, calCount); } /*! @@ -99,7 +99,7 @@ template __aicore__ inline void Tanh(const LocalTensor& dstTensor, const LocalTensor& srcTensor, const uint32_t calCount) { - TanhImpl(dstTensor, srcTensor, calCount); + TanhImpl(dstTensor, srcTensor, calCount); } #pragma end_pipe } // namespace AscendC diff --git a/lib/math/tanh_tiling.h b/lib/math/tanh_tiling.h index 4d13550d0cfdfa4cbfcf7ac124ce2e785753b3a2..3930d84ac96b07badff3069b9d6acbea20017fa2 100644 --- a/lib/math/tanh_tiling.h +++ b/lib/math/tanh_tiling.h @@ -35,13 +35,13 @@ void GetTanhMaxMinTmpSize(const ge::Shape& srcShape, const uint32_t typeSize, co /*! * \brief The calculation of the Tanh interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuffer, the size of the extra temporary space */ -void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCnt, uint32_t& extraBuf); +void GetTanhTmpBufferFactorSize(const uint32_t typeSize, uint32_t& maxLiveNodeCount, uint32_t& extraBuf); } // namespace AscendC #endif // LIB_MATH_TANH_TILING_H \ No newline at end of file diff --git a/lib/math/trunc_tiling.h b/lib/math/trunc_tiling.h index 1505220a238947b33c721a6a96079a5d632dfab9..6fa9185449c1bf4cd5f31af761000cf1e7c0c850 100644 --- a/lib/math/trunc_tiling.h +++ b/lib/math/trunc_tiling.h @@ -38,9 +38,9 @@ void GetTruncMaxMinTmpSize(const ge::Shape &srcShape, const uint32_t typeSize, c * iterationSize = (remainFreeSapce - extraBuf) / maxLivedNodeCnt / typeSize * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ -void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCnt, uint32_t &extraBuf); +void GetTruncTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuf); } // namespace AscendC #endif // LIB_MATH_TRUNC_TILING_H \ No newline at end of file diff --git a/lib/math/xor.h b/lib/math/xor.h index 8f3f4a2b2ecee48bb3aa8e0baeaf2c73367d610f..096f1829ce89dce039e4b6bda6b7d8d4cc95184a 100644 --- a/lib/math/xor.h +++ b/lib/math/xor.h @@ -48,7 +48,7 @@ __aicore__ inline void Xor(const LocalTensor& dstTensor, const LocalTensor static_assert((std::is_same::value || std::is_same::value), "Failed to check the data types, current api support data types are int16_t/uint16_t."); - XorImpl(dstTensor, src0Tensor, src1Tensor, sharedTmpBuffer, calCount); + XorImpl(dstTensor, src0Tensor, src1Tensor, sharedTmpBuffer, calCount); } /* * @brief Xor Computes the element-wise logical XOR of the given input tensors. Zeros are treated as False and nonzeros diff --git a/lib/math/xor_tiling.h b/lib/math/xor_tiling.h index 374e830f37affdc808ab1a5b22fdfc4a869724a3..8c2af82517a2977c692353af0b26d2d7772aede7 100644 --- a/lib/math/xor_tiling.h +++ b/lib/math/xor_tiling.h @@ -22,11 +22,11 @@ namespace AscendC { /*! * \brief The calculation of the Xor interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetXorTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer); diff --git a/lib/reduce/mean.h b/lib/reduce/mean.h index dbc87a136297fc3fc81daa06c611394f92670bd8..838d152205d16a3e4ea07a0f02b9cb19335f2f73 100644 --- a/lib/reduce/mean.h +++ b/lib/reduce/mean.h @@ -87,7 +87,7 @@ __aicore__ inline void Mean( LocalTensor sharedTmpBuffer; bool ans = PopStackBuffer(sharedTmpBuffer); ASCENDC_ASSERT((ans), { KERNEL_LOG(KERNEL_ERROR, "PopStackBuffer Error!"); }); - Mean(dstTensor, srcTensor, sharedTmpBuffer, meanParams); + Mean(dstTensor, srcTensor, sharedTmpBuffer, meanParams); } #pragma end_pipe } // namespace AscendC diff --git a/lib/reduce/mean_tiling.h b/lib/reduce/mean_tiling.h index 8133d1e92c2f12af306251144a6c810b17eacb61..b435338111e19dc13d823c8736a0be33822071c5 100644 --- a/lib/reduce/mean_tiling.h +++ b/lib/reduce/mean_tiling.h @@ -33,11 +33,11 @@ void GetMeanMaxMinTmpSize(const uint32_t n, const uint32_t srcTypeSize, const ui /*! * \brief The calculation of the Mean interface requires the developer to reserve or apply for temporary space. The * relationship between the maximum temporary space (maxTmpBuffer) and the space occupied by the input (inputSize x - * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCnt * inputSize * typeSize + extraBuf - * This interface is used to obtain maxLiveNodeCnt and extraBuf. + * typeSize) is as follows: maxTmpBuffer = maxLiveNodeCount * inputSize * typeSize + extraBuf + * This interface is used to obtain maxLiveNodeCount and extraBuf. * * \param [in] typeSize, size of the input data type, in bytes - * \param [out] maxLiveNodeCnt, the multiple of the maximum temporary space to the input occupied space + * \param [out] maxLiveNodeCount, the multiple of the maximum temporary space to the input occupied space * \param [out] extraBuf, the size of the extra temporary space */ void GetMeanTmpBufferFactorSize(const uint32_t typeSize, uint32_t &maxLiveNodeCount, uint32_t &extraBuffer);