From 16263da7f7c0f69d4b0f04166d48a9771c7a7b5c Mon Sep 17 00:00:00 2001 From: shinoda Date: Sun, 11 May 2025 11:33:33 +0800 Subject: [PATCH] =?UTF-8?q?=E6=B7=BB=E5=8A=A0=20Cos=E3=80=81Dropout?= =?UTF-8?q?=E3=80=81Sum=E3=80=81ConfusionTranspose=20=E7=AB=AF=E5=88=B0?= =?UTF-8?q?=E7=AB=AF=E6=A0=B7=E4=BE=8B?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- examples/filter/dropout/CMakeLists.txt | 82 +++++++++ examples/filter/dropout/README.md | 93 ++++++++++ examples/filter/dropout/cmake/cpu_lib.cmake | 26 +++ examples/filter/dropout/cmake/npu_lib.cmake | 12 ++ examples/filter/dropout/main.cpp | 167 ++++++++++++++++++ .../dropout/op_host/dropout_custom_tiling.cpp | 34 ++++ .../dropout/op_host/dropout_custom_tiling.h | 24 +++ .../dropout/op_kernel/dropout_custom.cpp | 22 +++ .../dropout/op_kernel/dropout_custom_impl.h | 100 +++++++++++ examples/filter/dropout/run.sh | 58 ++++++ examples/filter/dropout/scripts/gen_data.py | 33 ++++ examples/math/cos/CMakeLists.txt | 82 +++++++++ examples/math/cos/README.md | 90 ++++++++++ examples/math/cos/cmake/cpu_lib.cmake | 26 +++ examples/math/cos/cmake/npu_lib.cmake | 12 ++ examples/math/cos/main.cpp | 153 ++++++++++++++++ .../math/cos/op_host/cos_custom_tiling.cpp | 18 ++ examples/math/cos/op_host/cos_custom_tiling.h | 21 +++ examples/math/cos/op_kernel/cos_custom.cpp | 21 +++ examples/math/cos/op_kernel/cos_custom_impl.h | 75 ++++++++ examples/math/cos/run.sh | 58 ++++++ examples/math/cos/scripts/gen_data.py | 31 ++++ examples/readme.md | 20 +++ examples/reduce/sum/CMakeLists.txt | 82 +++++++++ examples/reduce/sum/README.md | 90 ++++++++++ examples/reduce/sum/cmake/cpu_lib.cmake | 26 +++ examples/reduce/sum/cmake/npu_lib.cmake | 12 ++ examples/reduce/sum/main.cpp | 160 +++++++++++++++++ .../reduce/sum/op_host/sum_custom_tiling.cpp | 32 ++++ .../reduce/sum/op_host/sum_custom_tiling.h | 24 +++ examples/reduce/sum/op_kernel/sum_custom.cpp | 21 +++ .../reduce/sum/op_kernel/sum_custom_impl.h | 85 +++++++++ examples/reduce/sum/run.sh | 58 ++++++ examples/reduce/sum/scripts/gen_data.py | 31 ++++ .../confusion_transpose/CMakeLists.txt | 82 +++++++++ .../transpose/confusion_transpose/README.md | 93 ++++++++++ .../confusion_transpose/cmake/cpu_lib.cmake | 26 +++ .../confusion_transpose/cmake/npu_lib.cmake | 12 ++ .../transpose/confusion_transpose/main.cpp | 146 +++++++++++++++ .../confusion_transpose_custom_tiling.cpp | 24 +++ .../confusion_transpose_custom_tiling.h | 45 +++++ .../op_kernel/confusion_transpose_custom.cpp | 32 ++++ .../confusion_transpose_custom_impl.h | 78 ++++++++ examples/transpose/confusion_transpose/run.sh | 61 +++++++ .../confusion_transpose/scripts/gen_data.py | 39 ++++ .../scripts/verify_data.py | 57 ++++++ 46 files changed, 2574 insertions(+) create mode 100644 examples/filter/dropout/CMakeLists.txt create mode 100644 examples/filter/dropout/README.md create mode 100644 examples/filter/dropout/cmake/cpu_lib.cmake create mode 100644 examples/filter/dropout/cmake/npu_lib.cmake create mode 100644 examples/filter/dropout/main.cpp create mode 100644 examples/filter/dropout/op_host/dropout_custom_tiling.cpp create mode 100644 examples/filter/dropout/op_host/dropout_custom_tiling.h create mode 100644 examples/filter/dropout/op_kernel/dropout_custom.cpp create mode 100644 examples/filter/dropout/op_kernel/dropout_custom_impl.h create mode 100644 examples/filter/dropout/run.sh create mode 100644 examples/filter/dropout/scripts/gen_data.py create mode 100644 examples/math/cos/CMakeLists.txt create mode 100644 examples/math/cos/README.md create mode 100644 examples/math/cos/cmake/cpu_lib.cmake create mode 100644 examples/math/cos/cmake/npu_lib.cmake create mode 100644 examples/math/cos/main.cpp create mode 100644 examples/math/cos/op_host/cos_custom_tiling.cpp create mode 100644 examples/math/cos/op_host/cos_custom_tiling.h create mode 100644 examples/math/cos/op_kernel/cos_custom.cpp create mode 100644 examples/math/cos/op_kernel/cos_custom_impl.h create mode 100644 examples/math/cos/run.sh create mode 100644 examples/math/cos/scripts/gen_data.py create mode 100644 examples/reduce/sum/CMakeLists.txt create mode 100644 examples/reduce/sum/README.md create mode 100644 examples/reduce/sum/cmake/cpu_lib.cmake create mode 100644 examples/reduce/sum/cmake/npu_lib.cmake create mode 100644 examples/reduce/sum/main.cpp create mode 100644 examples/reduce/sum/op_host/sum_custom_tiling.cpp create mode 100644 examples/reduce/sum/op_host/sum_custom_tiling.h create mode 100644 examples/reduce/sum/op_kernel/sum_custom.cpp create mode 100644 examples/reduce/sum/op_kernel/sum_custom_impl.h create mode 100644 examples/reduce/sum/run.sh create mode 100644 examples/reduce/sum/scripts/gen_data.py create mode 100644 examples/transpose/confusion_transpose/CMakeLists.txt create mode 100644 examples/transpose/confusion_transpose/README.md create mode 100644 examples/transpose/confusion_transpose/cmake/cpu_lib.cmake create mode 100644 examples/transpose/confusion_transpose/cmake/npu_lib.cmake create mode 100644 examples/transpose/confusion_transpose/main.cpp create mode 100644 examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp create mode 100644 examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h create mode 100644 examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp create mode 100644 examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h create mode 100644 examples/transpose/confusion_transpose/run.sh create mode 100644 examples/transpose/confusion_transpose/scripts/gen_data.py create mode 100644 examples/transpose/confusion_transpose/scripts/verify_data.py diff --git a/examples/filter/dropout/CMakeLists.txt b/examples/filter/dropout/CMakeLists.txt new file mode 100644 index 00000000..00cc3f66 --- /dev/null +++ b/examples/filter/dropout/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/dropout_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(dropout_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/dropout_custom_tiling.cpp +) + +target_compile_options(dropout_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(dropout_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(dropout_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(dropout_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS dropout_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/filter/dropout/README.md b/examples/filter/dropout/README.md new file mode 100644 index 00000000..acf91d50 --- /dev/null +++ b/examples/filter/dropout/README.md @@ -0,0 +1,93 @@ + + +## 概述 + +本样例介绍了调用 Dropout 高阶 API 实现 dropout 算子,并按照核函数直调的方式分别给出了对应的端到端实现,关于 Dropout 高阶 API 的具体内容请参考 [Ascend C 算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi) 中的 " 高阶 API > 数据过滤 > Dropout " 章节。 + +- 直调:使用核函数直调 dropout 自定义算子。 + + 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A3 训练系列产品/Atlas A3 推理系列产品 +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品AI Core + +## 目录结构 + +| 目录 | 描述 +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 +dropout 算子,根据MaskTensor对SrcTensor(源操作数,输入Tensor)进行过滤得到DstTensor(目的操作数、输出Tensor)。仅支持输入shape为ND格式。该过滤功能包括两种模式,字节模式和比特模式,本样例支持字节模式。 + +- 算子规格: + + + + + + + + + + + +
算子类型(OpType)DropoutCustom
算子输入
nameshapedata typeformat
x8*2048floatND
mask8*2048uint8_tND
算子输出
y8*2048floatND
核函数名dropout_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[8][2048],输出y[8][2048]的dropout算子。 + +- kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用Dropout高阶API接口完成dropout计算,得到最终结果,再搬出到外部存储上。 + + dropout算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行dropout计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- tiling实现 + + dropout算子的tiling实现流程如下:根据输入数据 x 及 mask 的总维度和所用核数量确定所需tiling参数,并调用GetDropOutMaxMinTmpSize接口获取tmpBuf的大小。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A3 训练系列产品/Atlas A3 推理系列产品 + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/filter/dropout/cmake/cpu_lib.cmake b/examples/filter/dropout/cmake/cpu_lib.cmake new file mode 100644 index 00000000..693f15ac --- /dev/null +++ b/examples/filter/dropout/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/filter/dropout/cmake/npu_lib.cmake b/examples/filter/dropout/cmake/npu_lib.cmake new file mode 100644 index 00000000..8ad136f3 --- /dev/null +++ b/examples/filter/dropout/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/filter/dropout/main.cpp b/examples/filter/dropout/main.cpp new file mode 100644 index 00000000..202821ea --- /dev/null +++ b/examples/filter/dropout/main.cpp @@ -0,0 +1,167 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#include "./op_host/dropout_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_dropout_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm, + DropoutCustomTilingData tiling); +#endif + +constexpr uint32_t USED_CORE_NUM = 8; +constexpr uint32_t TILINGDATA_SIZE = 5; +constexpr uint32_t FIRST_AXIS = 8; +constexpr uint32_t SRC_LAST_AXIS = 2048; + +extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis, + const uint32_t maskLastAxis, const uint32_t coreNum); + +static bool CompareResult(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + printf("wrongNum: %ld\n", wrongNum); + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + size_t inputSize = FIRST_AXIS * SRC_LAST_AXIS * sizeof(uint32_t); + size_t maskSize = FIRST_AXIS * SRC_LAST_AXIS; + size_t outputSize = inputSize; + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *mask = (uint8_t *)AscendC::GmAlloc(maskSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + ReadFile("../input/input_mask.bin", maskSize, mask, maskSize); + + GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(dropout_custom, USED_CORE_NUM, x, y, mask, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", y, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(y, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)mask); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost, *maskHost, *tiling; + uint8_t *xDevice, *yDevice, *maskDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&maskHost), maskSize)); + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)) + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&maskDevice, maskSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + ReadFile("../input/input_mask.bin", maskSize, maskHost, maskSize); + + GenerateTilingData(tiling, FIRST_AXIS, SRC_LAST_AXIS, SRC_LAST_AXIS, USED_CORE_NUM); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(maskDevice, maskSize, maskHost, maskSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(dropout_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, maskDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(yHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(maskDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(maskHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.cpp b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp new file mode 100644 index 00000000..1f306703 --- /dev/null +++ b/examples/filter/dropout/op_host/dropout_custom_tiling.cpp @@ -0,0 +1,34 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "dropout_custom_tiling.h" +#include "tiling/tiling_api.h" +#include + +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t firstAxis, const uint32_t srcLastAxis, + const uint32_t maskLastAxis, const uint32_t coreNum) { + std::vector srcDims = {firstAxis, srcLastAxis, maskLastAxis}; + + uint32_t typeSize = 4; + ge::Shape shape(srcDims); + uint32_t minValue{}; + uint32_t maxValue{}; + + AscendC::GetDropOutMaxMinTmpSize(shape, typeSize, false, maxValue, minValue); + + DropoutCustomTilingData *tiling = reinterpret_cast(tilingBuf); + + tiling->firstAxis = firstAxis; + tiling->srcLastAxis = srcLastAxis; + tiling->maskLastAxis = maskLastAxis; + + tiling->tmpBufferSize = minValue; + tiling->tileNum = coreNum; +} \ No newline at end of file diff --git a/examples/filter/dropout/op_host/dropout_custom_tiling.h b/examples/filter/dropout/op_host/dropout_custom_tiling.h new file mode 100644 index 00000000..ddf7470c --- /dev/null +++ b/examples/filter/dropout/op_host/dropout_custom_tiling.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H +#define EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H + +#include + +struct DropoutCustomTilingData { + uint32_t firstAxis; + uint32_t srcLastAxis; + uint32_t maskLastAxis; + uint32_t tmpBufferSize; + uint32_t tileNum; +}; + +#endif // EXAMPLES_FILTER_DROPOUT_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/filter/dropout/op_kernel/dropout_custom.cpp b/examples/filter/dropout/op_kernel/dropout_custom.cpp new file mode 100644 index 00000000..7e9e3387 --- /dev/null +++ b/examples/filter/dropout/op_kernel/dropout_custom.cpp @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "dropout_custom_impl.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void dropout_custom(GM_ADDR srcGm, GM_ADDR dstGm, GM_ADDR maskGm, + DropoutCustomTilingData tiling) { + if ASCEND_IS_AIC { + return; + } + KernelDropout op; + op.Init(srcGm, dstGm, maskGm, tiling); + op.Process(); +} \ No newline at end of file diff --git a/examples/filter/dropout/op_kernel/dropout_custom_impl.h b/examples/filter/dropout/op_kernel/dropout_custom_impl.h new file mode 100644 index 00000000..d51237b9 --- /dev/null +++ b/examples/filter/dropout/op_kernel/dropout_custom_impl.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_FILTER_DROPOUT_CUSTOM_H +#define EXAMPLES_FILTER_DROPOUT_CUSTOM_H + +#include "../op_host/dropout_custom_tiling.h" +#include "kernel_operator.h" + +constexpr int32_t BUFFER_NUM = 2; + +template class KernelDropout { +public: + __aicore__ inline KernelDropout() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR m, DropoutCustomTilingData tilingData) { + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); + srcSize = tilingData.firstAxis * tilingData.srcLastAxis; + tileNum = tilingData.tileNum; + blockLength = srcSize / AscendC::GetBlockNum(); + tileLength = this->blockLength / tileNum / BUFFER_NUM; + + info.firstAxis = tilingData.firstAxis / tilingData.tileNum; + info.srcLastAxis = tileLength; + info.maskLastAxis = tileLength; + + xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + maskGm.SetGlobalBuffer((__gm__ uint8_t *)m + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T)); + pipe.InitBuffer(inQueueMask, BUFFER_NUM, this->tileLength * sizeof(uint8_t)); + pipe.InitBuffer(outQueue, BUFFER_NUM, this->tileLength * sizeof(T)); + pipe.InitBuffer(tmpBuf, tilingData.tmpBufferSize * sizeof(uint8_t)); + } + __aicore__ inline void Process() { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor maskLocal = inQueueMask.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(maskLocal, maskGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueMask.EnQue(maskLocal); + } + __aicore__ inline void Compute(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor maskLocal = inQueueMask.DeQue(); + + AscendC::LocalTensor yLocal = outQueue.AllocTensor(); + AscendC::LocalTensor sharedTmpBuffer = tmpBuf.AllocTensor(); + + AscendC::DropOut(yLocal, xLocal, maskLocal, sharedTmpBuffer, probValue, info); + + outQueue.EnQue(yLocal); + + inQueueX.FreeTensor(xLocal); + inQueueMask.FreeTensor(maskLocal); + tmpBuf.FreeTensor(sharedTmpBuffer); + } + __aicore__ inline void CopyOut(int32_t progress) { + AscendC::LocalTensor yLocal = outQueue.DeQue(); + AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength); + outQueue.FreeTensor(yLocal); + } + +private: + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor maskGm; + + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueMask; + AscendC::TQue outQueue; + AscendC::TBuf tmpBuf; + + uint32_t srcSize = 0; + uint32_t tileNum = 0; + uint32_t blockLength = 0; + uint32_t tileLength = 0; + float probValue = 0.8; + AscendC::DropOutShapeInfo info; +}; + +#endif \ No newline at end of file diff --git a/examples/filter/dropout/run.sh b/examples/filter/dropout/run.sh new file mode 100644 index 00000000..76ad17bb --- /dev/null +++ b/examples/filter/dropout/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./dropout_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./dropout_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./dropout_direct_kernel_op +fi \ No newline at end of file diff --git a/examples/filter/dropout/scripts/gen_data.py b/examples/filter/dropout/scripts/gen_data.py new file mode 100644 index 00000000..3413169d --- /dev/null +++ b/examples/filter/dropout/scripts/gen_data.py @@ -0,0 +1,33 @@ +#!/usr/bin/python3 + +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 2048] + + input_x = np.random.uniform(1, 100, input_shape).astype(dtype) + input_mask = np.random.randint(0, 2, size=input_shape, dtype=bool) + golden = np.where(input_mask, input_x / 0.8, np.zeros_like(input_x)) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + input_mask.tofile("./input/input_mask.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() \ No newline at end of file diff --git a/examples/math/cos/CMakeLists.txt b/examples/math/cos/CMakeLists.txt new file mode 100644 index 00000000..188a5881 --- /dev/null +++ b/examples/math/cos/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/cos_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(cos_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/cos_custom_tiling.cpp +) + +target_compile_options(cos_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(cos_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(cos_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(cos_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS cos_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/math/cos/README.md b/examples/math/cos/README.md new file mode 100644 index 00000000..25894c25 --- /dev/null +++ b/examples/math/cos/README.md @@ -0,0 +1,90 @@ + + +## 概述 + +本样例介绍了调用 Cos 高阶 API 实现 cos 算子,并按照核函数直调的方式分别给出了对应的端到端实现。 + +- 直调:使用核函数直调cos自定义算子。 + + 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品AI Core + +## 目录结构 + +| 目录 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 +cos单算子,对输入tensor按元素做三角函数余弦运算。 + +- 算子规格: + + + + + + + + + + +
算子类型(OpType)CosCustom
算子输入
nameshapedata typeformat
x8*2048floatND
算子输出
y8*2048floatND
核函数名cos_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[8][2048],输出y[8][2048]的cos算子。 + +- kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用Cos高阶API接口完成cos计算,得到最终结果,再搬出到外部存储上。 + + cos算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行cos计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- tiling实现 + + cos算子的tiling实现流程如下:根据输入长度totalLength和所用核数量coreNum确定所需tiling参数。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/math/cos/cmake/cpu_lib.cmake b/examples/math/cos/cmake/cpu_lib.cmake new file mode 100644 index 00000000..693f15ac --- /dev/null +++ b/examples/math/cos/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/math/cos/cmake/npu_lib.cmake b/examples/math/cos/cmake/npu_lib.cmake new file mode 100644 index 00000000..8ad136f3 --- /dev/null +++ b/examples/math/cos/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/math/cos/main.cpp b/examples/math/cos/main.cpp new file mode 100644 index 00000000..4981c668 --- /dev/null +++ b/examples/math/cos/main.cpp @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#include "op_host/cos_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_cos_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling); +#endif + +constexpr uint32_t USED_CORE_NUM = 8; +constexpr uint32_t TILINGDATA_SIZE = 2; +constexpr uint32_t TOTAL_LENGTH = 8 * 2048; + +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum); + +static bool CompareResult(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + size_t inputSize = TOTAL_LENGTH * sizeof(uint32_t); + size_t outputSize = inputSize; + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + + GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(cos_custom, USED_CORE_NUM, x, y, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", y, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(y, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost, *tiling; + uint8_t *xDevice, *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)) + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + + GenerateTilingData(tiling, TOTAL_LENGTH, USED_CORE_NUM); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(cos_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(yHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} diff --git a/examples/math/cos/op_host/cos_custom_tiling.cpp b/examples/math/cos/op_host/cos_custom_tiling.cpp new file mode 100644 index 00000000..352215a9 --- /dev/null +++ b/examples/math/cos/op_host/cos_custom_tiling.cpp @@ -0,0 +1,18 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "cos_custom_tiling.h" +#include "tiling/tiling_api.h" +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t totalLength, const uint32_t coreNum) { + CosCustomTilingData *tiling = reinterpret_cast(tilingBuf); + + tiling->totalLength = totalLength; + tiling->tileNum = coreNum; +} \ No newline at end of file diff --git a/examples/math/cos/op_host/cos_custom_tiling.h b/examples/math/cos/op_host/cos_custom_tiling.h new file mode 100644 index 00000000..1ba0775f --- /dev/null +++ b/examples/math/cos/op_host/cos_custom_tiling.h @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATH_COS_CUSTOM_TILING_H +#define EXAMPLES_MATH_COS_CUSTOM_TILING_H + +#include + +struct CosCustomTilingData { + uint32_t totalLength; + uint32_t tileNum; +}; + +#endif // EXAMPLES_MATH_COS_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/math/cos/op_kernel/cos_custom.cpp b/examples/math/cos/op_kernel/cos_custom.cpp new file mode 100644 index 00000000..10a86883 --- /dev/null +++ b/examples/math/cos/op_kernel/cos_custom.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "cos_custom_impl.h" +#include "kernel_operator.h" + +extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, CosCustomTilingData tiling) { + if ASCEND_IS_AIC { + return; + } + MyCustomKernel::kernelCos op; + op.Init(x, y, tiling); + op.Process(); +} \ No newline at end of file diff --git a/examples/math/cos/op_kernel/cos_custom_impl.h b/examples/math/cos/op_kernel/cos_custom_impl.h new file mode 100644 index 00000000..b6caf3d4 --- /dev/null +++ b/examples/math/cos/op_kernel/cos_custom_impl.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATH_SORT_CUSTOM_H +#define EXAMPLES_MATH_SORT_CUSTOM_H +#include "../op_host/cos_custom_tiling.h" +#include "kernel_operator.h" + +namespace MyCustomKernel { +constexpr int32_t BUFFER_NUM = 2; + +template class kernelCos { +public: + __aicore__ inline kernelCos() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, CosCustomTilingData tilingData) { + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); + this->tileNum = tilingData.tileNum; + this->blockLength = tilingData.totalLength / AscendC::GetBlockNum(); + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + + xGm.SetGlobalBuffer((__gm__ T *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ T *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(T)); + pipe.InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(T)); + } + __aicore__ inline void Process() { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + AscendC::Cos(yLocal, xLocal, this->tileLength); + outQueueY.EnQue(yLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) { + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength); + outQueueY.FreeTensor(yLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueY; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + + uint32_t tileNum = 0; + uint32_t blockLength = 0; + uint32_t tileLength = 0; +}; +} // namespace MyCustomKernel + +#endif \ No newline at end of file diff --git a/examples/math/cos/run.sh b/examples/math/cos/run.sh new file mode 100644 index 00000000..2f8c76a4 --- /dev/null +++ b/examples/math/cos/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./cos_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./cos_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./cos_direct_kernel_op +fi \ No newline at end of file diff --git a/examples/math/cos/scripts/gen_data.py b/examples/math/cos/scripts/gen_data.py new file mode 100644 index 00000000..8c5886bf --- /dev/null +++ b/examples/math/cos/scripts/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 + +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 2048] + # generate value between [-65504, 65504] + input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + golden = np.cos(input_x).astype(dtype) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/examples/readme.md b/examples/readme.md index d1ef2e05..b982b70c 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -117,6 +117,26 @@ welford_finalize Welford算法的后处理,一种在线计算均值和方差的方法。 + + math + cos + 对输入tensor做Cos计算。 + + + filter + dropout + 提供根据MaskTensor对SrcTensor(源操作数,输入Tensor)进行过滤的功能,得到DstTensor(目的操作数、输出Tensor)。 + + + reduce + sum + 获取最后一个维度的元素总和。 + + + transpose + confusion_transpose + 对输入数据进行数据排布及Reshape操作。 + pad broadcast diff --git a/examples/reduce/sum/CMakeLists.txt b/examples/reduce/sum/CMakeLists.txt new file mode 100644 index 00000000..db2524fc --- /dev/null +++ b/examples/reduce/sum/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/sum_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(sum_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/sum_custom_tiling.cpp +) + +target_compile_options(sum_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(sum_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(sum_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(sum_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS sum_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/reduce/sum/README.md b/examples/reduce/sum/README.md new file mode 100644 index 00000000..28be735a --- /dev/null +++ b/examples/reduce/sum/README.md @@ -0,0 +1,90 @@ + + +## 概述 + +本样例介绍了调用 Sum 高阶 API 实现 sum 算子,并按照核函数直调的方式分别给出了对应的端到端实现,关于 Sum 高阶 API 的具体内容请参考 [Ascend C 算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi) 中的 " 高阶 API > 归约操作 > Sum " 章节。 + +- 直调:使用核函数直调 sum 自定义算子。 + + 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品 AI Core + +## 目录结构 + +| 目录 | 描述 +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 +sum 算子,获取最后一个维度的元素总和。如果输入是向量,则在向量中对各元素相加;如果输入是矩阵,则沿最后一个维度对每行中元素求和。 + +- 算子规格: + + + + + + + + + + +
算子类型(OpType)SumCustom
算子输入
nameshapedata typeformat
x7*2023floatND
算子输出
y8*1floatND
核函数名sum_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[7][2023],输出y[8]的sum算子,其中y中的有效值数量为7,最后一个为padding。 + +- kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用Sum高阶API接口完成sum计算,得到最终结果,再搬出到外部存储上。 + + sum算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行sum计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- tiling实现 + + sum算子的tiling实现流程如下:根据输入数据 x 的内轴及内轴实际长度和外轴确定所需tiling参数例如输出内轴补齐后长度等,并调用GetSumMaxMinTmpSize接口获取tmpBuf的大小。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品 AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/reduce/sum/cmake/cpu_lib.cmake b/examples/reduce/sum/cmake/cpu_lib.cmake new file mode 100644 index 00000000..693f15ac --- /dev/null +++ b/examples/reduce/sum/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/reduce/sum/cmake/npu_lib.cmake b/examples/reduce/sum/cmake/npu_lib.cmake new file mode 100644 index 00000000..8ad136f3 --- /dev/null +++ b/examples/reduce/sum/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/reduce/sum/main.cpp b/examples/reduce/sum/main.cpp new file mode 100644 index 00000000..b1ecc6fc --- /dev/null +++ b/examples/reduce/sum/main.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#include "./op_host/sum_custom_tiling.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_sum_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling); +#endif + +constexpr uint32_t USED_CORE_NUM = 1; +constexpr uint32_t TILINGDATA_SIZE = 5; +constexpr uint32_t M = 7; // 输入数据的外轴长度 +constexpr uint32_t N = 2023; // 输入数据内轴的实际元素个数 + +extern void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N); + +static bool CompareResult(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + printf("wrongNum: %ld\n", wrongNum); + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + uint8_t *tiling = nullptr; + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + +#ifdef ASCENDC_CPU_DEBUG + tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost; + uint8_t *xDevice, *yDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&tiling), tilingSize)) +#endif + GenerateTilingData(tiling, M, N); + + auto tilingData = reinterpret_cast(tiling); + + size_t inputSize = tilingData->outter * tilingData->inner * sizeof(uint32_t); + size_t outputSize = tilingData->out_inner * sizeof(uint32_t); + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(sum_custom, USED_CORE_NUM, x, y, + *reinterpret_cast(tiling)); // use this macro for cpu debug + + WriteFile("../output/output.bin", y, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(y, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(sum_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, reinterpret_cast(tiling)); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(yHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(tiling)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.cpp b/examples/reduce/sum/op_host/sum_custom_tiling.cpp new file mode 100644 index 00000000..b0150f5a --- /dev/null +++ b/examples/reduce/sum/op_host/sum_custom_tiling.cpp @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "sum_custom_tiling.h" +#include "tiling/tiling_api.h" +#include + +void GenerateTilingData(uint8_t *tilingBuf, const uint32_t M, const uint32_t N) { + uint32_t minValue{}; + uint32_t maxValue{}; + + AscendC::GetSumMaxMinTmpSize(N, 4, false, maxValue, minValue); + + SumCustomTilingData *tiling = reinterpret_cast(tilingBuf); + + tiling->outter = M; + tiling->inner = (N * sizeof(uint32_t) + 32 - 1) / 32 * 32 / sizeof(uint32_t); + tiling->n = N; + tiling->tmpBufSize = minValue; + + // 输出如果未遵循框架对内存开辟的要求(开辟内存的大小满足31Byte对齐),即outter * + // sizeof(T)不是31Byte对齐时,需要向上进行32Byte对齐 + // https://www.hiascend.com/document/detail/zh/CANNCommunityEdition/81RC1alpha001/API/ascendcopapi/atlasascendc_api_07_0826.html + tiling->out_inner = (M * sizeof(uint32_t) + 32 - 1) / 32 * 32 / sizeof(uint32_t); +} \ No newline at end of file diff --git a/examples/reduce/sum/op_host/sum_custom_tiling.h b/examples/reduce/sum/op_host/sum_custom_tiling.h new file mode 100644 index 00000000..f952161a --- /dev/null +++ b/examples/reduce/sum/op_host/sum_custom_tiling.h @@ -0,0 +1,24 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_REDUCE_SUM_COSTOM_TILING_H +#define EXAMPLES_REDUCE_SUM_COSTOM_TILING_H + +#include + +struct SumCustomTilingData { + uint32_t inner; // 输入内轴补齐后的长度 + uint32_t outter; // 输入外轴长度 + uint32_t n; // 输入内轴实际长度 + uint32_t tmpBufSize; // 临时空间大小 + uint32_t out_inner; // 输出内轴补齐后的长度 +}; + +#endif // EXAMPLES_REDUCE_SUM_COSTOM_TILING_H \ No newline at end of file diff --git a/examples/reduce/sum/op_kernel/sum_custom.cpp b/examples/reduce/sum/op_kernel/sum_custom.cpp new file mode 100644 index 00000000..ad2999b2 --- /dev/null +++ b/examples/reduce/sum/op_kernel/sum_custom.cpp @@ -0,0 +1,21 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "kernel_operator.h" +#include "sum_custom_impl.h" + +extern "C" __global__ __aicore__ void sum_custom(GM_ADDR srcGm, GM_ADDR dstGm, SumCustomTilingData tiling) { + if ASCEND_IS_AIC { + return; + } + KernelSum op; + op.Init(srcGm, dstGm, tiling); + op.Process(); +} \ No newline at end of file diff --git a/examples/reduce/sum/op_kernel/sum_custom_impl.h b/examples/reduce/sum/op_kernel/sum_custom_impl.h new file mode 100644 index 00000000..8ea2260a --- /dev/null +++ b/examples/reduce/sum/op_kernel/sum_custom_impl.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_REDUCE_SUM_COSTOM_H +#define EXAMPLES_REDUCE_SUM_COSTOM_H + +#include "../op_host/sum_custom_tiling.h" +#include "kernel_operator.h" + +template class KernelSum { +public: + __aicore__ inline KernelSum() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, SumCustomTilingData tilingData) { + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); + inner = tilingData.inner; + outter = tilingData.outter; + n = tilingData.n; + tmpBufSize = tilingData.tmpBufSize; + out_inner = tilingData.out_inner; + + params.inner = inner; + params.outter = outter; + params.n = n; + + xGm.SetGlobalBuffer((__gm__ T *)x); + yGm.SetGlobalBuffer((__gm__ T *)y); + pipe.InitBuffer(inQueue, 1, inner * outter * sizeof(T)); + pipe.InitBuffer(outQueue, 1, out_inner * sizeof(T)); + pipe.InitBuffer(tmpBuf, tmpBufSize * sizeof(uint8_t)); + } + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() { + AscendC::LocalTensor xLocal = inQueue.AllocTensor(); + AscendC::DataCopy(xLocal, xGm, inner * outter); + inQueue.EnQue(xLocal); + } + __aicore__ inline void Compute() { + AscendC::LocalTensor xLocal = inQueue.DeQue(); + AscendC::LocalTensor yLocal = outQueue.AllocTensor(); + AscendC::LocalTensor sharedTmpBuffer = tmpBuf.AllocTensor(); + + T scalar(0); + AscendC::Duplicate(yLocal, scalar, out_inner); + AscendC::Sum(yLocal, xLocal, sharedTmpBuffer, params); + + outQueue.EnQue(yLocal); + inQueue.FreeTensor(xLocal); + tmpBuf.FreeTensor(sharedTmpBuffer); + } + __aicore__ inline void CopyOut() { + AscendC::LocalTensor yLocal = outQueue.DeQue(); + AscendC::DataCopy(yGm, yLocal, out_inner); + outQueue.FreeTensor(yLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueue; + AscendC::TQue outQueue; + AscendC::TBuf tmpBuf; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + + uint32_t inner = 0; + uint32_t outter = 0; + uint32_t n = 0; + uint32_t tmpBufSize = 0; + uint32_t out_inner = 0; + AscendC::SumParams params; +}; + +#endif \ No newline at end of file diff --git a/examples/reduce/sum/run.sh b/examples/reduce/sum/run.sh new file mode 100644 index 00000000..796c0fd7 --- /dev/null +++ b/examples/reduce/sum/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./sum_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./sum_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./sum_direct_kernel_op +fi \ No newline at end of file diff --git a/examples/reduce/sum/scripts/gen_data.py b/examples/reduce/sum/scripts/gen_data.py new file mode 100644 index 00000000..ea012879 --- /dev/null +++ b/examples/reduce/sum/scripts/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 + +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [7, 2024] + input_x = np.random.uniform(-100, 100, input_shape).astype(dtype) + # 对 input_x 进行切片,排除每行最后一个元素(保留前 2023 个) + golden = np.sum(input_x[:, :-1], axis=1).astype(dtype) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/examples/transpose/confusion_transpose/CMakeLists.txt b/examples/transpose/confusion_transpose/CMakeLists.txt new file mode 100644 index 00000000..ef7fd4a9 --- /dev/null +++ b/examples/transpose/confusion_transpose/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/confusion_transpose_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(confusion_transpose_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/confusion_transpose_custom_tiling.cpp +) + +target_compile_options(confusion_transpose_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(confusion_transpose_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(confusion_transpose_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(confusion_transpose_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS confusion_transpose_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/README.md b/examples/transpose/confusion_transpose/README.md new file mode 100644 index 00000000..7f27f20a --- /dev/null +++ b/examples/transpose/confusion_transpose/README.md @@ -0,0 +1,93 @@ + + +## 概述 + +本样例介绍了调用 ConfusionTranspose 高阶 API 实现 confusionTranspose 算子,并按照核函数直调的方式分别给出了对应的端到端实现,关于 ConfusionTranspose 高阶 API 的具体内容请参考 [Ascend C 算子开发接口](https://hiascend.com/document/redirect/CannCommunityAscendCApi) 中的 " 高阶 API > 变形 > ConfusionTranspose " 章节。 + +- 直调:使用核函数直调 confusionTranspose 自定义算子。 + + 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas 推理系列产品 AI Core + +## 目录结构 + +| 目录 | 描述 +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 +confusionTranspose 算子,对输入数据进行数据排布及Reshape操作。 + +- 算子规格: + + + + + + + + + + +
算子类型(OpType)ConfusionTransposeCustom
算子输入
nameshapedata typeformat
x1*2*64*32halfND
算子输出
y1*64*2*32halfND
核函数名confusion_transpose_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[1][2][64][32],输出y[1][64][2][32]的confusionTranspose算子,针对NZ2ND场景,实现1、2轴互换。 + +输入Tensor { shape:[B, N, H/N/16, S/16, 16, 16], origin_shape:[B, N, S, H/N], format:"NZ", origin_format:"ND"} +输出Tensor { shape:[B, S, N, H/N], origin_shape:[B, S, N, H/N], format:"ND", origin_format:"ND"} + +- kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用ConfusionTranspose高阶API接口完成confusionTranspose计算,得到最终结果,再搬出到外部存储上。 + + ConfusionTranspose算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行confusionTranspose计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- tiling实现 + + confusionTranspose算子的tiling实现流程如下:使用Ascend C提供 API GetConfusionTransposeTilingInfo,获取所需的Tiling参数,并调用GetConfusionTransposeMaxMinTmpSize接口获取tmpBuf的大小。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 + - Atlas 推理系列产品 AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake new file mode 100644 index 00000000..693f15ac --- /dev/null +++ b/examples/transpose/confusion_transpose/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/cmake/npu_lib.cmake b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake new file mode 100644 index 00000000..8ad136f3 --- /dev/null +++ b/examples/transpose/confusion_transpose/cmake/npu_lib.cmake @@ -0,0 +1,12 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/main.cpp b/examples/transpose/confusion_transpose/main.cpp new file mode 100644 index 00000000..3fcf6b75 --- /dev/null +++ b/examples/transpose/confusion_transpose/main.cpp @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +#include "aclrtlaunch_confusion_transpose_custom.h" +#include "tiling/platform/platform_ascendc.h" +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling); +#endif + +constexpr uint32_t USED_CORE_NUM = 1; +constexpr uint32_t TILINGDATA_SIZE = 22; +constexpr uint32_t B = 1; +constexpr uint32_t N = 2; +constexpr uint32_t S = 64; +constexpr uint32_t hnDiv = 32; + +extern uint8_t *GenerateTiling(uint32_t B, uint32_t N, uint32_t S, uint32_t HNDiv); + +static bool CompareResult(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + printf("wrongNum: %ld\n", wrongNum); + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t inputSize = B * S * N * hnDiv * sizeof(uint16_t); + size_t outputSize = B * S * N * hnDiv * sizeof(uint16_t); + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + + uint8_t *buf = GenerateTiling(B, N, S, hnDiv); + + memcpy_s(tiling, tilingSize, buf, tilingSize); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); + ICPU_RUN_KF(confusion_transpose_custom, USED_CORE_NUM, x, y, tiling); + + WriteFile("../output/output.bin", y, outputSize); + + free(buf); + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost; + uint8_t *xDevice, *yDevice, *tilingDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + + uint8_t *buf = GenerateTiling(B, N, S, hnDiv); + + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, buf, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + ACLRT_LAUNCH_KERNEL(confusion_transpose_custom) + (USED_CORE_NUM, stream, xDevice, yDevice, tilingDevice); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Clean up memory + free(buf); + + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + return 0; +} diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp new file mode 100644 index 00000000..64cd8736 --- /dev/null +++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.cpp @@ -0,0 +1,24 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "confusion_transpose_custom_tiling.h" + +uint8_t *GetTilingBuf(optiling::ConfusionTransposeCustomTilingData *tilingData) { + uint32_t tilingSize = sizeof(optiling::ConfusionTransposeCustomTilingData); + uint8_t *buf = (uint8_t *)malloc(tilingSize); + tilingData->SaveToBuffer(buf, tilingSize); + return buf; +} + +uint8_t *GenerateTiling(uint32_t B, uint32_t N, uint32_t S, uint32_t HNDiv) { + optiling::ConfusionTransposeCustomTilingData tiling; + ComputeTiling(B, N, S, HNDiv, tiling); + return GetTilingBuf(&tiling); +} \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h new file mode 100644 index 00000000..396c7dbc --- /dev/null +++ b/examples/transpose/confusion_transpose/op_host/confusion_transpose_custom_tiling.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H +#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/platform/platform_ascendc.h" +#include "tiling/tiling_api.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(ConfusionTransposeCustomTilingData) + TILING_DATA_FIELD_DEF(uint32_t, B); + TILING_DATA_FIELD_DEF(uint32_t, N); + TILING_DATA_FIELD_DEF(uint32_t, S); + TILING_DATA_FIELD_DEF(uint32_t, HNDiv); + TILING_DATA_FIELD_DEF_STRUCT(ConfusionTransposeTiling, confusionTransposeTilingData); +END_TILING_DATA_DEF; +REGISTER_TILING_DATA_CLASS(ConfusionTransposeCustom, ConfusionTransposeCustomTilingData) +} // namespace optiling + +void ComputeTiling(uint32_t B, uint32_t N, uint32_t S, uint32_t hnDiv, + optiling::ConfusionTransposeCustomTilingData &tiling) { + tiling.set_B(B); + tiling.set_N(N); + tiling.set_S(S); + tiling.set_HNDiv(hnDiv); + + std::vector shapeVec = {B, N, S, hnDiv}; + ge::Shape srcShape(shapeVec); + + uint32_t maxValue = 0; + uint32_t minValue = 0; + AscendC::GetConfusionTransposeMaxMinTmpSize(srcShape, sizeof(uint16_t), 1, maxValue, minValue); + const uint32_t stackBufferSize = minValue; + AscendC::GetConfusionTransposeTilingInfo(srcShape, stackBufferSize, sizeof(uint16_t), 1, tiling.confusionTransposeTilingData); +} + +#endif // EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_CUSTOM_TILING_H diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp new file mode 100644 index 00000000..c5f01523 --- /dev/null +++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom.cpp @@ -0,0 +1,32 @@ +/** + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ +#include "confusion_transpose_custom_impl.h" +#include "kernel_operator.h" + +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM) { + uint32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); + + for (uint32_t i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +extern "C" __global__ __aicore__ void confusion_transpose_custom(GM_ADDR x, GM_ADDR y, GM_ADDR tiling) { + if ASCEND_IS_AIC { + return; + } + MyCustomKernel::KernelConfusionTranspose op; + MyCustomKernel::VecTiling tilingData; + CopyTiling(&tilingData, tiling); + op.Init(x, y, tilingData); + op.Process(); +} \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h new file mode 100644 index 00000000..dc7bdbd8 --- /dev/null +++ b/examples/transpose/confusion_transpose/op_kernel/confusion_transpose_custom_impl.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_COSTOM_H +#define EXAMPLES_TRANSPOSE_CONFUSION_TRANSPOSE_COSTOM_H + +#include "kernel_operator.h" + +namespace MyCustomKernel { +struct VecTiling { + uint32_t B; + uint32_t N; + uint32_t S; + uint32_t HNDiv; + ConfusionTransposeTiling confusionTransposeTilingData; +}; + +template class KernelConfusionTranspose { +public: + __aicore__ inline KernelConfusionTranspose() {} + __aicore__ inline void Init(__gm__ uint8_t *srcGm, __gm__ uint8_t *dstGm, const VecTiling &tilingData) { + this->B = tilingData.B; + this->N = tilingData.N; + this->S = tilingData.S; + this->hnDiv = tilingData.HNDiv; + + srcGlobal.SetGlobalBuffer((__gm__ T *)srcGm, B * N * S * hnDiv); + dstGlobal.SetGlobalBuffer((__gm__ T *)dstGm, B * N * S * hnDiv); + pipe.InitBuffer(inQueueSrcVecIn, 1, B * N * S * hnDiv * sizeof(T)); + pipe.InitBuffer(inQueueSrcVecOut, 1, B * N * S * hnDiv * sizeof(T)); + this->tiling = tilingData.confusionTransposeTilingData; + } + __aicore__ inline void Process() { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() { + AscendC::LocalTensor srcLocal = inQueueSrcVecIn.AllocTensor(); + AscendC::DataCopy(srcLocal, srcGlobal, B * N * S * hnDiv); + inQueueSrcVecIn.EnQue(srcLocal); + } + __aicore__ inline void Compute() { + AscendC::LocalTensor srcLocal = inQueueSrcVecIn.DeQue(); + AscendC::LocalTensor dstLocal = inQueueSrcVecOut.AllocTensor(); + AscendC::ConfusionTranspose(dstLocal, srcLocal, AscendC::TransposeType::TRANSPOSE_NZ2ND_0213, this->tiling); + inQueueSrcVecOut.EnQue(dstLocal); + inQueueSrcVecIn.FreeTensor(srcLocal); + } + __aicore__ inline void CopyOut() { + AscendC::LocalTensor dstLocal = inQueueSrcVecOut.DeQue(); + AscendC::DataCopy(dstGlobal, dstLocal, B * N * S * hnDiv); + inQueueSrcVecOut.FreeTensor(dstLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueSrcVecIn; + AscendC::TQue inQueueSrcVecOut; + AscendC::GlobalTensor srcGlobal; + AscendC::GlobalTensor dstGlobal; + uint32_t B = 0; + uint32_t N = 0; + uint32_t S = 0; + uint32_t hnDiv = 0; + ConfusionTransposeTiling tiling; +}; +} // namespace MyCustomKernel +#endif \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/run.sh b/examples/transpose/confusion_transpose/run.sh new file mode 100644 index 00000000..cddce28f --- /dev/null +++ b/examples/transpose/confusion_transpose/run.sh @@ -0,0 +1,61 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./confusion_transpose_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./confusion_transpose_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./confusion_transpose_direct_kernel_op +fi + +cd .. +python3 scripts/verify_data.py output/output.bin output/golden.bin diff --git a/examples/transpose/confusion_transpose/scripts/gen_data.py b/examples/transpose/confusion_transpose/scripts/gen_data.py new file mode 100644 index 00000000..d323b2c6 --- /dev/null +++ b/examples/transpose/confusion_transpose/scripts/gen_data.py @@ -0,0 +1,39 @@ +#!/usr/bin/python3 + +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float16 + + B, N, S, hnDiv = 1, 2, 64, 32 + + origin_shape = (B, N, S, hnDiv) + + origin_data = np.random.uniform(-1000.0, 1000.0, origin_shape).astype(dtype) + + reshape_shape = (B, N, S // 16, 16, hnDiv // 16, 16) + reshaped_data = origin_data.reshape(reshape_shape) + + transposed_data = np.transpose(reshaped_data, (0, 1, 4, 2, 3, 5)) + + golden = np.transpose(origin_data, (0, 2, 1, 3)) + + os.system("mkdir -p ./input") + transposed_data.tofile("./input/input_x.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() \ No newline at end of file diff --git a/examples/transpose/confusion_transpose/scripts/verify_data.py b/examples/transpose/confusion_transpose/scripts/verify_data.py new file mode 100644 index 00000000..da072f53 --- /dev/null +++ b/examples/transpose/confusion_transpose/scripts/verify_data.py @@ -0,0 +1,57 @@ +#!/usr/bin/python3 + +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import sys +import numpy as np + +# for float16 +relative_tol = 1e-3 +absolute_tol = 1e-5 +error_tol = 1e-3 + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float16).reshape(-1) + golden = np.fromfile(golden, dtype=np.float16).reshape(-1) + + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolrence: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) \ No newline at end of file -- Gitee