From d86687cfe768643ae89636d72ba22399d00d3403 Mon Sep 17 00:00:00 2001 From: shinoda <10712972+zyc20010326@user.noreply.gitee.com> Date: Sun, 11 May 2025 11:33:33 +0800 Subject: [PATCH] feat: add math cos operator example. --- examples/math/cos/CMakeLists.txt | 82 +++++++++ examples/math/cos/README.md | 90 ++++++++++ examples/math/cos/cmake/cpu_lib.cmake | 26 +++ examples/math/cos/cmake/npu_lib.cmake | 19 +++ examples/math/cos/main.cpp | 157 ++++++++++++++++++ .../math/cos/op_host/cos_custom_tiling.cpp | 22 +++ examples/math/cos/op_host/cos_custom_tiling.h | 29 ++++ examples/math/cos/op_kernel/cos_custom.cpp | 41 +++++ examples/math/cos/op_kernel/cos_custom_impl.h | 79 +++++++++ examples/math/cos/run.sh | 58 +++++++ examples/math/cos/scripts/gen_data.py | 31 ++++ examples/readme.md | 5 + 12 files changed, 639 insertions(+) create mode 100644 examples/math/cos/CMakeLists.txt create mode 100644 examples/math/cos/README.md create mode 100644 examples/math/cos/cmake/cpu_lib.cmake create mode 100644 examples/math/cos/cmake/npu_lib.cmake create mode 100644 examples/math/cos/main.cpp create mode 100644 examples/math/cos/op_host/cos_custom_tiling.cpp create mode 100644 examples/math/cos/op_host/cos_custom_tiling.h create mode 100644 examples/math/cos/op_kernel/cos_custom.cpp create mode 100644 examples/math/cos/op_kernel/cos_custom_impl.h create mode 100644 examples/math/cos/run.sh create mode 100644 examples/math/cos/scripts/gen_data.py diff --git a/examples/math/cos/CMakeLists.txt b/examples/math/cos/CMakeLists.txt new file mode 100644 index 00000000..188a5881 --- /dev/null +++ b/examples/math/cos/CMakeLists.txt @@ -0,0 +1,82 @@ +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) +if(${RUN_MODE}) + set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +endif() +if (${SOC_VERSION}) + set(SOC_VERSION "Ascend910" CACHE STRING "system on chip type") +endif() + +set(ASCEND_CANN_PACKAGE_PATH "~/Ascend/ascend-toolkit/latest" CACHE STRING "ASCEND CANN package installation directory") + +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() + +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/op_kernel/cos_custom.cpp +) +set(CUSTOM_ASCEND310P_LIST "Ascend310P1" "Ascend310P3") + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() + +add_executable(cos_direct_kernel_op + ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/op_host/cos_custom_tiling.cpp +) + +target_compile_options(cos_direct_kernel_op PRIVATE + $:-g>> + -O2 + -std=c++17 + -D_GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_definitions(cos_direct_kernel_op PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_include_directories(cos_direct_kernel_op PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR} + $:${ASCEND_CANN_PACKAGE_PATH}/include>> + $:${ASCEND_CANN_PACKAGE_PATH}/runtime/include>> +) + +target_link_libraries(cos_direct_kernel_op PRIVATE + $,$>:host_intf_pub>> + $:tikicpulib::${SOC_VERSION}>> + $:ascendcl>> + $:c_sec>> + ascendc_kernels_${RUN_MODE} + tiling_api + register + platform + ascendalog + dl + graph_base +) + +install(TARGETS cos_direct_kernel_op + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) \ No newline at end of file diff --git a/examples/math/cos/README.md b/examples/math/cos/README.md new file mode 100644 index 00000000..96466bd5 --- /dev/null +++ b/examples/math/cos/README.md @@ -0,0 +1,90 @@ + + +## 概述 + +本样例介绍了调用Cos高阶API实现cos单算子,并按照核函数直调的方式分别给出了对应的端到端实现。 + +- 直调:使用核函数直调cos自定义算子。 + + 核函数的基础调用方式,开发者完成算子核函数的开发和Tiling实现后,即可通过AscendCL运行时接口,完成算子的调用。包含CPU侧、NPU侧、仿真侧三种运行验证方法。 + +## 样例支持的产品型号为: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 +- Atlas 推理系列产品AI Core + +## 目录结构 + +| 目录 | 描述 | +| ------------------------------------------------------------ | ------------------------------------------------------------ | +| [cmake](./cmake) | 编译工程文件 | +| [op_host](./op_host) | 本样例tiling代码实现 | +| [op_kernel](./op_kernel) | 本样例kernel侧代码实现 | +| [scripts](./scripts) | 包含输入数据和真值数据生成脚本文件 | +| CMakeLists.txt | 编译工程文件 | +| main.cpp | 主函数,调用算子的应用程序,含CPU域及NPU域调用 | +| run.sh | 编译执行脚本 | + +## 算子描述 +- 算子功能 +cos单算子,对输入tensor按元素做三角函数余弦运算。 + +- 算子规格: + + + + + + + + + + +
算子类型(OpType)CosCustom
算子输入
nameshapedata typeformat
x8*2048floatND
算子输出
y8*2048floatND
核函数名cos_custom
+ +## 算子实现介绍 + +本样例中实现的是固定shape为输入x[8][2048],输出y[8][2048]的cos算子。 + +- kernel实现 + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用Cos高阶API接口完成cos计算,得到最终结果,再搬出到外部存储上。 + + cos算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor srcGm存储在srcLocal中,Compute任务负责对srcLocal执行cos计算,计算结果存储在dstLocal中,CopyOut任务负责将输出数据从dstLocal搬运至Global Memory上的输出Tensor dstGm。 + +- tiling实现 + + cos算子的tiling实现流程如下:根据输入长度totalLength和所用核数量coreNum确定所需tiling参数。 + +## 编译运行样例 + + - 配置环境变量 + + 这里的\$ASCEND_CANN_PACKAGE_PATH需要替换为CANN开发套件包安装后文件存储路径。例如:/usr/local/Ascend/ascend-toolkit/latest + ``` + export ASCEND_HOME_DIR=$ASCEND_CANN_PACKAGE_PATH + source $ASCEND_HOME_DIR/../set_env.sh + ``` + + - 生成输入和真值 + + 执行如下命令后,当前目录生成input和output目录存放输入数据和真值数据。 + ``` + python3 scripts/gen_data.py + ``` + + - 编译执行 + + ``` + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + 其中脚本参数说明如下: + - RUN_MODE :编译执行方式,可选择CPU调试,NPU仿真,NPU上板,对应参数分别为[cpu / sim / npu]。若需要详细了解NPU仿真相关内容,请参考[《算子开发工具msProf》](https://hiascend.com/document/redirect/CannCommunityToolMsProf)中的“工具使用”章节。 + - SOC_VERSION :昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + - Atlas 推理系列产品AI Core + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + ``` + bash run.sh -r cpu -v Ascendxxxyy + ``` \ No newline at end of file diff --git a/examples/math/cos/cmake/cpu_lib.cmake b/examples/math/cos/cmake/cpu_lib.cmake new file mode 100644 index 00000000..693f15ac --- /dev/null +++ b/examples/math/cos/cmake/cpu_lib.cmake @@ -0,0 +1,26 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED + ${KERNEL_FILES} +) + +target_link_libraries(ascendc_kernels_${RUN_MODE} PRIVATE + tikicpulib::${SOC_VERSION} +) + +target_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> +) + +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE + -g + -O0 + -std=c++17 +) + +install(TARGETS ascendc_kernels_${RUN_MODE} +DESTINATION ${CMAKE_INSTALL_LIBDIR} +) \ No newline at end of file diff --git a/examples/math/cos/cmake/npu_lib.cmake b/examples/math/cos/cmake/npu_lib.cmake new file mode 100644 index 00000000..98413a61 --- /dev/null +++ b/examples/math/cos/cmake/npu_lib.cmake @@ -0,0 +1,19 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +ascendc_library(ascendc_kernels_${RUN_MODE} STATIC + ${KERNEL_FILES} +) + +ascendc_compile_definitions(ascendc_kernels_${RUN_MODE} PRIVATE + $<$>:CUSTOM_ASCEND310P> + -DASCENDC_DUMP + -DHAVE_WORKSPACE + -DHAVE_TILING + ) diff --git a/examples/math/cos/main.cpp b/examples/math/cos/main.cpp new file mode 100644 index 00000000..b07584a8 --- /dev/null +++ b/examples/math/cos/main.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "../../common/data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void cos_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *y, uint8_t *workspace, + uint8_t *tiling); +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling); +#endif + +constexpr uint32_t USED_CORE_NUM = 8; +constexpr uint32_t TILINGDATA_SIZE = 2; +constexpr uint32_t TOTAL_LENGTH = 8 * 2048; + +extern void GenerateTilingData(const uint32_t totalLength, const uint32_t coreNum, const uint32_t tilingSize, + uint8_t *tilingBuffer); + +static bool CompareResult(const void *outputData, int64_t outSize) { + void *goldenData; +#ifdef ASCENDC_CPU_DEBUG + goldenData = (uint8_t *)AscendC::GmAlloc(outSize); +#else + CHECK_ACL(aclrtMallocHost((void **)(&goldenData), outSize)); +#endif + size_t goldenSize = outSize; + bool ret = ReadFile("../output/golden.bin", goldenSize, goldenData, goldenSize); + if (ret) { + printf("ReadFile golden.bin success!\n"); + } else { + printf("test failed!\n"); + return false; + } + constexpr float EPS = 1e-4; + int64_t wrongNum = 0; + + for (int i = 0; i < outSize / sizeof(float); i++) { + float a = (reinterpret_cast(outputData))[i]; + float b = (reinterpret_cast(goldenData))[i]; + float ae = std::abs(a - b); + float re = ae / abs(b); + if (ae > EPS && re > EPS) { + printf("CompareResult golden.bin failed output is %lf, golden is %lf\n", a, b); + wrongNum++; + } + } +#ifdef ASCENDC_CPU_DEBUG + AscendC::GmFree((void *)goldenData); +#else + CHECK_ACL(aclrtFreeHost(goldenData)); +#endif + if (wrongNum != 0) { + return false; + } else { + printf("CompareResult golden.bin success!\n"); + return true; + } +} + +int32_t main(int32_t argc, char *argv[]) { + size_t tilingSize = TILINGDATA_SIZE * sizeof(uint32_t); + size_t inputSize = TOTAL_LENGTH * sizeof(uint32_t); + size_t outputSize = inputSize; + +#ifdef ASCENDC_CPU_DEBUG + uint8_t *x = (uint8_t *)AscendC::GmAlloc(inputSize); + uint8_t *y = (uint8_t *)AscendC::GmAlloc(outputSize); + uint8_t *tiling = (uint8_t *)AscendC::GmAlloc(tilingSize); + + ReadFile("../input/input_x.bin", inputSize, x, inputSize); + + GenerateTilingData(TOTAL_LENGTH, USED_CORE_NUM, tilingSize, tiling); + + AscendC::SetKernelMode(KernelMode::AIV_MODE); // run in aiv mode + + ICPU_RUN_KF(cos_custom, USED_CORE_NUM, x, y, 0, tiling); // use this macro for cpu debug + + WriteFile("../output/output.bin", y, outputSize); + + bool goldenResult = true; + goldenResult = CompareResult(y, outputSize); + + AscendC::GmFree((void *)x); + AscendC::GmFree((void *)y); + AscendC::GmFree((void *)tiling); +#else + CHECK_ACL(aclInit(nullptr)); + aclrtContext context; + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + uint8_t *xHost, *yHost, *tilingHost; + uint8_t *xDevice, *yDevice, *tilingDevice; + + CHECK_ACL(aclrtMallocHost((void **)(&xHost), inputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputSize)); + CHECK_ACL(aclrtMallocHost((void **)(&tilingHost), tilingSize)) + + CHECK_ACL(aclrtMalloc((void **)&xDevice, inputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&yDevice, outputSize, ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMalloc((void **)&tilingDevice, tilingSize, ACL_MEM_MALLOC_HUGE_FIRST)); + + ReadFile("../input/input_x.bin", inputSize, xHost, inputSize); + + GenerateTilingData(TOTAL_LENGTH, USED_CORE_NUM, tilingSize, tilingHost); + + // Copy host memory to device memory + CHECK_ACL(aclrtMemcpy(xDevice, inputSize, xHost, inputSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(tilingDevice, tilingSize, tilingHost, tilingSize, ACL_MEMCPY_HOST_TO_DEVICE)); + + // Execute the kernel + cos_custom_do(USED_CORE_NUM, nullptr, stream, xDevice, yDevice, nullptr, tilingDevice); + + // Wait for the stop event to complete + CHECK_ACL(aclrtSynchronizeStream(stream)); + + // Copy result to host memory and write to output file + CHECK_ACL(aclrtMemcpy(yHost, outputSize, yDevice, outputSize, ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile("../output/output.bin", yHost, outputSize); + + // Compare the result with the golden result + bool goldenResult = true; + goldenResult = CompareResult(yHost, outputSize); + + // Clean up memory + CHECK_ACL(aclrtFree(xDevice)); + CHECK_ACL(aclrtFree(yDevice)); + CHECK_ACL(aclrtFree(tilingDevice)); + CHECK_ACL(aclrtFreeHost(xHost)); + CHECK_ACL(aclrtFreeHost(yHost)); + CHECK_ACL(aclrtFreeHost(tilingHost)); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#endif + + if (goldenResult) { + printf("test pass!\n"); + } else { + printf("test failed!\n"); + } + return 0; +} diff --git a/examples/math/cos/op_host/cos_custom_tiling.cpp b/examples/math/cos/op_host/cos_custom_tiling.cpp new file mode 100644 index 00000000..d5a25a87 --- /dev/null +++ b/examples/math/cos/op_host/cos_custom_tiling.cpp @@ -0,0 +1,22 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "cos_custom_tiling.h" +#include "tiling/tiling_api.h" + +void GenerateTilingData(const uint32_t totalLength, const uint32_t coreNum, const uint32_t tilingSize, + uint8_t *tilingBuffer) { + optiling::CosCustomTilingData tiling; + ComputeTiling(totalLength, coreNum, tiling); + + // copy tiling to tilingBuffer + tiling.SaveToBuffer(tilingBuffer, tilingSize); + return; +} \ No newline at end of file diff --git a/examples/math/cos/op_host/cos_custom_tiling.h b/examples/math/cos/op_host/cos_custom_tiling.h new file mode 100644 index 00000000..8f91cd0f --- /dev/null +++ b/examples/math/cos/op_host/cos_custom_tiling.h @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATH_COS_CUSTOM_TILING_H +#define EXAMPLES_MATH_COS_CUSTOM_TILING_H +#include "register/tilingdata_base.h" +#include "tiling/tiling_api.h" + +namespace optiling { +BEGIN_TILING_DATA_DEF(CosCustomTilingData) + TILING_DATA_FIELD_DEF(uint32_t, totalLength); + TILING_DATA_FIELD_DEF(uint32_t, tileNum); +END_TILING_DATA_DEF; +REGISTER_TILING_DATA_CLASS(CosCustom, CosCustomTilingData) +} // namespace optiling + +void ComputeTiling(const uint32_t totalLength, const uint32_t coreNum, optiling::CosCustomTilingData &tiling) { + tiling.set_totalLength(totalLength); + tiling.set_tileNum(coreNum); +} + +#endif // EXAMPLES_MATH_COS_CUSTOM_TILING_H \ No newline at end of file diff --git a/examples/math/cos/op_kernel/cos_custom.cpp b/examples/math/cos/op_kernel/cos_custom.cpp new file mode 100644 index 00000000..22c060d2 --- /dev/null +++ b/examples/math/cos/op_kernel/cos_custom.cpp @@ -0,0 +1,41 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#include "cos_custom_impl.h" +#include "kernel_operator.h" + +__aicore__ inline void CopyTiling(MyCustomKernel::VecTiling *tiling, GM_ADDR tilingGM) { + uint32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); + + for (uint32_t i = 0; i < sizeof(MyCustomKernel::VecTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +extern "C" __global__ __aicore__ void cos_custom(GM_ADDR x, GM_ADDR y, GM_ADDR workspace, GM_ADDR tiling) { + if ASCEND_IS_AIC { + return; + } + MyCustomKernel::kernelCos op; + MyCustomKernel::VecTiling tilingData; + CopyTiling(&tilingData, tiling); + op.Init(x, y, tilingData); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +// call of kernel function +void cos_custom_do(uint32_t blockDim, void *l2ctrl, void *stream, uint8_t *x, uint8_t *y, uint8_t *workspace, + uint8_t *tiling) { + cos_custom<<>>(x, y, workspace, tiling); +} +#endif \ No newline at end of file diff --git a/examples/math/cos/op_kernel/cos_custom_impl.h b/examples/math/cos/op_kernel/cos_custom_impl.h new file mode 100644 index 00000000..f6669bff --- /dev/null +++ b/examples/math/cos/op_kernel/cos_custom_impl.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2025 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef EXAMPLES_MATH_SORT_CUSTOM_H +#define EXAMPLES_MATH_SORT_CUSTOM_H +#include "kernel_operator.h" + +namespace MyCustomKernel { +constexpr int32_t BUFFER_NUM = 2; + +struct VecTiling { + uint32_t totalLength = 0; + uint32_t tileNum = 0; +}; + +template class kernelCos { +public: + __aicore__ inline kernelCos() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, VecTiling tilingData) { + ASSERT(AscendC::GetBlockNum() != 0 && "block dim can not be zero!"); + this->tileNum = tilingData.tileNum; + this->blockLength = tilingData.totalLength / AscendC::GetBlockNum(); + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + + xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float)); + pipe.InitBuffer(outQueueY, BUFFER_NUM, this->tileLength * sizeof(float)); + } + __aicore__ inline void Process() { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + } + __aicore__ inline void Compute(int32_t progress) { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = outQueueY.AllocTensor(); + AscendC::Cos(yLocal, xLocal, this->tileLength); + outQueueY.EnQue(yLocal); + inQueueX.FreeTensor(xLocal); + } + __aicore__ inline void CopyOut(int32_t progress) { + AscendC::LocalTensor yLocal = outQueueY.DeQue(); + AscendC::DataCopy(yGm[progress * this->tileLength], yLocal, this->tileLength); + outQueueY.FreeTensor(yLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue outQueueY; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + + uint32_t tileNum = 0; + uint32_t blockLength = 0; + uint32_t tileLength = 0; +}; +} // namespace MyCustomKernel + +#endif \ No newline at end of file diff --git a/examples/math/cos/run.sh b/examples/math/cos/run.sh new file mode 100644 index 00000000..2f8c76a4 --- /dev/null +++ b/examples/math/cos/run.sh @@ -0,0 +1,58 @@ +#!/bin/bash + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +SHORT=r:,v:, +LONG=run-mode:,soc-version:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +while : +do + case "$1" in + (-r | --run-mode ) + RUN_MODE="$2" + shift 2;; + (-v | --soc-version ) + SOC_VERSION="$2" + shift 2;; + (--) + shift; + break;; + (*) + echo "[ERROR] Unexpected option: $1"; + break;; + esac +done + +rm -rf build +mkdir build +cd build + +# in case of running op in simulator, use stub so instead +if [ "${RUN_MODE}" = "sim" ]; then + export LD_LIBRARY_PATH=$(echo $LD_LIBRARY_PATH | sed 's/\/.*\/runtime\/lib64://g') + export LD_LIBRARY_PATH=$ASCEND_HOME_DIR/runtime/lib64/stub:$LD_LIBRARY_PATH +fi + +source $ASCEND_HOME_DIR/bin/setenv.bash +export LD_LIBRARY_PATH=${ASCEND_HOME_DIR}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH + +cmake -DRUN_MODE=${RUN_MODE} -DSOC_VERSION=${SOC_VERSION} -DASCEND_CANN_PACKAGE_PATH=${ASCEND_HOME_DIR} .. +make -j16 + +if [ "${RUN_MODE}" = "npu" ]; then + ./cos_direct_kernel_op +elif [ "${RUN_MODE}" = "sim" ]; then + export ASCEND_TOOLKIT_HOME=${ASCEND_HOME_DIR} + export ASCEND_HOME_PATH=${ASCEND_HOME_DIR} + msprof op simulator --application=./cos_direct_kernel_op +elif [ "${RUN_MODE}" = "cpu" ]; then + ./cos_direct_kernel_op +fi \ No newline at end of file diff --git a/examples/math/cos/scripts/gen_data.py b/examples/math/cos/scripts/gen_data.py new file mode 100644 index 00000000..8c5886bf --- /dev/null +++ b/examples/math/cos/scripts/gen_data.py @@ -0,0 +1,31 @@ +#!/usr/bin/python3 + +# coding=utf-8 + +# Copyright (c) 2025 Huawei Technologies Co., Ltd. +# This file is a part of the CANN Open Software. +# Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). +# Please refer to the License for details. You may not use this file except in compliance with the License. +# THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, +# INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. +# See LICENSE in the root of the software repository for the full text of the License. +# ====================================================================================================================== + +import os +import numpy as np + +def gen_golden_data_simple(): + dtype = np.float32 + + input_shape = [8, 2048] + # generate value between [-65504, 65504] + input_x = np.random.uniform(-65504, np.nextafter(65504, np.inf), input_shape).astype(dtype) + golden = np.cos(input_x).astype(dtype) + + os.system("mkdir -p ./input") + input_x.tofile("./input/input_x.bin") + os.system("mkdir -p ./output") + golden.tofile("./output/golden.bin") + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/examples/readme.md b/examples/readme.md index d034cbbb..c6cf4332 100644 --- a/examples/readme.md +++ b/examples/readme.md @@ -93,6 +93,11 @@ welford_finalize Welford算法的后处理,一种在线计算均值和方差的方法。 + + math + cos + 对输入tensor做Cos计算。 + pad broadcast -- Gitee