From afda895226cd2f8ad83fa82e4dfefe8cd0817447 Mon Sep 17 00:00:00 2001 From: "zhanghao (AS)" Date: Fri, 22 Aug 2025 16:27:00 +0800 Subject: [PATCH 01/13] add static_tensor_programming testcases --- .../KernelLaunch/CMakeLists.txt | 47 ++++ .../KernelLaunch/README.md | 88 ++++++++ .../KernelLaunch/add_custom_v1.cpp | 85 ++++++++ .../KernelLaunch/add_custom_v2.cpp | 90 ++++++++ .../KernelLaunch/cmake/cpu_lib.cmake | 9 + .../KernelLaunch/cmake/npu_lib.cmake | 11 + .../KernelLaunch/data_utils.h | 203 ++++++++++++++++++ .../KernelLaunch/main.cpp | 127 +++++++++++ .../KernelLaunch/run.sh | 113 ++++++++++ .../KernelLaunch/scripts/gen_data.py | 25 +++ .../KernelLaunch/scripts/verify_result.py | 53 +++++ .../README.md | 69 ++++++ 12 files changed, 920 insertions(+) create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt new file mode 100644 index 000000000..392189fe1 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt @@ -0,0 +1,47 @@ +cmake_minimum_required(VERSION 3.16) +project(Ascend_c) + +set(RUN_MODE "npu" CACHE STRING "cpu/sim/npu") +set(SOC_VERSION "Ascend310P3" CACHE STRING "system on chip type") +set(ASCEND_CANN_PACKAGE_PATH "/usr/local/Ascend/ascend-toolkit/latest" + CACHE STRING "ASCEND CANN package installation directory" +) +if(NOT CMAKE_BUILD_TYPE) + set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "Build type Release/Debug (default Debug)" FORCE) +endif() +if(CMAKE_INSTALL_PREFIX STREQUAL /usr/local) + set(CMAKE_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/out" CACHE STRING "path for install()" FORCE) +endif() + +# ${KERNEL_FILES} are used to compile library, push files written by ascendc in ${KERNEL_FILES}. +# ref to cmake/npu.cmake ascendc_library, cmake/cpu.cmake add_library +file(GLOB KERNEL_FILES + ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp +) + +if("${RUN_MODE}" STREQUAL "cpu") + include(cmake/cpu_lib.cmake) +elseif("${RUN_MODE}" STREQUAL "sim" OR "${RUN_MODE}" STREQUAL "npu") + include(cmake/npu_lib.cmake) +else() + message("invalid RUN_MODE: ${RUN_MODE}") +endif() +add_executable(ascendc_kernels_bbit ${CMAKE_CURRENT_SOURCE_DIR}/main.cpp) + +target_compile_options(ascendc_kernels_bbit PRIVATE + $:-g>> + -O2 -std=c++17 -D_GLIBCXX_USE_CXX11_ABI=0 -Wall -Werror +) + +target_link_libraries(ascendc_kernels_bbit PRIVATE + $,$>:host_intf_pub>> + $:ascendcl>> + ascendc_kernels_${RUN_MODE} +) + +install(TARGETS ascendc_kernels_bbit + LIBRARY DESTINATION ${CMAKE_INSTALL_LIBDIR} + ARCHIVE DESTINATION ${CMAKE_INSTALL_LIBDIR} + RUNTIME DESTINATION ${CMAKE_INSTALL_BINDIR} +) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md new file mode 100644 index 000000000..f72b521cd --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md @@ -0,0 +1,88 @@ +## 目录结构介绍 + +``` +├── KernelLaunch +│ ├── cmake // 编译工程文件 +│ ├── scripts +│ │ ├── gen_data.py // 输入数据和真值数据生成脚本 +│ │ └── verify_result.py // 验证输出数据和真值数据是否一致的验证脚本 +│ ├── add_custom_v1.cpp // 算子kernel实现1:未优化前实现 +│ ├── add_custom_v2.cpp // 算子kernel实现2:优化地址分配,消除Bank冲突后的实现 +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 +│ └── run.sh // 编译运行算子的脚本 +``` + +## 代码实现介绍 + +本样例中实现的是固定shape为1*4096的Add算子。 + +- kernel实现 + + Add算子的数学表达式为: + + ``` + z = x + y + ``` + + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 + + 实现1:请参考[add_custom_v1.cpp](./add_custom_v1.cpp),xLocal地址为0,yLocal地址为0x4000,zLocal地址为0x8000。xLocal与yLocal存在读读冲突,xLocal与zLocal存在读写冲突。 + + 实现2:请参考[add_custom_v2.cpp](./add_custom_v2.cpp),为了避免Bank冲突,通过配置InitBuffer时的bufferSize来调整Tensor地址,xLocal地址为0,yLocal地址为0x4100,zLocal地址为0x10000。 +- 调用实现 + + 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; + 2. NPU侧运行验证主要通过使用ACLRT_LAUNCH_KERNEL内核调用宏来完成。 + + 应用程序通过ASCENDC_CPU_DEBUG 宏区分代码逻辑运行于CPU侧还是NPU侧。 + +## 运行样例算子 + +- 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + + ```bash + cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch + ``` +- 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` +- 样例执行 + + ```bash + bash run.sh -r [RUN_MODE] -v [SOC_VERSION] + ``` + + - RUN_MODE:编译方式,可选择CPU调试,NPU仿真,NPU上板。支持参数为[cpu /sim / npu] + - SOC_VERSION:昇腾AI处理器型号,如果无法确定具体的[SOC_VERSION],则在安装昇腾AI处理器的服务器执行npu-smi info命令进行查询,在查询到的“Name”前增加Ascend信息,例如“Name”对应取值为xxxyy,实际配置的[SOC_VERSION]值为Ascendxxxyy。支持以下产品型号: + - Atlas A2训练系列产品/Atlas 800I A2推理产品 + + 示例如下,Ascendxxxyy请替换为实际的AI处理器型号。 + + ```bash + bash run.sh -r cpu -v Ascendxxxyy + ``` + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/07/01 | 新增本readme | diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp new file mode 100644 index 000000000..471821d26 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp @@ -0,0 +1,85 @@ +/** + * @file add_custom_v1.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + +using AscendC::TPosition; +namespace { +constexpr uint32_t TOTAL_LENGTH = 4096 * 16; // total length of data +constexpr uint32_t TILE_LENGTH = 4096; +} + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH); + yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH); + zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH); + } + __aicore__ inline void Process() + { + uint32_t loopCount = TOTAL_LENGTH / TILE_LENGTH; + AscendC::LocalTensor xLocal(AscendC::TPosition::VECCALC, xAddr, TILE_LENGTH); + AscendC::LocalTensor yLocal(AscendC::TPosition::VECCALC, yAddr, TILE_LENGTH); + AscendC::LocalTensor zLocal(AscendC::TPosition::VECCALC, zAddr, TILE_LENGTH); + + for (int i = 0; i < loopCount; i++) { + // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops + if (i != 0) { + AscendC::WaitFlag(EVENT_ID0); + } + DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH); + DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH); + // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + if (i != 0) { + // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops + AscendC::WaitFlag(EVENT_ID0); + } + AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH); + if (i != (loopCount - 1)) { + // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops + AscendC::SetFlag(EVENT_ID0); + } + // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH); + if (i != (loopCount - 1)) { + // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops + AscendC::SetFlag(EVENT_ID0); + } + } + } + +private: + static constexpr uint32_t xAddr = 0; + static constexpr uint32_t yAddr = TILE_LENGTH * sizeof(float); + static constexpr uint32_t zAddr = TILE_LENGTH * sizeof(float) * 2; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; +}; + +extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom_v1<<>>(x, y, z); +} +#endif diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp new file mode 100644 index 000000000..65e7dd7e5 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp @@ -0,0 +1,90 @@ +/** + * @file add_custom_v2.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" + +using AscendC::TPosition; +namespace { +constexpr int32_t TOTAL_LENGTH = 4096; // total length of data +constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue +constexpr int32_t BANKGROUP_SIZE = 1024 * 64; // one bank size is 4KB, with 16 banks +constexpr int32_t ONE_REPEAT_SIZE = 256; // 256 bytes per repeat +} // namespace + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + { + xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH); + yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH); + zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH); + // xLocal size add 256 to avoid rr conflict + pipe.InitBuffer(inQueueX, BUFFER_NUM, TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE); + // yLocal size adjust to 64KB - xLocal size to avoid rw conflict + pipe.InitBuffer(inQueueY, BUFFER_NUM, BANKGROUP_SIZE - (TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, TOTAL_LENGTH * sizeof(float)); + } + __aicore__ inline void Process() + { + CopyIn(); + Compute(); + CopyOut(); + } + +private: + __aicore__ inline void CopyIn() + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH); + AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute() + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, TOTAL_LENGTH); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut() + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX; + AscendC::TQue inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; +}; + +extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z) +{ + KernelAdd op; + op.Init(x, y, z); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +{ + add_custom_v2<<>>(x, y, z); +} +#endif diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake new file mode 100644 index 000000000..5362c8b5a --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/cpu_lib.cmake @@ -0,0 +1,9 @@ +if(NOT DEFINED ENV{CMAKE_PREFIX_PATH}) + set(CMAKE_PREFIX_PATH ${ASCEND_CANN_PACKAGE_PATH}/tools/tikicpulib/lib/cmake) +endif() +find_package(tikicpulib REQUIRED) + +add_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) +target_link_libraries(ascendc_kernels_${RUN_MODE} PUBLIC tikicpulib::${SOC_VERSION}) +target_compile_options(ascendc_kernels_${RUN_MODE} PRIVATE -g -O0 -std=c++17) +install(TARGETS ascendc_kernels_${RUN_MODE} DESTINATION ${CMAKE_INSTALL_LIBDIR}) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake new file mode 100644 index 000000000..f92b095d1 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/cmake/npu_lib.cmake @@ -0,0 +1,11 @@ +if(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/compiler/tikcpp/ascendc_kernel_cmake) +elseif(EXISTS ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) + set(ASCENDC_CMAKE_DIR ${ASCEND_CANN_PACKAGE_PATH}/tools/tikcpp/ascendc_kernel_cmake) +else() + message(FATAL_ERROR "ascendc_kernel_cmake does not exist ,please check whether the cann package is installed") +endif() +include(${ASCENDC_CMAKE_DIR}/ascendc.cmake) + +# ascendc_library use to add kernel file to generate ascendc library +ascendc_library(ascendc_kernels_${RUN_MODE} SHARED ${KERNEL_FILES}) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h new file mode 100644 index 000000000..09d906371 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h @@ -0,0 +1,203 @@ +/** + * @file data_utils.h + * + * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#include "acl/acl.h" + +typedef enum { + DT_UNDEFINED = -1, + FLOAT = 0, + HALF = 1, + INT8_T = 2, + INT32_T = 3, + UINT8_T = 4, + INT16_T = 6, + UINT16_T = 7, + UINT32_T = 8, + INT64_T = 9, + UINT64_T = 10, + DOUBLE = 11, + BOOL = 12, + STRING = 13, + COMPLEX64 = 16, + COMPLEX128 = 17, + BF16 = 27 +} printDataType; + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0); + +/** + * @brief Read data from file + * @param [in] filePath: file path + * @param [out] fileSize: file size + * @return read result + */ +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} + +template void DoPrintData(const T *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << data[i]; + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void DoPrintHalfData(const aclFloat16 *data, size_t count, size_t elementsPerRow) +{ + assert(elementsPerRow != 0); + for (size_t i = 0; i < count; ++i) { + std::cout << std::setw(10) << std::setprecision(6) << aclFloat16ToFloat(data[i]); + if (i % elementsPerRow == elementsPerRow - 1) { + std::cout << std::endl; + } + } +} + +void PrintData(const void *data, size_t count, printDataType dataType, size_t elementsPerRow = 16) +{ + if (data == nullptr) { + ERROR_LOG("Print data failed. data is nullptr"); + return; + } + + switch (dataType) { + case BOOL: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT8_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT16_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT32_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case INT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case UINT64_T: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case HALF: + DoPrintHalfData(reinterpret_cast(data), count, elementsPerRow); + break; + case FLOAT: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + case DOUBLE: + DoPrintData(reinterpret_cast(data), count, elementsPerRow); + break; + default: + ERROR_LOG("Unsupported type: %d", dataType); + } + std::cout << std::endl; +} +#endif // DATA_UTILS_H diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp new file mode 100644 index 000000000..8a65f8fa6 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp @@ -0,0 +1,127 @@ +/** + * @file main.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "data_utils.h" +#ifndef ASCENDC_CPU_DEBUG +#include "acl/acl.h" +extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); +extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); +using KernelEntry = void(*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *); +#else +#include "tikicpulib.h" +extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z); +extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z); +using KernelEntry = void(*)(GM_ADDR, GM_ADDR, GM_ADDR); + +#endif + +struct ArgInfo { + std::string fileName; + size_t length; +}; + +#ifndef ASCENDC_CPU_DEBUG + +void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector &inputsInfo, + std::vector &outputsInfo) +{ + std::vector inputHost(inputsInfo.size()); + std::vector inputDevice(inputsInfo.size()); + std::vector outputHost(outputsInfo.size()); + std::vector outputDevice(outputsInfo.size()); + + for (uint32_t i = 0; i < inputsInfo.size(); i++) { + CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length)); + CHECK_ACL(aclrtMalloc((void **)(&inputDevice[i]), inputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST)); + ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, inputHost[i], inputsInfo[i].length); + CHECK_ACL(aclrtMemcpy(inputDevice[i], inputsInfo[i].length, inputHost[i], inputsInfo[i].length, + ACL_MEMCPY_HOST_TO_DEVICE)); + } + + for (uint32_t i = 0; i < outputsInfo.size(); i++) { + CHECK_ACL(aclrtMallocHost((void **)(&outputHost[i]), outputsInfo[i].length)); + CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST)); + } + + kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0]); + CHECK_ACL(aclrtSynchronizeStream(stream)); + for (uint32_t i = 0; i < outputsInfo.size(); i++) { + CHECK_ACL(aclrtMemcpy(outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length, + ACL_MEMCPY_DEVICE_TO_HOST)); + WriteFile(outputsInfo[i].fileName, outputHost[i], outputsInfo[i].length); + CHECK_ACL(aclrtFree(outputDevice[i])); + CHECK_ACL(aclrtFreeHost(outputHost[i])); + } + + for (uint32_t i = 0; i < inputsInfo.size(); i++) { + CHECK_ACL(aclrtFree(inputDevice[i])); + CHECK_ACL(aclrtFreeHost(inputHost[i])); + } +} + +#else + +#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo) \ + { \ + std::vector input(inputsInfo.size()); \ + std::vector output(outputsInfo.size()); \ + \ + for (uint32_t i = 0; i < inputsInfo.size(); i++) { \ + input[i] = (uint8_t *)AscendC::GmAlloc(inputsInfo[i].length); \ + ReadFile(inputsInfo[i].fileName, inputsInfo[i].length, input[i], inputsInfo[i].length); \ + } \ + \ + for (uint32_t i = 0; i < outputsInfo.size(); i++) { \ + output[i] = (uint8_t *)AscendC::GmAlloc(outputsInfo[i].length); \ + } \ + \ + AscendC::SetKernelMode(KernelMode::AIV_MODE); \ + ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0]); \ + for (uint32_t i = 0; i < inputsInfo.size(); i++) { \ + AscendC::GmFree((void *)input[i]); \ + } \ + \ + for (uint32_t i = 0; i < outputsInfo.size(); i++) { \ + WriteFile(outputsInfo[i].fileName, output[i], outputsInfo[i].length); \ + AscendC::GmFree((void *)output[i]); \ + } \ + } + +#endif + +int32_t main(int32_t argc, char *argv[]) +{ + uint32_t blockDim = 1; + uint32_t dataLen = 4096; + size_t inputByteSize = dataLen * sizeof(float); + size_t outputByteSize = dataLen * sizeof(float); + + std::vector inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}}; + std::vector outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}}; + std::vector outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}}; + +#ifndef ASCENDC_CPU_DEBUG + CHECK_ACL(aclInit(nullptr)); + int32_t deviceId = 0; + CHECK_ACL(aclrtSetDevice(deviceId)); + aclrtStream stream = nullptr; + CHECK_ACL(aclrtCreateStream(&stream)); + + KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info); + KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info); + + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +#else + KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info); + KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info); +#endif + return 0; +} diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh new file mode 100644 index 000000000..10a011174 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh @@ -0,0 +1,113 @@ +#!/bin/bash +CURRENT_DIR=$( + cd $(dirname ${BASH_SOURCE:-$0}) + pwd +) + +BUILD_TYPE="Debug" +INSTALL_PREFIX="${CURRENT_DIR}/out" + +SHORT=r:,v:,i:,b:,p:, +LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, +OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") +eval set -- "$OPTS" +SOC_VERSION="Ascend310P3" + +while :; do + case "$1" in + -r | --run-mode) + RUN_MODE="$2" + shift 2 + ;; + -v | --soc-version) + SOC_VERSION="$2" + shift 2 + ;; + -i | --install-path) + ASCEND_INSTALL_PATH="$2" + shift 2 + ;; + -b | --build-type) + BUILD_TYPE="$2" + shift 2 + ;; + -p | --install-prefix) + INSTALL_PREFIX="$2" + shift 2 + ;; + --) + shift + break + ;; + *) + echo "[ERROR] Unexpected option: $1" + break + ;; + esac +done + +RUN_MODE_LIST="cpu sim npu" +if [[ " $RUN_MODE_LIST " != *" $RUN_MODE "* ]]; then + echo "ERROR: RUN_MODE error, This sample only support specify cpu, sim or npu!" + exit -1 +fi + +VERSION_LIST="Ascend910B1 Ascend910B2 Ascend910B3 Ascend910B4" +if [[ " $VERSION_LIST " != *" $SOC_VERSION "* ]]; then + echo "ERROR: SOC_VERSION should be in [$VERSION_LIST]" + exit -1 +fi + +if [ -n "$ASCEND_INSTALL_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_INSTALL_PATH +elif [ -n "$ASCEND_HOME_PATH" ]; then + _ASCEND_INSTALL_PATH=$ASCEND_HOME_PATH +else + if [ -d "$HOME/Ascend/ascend-toolkit/latest" ]; then + _ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + else + _ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + fi +fi + +export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} +export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} +echo "Current compile soc version is ${SOC_VERSION}" +source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash +if [ "${RUN_MODE}" = "sim" ]; then + # in case of running op in simulator, use stub .so instead + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +elif [ "${RUN_MODE}" = "cpu" ]; then + export LD_LIBRARY_PATH=${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib:${_ASCEND_INSTALL_PATH}/tools/tikicpulib/lib/${SOC_VERSION}:${_ASCEND_INSTALL_PATH}/tools/simulator/${SOC_VERSION}/lib:$LD_LIBRARY_PATH +fi + +set -e +rm -rf build out +mkdir -p build +cmake -B build \ + -DRUN_MODE=${RUN_MODE} \ + -DSOC_VERSION=${SOC_VERSION} \ + -DCMAKE_BUILD_TYPE=${BUILD_TYPE} \ + -DCMAKE_INSTALL_PREFIX=${INSTALL_PREFIX} \ + -DASCEND_CANN_PACKAGE_PATH=${_ASCEND_INSTALL_PATH} +cmake --build build -j +cmake --install build + +rm -f ascendc_kernels_bbit +cp ./out/bin/ascendc_kernels_bbit ./ +rm -rf input output +mkdir -p input output +python3 scripts/gen_data.py +( + export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --launch-count=2 --output=./prof ./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "cpu" ]; then + ./ascendc_kernels_bbit + fi +) +md5sum output/*.bin +python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin +python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py new file mode 100644 index 000000000..86bbba89d --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py @@ -0,0 +1,25 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np + + +def gen_golden_data_simple(): + input_x = np.random.uniform(1, 100, [1, 4096]).astype(np.float32) + input_y = np.random.uniform(1, 100, [1, 4096]).astype(np.float32) + golden = (input_x + input_y).astype(np.float32) + + input_x.tofile("./input/input_x.bin") + input_y.tofile("./input/input_y.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data_simple() diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py new file mode 100644 index 000000000..6a38a3b2b --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py @@ -0,0 +1,53 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-4 +absolute_tol = 1e-5 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md new file mode 100644 index 000000000..4ea8f29b8 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md @@ -0,0 +1,69 @@ +## 概述 + +本样例介绍基于静态Tensor方式编程的场景下Add算子的实现方法,并提供核函数直调方法。 + +## 目录结构介绍 + +``` +├── 23_static_tensor_programming_kernel_launch // 使用核函数直调的方式调用Add自定义算子 +│ └── KernelLaunch // Kernel Launch方式调用核函数样例 +``` + +## 算子描述 + +算子实现的是固定shape为1×4096的Add算子。 + +Add的计算公式为: + +```python +z = x + y +``` + +- x:输入,形状为\[1, 4096],数据类型为float; +- y:输入,形状为\[1, 4096],数据类型为float; +- z:输出,形状为\[1, 4096],数据类型为float; + +## 算子规格描述 + + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x1 * 4096floatND
y1 * 4096floatND
算子输出y1 * 4096floatND
核函数名add_custom_v1 / add_custom_v2
+ +## 支持的产品型号 + +本样例支持如下产品型号: + +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + +## 编译运行样例算子 + +针对自定义算子工程,编译运行包含如下步骤: + +- 编译自定义算子工程; +- 调用执行自定义算子; + +详细操作如下所示。 + +### 1. 获取源码包 + +编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包。 + +### 2. 编译运行样例工程 + +- [KernelLaunch样例运行](./KernelLaunch/README.md) + +## 更新说明 + + +| 时间 | 更新事项 | +| ---------- | ---------------- | +| 2025/07/01 | 新增直调方式样例 | -- Gitee From dfa886a322e104e497c882f8b1ebd96cc5614488 Mon Sep 17 00:00:00 2001 From: "zhanghao (AS)" Date: Tue, 26 Aug 2025 14:03:57 +0800 Subject: [PATCH 02/13] add cases for low level programming --- .../KernelLaunch/CMakeLists.txt | 1 + .../KernelLaunch/add_custom_tiling.h | 17 ++ .../KernelLaunch/add_custom_v1.cpp | 28 ++-- .../KernelLaunch/add_custom_v2.cpp | 156 ++++++++++++------ .../KernelLaunch/add_custom_v3.cpp | 93 +++++++++++ .../KernelLaunch/main.cpp | 45 +++-- .../KernelLaunch/run.sh | 4 +- 7 files changed, 264 insertions(+), 80 deletions(-) create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt index 392189fe1..ab71257d0 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt @@ -18,6 +18,7 @@ endif() file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v3.cpp ) if("${RUN_MODE}" STREQUAL "cpu") diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h new file mode 100644 index 000000000..7c6d310b7 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h @@ -0,0 +1,17 @@ +/** + * @file add_custom_tiling.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_TILING_H +#define ADD_CUSTOM_TILING_H +#include + +struct AddCustomTilingData { + uint32_t totalLength; +}; +#endif \ No newline at end of file diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp index 471821d26..bb6906014 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp @@ -7,37 +7,38 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ +#include "add_custom_tiling.h" #include "kernel_operator.h" using AscendC::TPosition; namespace { -constexpr uint32_t TOTAL_LENGTH = 4096 * 16; // total length of data constexpr uint32_t TILE_LENGTH = 4096; } class KernelAdd { public: __aicore__ inline KernelAdd() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { - xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH); - yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH); - zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH); + xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); + yGm.SetGlobalBuffer((__gm__ float *)y, totalLength); + zGm.SetGlobalBuffer((__gm__ float *)z, totalLength); + loopCount = totalLength / TILE_LENGTH; } __aicore__ inline void Process() { - uint32_t loopCount = TOTAL_LENGTH / TILE_LENGTH; AscendC::LocalTensor xLocal(AscendC::TPosition::VECCALC, xAddr, TILE_LENGTH); AscendC::LocalTensor yLocal(AscendC::TPosition::VECCALC, yAddr, TILE_LENGTH); AscendC::LocalTensor zLocal(AscendC::TPosition::VECCALC, zAddr, TILE_LENGTH); + // one buffer for (int i = 0; i < loopCount; i++) { // dependency of PIPE_V & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops if (i != 0) { AscendC::WaitFlag(EVENT_ID0); } - DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH); - DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH); // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); @@ -53,7 +54,7 @@ public: // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH); + AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH); if (i != (loopCount - 1)) { // dependency of PIPE_MTE3 & PIPE_V caused by zLocal between 2 sequential loops AscendC::SetFlag(EVENT_ID0); @@ -68,18 +69,19 @@ private: AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; + uint32_t loopCount; }; -extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z) +extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { KernelAdd op; - op.Init(x, y, z); + op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); op.Process(); } #ifndef ASCENDC_CPU_DEBUG -void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling) { - add_custom_v1<<>>(x, y, z); + add_custom_v1<<>>(x, y, z, tiling); } #endif diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp index 65e7dd7e5..1f6f4df17 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp @@ -7,84 +7,138 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ +#include "add_custom_tiling.h" #include "kernel_operator.h" using AscendC::TPosition; namespace { -constexpr int32_t TOTAL_LENGTH = 4096; // total length of data -constexpr int32_t BUFFER_NUM = 1; // tensor num for each queue -constexpr int32_t BANKGROUP_SIZE = 1024 * 64; // one bank size is 4KB, with 16 banks -constexpr int32_t ONE_REPEAT_SIZE = 256; // 256 bytes per repeat -} // namespace +constexpr uint32_t TILE_LENGTH = 4096; +} class KernelAdd { public: __aicore__ inline KernelAdd() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { - xGm.SetGlobalBuffer((__gm__ float *)x, TOTAL_LENGTH); - yGm.SetGlobalBuffer((__gm__ float *)y, TOTAL_LENGTH); - zGm.SetGlobalBuffer((__gm__ float *)z, TOTAL_LENGTH); - // xLocal size add 256 to avoid rr conflict - pipe.InitBuffer(inQueueX, BUFFER_NUM, TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE); - // yLocal size adjust to 64KB - xLocal size to avoid rw conflict - pipe.InitBuffer(inQueueY, BUFFER_NUM, BANKGROUP_SIZE - (TOTAL_LENGTH * sizeof(float) + ONE_REPEAT_SIZE)); - pipe.InitBuffer(outQueueZ, BUFFER_NUM, TOTAL_LENGTH * sizeof(float)); + xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); + yGm.SetGlobalBuffer((__gm__ float *)y, totalLength); + zGm.SetGlobalBuffer((__gm__ float *)z, totalLength); + loopCount = totalLength / TILE_LENGTH; } __aicore__ inline void Process() { - CopyIn(); - Compute(); - CopyOut(); - } + // ping + AscendC::LocalTensor xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH); + AscendC::LocalTensor yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH); + AscendC::LocalTensor zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH); + // pong + AscendC::LocalTensor xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH); + AscendC::LocalTensor yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH); + AscendC::LocalTensor zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH); -private: - __aicore__ inline void CopyIn() - { - AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); - AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); - AscendC::DataCopy(xLocal, xGm, TOTAL_LENGTH); - AscendC::DataCopy(yLocal, yGm, TOTAL_LENGTH); - inQueueX.EnQue(xLocal); - inQueueY.EnQue(yLocal); - } - __aicore__ inline void Compute() - { - AscendC::LocalTensor xLocal = inQueueX.DeQue(); - AscendC::LocalTensor yLocal = inQueueY.DeQue(); - AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); - AscendC::Add(zLocal, xLocal, yLocal, TOTAL_LENGTH); - outQueueZ.EnQue(zLocal); - inQueueX.FreeTensor(xLocal); - inQueueY.FreeTensor(yLocal); - } - __aicore__ inline void CopyOut() - { - AscendC::LocalTensor zLocal = outQueueZ.DeQue(); - AscendC::DataCopy(zGm, zLocal, TOTAL_LENGTH); - outQueueZ.FreeTensor(zLocal); + // double buffer + for (int i = 0; i < loopCount / 2; i++) { + // ping part + // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops + if (i != 0) { + AscendC::WaitFlag(EVENT_ID0); + } + AscendC::DataCopy(xLocalPing, xGm[2 * i * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocalPing, yGm[2 * i * TILE_LENGTH], TILE_LENGTH); + // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one single loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + if (i != 0) { + // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops + AscendC::WaitFlag(EVENT_ID0); + } + AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH); + if (i != (loopCount - 1)) { + // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops + AscendC::SetFlag(EVENT_ID0); + } + // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPing in one single loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::DataCopy(zGm[2 * i * TILE_LENGTH], zLocalPing, TILE_LENGTH); + if (i != (loopCount - 1)) { + // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops + AscendC::SetFlag(EVENT_ID0); + } + + // pong part + // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops + if (i != 0) { + AscendC::WaitFlag(EVENT_ID1); + } + AscendC::DataCopy(xLocalPong, xGm[(2 * i + 1) * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocalPong, yGm[(2 * i + 1) * TILE_LENGTH], TILE_LENGTH); + // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPong/yLocalPong in one single loop + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + if (i != 0) { + // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops + AscendC::WaitFlag(EVENT_ID1); + } + AscendC::Add(zLocalPong, xLocalPong, yLocalPong, TILE_LENGTH); + if (i != (loopCount - 1)) { + // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops + AscendC::SetFlag(EVENT_ID1); + } + // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPong in one single loop + AscendC::SetFlag(EVENT_ID1); + AscendC::WaitFlag(EVENT_ID1); + AscendC::DataCopy(zGm[(2 * i + 1) * TILE_LENGTH], zLocalPong, TILE_LENGTH); + if (i != (loopCount - 1)) { + // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops + AscendC::SetFlag(EVENT_ID1); + } + } + + // tail block + if (loopCount % 2 != 0) { + // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing with the front for loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::DataCopy(xLocalPing, xGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocalPing, yGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH); + // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing with the front for loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH); + // dependency of PIPE_V & PIPE_MTE3 caused by zLocalPing in one loop + AscendC::SetFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID0); + AscendC::DataCopy(zGm[(loopCount - 1) * TILE_LENGTH], zLocalPing, TILE_LENGTH); + } } private: - AscendC::TPipe pipe; - AscendC::TQue inQueueX; - AscendC::TQue inQueueY; - AscendC::TQue outQueueZ; + static constexpr uint32_t xAddrPing = 0; + static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float); + static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2; + static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3; + static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4; + static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5; AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; + uint32_t loopCount; }; -extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z) +extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { KernelAdd op; - op.Init(x, y, z); + op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); op.Process(); } #ifndef ASCENDC_CPU_DEBUG -void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z) +void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling) { - add_custom_v2<<>>(x, y, z); + add_custom_v2<<>>(x, y, z, tiling); } #endif diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp new file mode 100644 index 000000000..f28463396 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp @@ -0,0 +1,93 @@ +/** + * @file add_custom_v3.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "add_custom_tiling.h" +#include "kernel_operator.h" + +using AscendC::TPosition; +namespace { +constexpr uint32_t TILE_LENGTH = 4096; +} + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) + { + xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); + yGm.SetGlobalBuffer((__gm__ float *)y, totalLength); + zGm.SetGlobalBuffer((__gm__ float *)z, totalLength); + loopCount = totalLength / TILE_LENGTH; + } + + __aicore__ inline void Process() + { + // ping + AscendC::LocalTensor xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH); + AscendC::LocalTensor yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH); + AscendC::LocalTensor zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH); + // pong + AscendC::LocalTensor xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH); + AscendC::LocalTensor yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH); + AscendC::LocalTensor zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH); + + // double buffer + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int i = 0; i < loopCount; i++) { + int32_t eventID = (i % 2 == 0 ? EVENT_ID0 : EVENT_ID1); + AscendC::LocalTensor &xLocal = (i % 2 == 0 ? xLocalPing : xLocalPong); + AscendC::LocalTensor &yLocal = (i % 2 == 0 ? yLocalPing : yLocalPong); + AscendC::LocalTensor &zLocal = (i % 2 == 0 ? zLocalPing : zLocalPong); + // dependency of PIPE_MTE3 & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops + AscendC::WaitFlag(eventID); + AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH); + + // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop + AscendC::SetFlag(eventID); + AscendC::WaitFlag(eventID); + AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop + AscendC::SetFlag(eventID); + AscendC::WaitFlag(eventID); + AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH); + // dependency of PIPE_MTE3 & PIPE_MTE2 caused by zLocal between 2 sequential loops + AscendC::SetFlag(eventID); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + } + +private: + static constexpr uint32_t xAddrPing = 0; + static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float); + static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2; + static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3; + static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4; + static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t loopCount; +}; + +extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) +{ + KernelAdd op; + op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling) +{ + add_custom_v3<<>>(x, y, z, tiling); +} +#endif diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp index 8a65f8fa6..287c33418 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp @@ -7,17 +7,20 @@ * but WITHOUT ANY WARRANTY; without even the implied warranty of * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. */ +#include "add_custom_tiling.h" #include "data_utils.h" #ifndef ASCENDC_CPU_DEBUG #include "acl/acl.h" -extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); -extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z); -using KernelEntry = void(*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *); +extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling); +extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling); +extern void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling); +using KernelEntry = void (*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *, uint8_t *); #else #include "tikicpulib.h" -extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z); -extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z); -using KernelEntry = void(*)(GM_ADDR, GM_ADDR, GM_ADDR); +extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling); +extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling); +extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling); +using KernelEntry = void (*)(GM_ADDR, GM_ADDR, GM_ADDR, GM_ADDR); #endif @@ -29,12 +32,17 @@ struct ArgInfo { #ifndef ASCENDC_CPU_DEBUG void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::vector &inputsInfo, - std::vector &outputsInfo) + std::vector &outputsInfo, uint8_t *tiling) { std::vector inputHost(inputsInfo.size()); std::vector inputDevice(inputsInfo.size()); std::vector outputHost(outputsInfo.size()); std::vector outputDevice(outputsInfo.size()); + uint8_t *tilingDevice; + + CHECK_ACL(aclrtMalloc((void **)(&tilingDevice), sizeof(AddCustomTilingData), ACL_MEM_MALLOC_HUGE_FIRST)); + CHECK_ACL(aclrtMemcpy(tilingDevice, sizeof(AddCustomTilingData), tiling, sizeof(AddCustomTilingData), + ACL_MEMCPY_HOST_TO_DEVICE)); for (uint32_t i = 0; i < inputsInfo.size(); i++) { CHECK_ACL(aclrtMallocHost((void **)(&inputHost[i]), inputsInfo[i].length)); @@ -49,8 +57,10 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v CHECK_ACL(aclrtMalloc((void **)(&outputDevice[i]), outputsInfo[i].length, ACL_MEM_MALLOC_HUGE_FIRST)); } - kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0]); + kernelEntry(blockDim, stream, inputDevice[0], inputDevice[1], outputDevice[0], tilingDevice); CHECK_ACL(aclrtSynchronizeStream(stream)); + + CHECK_ACL(aclrtFree(tilingDevice)); for (uint32_t i = 0; i < outputsInfo.size(); i++) { CHECK_ACL(aclrtMemcpy(outputHost[i], outputsInfo[i].length, outputDevice[i], outputsInfo[i].length, ACL_MEMCPY_DEVICE_TO_HOST)); @@ -67,7 +77,7 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v #else -#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo) \ +#define KernelCall(kernelEntry, blockDim, inputsInfo, outputsInfo, tiling) \ { \ std::vector input(inputsInfo.size()); \ std::vector output(outputsInfo.size()); \ @@ -82,7 +92,7 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v } \ \ AscendC::SetKernelMode(KernelMode::AIV_MODE); \ - ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0]); \ + ICPU_RUN_KF(kernelEntry, blockDim, input[0], input[1], output[0], tiling); \ for (uint32_t i = 0; i < inputsInfo.size(); i++) { \ AscendC::GmFree((void *)input[i]); \ } \ @@ -98,13 +108,16 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v int32_t main(int32_t argc, char *argv[]) { uint32_t blockDim = 1; - uint32_t dataLen = 4096; + uint32_t dataLen = 4096 * 3; size_t inputByteSize = dataLen * sizeof(float); size_t outputByteSize = dataLen * sizeof(float); + AddCustomTilingData tiling; + tiling.totalLength = dataLen; std::vector inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}}; std::vector outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}}; std::vector outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}}; + std::vector outputsV3Info = {{"./output/output_z_v3.bin", outputByteSize}}; #ifndef ASCENDC_CPU_DEBUG CHECK_ACL(aclInit(nullptr)); @@ -113,15 +126,17 @@ int32_t main(int32_t argc, char *argv[]) aclrtStream stream = nullptr; CHECK_ACL(aclrtCreateStream(&stream)); - KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info); - KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info); + KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info, (uint8_t *)&tiling); + KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info, (uint8_t *)&tiling); + KernelCall(add_custom_do_v3, blockDim, stream, inputsInfo, outputsV3Info, (uint8_t *)&tiling); CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtResetDevice(deviceId)); CHECK_ACL(aclFinalize()); #else - KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info); - KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info); + KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info, (uint8_t *)&tiling); + KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info, (uint8_t *)&tiling); + KernelCall(add_custom_v3, blockDim, inputsInfo, outputsV3Info, (uint8_t *)&tiling); #endif return 0; } diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh index 10a011174..eb5f327ea 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh @@ -101,7 +101,8 @@ python3 scripts/gen_data.py ( export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH if [ "${RUN_MODE}" = "npu" ]; then - msprof op --launch-count=2 --output=./prof ./ascendc_kernels_bbit + # msprof op --launch-count=3 --output=./prof ./ascendc_kernels_bbit + ./ascendc_kernels_bbit elif [ "${RUN_MODE}" = "sim" ]; then msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit elif [ "${RUN_MODE}" = "cpu" ]; then @@ -111,3 +112,4 @@ python3 scripts/gen_data.py md5sum output/*.bin python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin +python3 scripts/verify_result.py output/output_z_v3.bin output/golden.bin -- Gitee From a25c4cc9926336476c6bf58de37a7525ac276033 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Thu, 4 Sep 2025 11:18:03 +0800 Subject: [PATCH 03/13] fix code --- .../KernelLaunch/add_custom_v1.cpp | 6 +++--- .../KernelLaunch/add_custom_v2.cpp | 6 +++--- .../KernelLaunch/add_custom_v3.cpp | 6 +++--- 3 files changed, 9 insertions(+), 9 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp index bb6906014..77dbc459e 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp @@ -15,9 +15,9 @@ namespace { constexpr uint32_t TILE_LENGTH = 4096; } -class KernelAdd { +class KernelAddV1 { public: - __aicore__ inline KernelAdd() {} + __aicore__ inline KernelAddV1() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); @@ -74,7 +74,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { - KernelAdd op; + KernelAddV1 op; op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); op.Process(); } diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp index 1f6f4df17..c60601cc9 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp @@ -15,9 +15,9 @@ namespace { constexpr uint32_t TILE_LENGTH = 4096; } -class KernelAdd { +class KernelAddV2 { public: - __aicore__ inline KernelAdd() {} + __aicore__ inline KernelAddV2() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); @@ -131,7 +131,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { - KernelAdd op; + KernelAddV2 op; op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); op.Process(); } diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp index f28463396..4a51f78dd 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp @@ -15,9 +15,9 @@ namespace { constexpr uint32_t TILE_LENGTH = 4096; } -class KernelAdd { +class KernelAddV3 { public: - __aicore__ inline KernelAdd() {} + __aicore__ inline KernelAddV3() {} __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) { xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); @@ -80,7 +80,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { - KernelAdd op; + KernelAddV3 op; op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); op.Process(); } -- Gitee From a868a6a06d024d06dfce83f197b7f8733b6f7eba Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Thu, 4 Sep 2025 11:47:18 +0800 Subject: [PATCH 04/13] fix loop event --- .../KernelLaunch/add_custom_v2.cpp | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp index c60601cc9..831a873dd 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp @@ -53,7 +53,7 @@ public: AscendC::WaitFlag(EVENT_ID0); } AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH); - if (i != (loopCount - 1)) { + if (i != (loopCount / 2 - 1)) { // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing between 2 sequential loops AscendC::SetFlag(EVENT_ID0); } @@ -61,7 +61,7 @@ public: AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); AscendC::DataCopy(zGm[2 * i * TILE_LENGTH], zLocalPing, TILE_LENGTH); - if (i != (loopCount - 1)) { + if (i != (loopCount / 2 - 1)) { // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing between 2 sequential loops AscendC::SetFlag(EVENT_ID0); } @@ -81,7 +81,7 @@ public: AscendC::WaitFlag(EVENT_ID1); } AscendC::Add(zLocalPong, xLocalPong, yLocalPong, TILE_LENGTH); - if (i != (loopCount - 1)) { + if (i != (loopCount / 2 - 1)) { // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPong/yLocalPong between 2 sequential loops AscendC::SetFlag(EVENT_ID1); } @@ -89,7 +89,7 @@ public: AscendC::SetFlag(EVENT_ID1); AscendC::WaitFlag(EVENT_ID1); AscendC::DataCopy(zGm[(2 * i + 1) * TILE_LENGTH], zLocalPong, TILE_LENGTH); - if (i != (loopCount - 1)) { + if (i != (loopCount / 2 - 1)) { // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPong between 2 sequential loops AscendC::SetFlag(EVENT_ID1); } @@ -97,7 +97,7 @@ public: // tail block if (loopCount % 2 != 0) { - // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing with the front for loop + // dependency of PIPE_V & PIPE_MTE2 caused by xLocalPing/yLocalPing with the previous for loop AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); AscendC::DataCopy(xLocalPing, xGm[(loopCount - 1) * TILE_LENGTH], TILE_LENGTH); @@ -105,7 +105,7 @@ public: // dependency of PIPE_MTE2 & PIPE_V caused by xLocalPing/yLocalPing in one loop AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); - // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing with the front for loop + // dependency of PIPE_MTE3 & PIPE_V caused by zLocalPing with the previous for loop AscendC::SetFlag(EVENT_ID0); AscendC::WaitFlag(EVENT_ID0); AscendC::Add(zLocalPing, xLocalPing, yLocalPing, TILE_LENGTH); -- Gitee From 77e1ed5aee8a6d75866e1f2c1caa3df666f37df6 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 10:54:02 +0800 Subject: [PATCH 05/13] add multi cores --- .../KernelLaunch/add_custom_tiling.h | 2 +- .../KernelLaunch/add_custom_v1.cpp | 12 ++++++------ .../KernelLaunch/add_custom_v2.cpp | 12 ++++++------ .../KernelLaunch/add_custom_v3.cpp | 12 ++++++------ .../KernelLaunch/main.cpp | 6 +++--- 5 files changed, 22 insertions(+), 22 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h index 7c6d310b7..278a6e336 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_tiling.h @@ -12,6 +12,6 @@ #include struct AddCustomTilingData { - uint32_t totalLength; + uint32_t singleCoreLength; }; #endif \ No newline at end of file diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp index 77dbc459e..c7aa904f3 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp @@ -18,12 +18,12 @@ constexpr uint32_t TILE_LENGTH = 4096; class KernelAddV1 { public: __aicore__ inline KernelAddV1() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength) { - xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); - yGm.SetGlobalBuffer((__gm__ float *)y, totalLength); - zGm.SetGlobalBuffer((__gm__ float *)z, totalLength); - loopCount = totalLength / TILE_LENGTH; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + loopCount = singleCoreLength / TILE_LENGTH; } __aicore__ inline void Process() { @@ -75,7 +75,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { KernelAddV1 op; - op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); + op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); op.Process(); } diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp index 831a873dd..6871ba70d 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp @@ -18,12 +18,12 @@ constexpr uint32_t TILE_LENGTH = 4096; class KernelAddV2 { public: __aicore__ inline KernelAddV2() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength) { - xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); - yGm.SetGlobalBuffer((__gm__ float *)y, totalLength); - zGm.SetGlobalBuffer((__gm__ float *)z, totalLength); - loopCount = totalLength / TILE_LENGTH; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + loopCount = singleCoreLength / TILE_LENGTH; } __aicore__ inline void Process() { @@ -132,7 +132,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { KernelAddV2 op; - op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); + op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); op.Process(); } diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp index 4a51f78dd..e91fd9bdb 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp @@ -18,12 +18,12 @@ constexpr uint32_t TILE_LENGTH = 4096; class KernelAddV3 { public: __aicore__ inline KernelAddV3() {} - __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength) { - xGm.SetGlobalBuffer((__gm__ float *)x, totalLength); - yGm.SetGlobalBuffer((__gm__ float *)y, totalLength); - zGm.SetGlobalBuffer((__gm__ float *)z, totalLength); - loopCount = totalLength / TILE_LENGTH; + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + loopCount = singleCoreLength / TILE_LENGTH; } __aicore__ inline void Process() @@ -81,7 +81,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { KernelAddV3 op; - op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->totalLength); + op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); op.Process(); } diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp index 287c33418..8b6b2f1e0 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp @@ -107,12 +107,12 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v int32_t main(int32_t argc, char *argv[]) { - uint32_t blockDim = 1; - uint32_t dataLen = 4096 * 3; + uint32_t blockDim = 8; + uint32_t dataLen = 4096 * 9 * blockDim; size_t inputByteSize = dataLen * sizeof(float); size_t outputByteSize = dataLen * sizeof(float); AddCustomTilingData tiling; - tiling.totalLength = dataLen; + tiling.singleCoreLength = dataLen / blockDim; std::vector inputsInfo = {{"./input/input_x.bin", inputByteSize}, {"./input/input_y.bin", inputByteSize}}; std::vector outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}}; -- Gitee From f341d2207f71f223b5faf40b103449be75571df7 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 10:59:58 +0800 Subject: [PATCH 06/13] fix typos --- .../KernelLaunch/main.cpp | 1 + .../KernelLaunch/scripts/gen_data.py | 4 ++-- 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp index 8b6b2f1e0..6591e50da 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp @@ -108,6 +108,7 @@ void KernelCall(KernelEntry kernelEntry, uint32_t blockDim, void *stream, std::v int32_t main(int32_t argc, char *argv[]) { uint32_t blockDim = 8; + // set data length, in this case we use 8 cores and length of each core is 4096 * 9 uint32_t dataLen = 4096 * 9 * blockDim; size_t inputByteSize = dataLen * sizeof(float); size_t outputByteSize = dataLen * sizeof(float); diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py index 86bbba89d..a77774b9a 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py @@ -12,8 +12,8 @@ import numpy as np def gen_golden_data_simple(): - input_x = np.random.uniform(1, 100, [1, 4096]).astype(np.float32) - input_y = np.random.uniform(1, 100, [1, 4096]).astype(np.float32) + input_x = np.random.uniform(1, 100, [8 * 9, 4096]).astype(np.float32) + input_y = np.random.uniform(1, 100, [8 * 9, 4096]).astype(np.float32) golden = (input_x + input_y).astype(np.float32) input_x.tofile("./input/input_x.bin") -- Gitee From a900629f4d4a1935c958154fcdb5879454324313 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 11:31:10 +0800 Subject: [PATCH 07/13] add local mem allocator --- .../KernelLaunch/CMakeLists.txt | 1 + .../KernelLaunch/add_custom_v3.cpp | 20 ++-- .../KernelLaunch/add_custom_v4.cpp | 93 +++++++++++++++++++ .../KernelLaunch/main.cpp | 5 + 4 files changed, 107 insertions(+), 12 deletions(-) create mode 100644 operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt index ab71257d0..e31108ae1 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/CMakeLists.txt @@ -19,6 +19,7 @@ file(GLOB KERNEL_FILES ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v1.cpp ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v2.cpp ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v3.cpp + ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_v4.cpp ) if("${RUN_MODE}" STREQUAL "cpu") diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp index e91fd9bdb..0f9aa7777 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp @@ -28,14 +28,16 @@ public: __aicore__ inline void Process() { + // use local memory allocator to simplify memor allocation + AscendC::LocalMemAllocator ubAllocator; // ping - AscendC::LocalTensor xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH); - AscendC::LocalTensor yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH); - AscendC::LocalTensor zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH); + AscendC::LocalTensor xLocalPing = ubAllocator.Alloc(); + AscendC::LocalTensor yLocalPing = ubAllocator.Alloc(); + AscendC::LocalTensor zLocalPing = ubAllocator.Alloc(); // pong - AscendC::LocalTensor xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH); - AscendC::LocalTensor yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH); - AscendC::LocalTensor zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH); + AscendC::LocalTensor xLocalPong = ubAllocator.Alloc(); + AscendC::LocalTensor yLocalPong = ubAllocator.Alloc(); + AscendC::LocalTensor zLocalPong = ubAllocator.Alloc(); // double buffer AscendC::SetFlag(EVENT_ID0); @@ -66,12 +68,6 @@ public: } private: - static constexpr uint32_t xAddrPing = 0; - static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float); - static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2; - static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3; - static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4; - static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5; AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp new file mode 100644 index 000000000..ef4aa0cc2 --- /dev/null +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp @@ -0,0 +1,93 @@ +/** + * @file add_custom_v4.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "add_custom_tiling.h" +#include "kernel_operator.h" + +using AscendC::TPosition; +namespace { +constexpr uint32_t TILE_LENGTH = 4096; +} + +class KernelAddV4 { +public: + __aicore__ inline KernelAddV4() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength) + { + xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + yGm.SetGlobalBuffer((__gm__ float *)y + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + zGm.SetGlobalBuffer((__gm__ float *)z + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); + loopCount = singleCoreLength / TILE_LENGTH; + } + + __aicore__ inline void Process() + { + // ping + AscendC::LocalTensor xLocalPing(AscendC::TPosition::VECCALC, xAddrPing, TILE_LENGTH); + AscendC::LocalTensor yLocalPing(AscendC::TPosition::VECCALC, yAddrPing, TILE_LENGTH); + AscendC::LocalTensor zLocalPing(AscendC::TPosition::VECCALC, zAddrPing, TILE_LENGTH); + // pong + AscendC::LocalTensor xLocalPong(AscendC::TPosition::VECCALC, xAddrPong, TILE_LENGTH); + AscendC::LocalTensor yLocalPong(AscendC::TPosition::VECCALC, yAddrPong, TILE_LENGTH); + AscendC::LocalTensor zLocalPong(AscendC::TPosition::VECCALC, zAddrPong, TILE_LENGTH); + + // double buffer + AscendC::SetFlag(EVENT_ID0); + AscendC::SetFlag(EVENT_ID1); + for (int i = 0; i < loopCount; i++) { + int32_t eventID = (i % 2 == 0 ? EVENT_ID0 : EVENT_ID1); + AscendC::LocalTensor &xLocal = (i % 2 == 0 ? xLocalPing : xLocalPong); + AscendC::LocalTensor &yLocal = (i % 2 == 0 ? yLocalPing : yLocalPong); + AscendC::LocalTensor &zLocal = (i % 2 == 0 ? zLocalPing : zLocalPong); + // dependency of PIPE_MTE3 & PIPE_MTE2 caused by xLocal/yLocal between 2 sequential loops + AscendC::WaitFlag(eventID); + AscendC::DataCopy(xLocal, xGm[i * TILE_LENGTH], TILE_LENGTH); + AscendC::DataCopy(yLocal, yGm[i * TILE_LENGTH], TILE_LENGTH); + + // dependency of PIPE_MTE2 & PIPE_V caused by xLocal/yLocal in one single loop + AscendC::SetFlag(eventID); + AscendC::WaitFlag(eventID); + AscendC::Add(zLocal, xLocal, yLocal, TILE_LENGTH); + // dependency of PIPE_V & PIPE_MTE3 caused by zLocal in one single loop + AscendC::SetFlag(eventID); + AscendC::WaitFlag(eventID); + AscendC::DataCopy(zGm[i * TILE_LENGTH], zLocal, TILE_LENGTH); + // dependency of PIPE_MTE3 & PIPE_MTE2 caused by zLocal between 2 sequential loops + AscendC::SetFlag(eventID); + } + AscendC::WaitFlag(EVENT_ID0); + AscendC::WaitFlag(EVENT_ID1); + } + +private: + static constexpr uint32_t xAddrPing = 0; + static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float); + static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2; + static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3; + static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4; + static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t loopCount; +}; + +extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) +{ + KernelAddV4 op; + op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); + op.Process(); +} + +#ifndef ASCENDC_CPU_DEBUG +void add_custom_do_v4(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling) +{ + add_custom_v4<<>>(x, y, z, tiling); +} +#endif diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp index 6591e50da..a64889808 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/main.cpp @@ -14,12 +14,14 @@ extern void add_custom_do_v1(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling); extern void add_custom_do_v2(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling); extern void add_custom_do_v3(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling); +extern void add_custom_do_v4(uint32_t blockDim, void *stream, uint8_t *x, uint8_t *y, uint8_t *z, uint8_t *tiling); using KernelEntry = void (*)(uint32_t, void *, uint8_t *, uint8_t *, uint8_t *, uint8_t *); #else #include "tikicpulib.h" extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling); extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling); extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling); +extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling); using KernelEntry = void (*)(GM_ADDR, GM_ADDR, GM_ADDR, GM_ADDR); #endif @@ -119,6 +121,7 @@ int32_t main(int32_t argc, char *argv[]) std::vector outputsV1Info = {{"./output/output_z_v1.bin", outputByteSize}}; std::vector outputsV2Info = {{"./output/output_z_v2.bin", outputByteSize}}; std::vector outputsV3Info = {{"./output/output_z_v3.bin", outputByteSize}}; + std::vector outputsV4Info = {{"./output/output_z_v4.bin", outputByteSize}}; #ifndef ASCENDC_CPU_DEBUG CHECK_ACL(aclInit(nullptr)); @@ -130,6 +133,7 @@ int32_t main(int32_t argc, char *argv[]) KernelCall(add_custom_do_v1, blockDim, stream, inputsInfo, outputsV1Info, (uint8_t *)&tiling); KernelCall(add_custom_do_v2, blockDim, stream, inputsInfo, outputsV2Info, (uint8_t *)&tiling); KernelCall(add_custom_do_v3, blockDim, stream, inputsInfo, outputsV3Info, (uint8_t *)&tiling); + KernelCall(add_custom_do_v4, blockDim, stream, inputsInfo, outputsV4Info, (uint8_t *)&tiling); CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtResetDevice(deviceId)); @@ -138,6 +142,7 @@ int32_t main(int32_t argc, char *argv[]) KernelCall(add_custom_v1, blockDim, inputsInfo, outputsV1Info, (uint8_t *)&tiling); KernelCall(add_custom_v2, blockDim, inputsInfo, outputsV2Info, (uint8_t *)&tiling); KernelCall(add_custom_v3, blockDim, inputsInfo, outputsV3Info, (uint8_t *)&tiling); + KernelCall(add_custom_v4, blockDim, inputsInfo, outputsV4Info, (uint8_t *)&tiling); #endif return 0; } -- Gitee From 6d9ad98008e3b0de81a3b946767b94a6221317bb Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 11:40:09 +0800 Subject: [PATCH 08/13] add case 4 in run.sh --- .../KernelLaunch/run.sh | 1 + 1 file changed, 1 insertion(+) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh index eb5f327ea..e5ab94cab 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh @@ -113,3 +113,4 @@ md5sum output/*.bin python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin python3 scripts/verify_result.py output/output_z_v3.bin output/golden.bin +python3 scripts/verify_result.py output/output_z_v4.bin output/golden.bin -- Gitee From e9196989785fc9456163dc15129022e449b88ef5 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 14:53:19 +0800 Subject: [PATCH 09/13] edit case4 --- .../KernelLaunch/add_custom_v4.cpp | 17 +++++++++++------ .../KernelLaunch/run.sh | 19 +++++++++++++------ 2 files changed, 24 insertions(+), 12 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp index ef4aa0cc2..708a303f7 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp @@ -13,6 +13,7 @@ using AscendC::TPosition; namespace { constexpr uint32_t TILE_LENGTH = 4096; +constexpr uint32_t DST_START_ADDRESS = 0x20000; } class KernelAddV4 { @@ -66,12 +67,16 @@ public: } private: - static constexpr uint32_t xAddrPing = 0; - static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float); - static constexpr uint32_t zAddrPing = TILE_LENGTH * sizeof(float) * 2; - static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 3; - static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 4; - static constexpr uint32_t zAddrPong = TILE_LENGTH * sizeof(float) * 5; + // according to bank conflict rule: + // rr conflict happened when 2 read requests are in the same bank group + // rw conflict happened when read and write requests are in the same bank + // so we adjust the address to avoid bank conflicts + static constexpr uint32_t xAddrPing = 0x0; + static constexpr uint32_t yAddrPing = TILE_LENGTH * sizeof(float) + 256; + static constexpr uint32_t zAddrPing = DST_START_ADDRESS; + static constexpr uint32_t xAddrPong = TILE_LENGTH * sizeof(float) * 2 + 256; + static constexpr uint32_t yAddrPong = TILE_LENGTH * sizeof(float) * 3 + 512; + static constexpr uint32_t zAddrPong = DST_START_ADDRESS + TILE_LENGTH * sizeof(float); AscendC::GlobalTensor xGm; AscendC::GlobalTensor yGm; AscendC::GlobalTensor zGm; diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh index e5ab94cab..435f708fa 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh @@ -100,15 +100,22 @@ mkdir -p input output python3 scripts/gen_data.py ( export LD_LIBRARY_PATH=$(pwd)/out/lib:$(pwd)/out/lib64:${_ASCEND_INSTALL_PATH}/lib64:$LD_LIBRARY_PATH - if [ "${RUN_MODE}" = "npu" ]; then - # msprof op --launch-count=3 --output=./prof ./ascendc_kernels_bbit - ./ascendc_kernels_bbit - elif [ "${RUN_MODE}" = "sim" ]; then - msprof op simulator --launch-count=2 --output=./prof ./ascendc_kernels_bbit - elif [ "${RUN_MODE}" = "cpu" ]; then + if [[ "$RUN_WITH_TOOLCHAIN" -eq 1 ]]; then + if [ "${RUN_MODE}" = "npu" ]; then + msprof op --launch-count=4 --output=./prof ./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "sim" ]; then + msprof op simulator --launch-count=4 --output=./prof ./ascendc_kernels_bbit + elif [ "${RUN_MODE}" = "cpu" ]; then + ./ascendc_kernels_bbit + fi + else ./ascendc_kernels_bbit fi ) +# tidy folder by delete log files +if [ "${RUN_MODE}" = "sim" ]; then + rm -f *.log *.dump *.vcd *.toml *_log +fi md5sum output/*.bin python3 scripts/verify_result.py output/output_z_v1.bin output/golden.bin python3 scripts/verify_result.py output/output_z_v2.bin output/golden.bin -- Gitee From 54ef230855865a7bc34e4e17bc50ccdac31d819f Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 15:41:46 +0800 Subject: [PATCH 10/13] fix typos and readme --- .../KernelLaunch/README.md | 19 ++++++++++++------ .../KernelLaunch/data_utils.h | 2 +- .../KernelLaunch/run.sh | 2 -- .../KernelLaunch/scripts/gen_data.py | 2 +- .../KernelLaunch/scripts/verify_result.py | 2 +- .../README.md | 20 +++++++++---------- 6 files changed, 26 insertions(+), 21 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md index f72b521cd..84edca8e7 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md @@ -6,8 +6,11 @@ │ ├── scripts │ │ ├── gen_data.py // 输入数据和真值数据生成脚本 │ │ └── verify_result.py // 验证输出数据和真值数据是否一致的验证脚本 +│ ├── add_custom_tiling.h // tiling结构体 │ ├── add_custom_v1.cpp // 算子kernel实现1:未优化前实现 -│ ├── add_custom_v2.cpp // 算子kernel实现2:优化地址分配,消除Bank冲突后的实现 +│ ├── add_custom_v2.cpp // 算子kernel实现2:基于实现1,实现double buffer +│ ├── add_custom_v3.cpp // 算子kernel实现3:优化double buffer实现,简化判断逻辑,并使用LocalMemAllocator简化代码 +│ ├── add_custom_v4.cpp // 算子kernel实现4:基于add_custom_v3,修改地址分配逻辑,消除bank冲突 │ ├── CMakeLists.txt // 编译工程文件 │ ├── data_utils.h // 数据读入写出函数 │ ├── main.cpp // 主函数,调用算子的应用程序,含CPU域及NPU域调用 @@ -16,7 +19,7 @@ ## 代码实现介绍 -本样例中实现的是固定shape为1*4096的Add算子。 +本样例中实现的是固定shape为8*9*4096的Add算子。 - kernel实现 @@ -30,9 +33,13 @@ Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 - 实现1:请参考[add_custom_v1.cpp](./add_custom_v1.cpp),xLocal地址为0,yLocal地址为0x4000,zLocal地址为0x8000。xLocal与yLocal存在读读冲突,xLocal与zLocal存在读写冲突。 + 实现1:请参考[add_custom_v1.cpp](./add_custom_v1.cpp),使用静态Tensor编程方法,进行add算子的编程。 - 实现2:请参考[add_custom_v2.cpp](./add_custom_v2.cpp),为了避免Bank冲突,通过配置InitBuffer时的bufferSize来调整Tensor地址,xLocal地址为0,yLocal地址为0x4100,zLocal地址为0x10000。 + 实现2:请参考[add_custom_v2.cpp](./add_custom_v2.cpp),优化性能,使用double buffer进行流水排布。 + + 实现3:请参考[add_custom_v3.cpp](./add_custom_v3.cpp),优化add_custom_v2中的反向同步及判断逻辑,同时使用LocalMemAllocator进行线性内存分配。 + + 实现4:请参考[add_custom_v4.cpp](./add_custom_v4.cpp),基于add_custom_v3的实现,优化地址分配消除Bank冲突。 - 调用实现 1. CPU侧运行验证主要通过ICPU_RUN_KF CPU调测宏等CPU调测库提供的接口来完成; @@ -46,7 +53,7 @@ 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/4_best_practices/4_bank_conflict/KernelLaunch + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch ``` - 配置环境变量 @@ -85,4 +92,4 @@ | 时间 | 更新事项 | | ---------- | ------------ | -| 2025/07/01 | 新增本readme | +| 2025/09/06 | 新增本readme | diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h index 09d906371..9d3445780 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/data_utils.h @@ -1,7 +1,7 @@ /** * @file data_utils.h * - * Copyright (C) 2024. Huawei Technologies Co., Ltd. All rights reserved. + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * * This program is distributed in the hope that it will be useful, * but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh index 435f708fa..6c691801e 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/run.sh @@ -11,7 +11,6 @@ SHORT=r:,v:,i:,b:,p:, LONG=run-mode:,soc-version:,install-path:,build-type:,install-prefix:, OPTS=$(getopt -a --options $SHORT --longoptions $LONG -- "$@") eval set -- "$OPTS" -SOC_VERSION="Ascend310P3" while :; do case "$1" in @@ -72,7 +71,6 @@ fi export ASCEND_TOOLKIT_HOME=${_ASCEND_INSTALL_PATH} export ASCEND_HOME_PATH=${_ASCEND_INSTALL_PATH} -echo "Current compile soc version is ${SOC_VERSION}" source ${_ASCEND_INSTALL_PATH}/bin/setenv.bash if [ "${RUN_MODE}" = "sim" ]; then # in case of running op in simulator, use stub .so instead diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py index a77774b9a..b8f7ccb5b 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/gen_data.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # coding=utf-8 # -# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py index 6a38a3b2b..a5019f30f 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/scripts/verify_result.py @@ -1,7 +1,7 @@ #!/usr/bin/python3 # coding=utf-8 # -# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. # # This program is distributed in the hope that it will be useful, # but WITHOUT ANY WARRANTY; without even the implied warranty of diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md index 4ea8f29b8..c3448a15f 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md @@ -6,12 +6,12 @@ ``` ├── 23_static_tensor_programming_kernel_launch // 使用核函数直调的方式调用Add自定义算子 -│ └── KernelLaunch // Kernel Launch方式调用核函数样例 +│ └── KernelLaunch // Kernel Launch方式调用核函数样例 ``` ## 算子描述 -算子实现的是固定shape为1×4096的Add算子。 +算子实现的是固定shape为8x9×4096的Add算子。 Add的计算公式为: @@ -19,9 +19,9 @@ Add的计算公式为: z = x + y ``` -- x:输入,形状为\[1, 4096],数据类型为float; -- y:输入,形状为\[1, 4096],数据类型为float; -- z:输出,形状为\[1, 4096],数据类型为float; +- x:输入,形状为\[72, 4096],数据类型为float; +- y:输入,形状为\[72, 4096],数据类型为float; +- z:输出,形状为\[72, 4096],数据类型为float; ## 算子规格描述 @@ -29,13 +29,13 @@ z = x + y 算子类型(OpType)Add 算子输入nameshapedata typeformat -x1 * 4096floatND -y1 * 4096floatND +x72 * 4096floatND +y72 * 4096floatND -算子输出y1 * 4096floatND +算子输出y72 * 4096floatND -核函数名add_custom_v1 / add_custom_v2 +核函数名add_custom_v1 / add_custom_v2 / add_custom_v3 / add_custom_v4 ## 支持的产品型号 @@ -66,4 +66,4 @@ z = x + y | 时间 | 更新事项 | | ---------- | ---------------- | -| 2025/07/01 | 新增直调方式样例 | +| 2025/09/06 | 新增直调方式样例 | -- Gitee From 51f72166cf627e3246cda15600b28bb49673339e Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 15:52:45 +0800 Subject: [PATCH 11/13] fix typos --- .../KernelLaunch/README.md | 2 +- .../23_static_tensor_programming_kernel_launch/README.md | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md index 84edca8e7..f58858378 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/README.md @@ -19,7 +19,7 @@ ## 代码实现介绍 -本样例中实现的是固定shape为8*9*4096的Add算子。 +本样例中实现的是固定shape为72*4096的Add算子。 - kernel实现 diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md index c3448a15f..818d7960f 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/README.md @@ -11,7 +11,7 @@ ## 算子描述 -算子实现的是固定shape为8x9×4096的Add算子。 +算子实现的是固定shape为72×4096的Add算子。 Add的计算公式为: -- Gitee From 832a76e8737c523ccbb76ce0ed025af0af840536 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Sat, 6 Sep 2025 17:35:13 +0800 Subject: [PATCH 12/13] add soc state init --- .../KernelLaunch/add_custom_v1.cpp | 1 + .../KernelLaunch/add_custom_v2.cpp | 1 + .../KernelLaunch/add_custom_v3.cpp | 1 + .../KernelLaunch/add_custom_v4.cpp | 1 + 4 files changed, 4 insertions(+) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp index c7aa904f3..c642b9b86 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp @@ -74,6 +74,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v1(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { + AscendC::InitSocState(); KernelAddV1 op; op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); op.Process(); diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp index 6871ba70d..857b2bbbc 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp @@ -131,6 +131,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v2(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { + AscendC::InitSocState(); KernelAddV2 op; op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); op.Process(); diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp index 0f9aa7777..ba088efd7 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp @@ -76,6 +76,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v3(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { + AscendC::InitSocState(); KernelAddV3 op; op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); op.Process(); diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp index 708a303f7..e3c05bfd2 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v4.cpp @@ -85,6 +85,7 @@ private: extern "C" __global__ __aicore__ void add_custom_v4(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR tiling) { + AscendC::InitSocState(); KernelAddV4 op; op.Init(x, y, z, ((__gm__ AddCustomTilingData *)tiling)->singleCoreLength); op.Process(); -- Gitee From 1e2c2fe6670123b53ac536216a058545be57cf44 Mon Sep 17 00:00:00 2001 From: zhanghao0689 Date: Mon, 8 Sep 2025 09:43:24 +0800 Subject: [PATCH 13/13] fix constructon func --- .../KernelLaunch/add_custom_v1.cpp | 2 +- .../KernelLaunch/add_custom_v2.cpp | 2 +- .../KernelLaunch/add_custom_v3.cpp | 2 +- 3 files changed, 3 insertions(+), 3 deletions(-) diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp index c642b9b86..e670aa2a4 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v1.cpp @@ -17,7 +17,7 @@ constexpr uint32_t TILE_LENGTH = 4096; class KernelAddV1 { public: - __aicore__ inline KernelAddV1() {} + __aicore__ inline KernelAddV1() = default; __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength) { xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp index 857b2bbbc..fca2299b1 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v2.cpp @@ -17,7 +17,7 @@ constexpr uint32_t TILE_LENGTH = 4096; class KernelAddV2 { public: - __aicore__ inline KernelAddV2() {} + __aicore__ inline KernelAddV2() = default; __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength) { xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); diff --git a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp index ba088efd7..4369fe980 100644 --- a/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp +++ b/operator/ascendc/0_introduction/23_static_tensor_programming_kernel_launch/KernelLaunch/add_custom_v3.cpp @@ -17,7 +17,7 @@ constexpr uint32_t TILE_LENGTH = 4096; class KernelAddV3 { public: - __aicore__ inline KernelAddV3() {} + __aicore__ inline KernelAddV3() = default; __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t singleCoreLength) { xGm.SetGlobalBuffer((__gm__ float *)x + AscendC::GetBlockIdx() * singleCoreLength, singleCoreLength); -- Gitee