From 5a2a674bf30e23eaa2372208159d85a9198542e2 Mon Sep 17 00:00:00 2001 From: ApeiriaNode_Booker Date: Mon, 22 Sep 2025 10:39:43 +0800 Subject: [PATCH 1/2] add simple matmul leakyrelu --- .../24_simple_hello_world/README.md | 6 +- .../0_introduction/25_simple_add/README.md | 6 +- .../26_simple_matmulleakyrelu/CMakeLists.txt | 22 ++ .../26_simple_matmulleakyrelu/README.md | 101 +++++ .../26_simple_matmulleakyrelu/data_utils.h | 96 +++++ .../matmul_leakyrelu.cpp | 350 ++++++++++++++++++ .../26_simple_matmulleakyrelu/run.sh | 8 + .../scripts/gen_data.py | 35 ++ .../scripts/verify_result.py | 55 +++ 9 files changed, 673 insertions(+), 6 deletions(-) create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp create mode 100755 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py create mode 100644 operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md index 53ab1d8cc..20c1ee4c8 100644 --- a/operator/ascendc/0_introduction/24_simple_hello_world/README.md +++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md @@ -10,14 +10,14 @@ ## 支持的产品型号 本样例支持如下产品型号: -- Atlas A2训练系列产品/Atlas 800I A2推理产品 - +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas A3 训练系列产品/Atlas A3 推理系列产品 ## 运行样例算子 - 打开样例目录 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld/ + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld ``` - 配置环境变量 diff --git a/operator/ascendc/0_introduction/25_simple_add/README.md b/operator/ascendc/0_introduction/25_simple_add/README.md index ffa237fc2..6f6600a15 100644 --- a/operator/ascendc/0_introduction/25_simple_add/README.md +++ b/operator/ascendc/0_introduction/25_simple_add/README.md @@ -41,14 +41,14 @@ z = x + y ## 支持的产品型号 本样例支持如下产品型号: -- Atlas A2训练系列产品/Atlas 800I A2推理产品 - +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas A3 训练系列产品/Atlas A3 推理系列产品 ## 运行样例算子 - 打开样例目录 以命令行方式下载样例代码,master分支为例。 ```bash - cd ${git_clone_path}/samples/operator/ascendc/0_introduction/25_simple_add/ + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/25_simple_add ``` - 配置环境变量 diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt new file mode 100644 index 000000000..c833ed1fd --- /dev/null +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt @@ -0,0 +1,22 @@ +cmake_minimum_required(VERSION 3.16) + +set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version") + +find_package(ASC REQUIRED) + +project(kernel_samples LANGUAGES ASC CXX) + +set_source_files_properties( + matmul_leakyrelu.cpp PROPERTIES LANGUAGE ASC +) + +add_executable(demo + matmul_leakyrelu.cpp +) + +target_link_libraries(demo PRIVATE + tiling_api + register + platform + m +) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md new file mode 100644 index 000000000..a8b9dbd70 --- /dev/null +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md @@ -0,0 +1,101 @@ +## 简化MatmulLeakyRelu算子直调样例 +本样例以MatmulLeakyRelu算子为示例,展示了一种更为简单的算子编译流程,支持main函数和Kernel函数在同一个cpp文件中实现。 +> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。 + +## 目录结构介绍 +``` +├── 26_simple_matmulleakyrelu +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── data_utils.h // 数据读入写出函数 +│ ├── matmul_leakyrelu.cpp // 算子实现及测试 +│ ├── run.sh // 运行脚本 +│ └── scripts +│ ├── gen_data.py // 输入数据和真值数据生成脚本文件 +│ └── verify_result.py // 真值对比文件 +``` + +## 算子描述 +算子使用了MatmulLeakyRelu高阶API,实现了快速的MatmulLeakyRelu矩阵乘法的运算操作。 + +MatmulLeakyRelu的计算公式为: + +``` +C = A * B + Bias +C = C > 0 ? C : C * 0.001 +``` + +- A、B为源操作数,A为左矩阵,形状为\[M, K];B为右矩阵,形状为\[K, N]。 +- C为目的操作数,存放矩阵乘结果的矩阵,形状为\[M, N]。 +- Bias为矩阵乘偏置,形状为\[N]。对A*B结果矩阵的每一行都采用该Bias进行偏置。 + +## 算子规格描述 + + + + + + + + + + + + +
算子类型(OpType)MatmulLeakyRelu
算子输入nameshapedata typeformat
a1024 * 256float16ND
b256 * 640float16ND
bias640floatND
算子输出c1024 * 640floatND
核函数名matmul_leakyrelu_custom
+ +## 代码实现介绍 +本样例中实现的是[m, n, k]固定为[1024, 640, 256]的MatmulLeakyRelu算子。 +- kernel实现 + MatmulLeakyRelu算子的数学表达式为: + ``` + C = A * B + Bias + C = C > 0 ? C : C * 0.001 + ``` + 其中A的形状为[1024, 256],B的形状为[256, 640],C的形状为[1024, 640],Bias的形状为[640]。具体请参考[matmul_leakyrelu.cpp](./matmul_leakyrelu.cpp)。 + +- 调用实现 + 使用内核调用符<<<>>>调用核函数。 + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 +- Atlas A3 训练系列产品/Atlas A3 推理系列产品 + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/26_simple_matmulleakyrelu + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + - 样例执行 + ```bash + bash run.sh # 编译并执行样例 + ``` + + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/22 | 新增本readme | \ No newline at end of file diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h new file mode 100644 index 000000000..582fbf68f --- /dev/null +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/data_utils.h @@ -0,0 +1,96 @@ +/** + * @file data_utils.h + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef DATA_UTILS_H +#define DATA_UTILS_H +#include +#include +#include + +#include +#include +#include +#include +#include +#include +#include + +#define INFO_LOG(fmt, args...) fprintf(stdout, "[INFO] " fmt "\n", ##args) +#define WARN_LOG(fmt, args...) fprintf(stdout, "[WARN] " fmt "\n", ##args) +#define ERROR_LOG(fmt, args...) fprintf(stdout, "[ERROR] " fmt "\n", ##args) + +bool ReadFile(const std::string &filePath, size_t &fileSize, void *buffer, size_t bufferSize) +{ + struct stat sBuf; + int fileStatus = stat(filePath.data(), &sBuf); + if (fileStatus == -1) { + ERROR_LOG("failed to get file"); + return false; + } + if (S_ISREG(sBuf.st_mode) == 0) { + ERROR_LOG("%s is not a file, please enter a file", filePath.c_str()); + return false; + } + + std::ifstream file; + file.open(filePath, std::ios::binary); + if (!file.is_open()) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + std::filebuf *buf = file.rdbuf(); + size_t size = buf->pubseekoff(0, std::ios::end, std::ios::in); + if (size == 0) { + ERROR_LOG("file size is 0"); + file.close(); + return false; + } + if (size > bufferSize) { + ERROR_LOG("file size is larger than buffer size"); + file.close(); + return false; + } + buf->pubseekpos(0, std::ios::in); + buf->sgetn(static_cast(buffer), size); + fileSize = size; + file.close(); + return true; +} + +/** + * @brief Write data to file + * @param [in] filePath: file path + * @param [in] buffer: data to write to file + * @param [in] size: size to write + * @return write result + */ +bool WriteFile(const std::string &filePath, const void *buffer, size_t size) +{ + if (buffer == nullptr) { + ERROR_LOG("Write file failed. buffer is nullptr"); + return false; + } + + int fd = open(filePath.c_str(), O_RDWR | O_CREAT | O_TRUNC, S_IRUSR | S_IWRITE); + if (fd < 0) { + ERROR_LOG("Open file failed. path = %s", filePath.c_str()); + return false; + } + + size_t writeSize = write(fd, buffer, size); + (void)close(fd); + if (writeSize != size) { + ERROR_LOG("Write file Failed."); + return false; + } + + return true; +} +#endif // DATA_UTILS_H diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp new file mode 100644 index 000000000..6c1e6136a --- /dev/null +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp @@ -0,0 +1,350 @@ +/** + * @file matmul_leakyrelu_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "data_utils.h" +#include "kernel_tiling/kernel_tiling.h" +#include "tiling/platform/platform_ascendc.h" +#include "acl/acl.h" +#include "tiling/tiling_api.h" +#include "kernel_operator.h" +#include "lib/matmul_intf.h" + +using namespace matmul; + +__aicore__ inline uint32_t Ceiling(uint32_t a, uint32_t b) +{ + return (a + b - 1) / b; +} + +/** + * @brief Copy tiling data to TCubeTiling ptr from tiling gm addr. + * @param tiling: TCubeTiling ptr which needs to copy tiling data. + * @param tilingGM: tiling gm addr. + * @retval None + */ +__aicore__ inline void CopyTiling(TCubeTiling *tiling, GM_ADDR tilingGM) +{ + uint32_t *ptr = reinterpret_cast(tiling); + auto tiling32 = reinterpret_cast<__gm__ uint32_t *>(tilingGM); + + for (uint32_t i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, ptr++) { + *ptr = *(tiling32 + i); + } + return; +} + +template class MatmulLeakyKernel { +public: + __aicore__ inline MatmulLeakyKernel(){}; + __aicore__ inline void Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, GM_ADDR workspace, + const TCubeTiling &tiling, AscendC::TPipe *pipe); + __aicore__ inline void Process(AscendC::TPipe *pipe); + + __aicore__ inline void MatmulCompute(); + __aicore__ inline void LeakyReluCompute(); + __aicore__ inline void CopyOut(uint32_t count); + __aicore__ inline void CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, int32_t &offsetA, int32_t &offsetB, + int32_t &offsetC, int32_t &offsetBias); + + Matmul, MatmulType, + MatmulType, MatmulType> + matmulObj; + + AscendC::GlobalTensor aGlobal; + AscendC::GlobalTensor bGlobal; + AscendC::GlobalTensor cGlobal; + AscendC::GlobalTensor biasGlobal; + AscendC::LocalTensor reluOutLocal; + TCubeTiling tiling; + AscendC::TQue reluOutQueue_; +}; + +/** + * @brief Set matmulLeaky input and output gm addr of current core. + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias gm addr. + * @param c: C matrix gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tiling: matmul tiling data. + * @param pipe: Global memory and sync management TPipe object. + * @retval None + */ +template +__aicore__ inline void MatmulLeakyKernel::Init(GM_ADDR a, GM_ADDR b, GM_ADDR bias, + GM_ADDR c, GM_ADDR workspace, + const TCubeTiling &tiling, AscendC::TPipe *pipe) +{ + this->tiling = tiling; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ aType *>(a), tiling.M * tiling.Ka); + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ bType *>(b), tiling.Kb * tiling.N); + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ cType *>(c), tiling.M * tiling.N); + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ biasType *>(bias), tiling.N); + + int32_t offsetA, offsetB, offsetC, offsetBias; + CalcOffset(AscendC::GetBlockIdx(), tiling, offsetA, offsetB, offsetC, offsetBias); // Calculate the gm offset based on the blockidx. + aGlobal = aGlobal[offsetA]; + bGlobal = bGlobal[offsetB]; + cGlobal = cGlobal[offsetC]; + biasGlobal = biasGlobal[offsetBias]; + pipe->InitBuffer(reluOutQueue_, 1, tiling.baseM * tiling.baseN * sizeof(cType)); // Init output buffer. +} + +/** + * @brief Main process of matmul calculation + * @param pipe: Global memory and sync management TPipe object. + * @retval None + */ +template +__aicore__ inline void MatmulLeakyKernel::Process(AscendC::TPipe *pipe) +{ + uint32_t computeRound = 0; + + matmulObj.SetTensorA(aGlobal); + matmulObj.SetTensorB(bGlobal); + matmulObj.SetBias(biasGlobal); + while (matmulObj.template Iterate()) { // Once Iterate, compute baseM * baseN, sync is set true here. + MatmulCompute(); // Get matmul compute result. + LeakyReluCompute(); // Compute leakyRelu. + CopyOut(computeRound); // Copy leakyRelu out result to GM. + computeRound++; + } + matmulObj.End(); +} + +template +__aicore__ inline void MatmulLeakyKernel::MatmulCompute() +{ + reluOutLocal = reluOutQueue_.AllocTensor(); + matmulObj.template GetTensorC(reluOutLocal, false, true); +} + +template +__aicore__ inline void MatmulLeakyKernel::LeakyReluCompute() +{ + LeakyRelu(reluOutLocal, reluOutLocal, (cType)0.001, tiling.baseM * tiling.baseN); + reluOutQueue_.EnQue(reluOutLocal); +} + +/** + * @brief Copy leakyRelu out result to GM. + * @param count: Iterate count(once Iterate, compute baseM * baseN). + * @retval None + */ +template +__aicore__ inline void MatmulLeakyKernel::CopyOut(uint32_t count) +{ + reluOutQueue_.DeQue(); + const uint32_t roundM = tiling.singleCoreM / tiling.baseM; + const uint32_t roundN = tiling.singleCoreN / tiling.baseN; + uint32_t startOffset = (count % roundM * tiling.baseM * tiling.N + count / roundM * tiling.baseN); + AscendC::DataCopyParams copyParam = {(uint16_t)tiling.baseM, (uint16_t)(tiling.baseN * sizeof(cType) / AscendC::DEFAULT_C0_SIZE), 0, + (uint16_t)((tiling.N - tiling.baseN) * sizeof(cType) / AscendC::DEFAULT_C0_SIZE)}; + DataCopy(cGlobal[startOffset], reluOutLocal, copyParam); + reluOutQueue_.FreeTensor(reluOutLocal); +} + +/** + * @brief Calculate the gm offset based on the blockidx. + * @param blockIdx: Current Core blockidx. + * @param tiling: Matmul tiling data. + * @param offsetA: Gm offset of A matrix. + * @param offsetB: Gm offset of B matrix. + * @param offsetC: Gm offset of C matrix. + * @param offsetBias: Gm offset of Bias matrix. + * @retval None + */ +template +__aicore__ inline void +MatmulLeakyKernel::CalcOffset(int32_t blockIdx, const TCubeTiling &tiling, + int32_t &offsetA, int32_t &offsetB, int32_t &offsetC, + int32_t &offsetBias) +{ + auto mSingleBlocks = Ceiling(tiling.M, tiling.singleCoreM); + auto mCoreIndx = blockIdx % mSingleBlocks; + auto nCoreIndx = blockIdx / mSingleBlocks; + + offsetA = mCoreIndx * tiling.Ka * tiling.singleCoreM; + offsetB = nCoreIndx * tiling.singleCoreN; + offsetC = mCoreIndx * tiling.N * tiling.singleCoreM + nCoreIndx * tiling.singleCoreN; + offsetBias = nCoreIndx * tiling.singleCoreN; +} + +/** + * @brief matmul_leakyrelu kernel function entry + * @param a: A matrix gm addr. + * @param b: B matrix gm addr. + * @param bias: Bias gm addr. + * @param c: Out gm addr. + * @param workspace: Temporary gm space addr required by matmul calc. + * @param tilingGm: Tiling data addr. + * @retval None + */ +__global__ __aicore__ void matmul_leakyrelu_custom(GM_ADDR a, GM_ADDR b, GM_ADDR bias, GM_ADDR c, + GM_ADDR workspace, GM_ADDR tilingGm) +{ + AscendC::TPipe pipe; + TCubeTiling tiling; + CopyTiling(&tiling, tilingGm); + + MatmulLeakyKernel matmulLeakyKernel; + matmulLeakyKernel.Init(a, b, bias, c, workspace, tiling, &pipe); + REGIST_MATMUL_OBJ(&pipe, GetSysWorkSpacePtr(), matmulLeakyKernel.matmulObj, &matmulLeakyKernel.tiling); // Initialize the matmul object. + matmulLeakyKernel.Process(&pipe); +} + +/** + * @brief Generate matmul tiling. + * @param socVersion: Platform socversion. + * @param tilingBuf data buffer. + */ +void GenerateTiling(const char *socVersion, uint8_t *tilingBuf) +{ + using TPosition = matmul_tiling::TPosition; + using CubeFormat = matmul_tiling::CubeFormat; + using DataType = matmul_tiling::DataType; + using namespace std; + int M = 1024; + int N = 640; + int K = 256; + + TPosition leftPosition = TPosition::GM; + CubeFormat leftFormat = CubeFormat::ND; + DataType leftDtype = DataType::DT_FLOAT16; + bool isTransA = false; + + TPosition rightPosition = TPosition::GM; + CubeFormat rightFormat = CubeFormat::ND; + DataType rightDtype = DataType::DT_FLOAT16; + bool isTransB = false; + + TPosition resultPosition = TPosition::GM; + CubeFormat resultFormat = CubeFormat::ND; + DataType resultDtype = DataType::DT_FLOAT; + + TPosition biasPosition = TPosition::GM; + CubeFormat biasFormat = CubeFormat::ND; + DataType biasDtype = DataType::DT_FLOAT; + bool isBias = true; + + int usedCoreNum = 2; + int baseM = 256; + int baseN = 128; + + optiling::TCubeTiling tilingData; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); + matmul_tiling::MultiCoreMatmulTiling tilingApi(*ascendcPlatform); + + tilingApi.SetDim(usedCoreNum); // Set the number of cores that participate in multi-core computaion is 2. + tilingApi.SetAType(leftPosition, leftFormat, leftDtype, isTransA); + tilingApi.SetBType(rightPosition, rightFormat, rightDtype, isTransB); + tilingApi.SetCType(resultPosition, resultFormat, resultDtype); + tilingApi.SetBiasType(biasPosition, biasFormat, biasDtype); + + tilingApi.SetOrgShape(M, N, K); + tilingApi.SetShape(M, N, K); + tilingApi.SetBias(isBias); + tilingApi.SetTraverse(matmul_tiling::MatrixTraverse::FIRSTM); // Set the matmul travse is FIRSTM. + tilingApi.SetFixSplit(baseM, baseN, -1); // Set the fixed baseM=128, baseN=256. + tilingApi.SetBufferSpace(-1, -1, -1); + + int64_t res = tilingApi.GetTiling(tilingData); // Get matmul tiling data. + tilingData.set_stepM(1); // Set the matmul tiling stepM=1. + tilingData.set_stepN(1); // Set the matmul tiling stepN=1. + if (res == -1) { + std::cout << "gen tiling failed" << std::endl; + } + uint32_t tcubeTilingSize = tilingData.GetDataSize(); + tilingData.SaveToBuffer(tilingBuf, tcubeTilingSize); + return; +} + +int32_t main(int32_t argc, char *argv[]) +{ + const char *socVersion = "Ascend910B1"; + auto ascendcPlatform = platform_ascendc::PlatformAscendCManager::GetInstance(socVersion); + size_t aFileSize = 262144 * sizeof(int16_t); + size_t bFileSize = 163840 * sizeof(int16_t); + size_t cFileSize = 655360 * sizeof(float); + size_t biasFileSize = 640 * sizeof(float); + size_t tilingFileSize = sizeof(TCubeTiling); + size_t userWorkspaceSize = 0; + size_t systemWorkspaceSize = static_cast(ascendcPlatform->GetLibApiWorkSpaceSize()); + size_t workspaceSize = userWorkspaceSize + systemWorkspaceSize; + uint8_t *tilingBuf = (uint8_t *)malloc(tilingFileSize); + GenerateTiling(socVersion, tilingBuf); + uint32_t blockDim = 1; + + aclInit(nullptr); + int32_t deviceId = 0; + aclrtSetDevice(deviceId); + aclrtStream stream = nullptr; + aclrtCreateStream(&stream); + + uint8_t *inputAHost; + uint8_t *inputADevice; + aclrtMallocHost((void **)(&inputAHost), aFileSize); + aclrtMalloc((void **)&inputADevice, aFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + ReadFile("./input/x1_gm.bin", aFileSize, inputAHost, aFileSize); + aclrtMemcpy(inputADevice, aFileSize, inputAHost, aFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + uint8_t *inputBHost; + uint8_t *inputBDevice; + aclrtMallocHost((void **)(&inputBHost), bFileSize); + aclrtMalloc((void **)&inputBDevice, bFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + ReadFile("./input/x2_gm.bin", bFileSize, inputBHost, bFileSize); + aclrtMemcpy(inputBDevice, bFileSize, inputBHost, bFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + uint8_t *outputCHost; + uint8_t *outputCDevice; + aclrtMallocHost((void **)(&outputCHost), cFileSize); + aclrtMalloc((void **)&outputCDevice, cFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + + uint8_t *inputBiasHost; + uint8_t *inputBiasDevice; + aclrtMallocHost((void **)(&inputBiasHost), biasFileSize); + aclrtMalloc((void **)&inputBiasDevice, biasFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + ReadFile("./input/bias.bin", biasFileSize, inputBiasHost, biasFileSize); + aclrtMemcpy(inputBiasDevice, biasFileSize, inputBiasHost, biasFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + uint8_t *tilingHost; + uint8_t *tilingDevice; + aclrtMallocHost((void **)(&tilingHost), tilingFileSize); + aclrtMalloc((void **)&tilingDevice, tilingFileSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMemcpy(tilingHost, tilingFileSize, tilingBuf, tilingFileSize, ACL_MEMCPY_HOST_TO_HOST); + aclrtMemcpy(tilingDevice, tilingFileSize, tilingHost, tilingFileSize, ACL_MEMCPY_HOST_TO_DEVICE); + + uint8_t *workspaceDevice; + aclrtMalloc((void **)&workspaceDevice, workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST); + + matmul_leakyrelu_custom<<>>(inputADevice, inputBDevice, inputBiasDevice, outputCDevice, + workspaceDevice, tilingDevice); + + aclrtSynchronizeStream(stream); + + aclrtFree(inputADevice); + aclrtFreeHost(inputAHost); + aclrtFree(inputBDevice); + aclrtFreeHost(inputBHost); + aclrtMemcpy(outputCHost, cFileSize, outputCDevice, cFileSize, ACL_MEMCPY_DEVICE_TO_HOST); + WriteFile("./output/output.bin", outputCHost, cFileSize); + aclrtFree(outputCDevice); + aclrtFreeHost(outputCHost); + aclrtFree(inputBiasDevice); + aclrtFreeHost(inputBiasHost); + aclrtFree(tilingDevice); + aclrtFreeHost(tilingHost); + aclrtFree(workspaceDevice); + + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + free(tilingBuf); + return 0; +} \ No newline at end of file diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh new file mode 100755 index 000000000..be900eb02 --- /dev/null +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh @@ -0,0 +1,8 @@ +rm -rf build +mkdir -p build +cd build +cmake .. +make -j +python3 ../scripts/gen_data.py +./demo +python3 ../scripts/verify_result.py output/output.bin output/golden.bin diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py new file mode 100644 index 000000000..e03d4359c --- /dev/null +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/gen_data.py @@ -0,0 +1,35 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import numpy as np +import os + + +def gen_golden_data(): + M = 1024 + N = 640 + K = 256 + + input_a = np.random.randint(1, 10, [M, K]).astype(np.float16) + input_b = np.random.randint(1, 10, [K, N]).astype(np.float16) + input_bias = np.random.randint(1, 10, [N]).astype(np.float32) + alpha = 0.001 + golden = (np.matmul(input_a.astype(np.float32), input_b.astype(np.float32)) + input_bias).astype(np.float32) + golden = np.where(golden >= 0, golden, golden * alpha) + os.system("mkdir -p input") + os.system("mkdir -p output") + input_a.tofile("./input/x1_gm.bin") + input_b.tofile("./input/x2_gm.bin") + input_bias.tofile("./input/bias.bin") + golden.tofile("./output/golden.bin") + + +if __name__ == "__main__": + gen_golden_data() diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py new file mode 100644 index 000000000..7a7e27ffa --- /dev/null +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/scripts/verify_result.py @@ -0,0 +1,55 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import sys +import numpy as np + +# for float32 +relative_tol = 1e-6 +absolute_tol = 1e-9 +error_tol = 1e-4 + + +def verify_result(output, golden): + output = np.fromfile(output, dtype=np.float32).reshape(-1) + golden = np.fromfile(golden, dtype=np.float32).reshape(-1) + print("golden : ", golden) + print("output : ", output) + different_element_results = np.isclose(output, + golden, + rtol=relative_tol, + atol=absolute_tol, + equal_nan=True) + different_element_indexes = np.where(different_element_results == False)[0] + for index in range(len(different_element_indexes)): + real_index = different_element_indexes[index] + golden_data = golden[real_index] + output_data = output[real_index] + print( + "data index: %06d, expected: %-.9f, actual: %-.9f, rdiff: %-.6f" % + (real_index, golden_data, output_data, + abs(output_data - golden_data) / golden_data)) + if index == 100: + break + error_ratio = float(different_element_indexes.size) / golden.size + print("error ratio: %.4f, tolerance: %.4f" % (error_ratio, error_tol)) + return error_ratio <= error_tol + + +if __name__ == '__main__': + try: + res = verify_result(sys.argv[1], sys.argv[2]) + if not res: + raise ValueError("[ERROR] result error") + else: + print("test pass") + except Exception as e: + print(e) + sys.exit(1) -- Gitee From d8a10e40b7d8849cf93ef9a9a8f7338373140271 Mon Sep 17 00:00:00 2001 From: ApeiriaNode_Booker Date: Mon, 22 Sep 2025 14:27:12 +0800 Subject: [PATCH 2/2] add simple pybind samples --- .../24_simple_hello_world/CMakeLists.txt | 6 +- .../24_simple_hello_world/README.md | 5 +- .../{hello_world.cpp => hello_world.asc} | 2 +- .../25_simple_add/CMakeLists.txt | 6 +- .../0_introduction/25_simple_add/README.md | 5 +- .../{add_custom.cpp => add_custom.asc} | 2 +- .../26_simple_matmulleakyrelu/CMakeLists.txt | 6 +- .../26_simple_matmulleakyrelu/README.md | 5 +- ...mul_leakyrelu.cpp => matmul_leakyrelu.asc} | 2 +- .../26_simple_matmulleakyrelu/run.sh | 8 +- .../CMakeLists.txt | 63 ++++++++++ .../27_simple_add_cpp_extensions/README.md | 102 ++++++++++++++++ .../add_custom.asc | 111 ++++++++++++++++++ .../add_custom_test.py | 38 ++++++ operator/ascendc/0_introduction/README.md | 7 +- 15 files changed, 334 insertions(+), 34 deletions(-) rename operator/ascendc/0_introduction/24_simple_hello_world/{hello_world.cpp => hello_world.asc} (97%) rename operator/ascendc/0_introduction/25_simple_add/{add_custom.cpp => add_custom.asc} (99%) rename operator/ascendc/0_introduction/26_simple_matmulleakyrelu/{matmul_leakyrelu.cpp => matmul_leakyrelu.asc} (99%) create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc create mode 100644 operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt index 590f26516..559929405 100644 --- a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt +++ b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt @@ -6,10 +6,6 @@ find_package(ASC REQUIRED) project(kernel_samples LANGUAGES ASC CXX) -set_source_files_properties( - hello_world.cpp PROPERTIES LANGUAGE ASC -) - add_executable(demo - hello_world.cpp + hello_world.asc ) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md index 20c1ee4c8..ddcaf96e9 100644 --- a/operator/ascendc/0_introduction/24_simple_hello_world/README.md +++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md @@ -5,13 +5,12 @@ ``` ├── 24_simple_helloworld │ ├── CMakeLists.txt // 编译工程文件 -│ └── hello_world.cpp // 算子实现及测试 +│ └── hello_world.asc // AscendC算子实现 & 调用样例 ``` ## 支持的产品型号 本样例支持如下产品型号: -- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 -- Atlas A3 训练系列产品/Atlas A3 推理系列产品 +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品 ## 运行样例算子 - 打开样例目录 diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.asc similarity index 97% rename from operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp rename to operator/ascendc/0_introduction/24_simple_hello_world/hello_world.asc index cb28dec3d..e67f32663 100644 --- a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp +++ b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.asc @@ -1,5 +1,5 @@ /** - * @file hello_world.cpp + * @file hello_world.asc * * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * diff --git a/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt index b3e88f157..08689321b 100644 --- a/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt +++ b/operator/ascendc/0_introduction/25_simple_add/CMakeLists.txt @@ -6,10 +6,6 @@ find_package(ASC REQUIRED) project(kernel_samples LANGUAGES ASC CXX) -set_source_files_properties( - add_custom.cpp PROPERTIES LANGUAGE ASC -) - add_executable(demo - add_custom.cpp + add_custom.asc ) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/25_simple_add/README.md b/operator/ascendc/0_introduction/25_simple_add/README.md index 6f6600a15..470c3edef 100644 --- a/operator/ascendc/0_introduction/25_simple_add/README.md +++ b/operator/ascendc/0_introduction/25_simple_add/README.md @@ -5,7 +5,7 @@ ``` ├── 25_simple_add │ ├── CMakeLists.txt // 编译工程文件 -│ └── add_custom.cpp // 算子实现及测试 +│ └── add_custom.asc // AscendC算子实现 & 调用样例 ``` ## 算子描述 @@ -41,8 +41,7 @@ z = x + y ## 支持的产品型号 本样例支持如下产品型号: -- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 -- Atlas A3 训练系列产品/Atlas A3 推理系列产品 +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品 ## 运行样例算子 - 打开样例目录 diff --git a/operator/ascendc/0_introduction/25_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/25_simple_add/add_custom.asc similarity index 99% rename from operator/ascendc/0_introduction/25_simple_add/add_custom.cpp rename to operator/ascendc/0_introduction/25_simple_add/add_custom.asc index d2b5cb112..544e134f3 100644 --- a/operator/ascendc/0_introduction/25_simple_add/add_custom.cpp +++ b/operator/ascendc/0_introduction/25_simple_add/add_custom.asc @@ -1,5 +1,5 @@ /** - * @file add_custom.cpp + * @file add_custom.asc * * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt index c833ed1fd..2958d3a02 100644 --- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/CMakeLists.txt @@ -6,12 +6,8 @@ find_package(ASC REQUIRED) project(kernel_samples LANGUAGES ASC CXX) -set_source_files_properties( - matmul_leakyrelu.cpp PROPERTIES LANGUAGE ASC -) - add_executable(demo - matmul_leakyrelu.cpp + matmul_leakyrelu.asc ) target_link_libraries(demo PRIVATE diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md index a8b9dbd70..da5b62293 100644 --- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/README.md @@ -7,7 +7,7 @@ ├── 26_simple_matmulleakyrelu │ ├── CMakeLists.txt // 编译工程文件 │ ├── data_utils.h // 数据读入写出函数 -│ ├── matmul_leakyrelu.cpp // 算子实现及测试 +│ ├── matmul_leakyrelu.asc // AscendC算子实现 & 调用样例 │ ├── run.sh // 运行脚本 │ └── scripts │ ├── gen_data.py // 输入数据和真值数据生成脚本文件 @@ -58,8 +58,7 @@ C = C > 0 ? C : C * 0.001 ## 支持的产品型号 本样例支持如下产品型号: -- Atlas A2 训练系列产品/Atlas 800I A2 推理产品/A200I A2 Box 异构组件 -- Atlas A3 训练系列产品/Atlas A3 推理系列产品 +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品 ## 运行样例算子 - 打开样例目录 diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.asc similarity index 99% rename from operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp rename to operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.asc index 6c1e6136a..430d62328 100644 --- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.cpp +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/matmul_leakyrelu.asc @@ -1,5 +1,5 @@ /** - * @file matmul_leakyrelu_custom.cpp + * @file matmul_leakyrelu_custom.asc * * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. * diff --git a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh index be900eb02..fe03bb21f 100755 --- a/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh +++ b/operator/ascendc/0_introduction/26_simple_matmulleakyrelu/run.sh @@ -1,8 +1,6 @@ -rm -rf build -mkdir -p build -cd build -cmake .. -make -j +#!/bin/bash +rm -rf build; mkdir -p build; cd build +cmake ..; make -j python3 ../scripts/gen_data.py ./demo python3 ../scripts/verify_result.py output/output.bin output/golden.bin diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt new file mode 100644 index 000000000..96c61cc34 --- /dev/null +++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/CMakeLists.txt @@ -0,0 +1,63 @@ +cmake_minimum_required(VERSION 3.16) + +set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version") + +find_package(ASC REQUIRED) + +execute_process(COMMAND python3 -c "import os; import torch; print(os.path.dirname(torch.__file__))" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE TORCH_PATH +) +message("TORCH_PATH is ${TORCH_PATH}") + +execute_process(COMMAND python3 -c "import os; import torch_npu; print(os.path.dirname(torch_npu.__file__))" + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE TORCH_NPU_PATH +) +message("TORCH_NPU_PATH is ${TORCH_NPU_PATH}") + +execute_process(COMMAND python3 -m pybind11 --includes + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE PYBIND11_INC +) +string(REPLACE " " ";" PYBIND11_INC ${PYBIND11_INC}) + +execute_process(COMMAND python3-config --extension-suffix + OUTPUT_STRIP_TRAILING_WHITESPACE + OUTPUT_VARIABLE PYBIND11_SUFFIX +) + +project(kernel_samples LANGUAGES ASC CXX) + +add_library(pybind11_lib SHARED + add_custom.asc +) + +target_link_libraries(pybind11_lib PRIVATE + torch_npu +) + +target_link_directories(pybind11_lib PRIVATE + ${TORCH_PATH}/lib + ${TORCH_NPU_PATH}/lib +) + +target_include_directories(pybind11_lib PRIVATE + ${TORCH_NPU_PATH}/include + ${TORCH_PATH}/include + ${TORCH_PATH}/include/torch/csrc/api/include +) + +target_compile_definitions(pybind11_lib PRIVATE + _GLIBCXX_USE_CXX11_ABI=0 +) + +target_compile_options(pybind11_lib PRIVATE + ${PYBIND11_INC} + -fPIC +) + +set_target_properties(pybind11_lib PROPERTIES + OUTPUT_NAME add_custom${PYBIND11_SUFFIX} + PREFIX "" SUFFIX "" +) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md new file mode 100644 index 000000000..3769caeb1 --- /dev/null +++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/README.md @@ -0,0 +1,102 @@ +## 简化Pybind算子直调样例 +本样例使用pybind方式调用核函数,以带有Tiling的Add算子为示例,展示了一种更为简单的算子编译流程,支持main函数和Kernel函数在同一个cpp文件中实现。 +> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。 + +## 目录结构介绍 +``` +├── 27_simple_add_cpp_extensions +│ ├── CMakeLists.txt // 编译工程文件 +│ ├── add_custom_test.py // python调用脚本 +│ ├── add_custom.asc // AscendC算子实现 & Pybind封装 +│ └── run.sh // 编译运行算子的脚本 +``` +## 代码实现介绍 +- kernel实现 + Add算子的数学表达式为: + ``` + z = x + y + ``` + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。具体请参考[add_custom.asc](./add_custom.asc)。 + +- 调用实现 + 通过PyTorch框架进行模型的训练、推理时,会调用到很多算子进行计算,调用方式也和kernel编译流程相关。对于自定义算子工程,需要使用PyTorch Ascend Adapter中的OP-Plugin算子插件对功能进行扩展,让torch可以直接调用自定义算子包中的算子;对于简化KernelLaunch开放式算子编程的方式,也可以使用pytorch调用,此样例演示的就是这种算子调用方式。 + + pybind11.cpp文件是一个C++的代码示例,使用了pybind11库来将C++代码封装成Python模块。该代码实现中定义了一个名为m的pybind11模块,其中包含一个名为run_add_custom的函数。该函数与my_add::run_add_custom函数相同,用于将C++函数转成Python函数。在函数实现中,通过c10_npu::getCurrentNPUStream() 的函数获取当前NPU上的流,通过内核调用符<<<>>>调用自定义的Kernel函数add_custom,在NPU上执行算子。 + + 在add_custom_test.py调用脚本中,通过导入自定义模块add_custom,调用自定义模块add_custom中的run_add_custom函数,在NPU上执行x和y的加法操作,并将结果保存在变量z中。 + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2 训练系列产品/Atlas 800I A2 推理产品 + +## 运行样例算子 + - 安装pytorch (这里使用2.1.0版本为例) + + **aarch64:** + + ```bash + pip3 install torch==2.1.0 + ``` + + **x86:** + + ```bash + pip3 install torch==2.1.0+cpu --index-url https://download.pytorch.org/whl/cpu + ``` + + - 安装torch-npu (以Pytorch2.1.0、python3.9、CANN版本8.0.RC1.alpha002为例) + + ```bash + git clone https://gitee.com/ascend/pytorch.git -b v6.0.rc1.alpha002-pytorch2.1.0 + cd pytorch/ + bash ci/build.sh --python=3.9 + pip3 install dist/*.whl + ``` + + 安装pybind11 + ```bash + pip3 install pybind11 + ``` + + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/27_simple_add_cpp_extensions + ``` + + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + + - 样例执行 + ```bash + rm -rf build; mkdir -p build; cd build # 创建并进入build目录 + cmake ..; make -j # 编译算子so + python3 ../add_custom_test.py # 执行样例 + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/22 | 新增本readme | diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc new file mode 100644 index 000000000..d4a076832 --- /dev/null +++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom.asc @@ -0,0 +1,111 @@ +/** + * @file add_custom.asc + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include +#include + +#include "torch_npu/csrc/core/npu/NPUStream.h" +#include "kernel_operator.h" + +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) + { + this->blockLength = totalLength / AscendC::GetBlockNum(); + this->tileNum = 8; + this->tileLength = this->blockLength / this->tileNum / BUFFER_NUM; + xGm.SetGlobalBuffer((__gm__ half *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ half *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + zGm.SetGlobalBuffer((__gm__ half *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(half)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(half)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(half)); + } + __aicore__ inline void Process() + { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t blockLength; + uint32_t tileNum; + uint32_t tileLength; +}; + +__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength) +{ + KernelAdd op; + op.Init(x, y, z, totalLength); + op.Process(); +} + +namespace my_add { +at::Tensor run_add_custom(const at::Tensor &x, const at::Tensor &y) +{ + auto aclStream = c10_npu::getCurrentNPUStream().stream(false); + at::Tensor z = at::empty_like(x); + uint32_t blockDim = 8; + uint32_t totalLength = 1; + for (uint32_t size : x.sizes()) { + totalLength *= size; + } + auto xGm = static_cast(const_cast(x.storage().data())); + auto yGm = static_cast(const_cast(y.storage().data())); + auto zGm = static_cast(const_cast(z.storage().data())); + add_custom<<>>(xGm, yGm, zGm, totalLength); + return z; +} +} // namespace my_add + +PYBIND11_MODULE(add_custom, m) +{ + m.doc() = "add_custom pybind11 interfaces"; // optional module docstring + m.def("run_add_custom", &my_add::run_add_custom, ""); +} diff --git a/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py new file mode 100644 index 000000000..b5d63dee7 --- /dev/null +++ b/operator/ascendc/0_introduction/27_simple_add_cpp_extensions/add_custom_test.py @@ -0,0 +1,38 @@ +#!/usr/bin/python3 +# coding=utf-8 +# +# Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. +# +# This program is distributed in the hope that it will be useful, +# but WITHOUT ANY WARRANTY; without even the implied warranty of +# MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. +# =============================================================================== + +import torch +import torch_npu +from torch_npu.testing.testcase import TestCase, run_tests +import sys, os + +sys.path.append(os.getcwd()) +import add_custom + +torch.npu.config.allow_internal_format = False + + +class TestCustomAdd(TestCase): + + def test_add_custom_ops(self): + length = [8, 2048] + x = torch.rand(length, device='cpu', dtype=torch.float16) + y = torch.rand(length, device='cpu', dtype=torch.float16) + + x_npu = x.npu() + y_npu = y.npu() + output = add_custom.run_add_custom(x_npu, y_npu) + cpuout = torch.add(x, y) + + self.assertRtolEqual(output, cpuout) + + +if __name__ == "__main__": + run_tests() diff --git a/operator/ascendc/0_introduction/README.md b/operator/ascendc/0_introduction/README.md index 44a722d77..0b9383e71 100644 --- a/operator/ascendc/0_introduction/README.md +++ b/operator/ascendc/0_introduction/README.md @@ -38,7 +38,10 @@ | [21_vectoradd_kernellaunch](./21_vectoradd_kernellaunch) | 基于Ascend C的Add多场景自定义Vector算子的KernelLaunch调用样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | [22_baremix_kernellaunch](./22_baremix_kernellaunch) | 通过更底层的编码方式,实现MatmulLeayrelu融合算子的样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 | [23_static_tensor_programming_kernellaunch](./23_static_tensor_programming_kernellaunch) | 通过静态Tensor编程方式,实现Add算子的样例 | Atlas A2训练系列产品/Atlas 800I A2推理产品 - +| [24_simple_hello_world](./24_simple_hello_world) | Ascend C异构混合编程样例, 实现Hello World算子及调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品 +| [25_simple_add](./25_simple_add) | Ascend C异构混合编程样例, 实现Add自定义Vector算子及调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品 +| [26_simple_matmulleakyrelu](./26_simple_matmulleakyrelu) | Ascend C异构混合编程样例, 实现MatmulLeakyRelu自定义Cube+Vector算子及调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品 +| [27_simple_add_cpp_extensions](./27_simple_add_cpp_extensions) | Ascend C异构混合编程样例, 实现Add自定义Vector算子动态库及pybind调用, 支持host/device代码混合编程 | Atlas A2训练系列产品/Atlas 800I A2推理产品 ## 获取样例代码 可以使用以下两种方式下载,请选择其中一种进行源码准备。 @@ -72,4 +75,4 @@ | 2025/01/06 | 新增21_vectoradd_kernellaunch样例 | | 2025/07/22 | 新增8_library_frameworklaunch样例 | | 2025/7/28 | 新增22_baremix_kernellaunch | - +| 2025/9/22 | 新增Ascend C异构混合编程样例24-27 | \ No newline at end of file -- Gitee