diff --git a/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..b3e88f1572b89f6f9d2a2862accd54baf0951847 --- /dev/null +++ b/operator/ascendc/0_introduction/23_simple_add/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.16) + +set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version") + +find_package(ASC REQUIRED) + +project(kernel_samples LANGUAGES ASC CXX) + +set_source_files_properties( + add_custom.cpp PROPERTIES LANGUAGE ASC +) + +add_executable(demo + add_custom.cpp +) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/23_simple_add/README.md b/operator/ascendc/0_introduction/23_simple_add/README.md new file mode 100644 index 0000000000000000000000000000000000000000..4b3dce29fda7257aabe0a2bdee6b8c0d806faefb --- /dev/null +++ b/operator/ascendc/0_introduction/23_simple_add/README.md @@ -0,0 +1,86 @@ +## 简化Add算子直调样例 +本样例以Add算子为示例,展示了一种更为简单的算子编译流程,支持main函数和Kernel函数在同一个cpp文件中实现。 +> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。 +## 目录结构介绍 +``` +├── 23_simple_add +│ ├── CMakeLists.txt // 编译工程文件 +│ └── add_custom.cpp // 算子实现及测试 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8 * 2048floatND
y8 * 2048floatND
算子输出z8 * 2048floatND
核函数名add_custom
+ +## 代码实现介绍 +- kernel实现 + Add算子的数学表达式为: + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 +- tiling实现 + TilingData参数设计,TilingData参数本质上是和并行数据切分相关的参数,本示例算子使用了2个tiling参数:totalLength、tileNum。totalLength是指需要计算的数据量大小,tileNum是指每个核上总计算数据分块个数。比如,totalLength这个参数传递到kernel侧后,可以通过除以参与计算的核数,得到每个核上的计算量,这样就完成了多核数据的切分。 + +- 调用实现 + 使用内核调用符<<<>>>调用核函数。 + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/23_simple_add/ + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + + - 样例执行 + ```bash + mkdir -p build && cd build; # 创建并进入build目录 + cmake ..;make -j; # 编译工程 + ./demo # 执行样例 + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/15 | 新增本readme | \ No newline at end of file diff --git a/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp new file mode 100644 index 0000000000000000000000000000000000000000..d2b5cb112b1457f6a9a007b66ff3683e0dc09704 --- /dev/null +++ b/operator/ascendc/0_introduction/23_simple_add/add_custom.cpp @@ -0,0 +1,180 @@ +/** + * @file add_custom.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ + +#include +#include +#include +#include +#include +#include "acl/acl.h" +#include "kernel_operator.h" + +constexpr uint32_t BUFFER_NUM = 2; // tensor num for each queue + +struct AddCustomTilingData +{ + uint32_t totalLength; + uint32_t tileNum; +}; + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) + { + this->blockLength = totalLength / AscendC::GetBlockNum(); + this->tileNum = tileNum; + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + xGm.SetGlobalBuffer((__gm__ float *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ float *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + zGm.SetGlobalBuffer((__gm__ float *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(float)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(float)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(float)); + } + __aicore__ inline void Process() + { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t blockLength; + uint32_t tileNum; + uint32_t tileLength; +}; + +__global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, AddCustomTilingData tiling) +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); + KernelAdd op; + op.Init(x, y, z, tiling.totalLength, tiling.tileNum); + op.Process(); +} + +std::vector kernel_add(std::vector &x, std::vector &y) +{ + constexpr uint32_t blockDim = 8; + uint32_t totalLength = x.size(); + size_t totalByteSize = totalLength * sizeof(float); + int32_t deviceId = 0; + aclrtStream stream = nullptr; + AddCustomTilingData tiling = {/*totalLength:*/totalLength, /*tileNum:*/8}; + uint8_t *xHost = reinterpret_cast(x.data()); + uint8_t *yHost = reinterpret_cast(y.data()); + uint8_t *zHost = nullptr; + uint8_t *xDevice = nullptr; + uint8_t *yDevice = nullptr; + uint8_t *zDevice = nullptr; + + aclInit(nullptr); + aclrtSetDevice(deviceId); + aclrtCreateStream(&stream); + + aclrtMallocHost((void **)(&zHost), totalByteSize); + aclrtMalloc((void **)&xDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&yDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); + aclrtMalloc((void **)&zDevice, totalByteSize, ACL_MEM_MALLOC_HUGE_FIRST); + + aclrtMemcpy(xDevice, totalByteSize, xHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE); + aclrtMemcpy(yDevice, totalByteSize, yHost, totalByteSize, ACL_MEMCPY_HOST_TO_DEVICE); + + add_custom<<>>(xDevice, yDevice, zDevice, tiling); + aclrtSynchronizeStream(stream); + + aclrtMemcpy(zHost, totalByteSize, zDevice, totalByteSize, ACL_MEMCPY_DEVICE_TO_HOST); + std::vector z((float *)zHost, (float *)(zHost + totalLength)); + + aclrtFree(xDevice); + aclrtFree(yDevice); + aclrtFree(zDevice); + aclrtFreeHost(zHost); + + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + + return z; +} + +uint32_t VerifyResult(std::vector &output, std::vector &golden) +{ + auto printTensor = [](std::vector &tensor, const char *name) { + constexpr size_t maxPrintSize = 20; + std::cout << name << ": "; + std::copy(tensor.begin(), tensor.begin() + std::min(tensor.size(), maxPrintSize), + std::ostream_iterator(std::cout, " ")); + if (tensor.size() > maxPrintSize) { + std::cout << "..."; + } + std::cout << std::endl; + }; + printTensor(output, "Output"); + printTensor(golden, "Golden"); + if (std::equal(output.begin(), output.end(), golden.begin())) { + std::cout << "[Success] Case accuracy is verification passed." << std::endl; + return 0; + } else { + std::cout << "[Failed] Case accuracy is verification failed!" << std::endl; + return 1; + } + return 0; +} + +int32_t main(int32_t argc, char *argv[]) +{ + constexpr uint32_t totalLength = 8 * 2048; + constexpr float valueX = 1.2f; + constexpr float valueY = 2.3f; + std::vector x(totalLength, valueX); + std::vector y(totalLength, valueY); + + std::vector output = kernel_add(x, y); + + std::vector golden(totalLength, valueX + valueY); + return VerifyResult(output, golden); +} \ No newline at end of file diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..590f26516ae6a3fb1703bb7741bb3c9c19b15651 --- /dev/null +++ b/operator/ascendc/0_introduction/24_simple_hello_world/CMakeLists.txt @@ -0,0 +1,15 @@ +cmake_minimum_required(VERSION 3.16) + +set(SOC_VERSION "Ascend910B1" CACHE STRING "soc version") + +find_package(ASC REQUIRED) + +project(kernel_samples LANGUAGES ASC CXX) + +set_source_files_properties( + hello_world.cpp PROPERTIES LANGUAGE ASC +) + +add_executable(demo + hello_world.cpp +) \ No newline at end of file diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/README.md b/operator/ascendc/0_introduction/24_simple_hello_world/README.md new file mode 100644 index 0000000000000000000000000000000000000000..53ab1d8cc81ef6441bec25928132f2512a5df9a0 --- /dev/null +++ b/operator/ascendc/0_introduction/24_simple_hello_world/README.md @@ -0,0 +1,54 @@ +## 简化HelloWorld算子直调样例 +本样例通过使用<<<>>>内核调用符来完成算子核函数在NPU侧运行验证的基础流程,核函数内通过printf打印输出结果。 +> ⚠️ **注意** 该样例将在未来的`CANN 8.3`开始支持。 +## 目录结构介绍 +``` +├── 24_simple_helloworld +│ ├── CMakeLists.txt // 编译工程文件 +│ └── hello_world.cpp // 算子实现及测试 +``` + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + + +## 运行样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/24_simple_helloworld/ + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + - 样例执行 + ```bash + mkdir -p build && cd build; # 创建并进入build目录 + cmake ..;make -j; # 编译工程 + ./demo # 执行样例 + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/15 | 新增本readme | \ No newline at end of file diff --git a/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cb28dec3d6dd069cc9d1bbfd2484045839d9e3f5 --- /dev/null +++ b/operator/ascendc/0_introduction/24_simple_hello_world/hello_world.cpp @@ -0,0 +1,35 @@ +/** + * @file hello_world.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "acl/acl.h" + +__global__ __aicore__ void hello_world() +{ + KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIC_ONLY); + AscendC::printf("Hello World!!!\n"); +} + +int32_t main(int argc, char const *argv[]) +{ + aclInit(nullptr); + int32_t deviceId = 0; + aclrtSetDevice(deviceId); + aclrtStream stream = nullptr; + aclrtCreateStream(&stream); + + constexpr uint32_t blockDim = 1; + hello_world<<>>(); + aclrtSynchronizeStream(stream); + + aclrtDestroyStream(stream); + aclrtResetDevice(deviceId); + aclFinalize(); + return 0; +} \ No newline at end of file