diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt new file mode 100644 index 0000000000000000000000000000000000000000..38e13a85eed9af9b79c212856b89364c4aa69e55 --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/CMakeLists.txt @@ -0,0 +1,45 @@ +cmake_minimum_required(VERSION 3.16.0) +project(opp) + +set(ASCEND_COMPUTE_UNIT ascend910b) +find_package(ASC REQUIRED) + +npu_op_package(${vendor_name} + TYPE RUN +) + +file(GLOB host_ops_srcs ${CMAKE_CURRENT_SOURCE_DIR}/add_custom_host.cpp) +npu_op_code_gen( + SRC ${host_ops_srcs} + PACKAGE ${vendor_name} + OUT_DIR ${ASCEND_AUTOGEN_PATH} + OPTIONS + OPS_PRODUCT_NAME ${ASCEND_COMPUTE_UNIT} +) + +file(GLOB autogen_aclnn_srcs ${ASCEND_AUTOGEN_PATH}/aclnn_*.cpp) +set_source_files_properties(${autogen_aclnn_srcs} PROPERTIES GENERATED TRUE) +npu_op_library(cust_opapi ACLNN + ${autogen_aclnn_srcs} +) + +npu_op_library(cust_optiling TILING + ${host_ops_srcs} +) + +npu_op_kernel_library(ascendc_kernels + SRC_BASE ${CMAKE_SOURCE_DIR}/ + TILING_LIBRARY cust_optiling +) + +npu_op_kernel_sources(ascendc_kernels + OP_NAME AddCustom + KERNEL_FILE add_custom_kernel.cpp +) + +npu_op_package_add(${vendor_name} + LIBRARY + cust_opapi + cust_optiling + ascendc_kernels +) diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md new file mode 100644 index 0000000000000000000000000000000000000000..d0e4472c18a306b8123191506a23805fbe4e4a91 --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/README.md @@ -0,0 +1,86 @@ +## 简化Add算子直调样例 +本样例以Add算子为示例,展示了简单、灵活的算子编译流程。 +**注意:本样例仅支持CANN8.3及以上版本。** +## 目录结构介绍 +``` +├── AddCustomTiny +│ ├── add_cutsom_host.cpp // host侧编译源码文件 +│ ├── add_custom_tiling.h // host侧编译tiling头文件 +│ ├── add_custom_kernel.cpp // kernel侧编译源码文件 +│ ├── CMakeLists.txt // 编译工程文件 +│ └── readme.md // 算子实现及测试 +``` + +## 算子描述 +Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: +``` +z = x + y +``` +## 算子规格描述 + + + + + + + + + + + +
算子类型(OpType)Add
算子输入nameshapedata typeformat
x8 * 2048floatND
y8 * 2048floatND
算子输出z8 * 2048floatND
核函数名add_custom
+ +## 代码实现介绍 +- kernel实现 + Add算子的数学表达式为: + 计算逻辑是:Ascend C提供的矢量计算接口的操作元素都为LocalTensor,输入数据需要先搬运进片上存储,然后使用计算接口完成两个输入参数相加,得到最终结果,再搬出到外部存储上。 + + Add算子的实现流程分为3个基本任务:CopyIn,Compute,CopyOut。CopyIn任务负责将Global Memory上的输入Tensor xGm和yGm搬运到Local Memory,分别存储在xLocal、yLocal,Compute任务负责对xLocal、yLocal执行加法操作,计算结果存储在zLocal中,CopyOut任务负责将输出数据从zLocal搬运至Global Memory上的输出Tensor zGm中。 +- tiling实现 + TilingData参数设计,TilingData参数本质上是和并行数据切分相关的参数,本示例算子使用了2个tiling参数:totalLength、tileNum。totalLength是指需要计算的数据量大小,tileNum是指每个核上总计算数据分块个数。比如,totalLength这个参数传递到kernel侧后,可以通过除以参与计算的核数,得到每个核上的计算量,这样就完成了多核数据的切分。 + + +## 支持的产品型号 +本样例支持如下产品型号: +- Atlas A2训练系列产品/Atlas 800I A2推理产品 + + +## 编译样例算子 + - 打开样例目录 + 以命令行方式下载样例代码,master分支为例。 + ```bash + cd ${git_clone_path}/samples/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/ + ``` + - 配置环境变量 + + 请根据当前环境上CANN开发套件包的[安装方式](https://hiascend.com/document/redirect/CannCommunityInstSoftware),选择对应配置环境变量的命令。 + - 默认路径,root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=/usr/local/Ascend/ascend-toolkit/latest + ``` + - 默认路径,非root用户安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=$HOME/Ascend/ascend-toolkit/latest + ``` + - 指定路径install_path,安装CANN软件包 + ```bash + export ASCEND_INSTALL_PATH=${install_path}/ascend-toolkit/latest + ``` + 配置按安装径后,执行以下命令统一配置环境变量。 + ```bash + # 配置CANN环境变量 + source ${ASCEND_INSTALL_PATH}/bin/setenv.bash + # 添加AscendC CMake Module搜索路径至环境变量 + export CMAKE_PREFIX_PATH=${ASCEND_INSTALL_PATH}/compiler/tikcpp/ascendc_kernel_cmake:$CMAKE_PREFIX_PATH + ``` + + - 样例执行 + ```bash + rm -rf build && mkdir build && cd build + cmake .. && make -j binary package + ``` + +## 更新说明 +| 时间 | 更新事项 | +| ---------- | ------------ | +| 2025/09/16 | 新增readme | \ No newline at end of file diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp new file mode 100644 index 0000000000000000000000000000000000000000..1cc87f4fbb443ee61a27df4bcfa2278a44927d51 --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_host.cpp @@ -0,0 +1,56 @@ +/** + * @file add_custom_host.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "add_custom_tiling.h" +#include "register/op_def_registry.h" +#include "tiling/tiling_api.h" + +namespace optiling { +const uint32_t BLOCK_DIM = 8; +const uint32_t TILE_NUM = 8; +static ge::graphStatus TilingFunc(gert::TilingContext *context) +{ + AddCustomTilingData *tiling = context->GetTilingData(); + uint32_t totalLength = context->GetInputShape(0)->GetOriginShape().GetShapeSize(); + context->SetBlockDim(BLOCK_DIM); + tiling->totalLength = totalLength; + tiling->tileNum = TILE_NUM; + return ge::GRAPH_SUCCESS; +} +} // namespace optiling + + +namespace ops { +class AddCustom : public OpDef { +public: + explicit AddCustom(const char *name) : OpDef(name) + { + this->Input("x") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Input("y") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + this->Output("z") + .ParamType(REQUIRED) + .DataType({ge::DT_FLOAT16}) + .Format({ge::FORMAT_ND}); + + this->AICore() + .SetTiling(optiling::TilingFunc) + .AddConfig("ascend910") + .AddConfig("ascend310p") + .AddConfig("ascend310b") + .AddConfig("ascend910b"); + } +}; +OP_ADD(AddCustom); +} // namespace ops diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..22a9876fe71df2663494a2eaebda496963f791d3 --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_kernel.cpp @@ -0,0 +1,86 @@ +/** + * @file add_custom_kernel.cpp + * + * Copyright (C) 2025. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#include "kernel_operator.h" +#include "add_custom_tiling.h" +constexpr int32_t BUFFER_NUM = 2; // tensor num for each queue + +class KernelAdd { +public: + __aicore__ inline KernelAdd() {} + __aicore__ inline void Init(GM_ADDR x, GM_ADDR y, GM_ADDR z, uint32_t totalLength, uint32_t tileNum) + { + this->blockLength = totalLength / AscendC::GetBlockNum(); + this->tileNum = tileNum; + this->tileLength = this->blockLength / tileNum / BUFFER_NUM; + + xGm.SetGlobalBuffer((__gm__ DTYPE_X *)x + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + yGm.SetGlobalBuffer((__gm__ DTYPE_Y *)y + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + zGm.SetGlobalBuffer((__gm__ DTYPE_Z *)z + this->blockLength * AscendC::GetBlockIdx(), this->blockLength); + pipe.InitBuffer(inQueueX, BUFFER_NUM, this->tileLength * sizeof(DTYPE_X)); + pipe.InitBuffer(inQueueY, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Y)); + pipe.InitBuffer(outQueueZ, BUFFER_NUM, this->tileLength * sizeof(DTYPE_Z)); + } + __aicore__ inline void Process() + { + int32_t loopCount = this->tileNum * BUFFER_NUM; + for (int32_t i = 0; i < loopCount; i++) { + CopyIn(i); + Compute(i); + CopyOut(i); + } + } + +private: + __aicore__ inline void CopyIn(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.AllocTensor(); + AscendC::LocalTensor yLocal = inQueueY.AllocTensor(); + AscendC::DataCopy(xLocal, xGm[progress * this->tileLength], this->tileLength); + AscendC::DataCopy(yLocal, yGm[progress * this->tileLength], this->tileLength); + inQueueX.EnQue(xLocal); + inQueueY.EnQue(yLocal); + } + __aicore__ inline void Compute(int32_t progress) + { + AscendC::LocalTensor xLocal = inQueueX.DeQue(); + AscendC::LocalTensor yLocal = inQueueY.DeQue(); + AscendC::LocalTensor zLocal = outQueueZ.AllocTensor(); + AscendC::Add(zLocal, xLocal, yLocal, this->tileLength); + outQueueZ.EnQue(zLocal); + inQueueX.FreeTensor(xLocal); + inQueueY.FreeTensor(yLocal); + } + __aicore__ inline void CopyOut(int32_t progress) + { + AscendC::LocalTensor zLocal = outQueueZ.DeQue(); + AscendC::DataCopy(zGm[progress * this->tileLength], zLocal, this->tileLength); + outQueueZ.FreeTensor(zLocal); + } + +private: + AscendC::TPipe pipe; + AscendC::TQue inQueueX, inQueueY; + AscendC::TQue outQueueZ; + AscendC::GlobalTensor xGm; + AscendC::GlobalTensor yGm; + AscendC::GlobalTensor zGm; + uint32_t blockLength; + uint32_t tileNum; + uint32_t tileLength; +}; + +extern "C" __global__ __aicore__ void add_custom(GM_ADDR x, GM_ADDR y, GM_ADDR z, GM_ADDR workspace, GM_ADDR tiling) +{ + REGISTER_TILING_DEFAULT(AddCustomTilingData); + GET_TILING_DATA(tilingData, tiling); + KernelAdd op; + op.Init(x, y, z, tilingData.totalLength, tilingData.tileNum); + op.Process(); +} \ No newline at end of file diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h new file mode 100644 index 0000000000000000000000000000000000000000..d80ecbee486afe11590d21fbc4d03efe5eeab98e --- /dev/null +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/AddCustomTiny/add_custom_tiling.h @@ -0,0 +1,19 @@ +/** + * @file add_custom_tiling.h + * + * Copyright (C) 2023-2024. Huawei Technologies Co., Ltd. All rights reserved. + * + * This program is distributed in the hope that it will be useful, + * but WITHOUT ANY WARRANTY; without even the implied warranty of + * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. + */ +#ifndef ADD_CUSTOM_TILING_H +#define ADD_CUSTOM_TILING_H +#include + +struct AddCustomTilingData { + uint32_t totalLength; + uint32_t tileNum; +}; + +#endif // ADD_CUSTOM_TILING_H diff --git a/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md b/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md index de724d24f341be6587ab849494ceee48a0db72ee..5f1223f77bd39a3115f8c4374e79f53772ba84bf 100644 --- a/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md +++ b/operator/ascendc/0_introduction/1_add_frameworklaunch/README.md @@ -9,6 +9,7 @@ │ ├── AclOfflineModel // 通过aclopExecuteV2调用的方式调用AddCustom算子 │ ├── AclOnlineModel // 通过aclopCompile调用的方式调用AddCustom算子 │ ├── AddCustom // AddCustom算子工程 +│ ├── AddCustomTiny // AddCustom自定义算子工程极简样例 │ ├── PytorchInvocation // 通过pytorch调用的方式调用AddCustom算子 │ ├── TensorflowInvocation // 通过tensorflow调用的方式调用AddCustom算子 │ ├── CppExtensionInvocation // 通过CppExtension调用的方式调用AddCustom算子 @@ -150,3 +151,4 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 | 2024/11/11 | 样例目录调整 | | 2024/11/18 | 算子工程改写为由msOpGen生成 | | 2025/01/17 | 新增CppExtensionInvocation样例 | +| 2025/9/17 | 新增AddCustomTiny极简工程样例 | \ No newline at end of file