diff --git a/atvc/README.md b/atvc/README.md index 4fe55d6c4f8b6d1c04ac826a75964e7dc767a419..c7f47b655b1899d7a8a1e852235597b695bb6c6e 100644 --- a/atvc/README.md +++ b/atvc/README.md @@ -56,8 +56,8 @@ git clone https://gitee.com/ascend/ascendc-api-adv.git - 执行add用例 ```bash -$ cd ./atvc/tests -$ bash run_test.sh add +$ cd ./atvc/examples +$ bash run_examples.sh add ... Generate golden data successfully. ... @@ -74,18 +74,17 @@ Accuracy verification passed. # 样例介绍 | 样例名 | 描述 | 类型| | ------------------------------------------------------------ | ------------------------------------------------------------ |------------------------------------------------------------ | -| [add](./examples/add/add.cpp) | 使用ATVC的Elementwise模板实现Add算子以及调用样例 | 直调算子 | -| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例 | 直调算子 | -| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Elementwise类算子以及调用样例 | 直调算子 | -| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例 | 直调算子 | -| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例 | 直调算子 | -| [tanh_grad](./examples/tanh_grad/tanh_grad.cpp) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例 | 直调算子 | -| [ops_aclnn](./examples/ops_aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例 | 自定义工程算子 | -| [ops_pytorch](./examples/ops_pytorch) | 使用ATVC基于[pytorch](https://gitee.com/ascend/pytorch)算子的实现以及调用样例 | pytorch算子 |:wq -| [add_with_broadcast](./examples/add_with_broadcast) |使用ATVC的Elementwise和Broadcast组合模板实现Add算子以及调用样例 | 直调算子 | - -更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/1_quick_start.md),其中add、sinh_custom、add_with_scalar、reduce_sum、broadcast_to、tanh_grad、add_with_broadcast是ATVC的直调样例,ops_aclnn为基于ATVC对接aclnn工程的算子目录,ops_pytorch为基于ATVC对接pytorch工程的算子目录。其中,ops_aclnn和ops_pytorch样例需要进入到example路径下按照README.md描述执行。 - +| [add](./examples/add/add.cpp) | 使用ATVC的Elementwise模板实现Add算子以及调用样例 | Kernel直调 | +| [sinh_custom](./examples/sinh_custom/sinh_custom.cpp) | 临时Tensor参与计算的自定义Elementwise类算子以及调用样例 | Kernel直调 | +| [add_with_scalar](./examples/add_with_scalar/add_with_scalar.cpp) | 输入带标量的自定义Elementwise类算子以及调用样例 | Kernel直调 | +| [reduce_sum](./examples/reduce_sum/reduce_sum.cpp) | 使用ATVC的Reduce模板实现自定义ReduceSum算子以及调用样例 | Kernel直调 | +| [broadcast_to](./examples/broadcast_to/broadcast_to.cpp) | 使用ATVC的Broadcast模板实现自定义BroadcastTo算子以及调用样例 | Kernel直调 | +| [tanh_grad](./examples/tanh_grad/tanh_grad.cpp) | 使用Tiling超参进行算子性能调优的ElementWise类算子调用样例 | Kernel直调 | +| [ops_aclnn](./examples/ops_aclnn) | 使用ATVC基于自定义工程算子的实现以及调用样例 | 单算子API调用 | +| [ops_pytorch](./examples/ops_pytorch) | 使用ATVC开发自定义算子,并实现从[PyTorch](https://gitee.com/ascend/pytorch)框架调用的样例 | PyTorch框架调用 | +| [add_with_broadcast](./examples/add_with_broadcast) |使用ATVC的Elementwise和Broadcast组合模板实现Add算子以及调用样例 | Kernel直调 | + +更多算子类型介绍和如何选取模板参见参阅[快速入门](./docs/01_quick_start.md),其中add、sinh_custom、add_with_scalar、reduce_sum、broadcast_to、tanh_grad、add_with_broadcast是ATVC的直调样例,ops_aclnn为基于ATVC对接aclnn工程的算子目录,ops_pytorch为基于ATVC对接PyTorch工程的算子目录。其中,ops_aclnn和ops_pytorch样例需要进入到example路径下按照README.md描述执行。 # 支持场景 diff --git a/atvc/docs/01_quick_start.md b/atvc/docs/01_quick_start.md index 51c24a860f7cf3a2edd3113d526045d6843be990..066a14b4bf2f51cad52d95f0fff4fe8a4303133d 100644 --- a/atvc/docs/01_quick_start.md +++ b/atvc/docs/01_quick_start.md @@ -114,23 +114,23 @@ AddCustom<<>>(aDevice, bDevice, cDevice ## 算子编译&执行 完成算子代码编写后,调用以下命令编译代码并执行: ```bash -cd ./atvc/tests/ -bash run_test.sh add +$ cd ./atvc/examples +$ bash run_examples.sh add ``` 其他样例执行命令如下: ```bash -bash run_test.sh sinh_custom # 执行sinh_custom样例 -bash run_test.sh reduce_sum # 执行reduce_sum样例 -bash run_test.sh add_with_scalar # 执行add_with_scalar样例 -bash run_test.sh broadcast_to # 执行broadcast样例 +bash run_examples.sh sinh_custom # 执行sinh_custom样例 +bash run_examples.sh reduce_sum # 执行reduce_sum样例 +bash run_examples.sh add_with_scalar # 执行add_with_scalar样例 +bash run_examples.sh broadcast_to # 执行broadcast样例 ``` ## 完整样例
完整代码样例请参照[examples/add/add.cpp](../examples/add/add.cpp) # 模板选择 -用户根据待开发的Vector算子定义特征,选择以下三种匹配的模板及其配套的tiling算法,若自定义算子不在当前模板库的范围内,建议使用基本Ascend C API 手写算子。 +用户根据待开发的Vector算子定义特征,选择以下三种匹配的模板及其配套的tiling算法,若自定义算子不在当前模板库的范围内,建议使用Ascend C API 手写算子。 ## Elementwise类算子 Elementwise类算子通常是指对张量进行元素级别的操作的函数或方法,包括但不限于加、减、乘、除及指数、对数、三角函数等数学函数。这类算子的特点是会逐元素进行计算操作,而不会改变输入数据的形状。常见的Elementwise算子有Add、Sub、Exp、Log、Sin、Sqrt等。 ## Reduce类算子 diff --git a/atvc/docs/02_developer_guide.md b/atvc/docs/02_developer_guide.md index c24e416bcf26965c4914e6b598ff423e78b2597d..a7de01196286dcfa65ff95cf8890c06aef6ccaf8 100644 --- a/atvc/docs/02_developer_guide.md +++ b/atvc/docs/02_developer_guide.md @@ -297,184 +297,7 @@ if (!ATVC::Host::CalcEleWiseTiling(eleNum, param)) { ### 2.1.6 完整样例 -通过ATVC框架实现的完整SinhCustom算子定义&调用[代码](../examples/sinh_custom/sinh_custom.cpp)如下: - -```cpp -#include -#include -#include -#include -#include -#include -#include "acl/acl.h" -#include "elewise/elewise_host.h" -#include "elewise/elewise_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) - -namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - -// 描述算子的输入输出以及临时计算资源 -using SinhOpTraits = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; - -// 传入编译态参数ATVC::OpTraits -template -// 开发自定义函数名/类名 -struct SinhComputeFunc { - // DataType模板参数,根据实际数据类型个数填写 - template - // 重载operator公有接口,提供给`ATVC::Kernel::EleWiseOpTemplate`调用 - __aicore__ inline void operator()(AscendC::LocalTensor x, - AscendC::LocalTensor y, - AscendC::LocalTensor tempBuffer1, - AscendC::LocalTensor tempBuffer2) - { - // 开发调用AscendC Api自行实现计算仿函数 - uint32_t tiledCnt = y.GetSize(); // 进行单次基块计算的元素个数 - AscendC::Muls(tempBuffer1, x, static_cast(-1), tiledCnt); // tempBuffer1 = -1 * x - AscendC::Exp(tempBuffer1, tempBuffer1, tiledCnt); // tempbuffer1 = exp(-x) - AscendC::Exp(tempBuffer2, x, tiledCnt); // tempbuffer2 = exp(x) - AscendC::Sub(y, tempBuffer2, tempBuffer1, tiledCnt); // y = exp(x) - exp(-x) - AscendC::Muls(y, y, static_cast(0.5), tiledCnt); // y = (e^(x) - e^(-x)) / 2 - } -}; - -void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &golden) -{ - std::random_device rd; - std::mt19937 gen(rd()); - std::uniform_real_distribution dis(1.0f, 10.0f); - - for (int i = 0; i < eleNum; ++i) { - inputX[i] = dis(gen); - golden[i] = std::sinh(inputX[i]); - } -} - -bool VerifyResults(const std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclInit(nullptr)); - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - CHECK_ACL(aclrtCreateStream(&stream)); -} - -void CleanACL(aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} -} - -/* - * 该函数为SinhCustom算子核函数入口 - * x Device上的gm地址,指向SinhCustom算子第一个输入 - * y Device上的gm地址,指向SinhCustom算子第一个输出 - * param 指向运行态ATVC::EleWiseParam数据 -*/ -template -__global__ __aicore__ void SinhCustom(GM_ADDR x, GM_ADDR y, ATVC::EleWiseParam param) -{ - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); // 控制算子执行时只启动Vector核 - auto op = ATVC::Kernel::EleWiseOpTemplate>(); - op.Run(x, y, ¶m); // 按照输入、输出、param的顺序传入Run函数中;OpTraits内部的ATVC::OpTemps将由EleWiseOpTemplate内部申请资源,开发无需关注 -} - -int main() -{ - // init data - int32_t eleNum = 8 * 2048; - size_t inputByteSize = static_cast(eleNum) * sizeof(float); - size_t outputByteSize = static_cast(eleNum) * sizeof(float); - - std::vector inputX(eleNum); - std::vector golden(eleNum); - InitializeData(eleNum, inputX, golden); - - ATVC::EleWiseParam param; - - // 计算输入为8*2048个float元素的sinh算子的运行态参数param - if (!ATVC::Host::CalcEleWiseTiling(eleNum, param)) { - printf("Elewise tiling error."); - return -1; - }; - // 初始化Acl资源与数据 - aclrtContext context; - aclrtStream stream = nullptr; - int32_t deviceId = 0; - InitializeACL(context, stream, deviceId); - - uint8_t *yHost; - uint8_t *xDevice; - uint8_t *yDevice; - uint8_t *paramDevice; - CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); - CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - - // 将tiling计算的运行时参数EleWiseParam param传到Device侧 - auto elementParamSize = sizeof(param); - CHECK_ACL(aclrtMalloc((void**)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, - reinterpret_cast(¶m), elementParamSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - - // 调用自定义的Kernel API, <<<>>>的BlockNum参数可通过param的TilingData获取 - SinhCustom<<>>(xDevice, yDevice, paramDevice); - - CHECK_ACL(aclrtSynchronizeStream(stream)); - CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); - std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + eleNum); - - // 释放资源 - CHECK_ACL(aclrtFree(xDevice)); - CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFree(paramDevice)); - CHECK_ACL(aclrtFreeHost(yHost)); - - CleanACL(stream, deviceId); - - if (!VerifyResults(golden, outputY)) { - return -1; - } - printf("Accuracy verification passed.\n"); - return 0; -} -``` +通过ATVC框架实现的完整SinhCustom算子定义&调用,完整样例代码见链接[样例代码](../examples/sinh_custom/sinh_custom.cpp) ## 2.2 Reduce算子开发 ATVC框架提供的Reduce算子模板类的模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示;开发自定义的模块用蓝色表示): @@ -546,10 +369,10 @@ Reduce计算模板类将在数据计算阶段被`ReduceOpTemplate`算子模板 - 该模板类在实例化时固定传入ATVC::OpTraits类型的结构体作为模板参数,如`ATVC::OpTraits,ATVC::OpOutputs`。 - 开发必须完成以下公有API的内部实现: - 1. 计算单数据基块的Reduce结果 `\_\_aicore\_\_ inline void Compute(...)` - 2. 计算单UB内不同数据基块的计算结果 `\_\_aicore\_\_ inline void UpdateCache(...)` - 3. 计算多核之间&同一核内的多次UB结果 `\_\_aicore\_\_ inline void ReduceBetweenUB(...)` - 4. 返回非对齐场景不参与计算的尾部数据的填充值 `\_\_aicore\_\_ inline U GetPaddingValue()` + 1. 计算单数据基块的Reduce结果 `__aicore__ inline void Compute(...)` + 2. 计算单UB内不同数据基块的计算结果 `__aicore__ inline void UpdateCache(...)` + 3. 计算多核之间&同一核内的多次UB结果 `__aicore__ inline void ReduceBetweenUB(...)` + 4. 返回非对齐场景不参与计算的尾部数据的填充值 `__aicore__ inline U GetPaddingValue()` ### 2.2.3 内置Reduce算子模板 @@ -765,203 +588,7 @@ using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs -#include -#include -#include -#include -#include -#include "acl/acl.h" -#include "reduce/reduce_host.h" -#include "reduce/reduce_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) - -namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - -// ReduceSum算子的描述:一个输入,一个输出,类型均为float -using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; - -bool VerifyResults(const std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} -} - -/* - * 该函数为ReduceCustom算子核函数入口 - * x Device上的gm地址,指向ReduceCustom算子第一个输入 - * y Device上的gm地址,指向ReduceCustom算子第一个输出 - * reduceParam 指向运行态ATVC::ReduceParam数据 -*/ -template -__global__ __aicore__ void ReduceCustom(GM_ADDR x, GM_ADDR y, ATVC::ReduceParam reduceParam) -{ - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 - // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 - auto op = ATVC::Kernel::ReduceOpTemplate, Policy>(); - op.Run(x, y, &reduceParam); -} - -// 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 -template -void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::ReducePolicy &policy, aclrtStream& stream) -{ - // 申请临时空间workspace,并将其与ReduceTilingData一同传到Device侧 - uint8_t *paramDevice; - uint8_t *workspaceDevice; - CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); - param.workspaceAddr = reinterpret_cast(workspaceDevice); - auto reduceParamSize = sizeof(param); - CHECK_ACL(aclrtMalloc((void**)¶mDevice, reduceParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, reduceParamSize, - reinterpret_cast(¶m), reduceParamSize, - ACL_MEMCPY_HOST_TO_DEVICE)); - // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 - if (policy == ATVC::REDUCE_POLICY0) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY1) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY2) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY3) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY4) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY5) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY6) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY7) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY8) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY9) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY10) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY11) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY12) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY13) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY14) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY15) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY16) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY17) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY18) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY19) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY20) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY21) { - ReduceCustom<<>>(x, y, paramDevice); - } else if (policy == ATVC::REDUCE_POLICY22) { - ReduceCustom<<>>(x, y, paramDevice); - } else { - printf("[ERROR] Cannot find any matched policy.\n"); - } - // 流同步后释放申请的param内存 - CHECK_ACL(aclrtSynchronizeStream(stream)); - CHECK_ACL(aclrtFree(workspaceDevice)); - CHECK_ACL(aclrtFree(paramDevice)); -} - -int32_t main(int32_t argc, char* argv[]) -{ - int32_t eleNum = 8 * 1024; - int32_t outEleNum = 1 * 1024; - size_t inputByteSize = static_cast(eleNum) * sizeof(float); - size_t outputByteSize = static_cast(outEleNum) * sizeof(float); - std::vector dim{0}; // 对第0轴执行reduce操作 - std::vector shape{8, 1024}; // 测试输入shape - std::vector inputX(eleNum, 1.0f); - std::vector golden(outEleNum, 8.0f); - printf("Generate golden data successfully.\n"); - // 初始化Acl资源 - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - uint8_t *yHost; - uint8_t *xDevice; - uint8_t *yDevice; - - CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); - CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - - CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 - // Host侧调用Tiling API完成相关运行态参数的运算 - if (!ATVC::Host::CalcReduceTiling(shape, dim, &policy, ¶m)) { - printf("Reduce tiling error.\n"); - return -1; - }; - - // 调用Adapter调度接口,完成核函数的模板调用 - ReduceOpAdapter(xDevice, yDevice, param, policy, stream); - - CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); - std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); - - // 释放Acl资源 - CHECK_ACL(aclrtFree(xDevice)); - CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFreeHost(yHost)); - - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - - if (!VerifyResults(golden, outputY)) { - return -1; - } - - printf("Accuracy verification passed.\n"); - return 0; -} -``` +通过ATVC框架实现的完整ReduceSum算子定义&调用,完整样例代码见链接[样例代码](../examples/reduce_sum/reduce_sum.cpp) ## 2.3 Broadcast算子开发 ATVC框架提供的Broadcast算子模板类的模块之间的交互如下(ATVC框架提供的模板及接口用黄色表示;开发自定义的模块用蓝色表示): @@ -1242,145 +869,7 @@ int32_t main(int32_t argc, char* argv[]) ### 2.3.6 完整样例 -通过ATVC框架实现的完整BroadcastCustom算子[样例代码](../examples/broadcast_to/broadcast_to.cpp) -```cpp -#include -#include -#include -#include -#include -#include -#include "acl/acl.h" -#include "atvc.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) - -namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} -} - -// BroadcastTo算子的描述:一个输入,一个输出,类型均为float -using BroadcastOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; - -/* - * 该函数为BroadcastCustom算子核函数入口 - * x Device上的gm地址,指向BroadcastTo算子第一个输入 - * y Device上的gm地址,指向BroadcastTo算子第一个输出 - * broadcastParam 指向运行态ATVC::BroadcastParam数据 -*/ -template -__global__ __aicore__ void BroadcastCustom(GM_ADDR x, GM_ADDR y, ATVC::BroadcastParam broadcastParam) -{ - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); // 设置算子执行时只启动Vector核 - // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 - auto op = ATVC::Kernel::BroadcastOpTemplate, Policy>(); - op.Run(x, y, &broadcastParam); -} - - -// 负责Broadcast类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 -template -void BroadcastOpAdapter(uint8_t* x, uint8_t* y, ATVC::BroadcastParam ¶m, ATVC::BroadcastPolicy &policy, aclrtStream& stream) -{ - // 申请临时空间workspace,并将其与BroadcastTilingData一同传到Device侧 - uint8_t *paramDevice; - uint8_t *workspaceDevice; - CHECK_ACL(aclrtMalloc((void **)&workspaceDevice, param.workspaceSize, ACL_MEM_MALLOC_HUGE_FIRST)); - param.workspaceAddr = reinterpret_cast(workspaceDevice); - auto broadcastParamSize = sizeof(param); - CHECK_ACL(aclrtMalloc((void**)¶mDevice, broadcastParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, broadcastParamSize, reinterpret_cast(¶m), broadcastParamSize, ACL_MEMCPY_HOST_TO_DEVICE)); - // 将tiling api计算出的BroadcastPolicy转化为编译态参数并实例化相应的核函数 - if (policy == ATVC::BROADCAST_POLICY0) { - BroadcastCustom<<>>(x, y, paramDevice); - }else if (policy == ATVC::BROADCAST_POLICY1) { - BroadcastCustom<<>>(x, y, paramDevice); - } else { - printf("[ERROR] Cannot find any matched policy.\n"); - } - // 流同步后释放申请的param内存 - CHECK_ACL(aclrtSynchronizeStream(stream)); - CHECK_ACL(aclrtFree(workspaceDevice)); - CHECK_ACL(aclrtFree(paramDevice)); -} - -int32_t main(int32_t argc, char* argv[]) -{ - int32_t eleNum = 1 * 1024; - int32_t outEleNum = 8 * 1024; - size_t inputByteSize = static_cast(eleNum) * sizeof(float); - size_t outputByteSize = static_cast(outEleNum) * sizeof(float); - std::vector shapeIn{1, 1024}; // 测试输入shape - std::vector shapeOut{8, 1024}; // 测试输入shape - std::vector inputX(eleNum, 1.0f); - std::vector golden(outEleNum, 1.0f); - printf("Generate golden data successfully.\n"); - // 初始化Acl资源 - CHECK_ACL(aclInit(nullptr)); - aclrtContext context; - int32_t deviceId = 0; - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - aclrtStream stream = nullptr; - CHECK_ACL(aclrtCreateStream(&stream)); - uint8_t *yHost; - uint8_t *xDevice; - uint8_t *yDevice; - - CHECK_ACL(aclrtMallocHost((void **)(&yHost), outputByteSize)); - CHECK_ACL(aclrtMalloc((void **)&xDevice, inputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMalloc((void **)&yDevice, outputByteSize, ACL_MEM_MALLOC_HUGE_FIRST)); - - CHECK_ACL(aclrtMemcpy(xDevice, inputByteSize, inputX.data(), inputByteSize, ACL_MEMCPY_HOST_TO_DEVICE)); - - ATVC::BroadcastParam param; // Broadcast运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::BroadcastPolicy policy = {-1, -1, -1}; // Broadcast运行态参数,负责映射最适合的Broadcast模板实现 - // Host侧调用Tiling API完成相关运行态参数的运算 - if (!ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, ¶m)) { - printf("Broadcast tiling error.\n"); - return -1; - }; - - // 调用Adapter调度接口,完成核函数的模板调用 - BroadcastOpAdapter(xDevice, yDevice, param, policy, stream); - - CHECK_ACL(aclrtMemcpy(yHost, outputByteSize, yDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); - std::vector outputY(reinterpret_cast(yHost), reinterpret_cast(yHost) + outEleNum); - - // 释放Acl资源 - CHECK_ACL(aclrtFree(xDevice)); - CHECK_ACL(aclrtFree(yDevice)); - CHECK_ACL(aclrtFreeHost(yHost)); - - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); - - for (int32_t i = 0; i < outEleNum; i++) { - if (!IsClose(golden[i], outputY[i])) { - printf("Accuracy verification failed! The expected value of element in index [%d] is %f, but actual value is %f.\n", i, golden[i], outputY[i]); - return -1; - } - } - printf("Accuracy verification passed.\n"); - return 0; -} -``` +通过ATVC框架实现的完整BroadcastCustom算子定义&调用,完整样例代码见链接[样例代码](../examples/broadcast_to/broadcast_to.cpp) ## 2.4 组合算子开发 ATVC框架支持Broadcast与Elementwise组合的算子通过扩展BroadcastOpTemplate的模板参数对用户提供接口,开发者可以根据算子实际需求来定制组合,框架支持以下组合:Elementwise + Broadcast、Broadcast + Elementwise、Elementwise + Broadcast + Elementwise。下面以Broadcast与Elementwise组合为例进行详细讲解。 @@ -1600,7 +1089,6 @@ ATVC::Host::CalcBroadcastTiling(shapeIn, shapeOut, &policy, & ### 2.4.6 完整样例 参考sample [add_with_broadcast](../examples/add_with_broadcast/README.md) - # 3 ATVC的调试调优功能 为了用户在使用ATVC进行算子开发时能快速进行精度调试和性能调优,ATVC支持多种调试调优能力。 ## 3.1 OpTraits校验接口 @@ -1622,6 +1110,13 @@ enum class TemplateType { BROADCAST, // Broadcast模板的校验类型 }; ``` +DebugCheck主要校验项如下: +| 模板类型 | OpTraits校验项 | +| ----------- | --------------- | +| ELE_WISE | 输入输出非空 | +| REDUCE | 输入输出个数均为1, 输入输出数据类型相同 | +| BROADCAST | 输入输出个数均为1, 输入输出数据类型相同 | + 接口使用示例: ```cpp using AddOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; @@ -1630,13 +1125,13 @@ ATVC::Host::DebugCheck(); ``` 完整的DebugCheck调用接口样例可参考tanh_grad算子[样例代码](../examples/tanh_grad/tanh_grad.cpp)。 ## 3.2 使用调试调优模式运行算子 -样例执行脚本run_test.sh支持可选入参`--run-mode`进行不同调试调优运行模式的选择。 +样例执行脚本run_examples.sh支持可选入参`--run-mode`进行不同调试调优运行模式的选择。 当前支持`debug_print`和`profiling`两种模式。 - `--run-mode=debug_print`:DFX信息打印模式,打开kernel侧的模板内置关键节点的信息打印和异常退出时的打印功能。 - `--run-mode=profiling`:Profiling性能采集模式,运行时打开profiling性能数据采集功能。 - 未设置`--run-mode`:默认模式,正常上板,无kernel侧的dfx信息打印, 未开启profiling性能采集功能。 ## 3.2.1 DFX信息打印模式 -通过运行run_test.sh脚本时加上可选参数`--run-mode=debug_print`打开本功能。 +通过运行run_examples.sh脚本时加上可选参数`--run-mode=debug_print`打开本功能。 DFX信息打印格式按照 [日志级别(`ERROR`/`INFO`)]:[`ATVC`][`Module`](可选:[`CopyIn`/`CopyOut`等])的标准进行打印。 - 日志级别: ERROR是异常打印信息,INFO是模板内部重要信息打印 - `ATVC`: 标识是ATVC模板库内置的DFX信息打印 @@ -1652,12 +1147,12 @@ __aicore__ inline void DebugPrintf(__gm__ const char* fmt, Args&&... args); } } // 调用示例 -ATVC::Kernel::DebugPrintf("[ERROR]: [ATVC][EleWise] Input Count can not be 0!\n"); -ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n", curCoreStartCnt_ + offsetCnt_, calcCnt_); +ATVC::Kernel::DebugPrintf("[ERROR]:[ATVC][EleWise] Input Count can not be 0!\n"); +ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][EleWise][CopyIn] Offset is %u, copy count is %u.\n", curCoreStartCnt_ + offsetCnt_, calcCnt_); ``` ## 3.2.2 开启Profiling性能调优功能 -通过运行run_test.sh脚本时加上可选参数`--run-mode=profiling`打开本功能。 +通过运行run_examples.sh脚本时加上可选参数`--run-mode=profiling`打开本功能。 为了增加Profiling采集性能数据的稳定性,建议用户在开启profiling时,运行时重复多次调用kernel,可实现一次性采集多次上板的性能数据,消除抖动。 ```cpp TanhGrad<<>>(dyDevice, yDevice, zDevice, paramDevice); @@ -1698,7 +1193,8 @@ struct EleWiseTilingData { | ----------- | -------------- | ----------- | ----------- |---| | singleCoreBaseLine | uint32_t | 单核数据量基线 | [256, 128 * 1024] | 512| | ubSizeLimitThreshold | float | UB内存使用上限,决定了basicBlock最大值 | [0.5, 0.96] | 0.95 | -| splitDataShape | uint32_t[3]| 单核内数据量的3个分段节点,表示数据量分为4段| {node_0, node_1, node_2} | {1024, 32*1024, 64*1024}| +| nBufferNum | uint32_t | 用于并行流水的buffer数量 | [1, 2] | 2 | +| splitDataShape | uint32_t[3]| 单核内数据量的3个分段节点,表示数据量分为4段| {node_0, node_1, node_2} | {1024, 32 * 1024, 64 * 1024}| | dataSplitFactor | uint32_t[4]| 单核内4个数据段的切分系数, 决定不同数据段的切分基本块的大小| {factor_0, factor_1, factor_2, factor_3} 均需在范围[1, 32]| {4, 4, 8, 6}| | rsvLiveCnt | uint32_t| 预留的空间大小为rsvLiveCnt * (inputBuffer + outputBuffer)|[0, 1]| 0| @@ -1775,7 +1271,7 @@ struct ReduceTilingData { | ----------- | -------------- | ----------- | ----------- |---| | basicBlock | uint32_t | Reduce 基本块内存大小 | 不能超过UB内存的1/3, 192K内存 建议在48K-54K之间设置 | 54 * 1024| | maxInnerA | uint32_t |AR切轴内A轴的最大数据量 | [128, 256] | 128 | -| balanceThreshHold | double| 多核均衡的阈值水平, 阈值越高,切分后每个核处理的数据量越均衡 | [0.85, 0.95]| 0.95 | +| balanceThreshHold | double| 多核均衡的阈值水平, 阈值越高,切分后每个核处理的数据量越均衡 | [0.8, 0.95]| 0.85 | 对应的超参`ReduceTilingHyperParam`结构定义如下: ```cpp diff --git a/atvc/docs/03_code_organization.md b/atvc/docs/03_code_organization.md index 47641b0cb98d098b19a918a1145b5d546ade92a2..f6f60e5f7fa51e794c4cf2fd48fb4f98c714c570 100644 --- a/atvc/docs/03_code_organization.md +++ b/atvc/docs/03_code_organization.md @@ -32,24 +32,37 @@ include/ examples文件夹下提供了算子代码样例,包含算子实现的源码文件和测试用例配置和执行脚本。 ``` examples +├── run_examples.sh // 执行脚本 ├── add // EleWise Add算子样例 │ ├── README.md │ └── add.cpp ├── add_with_scalar // EleWise + Scalar算子样例 │ ├── README.md │ └── add_with_scalar.cpp +├── add_with_broadcast // BroadcastTo + EleWise算子样例 +│ ├── README.md +│ └── add_with_scalar.cpp ├── broadcast_to // BroadcastTo算子样例 │ ├── README.md │ └── broadcast_to.cpp ├── reduce_sum // ReduceSum算子样例 │ ├── README.md │ └── reduce_sum.cpp -└── sinh_custom // SinhCustom算子样例 - ├── README.md - └── sinh_custom.cpp -└── tanh_grad // Tah_Grad算子样例 - ├── README.md - └── tanh_grad.cpp +├── sinh_custom // SinhCustom算子样例 +│ ├── README.md +│ └── sinh_custom.cpp +├── tanh_grad // TanhGrad算子样例 +│ ├── README.md +│ └── tanh_grad.cpp +├── ops_aclnn // 单算子API调用样例 +| ├── README.md +│ ├── add +│ └── reduce_sum +├── ops_pytorch // PyTorch框架调用样例 +│ ├── README.md +│ ├── add +│ └── reduce_sum +└── common // 算子样例公共接口 ``` ## 3. docs diff --git a/atvc/docs/images/broadcast_fusion_dataflow.png b/atvc/docs/images/broadcast_fusion_dataflow.png new file mode 100644 index 0000000000000000000000000000000000000000..7785bedf67ea053ddac9b1f636c5768e583dd4f3 Binary files /dev/null and b/atvc/docs/images/broadcast_fusion_dataflow.png differ diff --git a/atvc/examples/add/README.md b/atvc/examples/add/README.md index ae40bdae244485e50549b70a9482384d907926fe..532af72d4c8ae159fd862f46e6b092e36acab23e 100644 --- a/atvc/examples/add/README.md +++ b/atvc/examples/add/README.md @@ -38,8 +38,8 @@ Add算子规格: ## 算子运行 在ascendc-api-adv代码仓目录下执行: ```bash -$ cd ./atvc/tests/ -$ bash run_test.sh add +$ cd ./atvc/examples +$ bash run_examples.sh add ... Generate golden data successfully. ... diff --git a/atvc/examples/add/add.cpp b/atvc/examples/add/add.cpp index a3f1e65443f1c0bd3551f573f1cd49df4063fd9b..863413a7be47e47c7e30099afaa90d1b2d9ab3ca 100644 --- a/atvc/examples/add/add.cpp +++ b/atvc/examples/add/add.cpp @@ -17,27 +17,9 @@ #include "acl/acl.h" #include "elewise/elewise_host.h" #include "elewise/elewise_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) +#include "example_common.h" namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &inputY, std::vector &golden) { std::random_device rd; @@ -51,37 +33,6 @@ void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("[ERROR]: Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclInit(nullptr)); - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - CHECK_ACL(aclrtCreateStream(&stream)); -} - -void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} - void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice) { CHECK_ACL(aclrtFree(xDevice)); diff --git a/atvc/examples/add_with_broadcast/README.md b/atvc/examples/add_with_broadcast/README.md index f49500d66f3d7b495549cdb9d09bb3530860e097..cb3620c084db824358dddd3df0df5250665e846f 100644 --- a/atvc/examples/add_with_broadcast/README.md +++ b/atvc/examples/add_with_broadcast/README.md @@ -38,8 +38,8 @@ Add算子规格: ## 算子运行 在ascendc-api-adv代码仓目录下执行: ```bash -$ cd ./atvc/tests/ -$ bash run_test.sh add_with_broadcast +$ cd ./atvc/examples +$ bash run_examples.sh add_with_broadcast ... Generate golden data successfully. ... diff --git a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp index 2a58d49b93040a9091e931da49242e21ad4cbb77..5ae9b3a2f6f940ff9d19abd8b0ff5675580dfd36 100644 --- a/atvc/examples/add_with_broadcast/add_with_broadcast.cpp +++ b/atvc/examples/add_with_broadcast/add_with_broadcast.cpp @@ -15,29 +15,12 @@ #include #include #include -#include "acl/acl.h" #include "broadcast/broadcast_host.h" +#include "example_common.h" +#include "acl/acl.h" #include "add_with_broadcast.h" -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) - namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - // AddWithBroadcast算子的描述:两个输入,一个输出,类型均为float using BroadcastOpTraits = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; @@ -64,21 +47,6 @@ void BroadcastOpAdapter(uint8_t* x, uint8_t* y, uint8_t* z, ATVC::BroadcastParam CHECK_ACL(aclrtDestroyStream(stream)); CHECK_ACL(aclrtFree(workspaceDevice)); } - -bool VerifyResults(const std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} } @@ -92,7 +60,7 @@ int32_t main(int32_t argc, char* argv[]) size_t inputByteSize = static_cast(eleNum) * sizeof(float); size_t outputByteSize = static_cast(outEleNum) * sizeof(float); std::random_device rd; - std::mt19937 gen(rd()); + std::mt19937 gen(static_cast(rd())); std::uniform_real_distribution disX(1.0f, 9.0f); std::uniform_real_distribution disY(1.0f, 9.0f); diff --git a/atvc/examples/add_with_scalar/README.md b/atvc/examples/add_with_scalar/README.md index 9dc434099cc48dd67227de4eed73386da187f88b..9c5164006010f246c49872570c3ecc57bb188d15 100644 --- a/atvc/examples/add_with_scalar/README.md +++ b/atvc/examples/add_with_scalar/README.md @@ -40,8 +40,8 @@ ## 算子运行 在ascendc-api-adv代码仓目录下执行: ```bash -$ cd ./atvc/tests -$ bash run_test.sh add_with_scalar +$ cd ./atvc/examples +$ bash run_examples.sh add_with_scalar ... Generate golden data successfully. ... diff --git a/atvc/examples/add_with_scalar/add_with_scalar.cpp b/atvc/examples/add_with_scalar/add_with_scalar.cpp index 24c4707032a33538bc727257ea340ec574cb51a3..d56bc6bdddb06dccee9a599ea32bfa25a55bc63a 100644 --- a/atvc/examples/add_with_scalar/add_with_scalar.cpp +++ b/atvc/examples/add_with_scalar/add_with_scalar.cpp @@ -17,27 +17,9 @@ #include "acl/acl.h" #include "elewise/elewise_host.h" #include "elewise/elewise_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) +#include "example_common.h" namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - using OP_TRAITS = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; template @@ -73,36 +55,6 @@ void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclInit(nullptr)); - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - CHECK_ACL(aclrtCreateStream(&stream)); -} - -void CleanACL(aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} - void CleanUp(uint8_t *&zHost, uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&zDevice) { CHECK_ACL(aclrtFree(xDevice)); @@ -176,7 +128,7 @@ int main() std::vector outputZ(reinterpret_cast(zHost), reinterpret_cast(zHost) + eleNum); CleanUp(zHost, xDevice, yDevice, zDevice); - CleanACL(stream, deviceId); + CleanACL(context, stream, deviceId); if (!VerifyResults(golden, outputZ)) { return -1; diff --git a/atvc/examples/broadcast_to/README.md b/atvc/examples/broadcast_to/README.md index 2699cb07d468b0e041680394b22e069207f2e18a..cfb7c0c8d9caaedd860a99cc03efc26dbac3e8e7 100644 --- a/atvc/examples/broadcast_to/README.md +++ b/atvc/examples/broadcast_to/README.md @@ -37,8 +37,8 @@ BroadcastTo算子规格: ## 算子运行 在ascendc-api-adv代码仓目录下执行: ```bash -$ cd ./atvc/tests -$ bash run_test.sh broadcast_to +$ cd ./atvc/examples +$ bash run_examples.sh broadcast_to ... Generate golden data successfully. ... diff --git a/atvc/examples/broadcast_to/broadcast_to.cpp b/atvc/examples/broadcast_to/broadcast_to.cpp index c7a4b16baeac5dfc80ced66e199ea5cb52c5c56b..7bd28495b8e3d4d0f3acb54d6c8b638ca31e97b4 100644 --- a/atvc/examples/broadcast_to/broadcast_to.cpp +++ b/atvc/examples/broadcast_to/broadcast_to.cpp @@ -18,57 +18,9 @@ #include "acl/acl.h" #include "broadcast/broadcast_host.h" #include "broadcast/broadcast_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) +#include "example_common.h" namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - -bool VerifyResults(const std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("[ERROR]: Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void InitializeACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) -{ - CHECK_ACL(aclInit(nullptr)); - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - CHECK_ACL(aclrtCreateStream(&stream)); -} - -void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} - void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) { CHECK_ACL(aclrtFree(xDevice)); @@ -136,7 +88,7 @@ int32_t main(int32_t argc, char* argv[]) aclrtContext context; int32_t deviceId = 0; aclrtStream stream = nullptr; - InitializeACL(stream, context, deviceId); + InitializeACL(context, stream, deviceId); uint8_t *yHost; uint8_t *xDevice; uint8_t *yDevice; @@ -163,7 +115,7 @@ int32_t main(int32_t argc, char* argv[]) // 释放Acl资源 CleanUp(xDevice, yDevice, yHost); - CleanACL(stream, context, deviceId); + CleanACL(context, stream, deviceId); for (int32_t i = 0; i < outEleNum; i++) { if (!IsClose(golden[i], outputY[i])) { diff --git a/atvc/examples/common/example_common.h b/atvc/examples/common/example_common.h new file mode 100644 index 0000000000000000000000000000000000000000..b10155e55bdb5814c22321e7113bea457796b155 --- /dev/null +++ b/atvc/examples/common/example_common.h @@ -0,0 +1,67 @@ +/** + * Copyright (c) Huawei Technologies Co., Ltd. 2025. All rights reserved. + * + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +#ifndef ATVC_EXAMPLE_COMMON_H +#define ATVC_EXAMPLE_COMMON_H +#include "acl/acl.h" + +namespace { +#define CHECK_ACL(x) \ + do { \ + aclError __ret = x; \ + if (__ret != ACL_ERROR_NONE) { \ + std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ + } \ + } while (0) + +static constexpr float REL_TOL = 1e-3f; +static constexpr float ABS_TOL = 1e-5f; + +bool IsClose(float a, float b) +{ + const float eps = 1e-40f; + float diff = std::abs(a - b); + return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); +} + +bool VerifyResults(const std::vector &golden, const std::vector &output) +{ + for (int32_t i = 0; i < golden.size(); i++) { + if (!IsClose(golden[i], output[i])) { + printf("[ERROR]: Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, + golden[i], + output[i]); + return false; + } + } + return true; +} + +void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) +{ + CHECK_ACL(aclInit(nullptr)); + CHECK_ACL(aclrtSetDevice(deviceId)); + CHECK_ACL(aclrtCreateContext(&context, deviceId)); + CHECK_ACL(aclrtCreateStream(&stream)); +} + +void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) +{ + CHECK_ACL(aclrtDestroyStream(stream)); + CHECK_ACL(aclrtDestroyContext(context)); + CHECK_ACL(aclrtResetDevice(deviceId)); + CHECK_ACL(aclFinalize()); +} +} // namespace + +#endif \ No newline at end of file diff --git a/atvc/examples/ops_aclnn/README.md b/atvc/examples/ops_aclnn/README.md index 802586c91f6cb52d69430f79563c2e3463c46a4e..f317fdaad7b312cd44a4a17a779ba2c402ae98a3 100644 --- a/atvc/examples/ops_aclnn/README.md +++ b/atvc/examples/ops_aclnn/README.md @@ -1,43 +1,11 @@ ## 概述 -使用ATVC对接ACLNN工程简单的示例,适合初学者。 - -## 自定义算子样例说明 -样例通过Ascend C编程语言实现了ATVC框架对接自定义算子工程,并按照算子调用方式分别给出了对应的端到端实现。 +使用ATVC框架开发自定义算子,并实现单算子API调用的样例。 ## 算子开发样例 | 目录名称 | 功能描述 | | ------------------------------------------------------------ | ---------------------------------------------------- | -| [add](./add) | 基于ATVC框架的Add自定义Vector算子及AclNNInvocation调用样例 | -| [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子及AclNNInvocation调用样例 | - -## 快速上手 - - 快速执行example用例,更详细的流程请参阅[add算子](../add/README.md)。 - -- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/01_quick_start.md)。 - -- 导入ATVC环境变量 - ```bash - # 如果不导入,默认使用./atvc/include路径 - export ATVC_PATH=${atvc}/include - ``` - - - 执行add用例 - - ```bash - # 基于ATVC编译自定义Add算子 - $ cd ./atvc/examples/ops_aclnn/add - # 以910B1为例,运行命令如下: - $ bash install.sh -v Ascend910B1 - # 安装custom包 - $ cd CustomOp/build_out - $ ./custom_opp*.run - # 样例运行 - $ cd ../../AclNNInvocationNaive - $ bash run.sh - ... - test pass - ``` +| [add](./add) | 使用ATVC框架开发自定义算子Add,并实现单算子API调用的样例。 | +| [reduce_sum](./reduce_sum) | 使用ATVC框架开发自定义算子ReduceSum,并实现单算子API调用的样例。 | ## 基于ATVC框架支持自定义算子 ### 步骤1. 生成自定义工程基础目录及文件 @@ -296,4 +264,4 @@ ### 步骤5. 调用执行算子工程 - 算子文件编写完成,参考[aclnn调用AddCustom算子工程(代码简化)](https://gitee.com/ascend/samples/blob/master/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocationNaive/README.md)进行编译验证 \ No newline at end of file + 算子文件编写完成,参考[aclnn调用AddCustom算子工程(代码简化)](https://gitee.com/ascend/samples/blob/master/operator/ascendc/0_introduction/1_add_frameworklaunch/AclNNInvocationNaive/README.md)进行编译验证。 \ No newline at end of file diff --git a/atvc/examples/ops_aclnn/add/README.md b/atvc/examples/ops_aclnn/add/README.md index 42020ecffbbf614de35adac72434339bc0d7492b..d2909e71e9d1339e71cf75b35eb30eaeda1cba7f 100644 --- a/atvc/examples/ops_aclnn/add/README.md +++ b/atvc/examples/ops_aclnn/add/README.md @@ -29,10 +29,6 @@ z = x + y 核函数名add_custom -## 支持的产品型号 -本样例支持如下产品型号: -- Atlas A2训练系列产品 - ## 算子工程介绍 其中,算子工程目录AddCustom包含算子的实现文件,如下所示: ``` @@ -61,6 +57,11 @@ CANN软件包中提供了工程创建工具msOpGen,AddCustom算子工程可通 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 ### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + - 导入ATVC环境变量 + ```bash + # 如果不导入,默认使用./atvc/include路径 + $ export ATVC_PATH=${atvc}/include + ``` - 切换到msOpGen脚本install.sh所在目录 ```bash # 若开发者以git命令行方式clone了master分支代码,并切换目录 diff --git a/atvc/examples/ops_aclnn/reduce_sum/README.md b/atvc/examples/ops_aclnn/reduce_sum/README.md index 0ae318021bc6f2a8f55d60ec27f640b480f52b47..04a21d4955a9c204200cd257191ad405bebe1933 100644 --- a/atvc/examples/ops_aclnn/reduce_sum/README.md +++ b/atvc/examples/ops_aclnn/reduce_sum/README.md @@ -26,10 +26,6 @@ ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结 核函数名reduce_sum_custom -## 支持的产品型号 -本样例支持如下产品型号: -- Atlas A2训练系列产品 - ## 算子工程介绍 其中,算子工程目录ReduceSumCustom包含算子的实现文件,如下所示: ``` @@ -58,6 +54,11 @@ CANN软件包中提供了工程创建工具msOpGen,ReduceSumCustom算子工程 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 ### 2. 生成自定义算子工程,复制host和kernel实现并编译算子 + - 导入ATVC环境变量 + ```bash + # 如果不导入,默认使用./atvc/include路径 + $ export ATVC_PATH=${atvc}/include + ``` - 切换到msOpGen脚本install.sh所在目录 ```bash # 若开发者以git命令行方式clone了master分支代码,并切换目录 diff --git a/atvc/examples/ops_pytorch/README.md b/atvc/examples/ops_pytorch/README.md index 5f85b795bcd0b3014831f618424ceef9ca28cc22..20e1299dbd881a4bb632f3da2a504fee4bbfd848 100644 --- a/atvc/examples/ops_pytorch/README.md +++ b/atvc/examples/ops_pytorch/README.md @@ -1,8 +1,5 @@ - ## 概述 -使用ATVC对接pytorch工程简单的示例,适合初学者。 - -## pytorch算子样例说明 -样例通过Ascend C编程语言实现了ATVC框架对接pytorch算子,并按照算子调用方式分别给出了对应的端到端实现。 +## 概述 +使用ATVC开发自定义算子,并实现从PyTorch框架调用的样例。 ## 算子开发样例 | 目录名称 | 功能描述 | @@ -10,183 +7,50 @@ | [add](./add) | 基于ATVC框架的Add自定义Vector算子 | | [reduce_sum](./reduce_sum) | 基于ATVC框架的reduce_sum自定义Vector算子 | -## 快速上手 - - 快速执行example用例,更详细的流程请参阅[add算子](./add/README.md)。 - -- 下载ATVC代码及环境配置,参考[快速入门](../../../docs/01_quick_start.md)。 - -- 导入ATVC环境变量 - ```bash - # 如果不导入,默认使用./atvc/include路径 - export ATVC_PATH=${atvc}/include - ``` - - - 执行add用例 - ```bash - # 基于ATVC编译pytorch Add算子 - $ cd ./atvc/examples/ops_pytorch/add - $ bash run.sh - ... - OK - ``` - -## 基于pytorch算子对接ATVC框架 - -### 步骤1. 定义算子描述,参考[add_custom_impl.h](./add/add_custom_impl.h) - - 首先通过ATVC提供的`ATVC::OpTraits`模板结构体来描述Add算子的输入输出信息,定义如下: -```cpp - // Add算子中有两个输入,一个输出。类型均为float - using AddOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; -``` - -### 步骤2. 实现算子计算逻辑,参考[add_custom_impl.h](./add/add_custom_impl.h) - - 用户需要通过AscendC API来搭建Add算子的核心计算逻辑,在ATVC框架中,这类算子的核心计算逻辑是通过定义一个结构体的仿函数来实现。它需要`ATVC::OpTraits`作为固定模板参数,并重载`operator()`来被提供的Kernel层算子模板类调用。 +## 基于PyTorch算子对接ATVC框架 + 不同的算子类型可参考[快速入门](../../docs/01_quick_start.md)中的模版选择模版进行选择,用户在此处通过`<<<>>>`的方式调用核函数,更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。 + + - 引入头文件。需要注意的是,需要保护对应核函数调用接口声明所在的头文件{kernel_name}_impl.h,kernel_name为算子的核函数名称。 ```cpp - // 头文件引入 - #include "elewise/elewise_host.h" - #include "elewise/elewise_device.h" - - // 传入编译态参数ATVC::OpTraits - template - struct AddComputeFunc { - /* - 函数说明: z = x + y - 参数说明: - x : 参与运算的输入 - y : 参与运算的输入 - z : 参与运算的输出 - */ - template - // 重载operator,提供给算子模板类调用 - __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) { - AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过c.GetSize()获取单次计算的元素数量 - } - }; -``` -### 步骤3. 实现核函数,参考[add_custom_impl.h](./add/add_custom_impl.h) - - ATVC提供的`ATVC::Kernel::EleWiseOpTemplate`算子模板类实现了核内的数据搬运、资源申请和计算调度功能。它将计算仿函数作为模板参数传入来完成构造实例化,用户可通过调用`ATVC::Kernel::EleWiseOpTemplate`算子模板类的`Run(Args&&... args)`接口完成算子的功能计算,完成完整核函数的实现。 - -```cpp - /* - * 该函数为Add算子核函数入口 - * a Device上的gm地址,指向Add算子第一个输入 - * b Device上的gm地址,指向Add算子第二个输入 - * c Device上的gm地址,指向Add算子第一个输出 - * param ATVC::EleWiseParam数据 - */ - template - __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, ATVC::EleWiseParam param) - { - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); - - auto op = ATVC::Kernel::EleWiseOpTemplate, ATVC::EleWiseParam*>(); // 将AddComputeFunc仿函数作为模板参数传入,实例化EleWiseOpTemplate模板类,并指明param的数据类型 - op.Run(x, y, z, ¶m); // 按照输入、输出、param的顺序传入Run函数 - } -``` - - ### 步骤4. 编写torch入口[pytorch_ascendc_extension.cpp](./add/pytorch_ascendc_extension.cpp) - - 不同的算子类型可参考[快速入门](../../docs/01_quick_start.md)中的模版选择模版进行选择,用户在此处通过`<<<>>>`的方式调用核函数,更多ATVC的用法可参考atvc的[开发指南](../../docs/02_developer_guide.md)。 - -``` cpp // 头文件引入 #include #include "torch_npu/csrc/core/npu/NPUStream.h" #include "add_custom_impl.h" - +``` +- 应用程序框架编写,需要注意的是,本样例输入x,y的内存是在python调用脚本[run_op.py](./add/run_op.py)中分配的。 +```cpp namespace ascendc_elewise_ops { - at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) - { - auto stream = c10_npu::getCurrentNPUStream().stream(false); - at::Tensor z = at::empty_like(x); - // 创建ATVC框架[elewise]所需要的数据,算子数据的长度 - int32_t totalLength = 1; - for (int32_t size : x.sizes()) { - totalLength *= size; - } - // 声明运行态参数param - ATVC::EleWiseParam param; - // Host侧调用Tiling API完成相关运行态参数的运算 - (void)ATVC::Host::CalcEleWiseTiling(totalLength, param); - // 调用核函数 - AddCustom<<>>( - (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param); - - return z; + at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) { } - // 加载算子模版 - TORCH_LIBRARY(ascendc_ops, m) - { - // torch的自定义算子接口 - m.def("add", &ascendc_elewise_ops::op_add_custom); } - } -``` -### 步骤5. 编写测试用例,参考[run_op.py](./add/run_op.py) -```python - # 导入torch torch所需依赖 - import torch - import torch_npu - # 导入测试用例依赖 - from torch_npu.testing.testcase import TestCase, run_tests - torch.npu.config.allow_internal_format = False - # 加载bishengcc编译出的二进制文件 - torch.ops.load_library('./libascendc_pytorch.so') - - # 测试用例编写 - class TestAscendCOps(TestCase): - # 测试用例 - def test_add_custom_ops_float(self): - length = [8, 2048] - # 生成随机数 - x = torch.rand(length, device='cpu', dtype=torch.float32) - y = torch.rand(length, device='cpu', dtype=torch.float32) - # 调用torch的自定义算子接口 - npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu()) - cpuout = torch.add(x, y) - self.assertRtolEqual(npuout, cpuout) - - if __name__ == '__main__': - run_tests() ``` -### 步骤6. 基于atvc框架pytorch的编译和测试脚本[run.sh](./add/run.sh) -```bash - # 获取torch、torch_npu、python的lib和include路径和atvc的路径 - torch_location=... - torch_npu_location=... - python_include=... - python_lib=... - atvc_path=... - - # 使用bishengcc进行编译pytorch算子 - bishengcc pytorch_ascendc_extension.cpp \ - -arch Ascend910B1 \ - -I${torch_location}/include \ - -I${torch_location}/include/torch/csrc/api/include \ - -I${python_include} \ - -I${atvc_path} \ - -I${torch_npu_location}/include \ - -L${torch_location}/lib \ - -L${torch_npu_location}/lib \ - -L${python_lib} \ - -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python \ - -o libascendc_pytorch.so \ - -shared - - # 执行测试用例 - python3 run_op.py -``` - -### 步骤7. 算子编译&执行 -```bash - # 基于ATVC编译pytorch Add算子 - $ cd ./atvc/examples/ops_pytorch/add - $ bash run.sh - ... - OK +- NPU侧运行验证。通过`<<<>>>`的方式调用核函数完成指定的运算。 +```cpp + // 运行资源申请,通过c10_npu::getCurrentNPUStream()的函数获取当前NPU上的流 + auto stream = c10_npu::getCurrentNPUStream().stream(false); + // 分配Device侧输出内存 + at::Tensor z = at::empty_like(x); + // totalLength是算子输入的元素个数 + int32_t totalLength = 1; + for (int32_t size : x.sizes()) { + totalLength *= size; + } + // 声明运行态参数param + ATVC::EleWiseParam param; + // Host侧调用Tiling API完成相关运行态参数的运算 + (void)ATVC::Host::CalcEleWiseTiling(totalLength, param); + // 使用<<<>>方式调用核函数完成指定的运算 + AddCustom<<>>( + (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param); + // 将Device上的运算结果拷贝回Host并释放申请的资源 + return z; ``` + - 定义PyTorch算子的调用接口。 +``` cpp + // 加载算子模版 + TORCH_LIBRARY(ascendc_ops, m) { // 模块名ascendc_ops,模块对象m + m.def("add", &ascendc_elewise_ops::op_add_custom); // 将函数add和PyTorch进行绑定 + } + +``` diff --git a/atvc/examples/ops_pytorch/add/README.md b/atvc/examples/ops_pytorch/add/README.md index 39366061aee1a8aa7cc0495815ab77278adad311..8cb593d4f92a50a3a3cb2c18f4838767785faac1 100644 --- a/atvc/examples/ops_pytorch/add/README.md +++ b/atvc/examples/ops_pytorch/add/README.md @@ -1,12 +1,12 @@ ## 概述 -本样例基于AddCustom算子工程,介绍了基于ATVC的单算子工程、单算子调用。 +本样例基于AddCustom算子工程,介绍了基于ATVC的PyTorch工程及调用。 ## 目录结构介绍 ``` ├── add -│ ├── add_custom_impl.h // 通过pytroch调用的方式调用Add算子 -│ ├── pytorch_ascendc_extension.cpp // pytorch调用入口 -│ ├── run_op.py // pytorch的测试用例 +│ ├── add_custom_impl.h // 通过PyTorch调用的方式调用Add算子 +│ ├── pytorch_ascendc_extension.cpp // PyTorch调用入口 +│ ├── run_op.py // PyTorch的测试用例 │ └── run.sh // 脚本,编译需要的二进制文件,并测试 ``` @@ -20,86 +20,53 @@ z = x + y 算子类型(OpType)Add 算子输入nameshapedata typeformat -x8 * 2048float,intND -y8 * 2048float,intND +x8 * 2048int32_t、floatND +y8 * 2048int32_t、floatND -算子输出z8 * 2048float,intND +算子输出z8 * 2048int32_t、floatND 核函数名AddCustom -## 支持的产品型号 -本样例支持如下产品型号: -- Atlas A2训练系列产品 - ## 编译运行样例算子 -针对pytorch算子,编译运行包含如下步骤: -- 完成算子pytorch入口和impl文件的实现; -- 编译pytorch算子的二进制文件; -- 调用执行pytorch算子; +针对PyTorch算子,编译运行包含如下步骤: +- 完成算子PyTorch入口和impl文件的实现; +- 编译PyTorch算子的二进制文件; +- 调用执行PyTorch算子; 详细操作如下所示。 ### 1. 获取源码包及环境配置 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 -### 2. 安装pytorch环境 +### 2. 安装PyTorch环境 参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境 -### 3. 基于ATVC编写pytorch算子的实现 - - 编写kernel侧函数,参考[add_custom_impl.h](./add_custom_impl.h) - ```cpp - // 引入头文件 - #include "elewise/elewise_host.h" - #include "elewise/elewise_device.h" +### 3. 基于ATVC编写PyTorch算子的实现 + - 算子kernel侧实现 + + 编写kernel侧函数,完成指定的运算。参考[add_custom_impl.h](./add_custom_impl.h)和[开发指南](../../../docs/02_developer_guide.md)完成核函数的实现。 - // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息,定义如下 - using AddOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; - using AddOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; - - // 实现算子计算逻辑 - // 传入编译态参数ATVC::OpTraits - template - struct AddComputeFunc { - /* - 函数说明: z = x + y - 参数说明: - x : 参与运算的输入 - y : 参与运算的输入 - z : 参与运算的输出 - */ - template - // 重载operator,提供给算子模板类调用 - __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor z) { - AscendC::Add(z, x, y, z.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过z.GetSize()获取单次计算的元素数量 - } - }; - - //实现核函数 - template - __global__ __aicore__ void AddCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::EleWiseParam param) - { - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); - // 将AddComputeFunc仿函数作为模板参数传入,实例化EleWiseOpTemplate模板类 - auto op = ATVC::Kernel::EleWiseOpTemplate>(); - op.Run(x, y, z, ¶m); - } - ``` - - 编写pytorch入口函数,并调用核函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp) + - 编写PyTorch入口函数,并通过`<<<>>>`调用核函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp) ```cpp at::Tensor op_add_custom(const at::Tensor &x, const at::Tensor &y) { + // 运行资源申请,通过c10_npu::getCurrentNPUStream()的函数获取当前NPU上的流 auto stream = c10_npu::getCurrentNPUStream().stream(false); + // 分配Device侧输出内存 + at::Tensor z = at::empty_like(x); ATVC::EleWiseParam param; int32_t totalLength = 1; for (int32_t size : x.sizes()) { totalLength *= size; } (void)ATVC::Host::CalcEleWiseTiling(totalLength, param); + // 使用<<<>>方式调用核函数完成指定的运算 AddCustom<<>>( (uint8_t *)(x.storage().data()), (uint8_t *)(y.storage().data()), (uint8_t *)(z.storage().data()), param); + return z; } ``` - - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py) + - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py) ```py # 引入头文件 @@ -107,16 +74,18 @@ z = x + y import torch_npu import numpy as np from torch_npu.testing.testcase import TestCase, run_tests - # 加载二进制 + # 加载二进制文件 torch.npu.config.allow_internal_format = False torch.ops.load_library('./libascendc_pytorch.so') class TestAscendCOps(TestCase): # 测试用例 def test_add_custom_ops_float(self): + # 分配Host侧输入内存,并进行数据的初始化 length = [8, 2048] x = torch.rand(length, device='cpu', dtype=torch.float32) y = torch.rand(length, device='cpu', dtype=torch.float32) + # 分配Device侧内存,并将数据从Host上拷贝到Device上 npuout = torch.ops.ascendc_ops.add(x.npu(), y.npu()) cpuout = torch.add(x, y) self.assertRtolEqual(npuout, cpuout) @@ -125,9 +94,46 @@ z = x + y if __name__ == '__main__': run_tests() ``` + - 编译和测试脚本,参考[run.sh](./run.sh) + + ```sh + # 获取torch、torch_npu、python的lib和include路径和atvc的路径 + torch_location=... + torch_npu_location=... + python_include=... + python_lib=... + if [ -z "$ATVC_PATH" ]; then + atvc_path=$(realpath ../../../include) + else + atvc_path=$ATVC_PATH + fi + + # 使用bishengcc进行编译PyTorch算子 + bishengcc pytorch_ascendc_extension.cpp \ + -arch Ascend910B1 \ + -I${torch_location}/include \ + -I${torch_location}/include/torch/csrc/api/include \ + -I${python_include} \ + -I${atvc_path} \ + -I${torch_npu_location}/include \ + -L${torch_location}/lib \ + -L${torch_npu_location}/lib \ + -L${python_lib} \ + -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python \ + -o libascendc_pytorch.so \ + -shared -### 4. 基于ATVC编写pytorch算子的调用验证 - - 调用脚本,生成pytorch算子,并运行测试用例 + # 执行测试用例 + python3 run_op.py + ``` + +### 4. 基于ATVC编写PyTorch算子的调用验证 + - 导入ATVC环境变量 + ```bash + # 如果不导入,默认使用./atvc/include路径 + $ export ATVC_PATH=${atvc}/include + ``` + - 调用脚本,生成PyTorch算子,并运行测试用例 ```bash $ cd ./atvc/examples/ops_pytorch/add $ bash run.sh diff --git a/atvc/examples/ops_pytorch/reduce_sum/README.md b/atvc/examples/ops_pytorch/reduce_sum/README.md index 23175eb3de04e4f1a31ecab53173f9c59999663a..d154ffb5cdf69abe77c83bdd57c7c832e08b0a39 100644 --- a/atvc/examples/ops_pytorch/reduce_sum/README.md +++ b/atvc/examples/ops_pytorch/reduce_sum/README.md @@ -1,86 +1,58 @@ ## 概述 -本样例基于ReduceSum算子,介绍了基于ATVC的单算子工程、单算子调用。 +本样例基于ReduceSum算子,介绍了基于ATVC的PyTorch工程及调用。 ## 目录结构介绍 ``` ├── reduce_sum -│ ├── reduce_sum_impl.h // 通过pytroch调用的方式调用ReduceSum算子 -│ ├── pytorch_ascendc_extension.cpp // pytorch调用入口 -│ ├── run_op.py // pytorch的测试用例 +│ ├── reduce_sum_impl.h // 通过PyTorch调用的方式调用ReduceSum算子 +│ ├── pytorch_ascendc_extension.cpp // PyTorch调用入口 +│ ├── run_op.py // PyTorch的测试用例 │ └── run.sh // 脚本,编译需要的二进制文件,并测试 ``` ## 算子描述 -Add算子实现了两个数据相加,返回相加结果的功能。对应的数学表达式为: -``` -z = x + y -``` +ReduceSum是对输入tensor的指定轴进行规约累加的计算并输出结果的Reduce类算子。 + ## 算子规格描述 - + - +
算子类型(OpType)ReduceSum
算子输入nameshapedata typeformat
x8 * 2048float,intND
x8 * 2048int32_t、floatND
算子输出y8 * 2048float,intND
算子输出y8 * 2048int32_t、floatND
核函数名ReduceSumCustom
-## 支持的产品型号 -本样例支持如下产品型号: -- Atlas A2训练系列产品 - ## 编译运行样例算子 -针对pytorch算子,编译运行包含如下步骤: -- 完成算子pytorch入口和impl文件的实现; -- 编译pytorch算子的二进制文件; -- 调用执行pytorch算子; +针对PyTorch算子,编译运行包含如下步骤: +- 完成算子PyTorch入口和impl文件的实现; +- 编译PyTorch算子的二进制文件; +- 调用执行PyTorch算子; 详细操作如下所示。 ### 1. 获取源码包及环境配置 -编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 -### 2. 安装pytorch环境 -参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境 + 编译运行此样例前,请参考[准备:获取样例代码](../README.md#codeready)获取源码包及环境变量的准备。 +### 2. 安装PyTorch环境 + 参考[torch的安装](https://gitee.com/ascend/pytorch)进行安装torch、torch_npu环境 -### 3. 基于ATVC编写pytorch算子的实现 - - 编写kernel侧函数,参考[reduce_sum_impl.h](./reduce_sum_impl.h) - ```cpp - // 引入头文件 - #include "reduce/reduce_host.h" - #include "reduce/reduce_device.h" - - // 首先通过ATVC提供的ATVC::OpTraits模板结构体来描述Add算子的输入输出信息,定义如下: - using ReduceOpTraitsFloat = ATVC::OpTraits, ATVC::OpOutputs>; - using ReduceOpTraitsInt = ATVC::OpTraits, ATVC::OpOutputs>; - - /* - * 该函数为ReduceSumCustom算子核函数入口 - * x Device上的gm地址,指向Add算子第一个输入 - * y Device上的gm地址,指向Add算子第一个输出 - * reduceParam ATVC::ReduceParam - */ - template - __global__ __aicore__ void ReduceSumCustom(GM_ADDR x, GM_ADDR y, GM_ADDR z, ATVC::ReduceParam reduceParam) - { - KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_MIX_AIV_1_0); // 使用了多核控制指令,设置算子执行时只启动Vector核 - // 将计算模板类模板定义作为模板参数传入,Policy由Host层的策略分派API给出 - // ATVC::ReduceParam* 为tiling的类型 - auto op = ATVC::Kernel::ReduceOpTemplate, - Policy>(); - op.Run(x, y, &reduceParam); - } - ``` - - 编写pytorch入口函数,并调用kernel侧函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp) +### 3. 基于ATVC编写PyTorch算子的实现 + - 算子kernel侧实现 + + 编写kernel侧函数,完成指定的运算。参考[reduce_sum_impl.h](./reduce_sum_impl.h)和[开发指南](../../../docs/02_developer_guide.md)完成核函数的实现。 + + - 编写PyTorch入口函数,并通过`<<<>>>`调用核函数,参考[pytorch_ascendc_extension.cpp](./pytorch_ascendc_extension.cpp) ```cpp - // pytorch 入口函数 at::Tensor op_reduce_sum(const at::Tensor &x, const std::vector &dim) - { + { std::vector shapeIn; std::vector shapeOut; - ATVC::ReduceParam param; // Reduce运行态参数,包含TilingData以及临时空间的相关信息 - ATVC::ReducePolicy policy = {-1, -1, -1}; // Reduce运行态参数,负责映射最适合的Reduce模板实现 + // Reduce运行态参数,包含TilingData以及临时空间的相关信息 + ATVC::ReduceParam param; + // Reduce运行态参数,负责映射最适合的Reduce模板实现 + ATVC::ReducePolicy policy = {-1, -1, -1}; for (int32_t size : x.sizes()) { shapeIn.push_back(size); shapeOut.push_back(size); @@ -89,6 +61,7 @@ z = x + y shapeOut[i] = 1; } auto options = torch::TensorOptions().dtype(x.scalar_type()).device(x.device()); + // 分配Device侧输出内存 at::Tensor y = at::empty(shapeOut, options); if (x.scalar_type() == at::kFloat) { // Host侧调用Tiling API完成相关运行态参数的运算 @@ -109,6 +82,7 @@ z = x + y // 负责Reduce类算子的调度,选择对应的Policy最佳策略并执行Kernel函数 void ReduceOpAdapter(uint8_t *x, uint8_t *y, ATVC::ReduceParam param, ATVC::ReducePolicy &policy) { + // 运行资源申请,通过c10_npu::getCurrentNPUStream()的函数获取当前NPU上的流 auto stream = c10_npu::getCurrentNPUStream().stream(false); // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 @@ -121,7 +95,7 @@ z = x + y } // namespace reduce } // namespace AscendC ``` - - 编写python调用函数,并调用pytorch入口函数,参考[run_op.py](./run_op.py) + - 编写python调用函数,并调用PyTorch入口函数,参考[run_op.py](./run_op.py) ```python # 引入头文件 @@ -136,8 +110,10 @@ z = x + y class TestAscendCOps(TestCase): # 测试用例 def test_reduce_sum_ops_float(self): + # 分配Host侧输入内存,并进行数据的初始化 length = [8, 2048] x = torch.rand(length, device='cpu', dtype=torch.float32) + # 分配Device侧内存,并将数据从Host上拷贝到Device上 npuout = torch.ops.ascendc_ops.sum(x.npu(), (0,)) cpuout = torch.sum(x, (0,)) self.assertRtolEqual(npuout.reshape(cpuout.shape), cpuout) @@ -146,9 +122,46 @@ z = x + y if __name__ == '__main__': run_tests() ``` + - 编译和测试脚本,参考[run.sh](./run.sh) -### 4. 基于ATVC编写pytorch算子的调用验证 - - 调用脚本,生成pytorch算子,并运行测试用例 + ```sh + # 获取torch、torch_npu、python的lib和include路径和atvc的路径 + torch_location=... + torch_npu_location=... + python_include=... + python_lib=... + if [ -z "$ATVC_PATH" ]; then + atvc_path=$(realpath ../../../include) + else + atvc_path=$ATVC_PATH + fi + + # 使用bishengcc进行编译PyTorch算子 + bishengcc pytorch_ascendc_extension.cpp \ + -arch Ascend910B1 \ + -I${torch_location}/include \ + -I${torch_location}/include/torch/csrc/api/include \ + -I${python_include} \ + -I${atvc_path} \ + -I${torch_npu_location}/include \ + -L${torch_location}/lib \ + -L${torch_npu_location}/lib \ + -L${python_lib} \ + -ltorch -ltorch_cpu -lc10 -ltorch_npu -lpython3 -ltorch_python \ + -o libascendc_pytorch.so \ + -shared + + # 执行测试用例 + python3 run_op.py + ``` + +### 4. 基于ATVC编写PyTorch算子的调用验证 + - 导入ATVC环境变量 + ```bash + # 如果不导入,默认使用./atvc/include路径 + $ export ATVC_PATH=${atvc}/include + ``` + - 调用脚本,生成PyTorch算子,并运行测试用例 ```bash $ cd ./atvc/examples/ops_pytorch/reduce_sum $ bash run.sh diff --git a/atvc/examples/reduce_sum/README.md b/atvc/examples/reduce_sum/README.md index 85f555c302e8176cc2bfa9a64133fa21ab417459..ca4cc75534f7951d0be23ca6e115babd31aa91ef 100644 --- a/atvc/examples/reduce_sum/README.md +++ b/atvc/examples/reduce_sum/README.md @@ -37,8 +37,8 @@ ReduceSum算子规格: ## 算子运行 在ascendc-api-adv代码仓目录下执行: ```bash -$ cd ./atvc/tests -$ bash run_test.sh reduce_sum +$ cd ./atvc/examples +$ bash run_examples.sh reduce_sum ... Generate golden data successfully. ... diff --git a/atvc/examples/reduce_sum/reduce_sum.cpp b/atvc/examples/reduce_sum/reduce_sum.cpp index cf1fe4cf5203a5a4130f73a4bb7163735ac378c2..d00164b051fd832c3ec557bee6a4c0796cf45cc6 100644 --- a/atvc/examples/reduce_sum/reduce_sum.cpp +++ b/atvc/examples/reduce_sum/reduce_sum.cpp @@ -18,53 +18,12 @@ #include "acl/acl.h" #include "reduce/reduce_host.h" #include "reduce/reduce_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) +#include "example_common.h" namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - // ReduceSum算子的描述:一个输入,一个输出,类型均为float using ReduceOpTraits = ATVC::OpTraits, ATVC::OpOutputs>; -bool VerifyResults(const std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtDestroyContext(context)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} - void CleanUp(uint8_t *&xDevice, uint8_t *&yDevice, uint8_t *&yHost) { CHECK_ACL(aclrtFree(xDevice)); diff --git a/atvc/tests/run_test.sh b/atvc/examples/run_examples.sh similarity index 84% rename from atvc/tests/run_test.sh rename to atvc/examples/run_examples.sh index 925f8de5d23ea4510703a4f44581779c3dd98fcd..e5a6282fd368fefa57a9a9e8a0c1f89ff4b02106 100644 --- a/atvc/tests/run_test.sh +++ b/atvc/examples/run_examples.sh @@ -23,9 +23,9 @@ else fi ATVC_HOME_DIR=$CURRENT_DIR/../ -TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples|xargs) +TEST_CASE_LIST=$(ls $ATVC_HOME_DIR/examples | grep -v '^run_examples.sh$' | grep -v '^ops_*' | xargs) if [ $# -lt 1 ]; then - echo "This script requires an input as the test case name. Execution example: 'bash run_test.sh [$TEST_CASE_LIST]'" + echo "This script requires an input as the test case name. Execution example: 'bash run_examples.sh [$TEST_CASE_LIST]'" exit 1 fi TEST_NAME=$1 @@ -46,16 +46,16 @@ function compile_operator(){ cd $ATVC_HOME_DIR/examples/$TEST_NAME if [ -z "$RUN_MODE" ]; then echo "Executing with npu mode" - ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME --include-path ${ATVC_HOME_DIR}/include + ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common elif [ "$RUN_MODE" = "debug_print" ]; then echo "Executing with debug_print mode" - ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME --include-path ${ATVC_HOME_DIR}/include -DATVC_DEBUG_MODE=1 + ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -DATVC_DEBUG_MODE=1 elif [ "$RUN_MODE" = "profiling" ]; then echo "Executing with profiling mode" - ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME --include-path ${ATVC_HOME_DIR}/include -DATVC_DEBUG_MODE=2 + ${COMPILE_TOOL} -arch Ascend910B1 $TEST_NAME.cpp -o $TEST_NAME -I ${ATVC_HOME_DIR}/include -I ${CURRENT_DIR}/common -DATVC_DEBUG_MODE=2 else echo "--npu-mode is an optional parameter and can be left unset. If set, the value must be debug_print or profiling." - echo "Execution example: 'bash run_test.sh $TEST_NAME --run-mode=debug_print'" + echo "Execution example: 'bash run_examples.sh $TEST_NAME --run-mode=debug_print'" exit 1 fi } @@ -81,6 +81,6 @@ if [[ " $TEST_CASE_LIST " == *" ${TEST_NAME} "* ]]; then fi cd ${ATVC_HOME_DIR} else - echo "Error: Cannot find '$TEST_NAME' in ${ATVC_HOME_DIR}examples. Execution example: 'bash run_test.sh [$TEST_CASE_LIST]'" + echo "Error: Cannot find '$TEST_NAME' in ${ATVC_HOME_DIR}examples. Execution example: 'bash run_examples.sh [$TEST_CASE_LIST]'" exit 1 fi \ No newline at end of file diff --git a/atvc/examples/sinh_custom/README.md b/atvc/examples/sinh_custom/README.md index ec7f8a45713ff32f9cef327769257d5dbaae3d56..4c007f46568343801abcbb550b2e97adfc040799 100644 --- a/atvc/examples/sinh_custom/README.md +++ b/atvc/examples/sinh_custom/README.md @@ -37,8 +37,8 @@ SinhCustom算子规格: ## 算子运行 在ascendc-api-adv代码仓目录下执行: ```bash -$ cd ./atvc/tests -$ bash run_test.sh sinh_custom +$ cd ./atvc/examples +$ bash run_examples.sh sinh_custom ... Generate golden data successfully. ... diff --git a/atvc/examples/sinh_custom/sinh_custom.cpp b/atvc/examples/sinh_custom/sinh_custom.cpp index b8a18a08b849b7805574b5d5eabc27d7c6fa6c42..d8e0914e03cc23e8169a5413b4a65d3eca0238b8 100644 --- a/atvc/examples/sinh_custom/sinh_custom.cpp +++ b/atvc/examples/sinh_custom/sinh_custom.cpp @@ -18,27 +18,9 @@ #include "acl/acl.h" #include "elewise/elewise_host.h" #include "elewise/elewise_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) +#include "example_common.h" namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - // 描述算子的输入输出以及临时计算资源 using SinhOpTraits = ATVC::OpTraits, ATVC::OpOutputs, ATVC::OpTemps>; @@ -75,36 +57,6 @@ void InitializeData(int32_t eleNum, std::vector &inputX, std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclInit(nullptr)); - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - CHECK_ACL(aclrtCreateStream(&stream)); -} - -void CleanACL(aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} } /* @@ -166,7 +118,7 @@ int main() CHECK_ACL(aclrtFree(yDevice)); CHECK_ACL(aclrtFreeHost(yHost)); - CleanACL(stream, deviceId); + CleanACL(context, stream, deviceId); if (!VerifyResults(golden, outputY)) { return -1; diff --git a/atvc/examples/tanh_grad/README.md b/atvc/examples/tanh_grad/README.md index 5136b68b090b646c08912bbc2d3f24e91d3f9674..147de3cf108c767a189e18e5d29277ec591140b9 100644 --- a/atvc/examples/tanh_grad/README.md +++ b/atvc/examples/tanh_grad/README.md @@ -33,10 +33,10 @@ Tanh算子规格: ## 算子基本功能验证 -在代码仓目录下执行: +执行命令如下: ```bash -$ cd ./atvc/tests/ -$ bash run_test.sh tanh_grad +$ cd ./atvc/examples +$ bash run_examples.sh tanh_grad ... Accuracy verification passed. ``` @@ -45,10 +45,10 @@ Accuracy verification passed. 样例提供的主要调试调优方式如下: - 使用`ATVC::Host::EleWiseTilingHyperParam`构建超参对`ATVC::Host::CalcEleWiseTiling()`接口实现Tiling调优 - 使用`--run-mode=debug_print`进行DFX信息打印: -在代码仓目录下执行: +执行命令如下: ```bash -$ cd ./atvc/tests/ -$ bash run_test.sh tanh_grad --run-mode=debug_print +$ cd ./atvc/examples +$ bash run_examples.sh tanh_grad --run-mode=debug_print ... [INFO]:[ATVC][EleWise]Start to run Template Fuction. ... @@ -62,15 +62,15 @@ Accuracy verification passed. ``` - 使用`--run-mode=profiling`开启Profiling,获取性能数据: -在代码仓目录下执行: +执行命令如下: ```bash -$ cd ./atvc/tests/ -$ bash run_test.sh tanh_grad --run-mode=profiling +$ cd ./atvc/examples +$ bash run_examples.sh tanh_grad --run-mode=profiling ... [INFO] Start Profiling ... ... [INFO] Process profiling data complete, Data is saved in /xxx_path -Accuracy verification passed. +... ``` -更多详细的调试调优介绍参考[ATVC开发指南](../../docs/2_developer_guide.md)的`ATVC的调试调优功能`章节 \ No newline at end of file +更多详细的调试调优介绍参考[ATVC开发指南](../../docs/02_developer_guide.md)的`ATVC的调试调优功能`章节 \ No newline at end of file diff --git a/atvc/examples/tanh_grad/tanh_grad.cpp b/atvc/examples/tanh_grad/tanh_grad.cpp index 7f8693d6050f1355ef641e09bbcbaa7120a8548e..c444909fa1c31e95c378933c718a7c2adae80aab 100644 --- a/atvc/examples/tanh_grad/tanh_grad.cpp +++ b/atvc/examples/tanh_grad/tanh_grad.cpp @@ -17,19 +17,9 @@ #include "acl/acl.h" #include "elewise/elewise_host.h" #include "elewise/elewise_device.h" - -#define CHECK_ACL(x) \ - do { \ - aclError __ret = x; \ - if (__ret != ACL_ERROR_NONE) { \ - std::cerr << __FILE__ << ":" << __LINE__ << " aclError:" << __ret << std::endl; \ - } \ - } while (0) +#include "example_common.h" namespace { -static constexpr float REL_TOL = 1e-3f; -static constexpr float ABS_TOL = 1e-5f; - struct MemoryPtrs { uint8_t *zHost; uint8_t *dyDevice; @@ -38,14 +28,6 @@ struct MemoryPtrs { uint8_t *paramDevice; }; -// 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) -{ - const float eps = 1e-40f; // 防止分母为零 - float diff = std::abs(a - b); - return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); -} - void InitializeData(int32_t eleNum, std::vector &inputDy, std::vector &inputY, std::vector &golden) { std::random_device rd; @@ -62,36 +44,6 @@ void InitializeData(int32_t eleNum, std::vector &inputDy, std::vector &golden, const std::vector &output) -{ - for (int32_t i = 0; i < golden.size(); i++) { - if (!IsClose(golden[i], output[i])) { - printf("[ERROR]: Accuracy verification failed! The expected value of element " - "in index [%d] is %f, but actual value is %f.\n", - i, - golden[i], - output[i]); - return false; - } - } - return true; -} - -void InitializeACL(aclrtContext &context, aclrtStream &stream, int32_t deviceId) -{ - CHECK_ACL(aclInit(nullptr)); - CHECK_ACL(aclrtSetDevice(deviceId)); - CHECK_ACL(aclrtCreateContext(&context, deviceId)); - CHECK_ACL(aclrtCreateStream(&stream)); -} - -void CleanACL(aclrtStream &stream, aclrtContext &context, int32_t deviceId) -{ - CHECK_ACL(aclrtDestroyStream(stream)); - CHECK_ACL(aclrtResetDevice(deviceId)); - CHECK_ACL(aclFinalize()); -} - void CleanUp(uint8_t *&zHost, uint8_t *&dyDevice, uint8_t *&yDevice, uint8_t *&zDevice) { CHECK_ACL(aclrtFree(dyDevice)); diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h index 8aa2755f5e13c14939e7966fdf4fe4ea66736fd6..1ec0d6b1c285545dbe596d01a93ea52e5072215f 100644 --- a/atvc/include/broadcast/broadcast_op_template.h +++ b/atvc/include/broadcast/broadcast_op_template.h @@ -369,7 +369,7 @@ private: return; } AscendC::DataCopyPad(input, srcGlobal_[copyInOffset], copyInParams, padParams); - ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast][CopyIn] Offset is %u, block len is %u " + ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast][CopyIn] Offset is %u, block len is %u " "block count is %u.\n", copyInOffset, copyInParams.blockLen, copyInParams.blockCount); } @@ -402,7 +402,7 @@ private: return; } AscendC::DataCopyPad(dstGlobal_[copyOutOffset], output, copyOutParams); - ATVC::Kernel::DebugPrintf("[INFO]: [ATVC][Broadcast][CopyOut] Offset is %u, block len is %u block count is %u.\n", + ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Broadcast][CopyOut] Offset is %u, block len is %u block count is %u.\n", copyOutOffset, copyOutParams.blockLen, copyOutParams.blockCount); } diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index 44ceb71e17fcaa9aa12a1cb83d4f433f49d441ca..480c04d2bfa53a537cc15a400d3d84265716ecb1 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -25,8 +25,8 @@ enum ShapeDim { DIM_7, DIM_8, DIM_9, - DIM_REDUCE, // Reduce轴 - DIM_BROADCAST // Broadcast轴 + DIM_REDUCE, // Reduce axis + DIM_BROADCAST // Broadcast axis }; namespace AR_PATTERN { diff --git a/atvc/include/reduce/reduce_host.h b/atvc/include/reduce/reduce_host.h index 0ab6bc9532c6f43f27e5f5d99b65e10cfd9c9260..93348539ae7f76c006edbe9d216d271fda8ffa4d 100644 --- a/atvc/include/reduce/reduce_host.h +++ b/atvc/include/reduce/reduce_host.h @@ -20,10 +20,10 @@ namespace ATVC { namespace Host { -/** - * @brief Validate the legitimacy of reduce tiling hyper param - * @param [in] hyperParam, reduce tiling hyper param - * @return bool result, return true if the hyper param is valid, otherwise return false. +/*! + * \brief Validate the legitimacy of reduce tiling hyper param + * \param [in] hyperParam, reduce tiling hyper param + * \return bool result, return true if the hyper param is valid, otherwise return false. */ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam) { @@ -31,6 +31,7 @@ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam) constexpr uint32_t MIN_BASE_LINE = 48 * 1024U; constexpr float MAX_INNER_A = 256; constexpr float MIN_INNER_A = 128; + constexpr float EPSILON = 1e-6f; constexpr float MIN_THRESH_HOLD = 0.8f; constexpr float MAX_THRESH_HOLD = 0.95f; if(hyperParam.basicBlock > MAX_BASE_LINE || hyperParam.basicBlock < MIN_BASE_LINE) { @@ -43,8 +44,8 @@ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam) "maxInnerA(%u) must be in [128, 256].\n", hyperParam.maxInnerA); return false; } - if(hyperParam.balanceThreshHold > MAX_THRESH_HOLD || hyperParam.balanceThreshHold < MIN_THRESH_HOLD) { - printf("[ERROR]: [ATVC][EleWise] Tiling hyperParam is invalid: nBufferNum(%f) must be in [0.8, 0.96].\n", + if(hyperParam.balanceThreshHold > MAX_THRESH_HOLD + EPSILON || hyperParam.balanceThreshHold + EPSILON < MIN_THRESH_HOLD) { + printf("[ERROR]: [ATVC][Reduce] Tiling hyperParam is invalid: balanceThreshHold(%f) must be in [0.8, 0.95].\n", hyperParam.balanceThreshHold); return false; } @@ -52,13 +53,13 @@ bool CheckReduceHyperParam(const ATVC::Host::ReduceTilingHyperParam &hyperParam) return true; } -/** - * @brief 计算Reduce的TilingData和策略参数 - * @param inputShape 输入张量的形状。 - * @param reduceDim 需要进行Reduce操作的具体维度。 - * @param policy 输出参数。 - * @param param 输出参数。 - * @return bool 返回true表示计算成功,false表示失败。 +/*! + * \brief Calculate the TilingData and policy parameters for Reduce. + * \param[in] inputShape, shape of the tensor. + * \param[in] reduceDim, The dim that requires a Reduce operation. + * \param[out] policy, static policy of Reduce Template + * \param[out] param, dynamic param of Reduce Template + * \return bool Return true to indicate calculation success, false to indicate failure. */ template bool CalcReduceTiling(std::vector inputShape, diff --git a/atvc/include/reduce/reduce_op_template.h b/atvc/include/reduce/reduce_op_template.h index 1c77be6f25c579b15500a54feb65c9d5f5629c28..fe466a0999aa0b16dad341a00b6d50fe00134ebe 100644 --- a/atvc/include/reduce/reduce_op_template.h +++ b/atvc/include/reduce/reduce_op_template.h @@ -30,6 +30,12 @@ namespace ATVC { namespace Kernel { +/*! + * ReduceOpTemplate Generic Reduce operator template. + * Reduce operators usually refer to operators that perform reduction operations on elements in tensors, + * such as summation and averaging. They can specify several dimensions for reduction calculations, + * or reduce all elements to a scalar. + */ template class ReduceOpTemplate { @@ -49,19 +55,25 @@ public: constexpr static uint32_t PROMOTE_BUF_SIZE = KernelUtils::GetComputeCount(); AscendC::LocalTensor tempBuf_; AscendC::LocalTensor computeRes_; - // 算子开发者传入的计算对象 + // The calculation object passed in by user ReduceCompute compute_; public: __aicore__ inline ReduceOpTemplate() {} - // 按照输入、输出、ReduceParam、其他标量的顺序传入 - // 内部根据ReduceParam进行数据调度并调用ReduceOpTemplate完成计算后搬出到GM + /*! + * \brief The input order is: input tensor, output tensor, ReduceParam, Other scalars. + * Internally schedule data based on ReduceParam and call ReduceOpTemplate to complete + * the calculation before moving it out to GM. + * \param[in] x, GM address of the input tensor. + * \param[in] y, GM address of the output tensor. + * \param[in] param, tiling data and policy. + * \return void. + */ template __aicore__ inline void Run(GM_ADDR x, GM_ADDR y, ReduceParam* param) { ATVC::Kernel::DebugPrintf("[INFO]:[ATVC][Reduce] Start to run template function.\n"); param_ = param; - // 完成入参校验打印 KernelUtils::PrintParam(param_); if (!KernelUtils::CheckParam(param_)) { return; @@ -72,6 +84,11 @@ public: } public: + /*! + * \brief Initialise all pipes, queues and buffers. + * \param[in] workspace, GM address for the workspace buffer. + * \param[in] args, GM addresses for input/output tensors. + */ template __aicore__ inline void Init(GM_ADDR workspace, Args... args) { @@ -90,18 +107,32 @@ public: pipe_->InitBuffer(tempUbQue_, BLOCK_SIZE_BYTE); } + /*! + * \brief Allocate a tensor from the internal buffer pool. + * \tparam IsInput, true – tensor will be used as input (read-only) + * false – tensor will be used as output (read-write) + * \tparam needDup, if true the buffer is duplicated (for double-buffering). + * \param[in] tensor, LocalTensor reference that receives the allocation. + */ template __aicore__ inline void AllocTensorAux(AscendC::LocalTensor& tensor) { bufPool_.AllocTensor(tensor); } + /*! + * \brief Release a tensor back to the buffer pool. + * \param[in] tensor, LocalTensor to free. + */ template __aicore__ inline void FreeTensorAux(const AscendC::LocalTensor& tensor) { bufPool_.FreeTensor(tensor); } + /*! + * \brief Execute the reduction schedule. + */ template __aicore__ inline void Process(Args... args) { @@ -118,7 +149,7 @@ public: SchTypeA op(this, input_, output_, &(this->param_)->tilingData); op.Process(args...); } else { - // 完成第一阶段的Reduce + // Complete the first phase of Reduce using SchTypeR = KernelUtils::Reduce::ReduceBlockAux< ATVC::ReduceTilingData, &SCH_LOOP_INFO, std::remove_reference_t, DataType, @@ -127,10 +158,10 @@ public: op.Process(args...); bufPool_.ResetEvent(); - // 全核同步 + // Full nuclear synchronization AscendC::SyncAll(); - // 完成第二阶段的Reduce + // Complete the second phase of Reduce bufPool_.template ResetInputSize(3); constexpr static ReduceSchLoopInfo groupSchLoopInfo = KernelUtils::Reduce::GetGroupSchLoopInfo(); ATVC::ReduceTilingData groupTiling; @@ -143,6 +174,11 @@ public: } } + /*! + * \brief Populate tiling data for the second (group) reduction phase. + * \param[in] groupTiling, Tiling structure to be filled. + * \return void. + */ __aicore__ inline void SetGroupTiling(ATVC::ReduceTilingData& groupTiling) { groupTiling.ubFactorA = ELEMENT_ONE_REPEAT_COMPUTE; @@ -162,6 +198,15 @@ public: static_cast(64)); // 按照64核计算,需要tiling传 } + /*! + * \brief Copy input tensor to UB with optional padding. + * \tparam isPadding, true – perform padding using PreCompute::GetPaddingValue + * false – no padding + * \param[in] src, GlobalTensor source in GM. + * \param[in] view, View descriptor describing the copy geometry. + * \param[in] shape, Shape descriptor (modified when padding). + * \param[in] ubTensor, LocalTensor destination in UB. + */ template __aicore__ inline void CopyInAux(const AscendC::GlobalTensor &src, ViewDescT &view, ShapeDescT &shape, @@ -199,8 +244,9 @@ public: static_cast(view.axis[CONST6].srcStride)}; int32_t total = 1; - for (int32_t i = 0; i < CONST6; ++i) + for (int32_t i = 0; i < CONST6; ++i) { total *= repeats[i]; + } for (int32_t idx = 0; idx < total; ++idx) { int32_t tmp = idx; @@ -216,6 +262,11 @@ public: } } + /*! + * \brief Copy input tensor directly to output when no reduction is required. + * \param void. + * \return void. + */ __aicore__ inline void CopyInput2Output() { uint32_t shapeSize = 1; @@ -318,7 +369,7 @@ protected: } private: - ATVC::ReduceParam* param_; // CalcReduceTiling API计算出的运行态参数 + ATVC::ReduceParam* param_; // The runtime parameters calculated by CalcReduceTiling API AscendC::TPipe* pipe_; AscendC::TBuf<> oriVecQue_; AscendC::TBuf<> tempResQue_; diff --git a/atvc/include/reduce/reduce_sum.h b/atvc/include/reduce/reduce_sum.h index 36c71a8c43c03d82b41181431802cd08114d49eb..08a24bf08d4101cf99dcf6a3aebe1b77b4e89e68 100644 --- a/atvc/include/reduce/reduce_sum.h +++ b/atvc/include/reduce/reduce_sum.h @@ -31,9 +31,14 @@ struct ReduceARParam { } namespace ATVC { -// OpTraits: 算子描述的ATVC::OpTraits结构体 +/*! + * ReduceSumCompute This class provides the core arithmetic required to reduce + * tensors along either the inner-most (AR) or outer-most (RA) axis after + * the tensor has been copied to the Unified Buffer (UB). Data movement between + * Global Memory (GM) and UB is not handled here; it is the responsibility of + * the surrounding scheduling template. + */ template -// 计算模板,不感知数据从GM到UB上的搬运 class ReduceSumCompute { public: // 从OpTraits中萃取算子输入描述信息 @@ -42,22 +47,34 @@ public: using PrompteDtype = typename KernelUtils::GetPromoteType::T; __aicore__ inline ReduceSumCompute() {} + /*! + * \brief Perform the actual reduction on a tile already resident in UB. + * \tparam needMask, True when UB alignment introduced invalid lanes. + * \tparam Pattern, One of ReducePattern::AR or ReducePattern::RA. + * \param[in] shape, {dimA, dimR} in elements; dimR may be padded. + * \param[out] dst, Destination tensor (length == dimA) + * \param[in] src, Source tensor (length == dimA * dimR) + */ template __aicore__ inline void Compute(KernelUtils::Shape<2> &shape, const AscendC::LocalTensor &dst, const AscendC::LocalTensor &src) { - // AR场景,硬件限制,R轴需要做UB上32B对齐,对齐方式有2种: - // 1. 高性能对齐(补充元素值不确定), 后续累加计算只能计算实际有效的元素个数 - // 2. 补0对齐(补值是由用户实现的GetPaddingValue()接口决定的) + // AR scenario, hardware limitations, R-axis requires 32B alignment on UB, with 2 alignment methods available: + // 1. High performance alignment (with uncertain supplementary element values), subsequent cumulative + // calculations can only calculate the actual number of effective elements + // 2. Alignment with zero padding (padding value is determined by the GetAddingValue() interface + // implemented by the user) if (std::is_same::value) { - if constexpr (needMask) { // 1. 高性能对齐模式 - // MainR(int64_t dimR, bool isAR): 框架提供的计算R轴二分长度(元素个数), dimR为原始的元素个数 + if constexpr (needMask) { // 1. High performance alignment mode + // MainR (int64_t dimR, boolean isAR): The framework provides the calculation of the R-axis binary + // length (number of elements), where dimR is the original number of elements int16_t mainR = KernelUtils::Reduce::MainR(shape.oriBurstLen, true); ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.oriBurstLen); } else { - // MainR:框架提供的计算R轴二分长度(元素个数),dimR为补齐后的元素个数 + // MainR: The framework provides the calculation of the R-axis binary length (number of elements), + // where dimR is the number of elements after completion int16_t mainR = KernelUtils::Reduce::MainR(shape.value[1], true); ReduceAR(dst, src, shape.value[0], shape.value[1], mainR, shape.value[1]); } @@ -68,6 +85,14 @@ public: } } + /*! + * \brief RA-pattern reduction: reduce along the outer-most (slowest-varying) axis. + * \param[out] dst, Output tensor (length == dimA) + * \param[in] src, Input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, Length of the non-reduced axis (A) + * \param[in] dimR, Length of the reduced axis (R) + * \param[in] mainR, Largest power-of-two ≤ dimR (computed by the caller) + */ __aicore__ inline void ReduceRA(const AscendC::LocalTensor &dst, const AscendC::LocalTensor &src, uint16_t dimA, @@ -77,43 +102,45 @@ public: uint32_t mainNum = dimA * mainR; constexpr uint32_t dtypeSize = sizeof(PrompteDtype); uint32_t tailNum = totalNum - mainNum; - // add mask最大值为256 bytes 且要满足32bytes对齐 + // MaskAddNum has a maximum value of 256 bytes and must be aligned with 32 bytes constexpr uint32_t maskAddNum = UB_ALIGN_256 / dtypeSize / UB_ALIGN_32 * UB_ALIGN_32; - // 处理tail uint16_t repeatTimes = tailNum / maskAddNum; uint16_t repeatNum = repeatTimes * maskAddNum; uint16_t repTailNum = tailNum - repeatNum; - uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; // 不同迭代间同一datablock步长 + // Same data block step size between different iterations + uint32_t repStride = dtypeSize * maskAddNum / UB_ALIGN_32; // dstBlkStride, src0BlkStride,src1BlkStride, dstRepStride, src0RepStride, src1RepStride AscendC::BinaryRepeatParams repeatParams(1, 1, 1, repStride, repStride, repStride); if (repeatTimes > 0) { AscendC::Add(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); } if (repTailNum > 0) { - repStride = dtypeSize * repTailNum / UB_ALIGN_32; // 不同迭代间同一datablock步长 + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; repeatParams.dstRepStride = repStride; repeatParams.src0RepStride = repStride; repeatParams.src1RepStride = repStride; AscendC::Add(src[repeatNum], src[repeatNum + mainNum], src[repeatNum], repTailNum, 1, repeatParams); } AscendC::PipeBarrier(); - // 二分主体 uint16_t loopRNum = mainR; while (loopRNum > 1) { loopRNum = loopRNum >> 1; - mainNum = loopRNum * dimA; // LoopR的前半部分数据量 + mainNum = loopRNum * dimA; // The first half of LoopR's data volume repeatTimes = mainNum / maskAddNum; repeatNum = repeatTimes * maskAddNum; repTailNum = mainNum - repeatNum; if (repeatTimes > 0) { - repStride = dtypeSize * maskAddNum / UB_ALIGN_32; // 不同迭代间同一datablock步长 + // Same data block step size between different iterations + repStride = dtypeSize * maskAddNum / UB_ALIGN_32; repeatParams.dstRepStride = repStride; repeatParams.src0RepStride = repStride; repeatParams.src1RepStride = repStride; AscendC::Add(src, src[mainNum], src, maskAddNum, repeatTimes, repeatParams); } if (repTailNum > 0) { - repStride = dtypeSize * repTailNum / UB_ALIGN_32; // 不同迭代间同一datablock步长 + // Same data block step size between different iterations + repStride = dtypeSize * repTailNum / UB_ALIGN_32; repeatParams.dstRepStride = repStride; repeatParams.src0RepStride = repStride; repeatParams.src1RepStride = repStride; @@ -124,6 +151,15 @@ public: AscendC::DataCopy(dst, src, dimA); } + /*! + * \brief AR-pattern reduction: reduce along the inner-most (fastest-varying) axis. + * \param[out] dstTensor, Output tensor (length == dimA) + * \param[in] srcTensor, Input tensor (length == dimR * dimA), already resident in UB + * \param[in] dimA, Length of the non-reduced axis (A) + * \param[in] dimR, Padded length of the reduced axis (R) + * \param[in] mainR, Largest power-of-two ≤ original R length + * \param[in] oriBurstLen, Original (un-padded) R length used to compute tail + */ __aicore__ inline void ReduceAR(const AscendC::LocalTensor &dstTensor, const AscendC::LocalTensor &srcTensor, uint16_t dimA, @@ -150,13 +186,12 @@ public: PerformInitialAdd(srcTensor, param); } - // 二分计算 param.loopRNum = mainR; while (param.loopRNum > maskAddRNum) { - param.loopRNum = param.loopRNum / 2; // 除2二分 + param.loopRNum = param.loopRNum / 2U; PerformBinaryReduction(srcTensor, param); } - if (param.loopRNum == 0) { // small shape, 直接reduce + if (param.loopRNum == 0) { // small shape, directly reduce param.loopRNum = tailR; } PerformFinalReduction(dstTensor, srcTensor, param); @@ -170,17 +205,16 @@ public: int64_t dimA = Pattern::TailA ? shape.value[1] : shape.value[0]; int32_t element_one_repeat = Platform::GetVRegSize() / sizeof(PrompteDtype); int64_t stride = OpsUtils::CeilDiv(dimA, static_cast(element_one_repeat)) * element_one_repeat; - // count A轴的大小 * VL uint16_t outerLoopTimes = OpsUtils::CeilDiv( static_cast(dimA * sizeof(PrompteDtype)), static_cast(Platform::GetVRegSize())); uint16_t innerLoopTimes = cacheID; uint32_t outerLoopStride = element_one_repeat; - uint32_t innerLoopStride = stride; // cacahe的每一个idex的块的大小, A轴的大小 + uint32_t innerLoopStride = stride; // The size of each idex block in cacahe and the size of the A-axis AscendC::LocalTensor dstTensor = tempBuf; AscendC::LocalTensor srcTensor = computeRes; uint32_t cah = cacheID * stride; - for (uint16_t i = 0; i < outerLoopTimes; ++i) { // outerLoopTimes是dimA的大小 + for (uint16_t i = 0; i < outerLoopTimes; ++i) { // OuterLoopTimes is the size of dimA uint32_t srcIdx = i * outerLoopStride; for (uint16_t j = 0; j < innerLoopTimes; ++j) { AscendC::Add(srcTensor[srcIdx], srcTensor[srcIdx], @@ -192,6 +226,13 @@ public: } } + /*! + * \brief Binary reduction between two UB buffers. + * \ Used for inter-core result merging when workspace staging is required. + * \param[in] ubTensorLeft, Left operand (in-place result). + * \param[in] ubTensorRight, Right operand (read-only). + * \param[in] calCount, Number of elements to reduce. + */ __aicore__ inline void ReduceBetweenUB(const AscendC::LocalTensor &ubTensorLeft, const AscendC::LocalTensor &ubTensorRight, @@ -200,10 +241,18 @@ public: Add(ubTensorRight, ubTensorRight, ubTensorLeft, calCount); } + /*! + * \brief Return the value used for padding when UB alignment is required. + * For SUM-reduction the neutral element is 0. + * \tparam U, Scalar type identical to DataType or PromoteDataType. + * \return The padding value (always 0). + */ template - __aicore__ inline U GetPaddingValue() // 设置框架内每一次搬入UB的数据对齐补充的值 + __aicore__ inline U GetPaddingValue() { - U paddingValue = 0; // 由于ReduceSum是累加R轴数据,补齐的元素值设为0,才能保证累加的结果不受影响 + // Due to the fact that ReduceSum accumulates R-axis data, the values of the supplemented elements + // are set to 0 to ensure that the accumulated result is not affected + U paddingValue = 0; return paddingValue; } @@ -216,7 +265,7 @@ private: AscendC::BinaryRepeatParams repeatParams(1, 1, 1, param.repStride, param.repStride, param.repStride); if (param.repStride > UB_ALIGN_255) { - for (uint16_t i = 0; i < param.dimMax; i += param.mainR) { + for (uint16_t i = 0; i < param.dimMax; i += param.dimR) { AscendC::Add(srcTensor[i], srcTensor[i], srcTensor[i + param.mainR], param.tailR); } } else { @@ -268,7 +317,7 @@ private: if constexpr (AscendC::IsSameType::value || AscendC::IsSameType::value) { uint16_t reduceLoopTimes = UB_ALIGN_255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; - // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次 + // WholeReduceSum repeat-time limit is 255; split dimA into chunks for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; AscendC::WholeReduceSum( @@ -277,15 +326,14 @@ private: AscendC::PipeBarrier(); } else if constexpr (AscendC::IsSameType::value || AscendC::IsSameType::value) { - // 尽量二分add到最后32bytes - // int32 -> float 都是4字,一把cast 用CAST_NONE + // Cast to float for higher-precision accumulation AscendC::LocalTensor interpreSrc = srcTensor.template ReinterpretCast(); AscendC::LocalTensor interpreDst = dstTensor.template ReinterpretCast(); AscendC::Cast(interpreSrc, srcTensor, AscendC::RoundMode::CAST_NONE, param.dimA * param.dimR); AscendC::PipeBarrier(); uint16_t reduceLoopTimes = 255 * param.dtypeSize / UB_ALIGN_32 * UB_ALIGN_32 / param.dtypeSize; - // WholeReduceSum repeattime最大值为255 255附近为了dimA需要分多次 + // WholeReduceSum repeat-time limit is 255; split dimA into chunks for (uint16_t dimAIdx = 0; dimAIdx < param.dimA; dimAIdx += reduceLoopTimes) { uint16_t curDimA = (dimAIdx + reduceLoopTimes < param.dimA) ? reduceLoopTimes : param.dimA - dimAIdx; AscendC::WholeReduceSum( diff --git a/atvc/include/reduce/tiling/reduce_tiling.h b/atvc/include/reduce/tiling/reduce_tiling.h index ba9673558bbb104cb3929c5eab6449ca8da077c9..bcded4d2f83151dbf0a0dacc2bcb42372fcab759 100644 --- a/atvc/include/reduce/tiling/reduce_tiling.h +++ b/atvc/include/reduce/tiling/reduce_tiling.h @@ -21,32 +21,50 @@ #include "tiling_common.h" namespace OpTiling { +/*! + * ReduceOpTiling: High-level tiling engine for Ascend Reduce kernels. + * Computes cache-friendly UB blocks, multi-core split factors, loop counts and required workspace size. + * Writes everything into the user-supplied `ReduceParam` and `ReducePolicy` structures so the runtime + * can launch the kernel. + */ class ReduceOpTiling { public: -ReduceOpTiling(ReduceTilingInputParam& inputParam, - ATVC::ReducePolicy* policy, ATVC::ReduceParam* param, ATVC::Host::ReduceTilingHyperParam& hyperParam) - : param_(param), policy_(policy), opInput_(inputParam), hyperParam_(hyperParam) { + /*! + * \brief constructor that binds the engine to user-supplied output structures and input descriptor. + * \param[in] inputParam, compile-time description of the Reduce op + * \param[out] policy, static policy of Reduce Template + * \param[out] param, dynamic param of Reduce Template + */ + ReduceOpTiling(ReduceTilingInputParam &inputParam, ATVC::ReducePolicy *policy, ATVC::ReduceParam *param, + ATVC::Host::ReduceTilingHyperParam &hyperParam) + : param_(param), policy_(policy), opInput_(inputParam), hyperParam_(hyperParam) + { compileInfo_ = ATVC::GetOpCompileInfo(); }; -int32_t Run() -{ - MakeWrapDim(opInput_.reduceShape, opInput_.reduceDim); - if (IsAxesValid(opInput_.reduceShape, opInput_.reduceDim) == -1) { - return -1; - } - std::vector newShape(ATVC::MAX_DIM, 1); - int32_t newShapeSize = 0; - EliminateOne(opInput_.reduceShape, opInput_.reduceDim, newShape, newShapeSize); - MergeAxis(opInput_.reduceDim, newShape, newShapeSize); - if (!DoTiling(newShape, newShapeSize)) { - printf("Do tiling failed!\n"); - return -1; + /*! + * \brief Orchestrates the full tiling workflow; returns 0 on success, ‑1 otherwise. + * \param void + * \return 0 success, 1 failure + */ + int32_t Run() + { + MakeWrapDim(opInput_.reduceShape, opInput_.reduceDim); + if (IsAxesValid(opInput_.reduceShape, opInput_.reduceDim) == -1) { + return -1; + } + std::vector newShape(ATVC::MAX_DIM, 1); + int32_t newShapeSize = 0; + EliminateOne(opInput_.reduceShape, opInput_.reduceDim, newShape, newShapeSize); + MergeAxis(opInput_.reduceDim, newShape, newShapeSize); + if (!DoTiling(newShape, newShapeSize)) { + printf("Do tiling failed!\n"); + return -1; + } + CalcWorkSpace(); + printf("ReduceOpTiling Run success!\n"); + return 0; } - CalcWorkSpace(); - printf("ReduceOpTiling Run success!\n"); - return 0; -} private: template @@ -111,9 +129,9 @@ void CalcWorkSpace() void EliminateOne(const std::vector& oriShape, std::vector& axes, std::vector& shape, int32_t& shapeSize) { - int32_t dstIdx = 1; // shape中第一个数给了1, 跳过第一个数 + int32_t dstIdx = 1; // The first number in the shape is given as 1, so the first number is skipped. for (size_t i = 0; i < axes.size(); i++) { - // 前面补了一维,所有的axes需要加1 + // The front dimension is filled, and all axes need to be increased by 1 axes[i] = axes[i] + 1; } int32_t eraseNum = 0; @@ -150,7 +168,7 @@ void MergeAxis(std::vector& axes, std::vector& shape, int32_t } s *= shape[j]; if (isRAxis1) { - // 连续的R轴, 需要擦除后续R轴的索引 + // Continuous R-axis, need to erase the index of subsequent R-axis axes.erase(iter1); } } @@ -171,7 +189,7 @@ bool DoTiling(std::vector& shape, int32_t shapeSize) switch (shapeSize) { case ATVC::CONST1: return ComputeTiling(shape); - param_->tilingData.coreNum = 1; // A 場景:不用reduce 只用1個core copydata + param_->tilingData.coreNum = 1; // Scenario A: No reduce, only one core copydata case ATVC::CONST2: return ComputeTiling(shape); case ATVC::CONST3: @@ -424,7 +442,7 @@ void ComputeUnitA(const std::vector& shape) const bool isPatternA = (Pattern::ID == ATVC::ReducePattern::PATTERN_A); const uint64_t patternABlockSize = basicBlock_ / ge::GetSizeByDataType(opInput_.inputDtype); uint64_t maxInnerA = isPatternA ? patternABlockSize : maxCacheA; - uint64_t stepLen = Pattern::ID == ATVC::ReducePattern::PATTERN_A ? A_STEP_LEN : 1; // 纯A的步长为4, 减少循环次数 + uint64_t stepLen = Pattern::ID == ATVC::ReducePattern::PATTERN_A ? A_STEP_LEN : 1; bool basicSplitA = (axisInCacheLine + (Pattern::FirstA ? 1 : 0)) % ATVC::CONST2; uint64_t bBlockNum = basicBlock_ / ge::GetSizeByDataType(opInput_.inputDtype); uint64_t step = 1; diff --git a/atvc/include/reduce/tiling/tiling_common.h b/atvc/include/reduce/tiling/tiling_common.h index 7226dfb27b9ccbe52aaf31adf11edb25e3d52485..6d5c6dcfde5565aad549b175f371bfa2304861d6 100644 --- a/atvc/include/reduce/tiling/tiling_common.h +++ b/atvc/include/reduce/tiling/tiling_common.h @@ -9,12 +9,12 @@ * See LICENSE in the root of the software repository for the full text of the License. */ - #ifndef ATVC_TILING_COMMON_H +#ifndef ATVC_TILING_COMMON_H #define ATVC_TILING_COMMON_H #include "reduce/common/patterns.h" namespace OpTiling { -constexpr static int32_t CACHE_SIZE = 16 * 1024; // cahce size for ub reduce +constexpr static int32_t CACHE_SIZE = 16 * 1024; // cahce size for ub reduce constexpr static int32_t A_STEP_LEN = 4; struct ReduceTilingUnit { @@ -44,12 +44,13 @@ struct ReduceTilingInputParam { std::vector reduceShape = {}; ge::DataType inputDtype = ge::DataType::DT_UNDEFINED; ge::DataType promoteDtpye = ge::DataType::DT_UNDEFINED; - ReduceTilingInputParam(std::vector reduceDim_, std::vector reduceShape_, - ge::DataType inputDtype_, ge::DataType promoteDtpye_): - reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_){} + ReduceTilingInputParam(std::vector reduceDim_, std::vector reduceShape_, ge::DataType inputDtype_, + ge::DataType promoteDtpye_) + : reduceDim(reduceDim_), reduceShape(reduceShape_), inputDtype(inputDtype_), promoteDtpye(promoteDtpye_) + {} }; -void MakeWrapDim(const std::vector& shape, std::vector& axes) +void MakeWrapDim(const std::vector &shape, std::vector &axes) { // EnsureNotScalar at least return 1-D Tensor, so shapeSize cannot be 0 size_t shapeSize = shape.size(); @@ -70,7 +71,7 @@ bool IsAxisA(int32_t idx) } } -int32_t IsAxesValid(const std::vector& shape, const std::vector& axes) +int32_t IsAxesValid(const std::vector &shape, const std::vector &axes) { size_t shapeSize = shape.size(); size_t axesSize = axes.size(); @@ -89,7 +90,7 @@ int32_t IsAxesValid(const std::vector& shape, const std::vector -bool IsEmtpyTensor(const std::vector& shape) +bool IsEmtpyTensor(const std::vector &shape) { for (int32_t i = 0; i < Pattern::Dim; i++) { if (shape[i] == 0) { @@ -99,16 +100,18 @@ bool IsEmtpyTensor(const std::vector& shape) return false; } -}; // namespace OpTiling +}; // namespace OpTiling namespace ATVC { namespace Host { // Hyper param for reduce tiling. struct ReduceTilingHyperParam { - uint32_t basicBlock = 48 * 1024; // 设置Reduce基本块内存大小, 一般不能超过内存的1/3, 建议在[48k-54k]之间设置 - uint32_t maxInnerA = 128; // [128, 256] - double balanceThreshHold = 0.85; // 多核均衡的阈值水平 [0.8-0.95] + // Set the basic block memory size for Reduce, generally not exceeding 1/3 of the memory. It is recommended to set + // it between [48k-54k] + uint32_t basicBlock = 48 * 1024; + uint32_t maxInnerA = 128; // [128, 256] + double balanceThreshHold = 0.85; // Threshold level for multi-core equilibrium [0.8-0.95] }; -} -} -#endif // ATVC_TILING_COMMON_H \ No newline at end of file +} // namespace Host +} // namespace ATVC +#endif // ATVC_TILING_COMMON_H \ No newline at end of file diff --git a/atvc/include/reduce/utils/reduce_block_aux.h b/atvc/include/reduce/utils/reduce_block_aux.h index 2ffce9b6ded9180384392a8efb78661f3de96bda..a68a169fa900a1aaf21d907b525ab95973d3d6a7 100644 --- a/atvc/include/reduce/utils/reduce_block_aux.h +++ b/atvc/include/reduce/utils/reduce_block_aux.h @@ -75,7 +75,7 @@ public: struct { uint64_t start = 0; - uint64_t stride = 1; // 拷贝步长 + uint64_t stride = 1; // Copy step size } iterAddr_[DIM]; const DataType* tiling_; @@ -127,7 +127,8 @@ public: int32_t rAxisIdx = LoopInfo->loopRCount - 1; int32_t rAxis = LoopInfo->loopRAxis[rAxisIdx]; - loopRAxisStep_ = OpsUtils::CeilDiv(tiling_->shape[rAxis], tiling_->ubFactorR); // 切分轴Rfactor的个数 + // The number of split axis Rfactors + loopRAxisStep_ = OpsUtils::CeilDiv(tiling_->shape[rAxis], tiling_->ubFactorR); if constexpr (LoopInfo->loopACount > 0) { int32_t aAxisIdx = LoopInfo->loopACount - 1; @@ -143,12 +144,12 @@ public: __aicore__ inline void Process(Args... args) { SetLoopRange(); - // 构造UB内轴index数组 - // 1、尾轴 - // 2、非循环轴 - // 3、核外A循环轴最内轴且UbFactorA > 1 - // 4、核外R循环轴最内轴且UbFactorR > 1 - // 5、核内R循环轴最内轴且UbFactorR > 1 + //Construct UB internal axis index array + //1. Tail axis + //2. Non cyclic axis + //3. The innermost axis of the extranuclear A-cycle axis and UbFactorA>1 + //4. The innermost axis of the extranuclear R-cycle axis and UbFactorR>1 + //5. The innermost axis of the nuclear R-cycle axis and UbFactorR>1 if constexpr (LoopInfo->loopRCount == 0) { rCount_ = tiling_->factorRCntPerCore; } else { @@ -232,7 +233,7 @@ public: SetEvent(AscendC::HardEvent::MTE2_V); computeTensor = ubTensor; } else { - // AllocComputeTensorAux 的index 外部不需要感知 + // The index of AlloccomputeTensorAux does not require external perception op_->ReduceOp::template AllocTensorAux(computeTensor); CopyIn(view, shape, ubTensor); SetEvent(AscendC::HardEvent::MTE2_V); @@ -257,7 +258,8 @@ public: PrePareReduce<(!InnerPattern::TailA), true>(i, view, shape, tensorRight, computeRight); ComputeMerge(shape, computeLeft, computeRight, args...); - // fp32 tensorLeft和computeLeft是同一个tensor,fp16 computeLeft在free时不用index + // FP32 tensorLeft and computeLet are the same tensor, + // while FP16 computeLet does not require an index when free op_->ReduceOp::template FreeTensorAux(computeRight); op_->compute_.template UpdateCache(i, shape, op_->tempBuf_, op_->computeRes_); } @@ -282,7 +284,7 @@ public: if constexpr (LoopInnerRIdx != 0) { constexpr auto axis = LoopInfo->loopInnerRAxis[LoopInnerRIdx - 1]; if constexpr (LoopInnerRIdx == LoopInfo->loopInnerRCount) { - // 最内层循环 + // innermost loop auto cur = basicBlockIdx % this->loopRAxisStep_; this->iterAddr_[axis].start = cur * this->ubFactorR_; this->iterAddr_[axis].stride = tiling_->shape[axis] - this->iterAddr_[axis].start; @@ -304,7 +306,7 @@ public: if constexpr (LoopAIdx != 0) { constexpr auto axis = LoopInfo->loopAAxis[LoopAIdx - 1]; if constexpr (LoopAIdx == LoopInfo->loopACount) { - // 切分轴 + // Splitting axis auto cur = step % this->loopAAxisStep_; this->iterAddr_[axis].start = cur * this->ubFactorA_; this->iterAddr_[axis].stride = tiling_->shape[axis] - this->iterAddr_[axis].start; @@ -365,8 +367,8 @@ public: for (int32_t i = 0; i < DIM; i++) { addrOffset += iterAddr_[i].start * tiling_->stride[i]; } - constexpr static auto burstLenAxis = DIM - 1; // 获取第一个循环轴 - view.addr = addrOffset; // 搬运地址 + constexpr static auto burstLenAxis = DIM - 1; // Get the first loop axis + view.addr = addrOffset; // Address to be moved view.burstLen = GetBurstLen(iterAddr_, tiling_); view.axisSize = 0; if constexpr (burstLenAxis > 0) { @@ -441,7 +443,7 @@ public: template __aicore__ inline void CopyIn(U& view, V& shape, const AscendC::LocalTensor& ubTensor) { - // 计算在UB的 View + // Calculate the View in UB op_->ReduceOp::template CopyInAux(this->input_[0], view, shape, ubTensor); } @@ -456,7 +458,7 @@ public: __aicore__ inline void ComputeMerge(V& shape, const AscendC::LocalTensor& ubTensorLeft, const AscendC::LocalTensor& ubTensorRight, Args... args) { - // Ub间Reduce + // Reduce between UB op_->compute_.ReduceBetweenUB(ubTensorLeft, ubTensorRight, shape.value[0] * shape.value[1]); op_->ReduceOp::template FreeTensorAux(ubTensorLeft); op_->ReduceOp::template ComputeAux(shape, ubTensorRight, args...); diff --git a/atvc/include/reduce/utils/reduce_block_aux_util.h b/atvc/include/reduce/utils/reduce_block_aux_util.h index bfffed420deef8fabd861d2daae34aca764a8325..52499e51dd7300e811c286b115bb84e9fc1f54fa 100644 --- a/atvc/include/reduce/utils/reduce_block_aux_util.h +++ b/atvc/include/reduce/utils/reduce_block_aux_util.h @@ -160,7 +160,7 @@ __aicore__ inline bool IsAxisA(int32_t idx) { template __aicore__ inline int32_t GetInnerA(T& iterAddr) { - // 从后往前遍历到A的最内切分轴 + // Traverse from back to front to the innermost dividing axis of A. int32_t startAxisA = 0; if constexpr (TailA) { startAxisA = Dim - 1;