diff --git a/atvc/examples/add/add.cpp b/atvc/examples/add/add.cpp index 4e0ef39cb028c7f3db06f6404c8c76022883daa5..04ea4a22766471183cb00a790918d83d7c443c16 100644 --- a/atvc/examples/add/add.cpp +++ b/atvc/examples/add/add.cpp @@ -30,7 +30,8 @@ static constexpr float REL_TOL = 1e-3f; static constexpr float ABS_TOL = 1e-5f; // 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { +bool IsClose(float a, float b) +{ const float eps = 1e-40f; // 防止分母为零 float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); @@ -49,7 +50,7 @@ struct AddComputeFunc { b : 参与运算的输入 c : 参与运算的输出 */ - template + template // 重载operator,提供给算子模板类调用 __aicore__ inline void operator()(AscendC::LocalTensor a, AscendC::LocalTensor b, AscendC::LocalTensor c) { AscendC::Add(c, a, b, c.GetSize()); // 开发调用AscendC Api自行实现计算逻辑, 通过c.GetSize()获取单次计算的元素数量 @@ -67,7 +68,8 @@ template __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR param) { KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); - auto op = ATVC::Kernel::EleWiseOpTemplate>(); // 将AddComputeFunc仿函数作为模板参数传入,实例化EleWiseOpTemplate模板类 + // 将AddComputeFunc仿函数作为模板参数传入,实例化EleWiseOpTemplate模板类 + auto op = ATVC::Kernel::EleWiseOpTemplate>(); op.Run(a, b, c, param); // 按照输入、输出、param的顺序传入Run函数,实现GM->GM的数据计算 } @@ -126,7 +128,9 @@ int main() auto elementParamSize = sizeof(param); CHECK_ACL(aclrtMalloc((void**)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, reinterpret_cast(¶m), elementParamSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, + reinterpret_cast(¶m), elementParamSize, + ACL_MEMCPY_HOST_TO_DEVICE)); uint32_t blockNum = param.tilingData.blockNum; // 调用核函数 AddCustom<<>>(xDevice, yDevice, zDevice, paramDevice); @@ -148,7 +152,9 @@ int main() for (int32_t i = 0; i < eleNum; i++) { if (!IsClose(golden[i], outputZ[i])) { - printf("Accuracy verification failed! The expected value of element in index [%d] is %f, but actual value is %f.\n", i, golden[i], outputZ[i]); + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, golden[i], outputZ[i]); return -1; } } diff --git a/atvc/examples/add_with_scalar/add_with_scalar.cpp b/atvc/examples/add_with_scalar/add_with_scalar.cpp index b0eeeedc89b8d7a2e1cdc4432976592857d228a0..c7aa38993b6f87bfae09e4f5928b62034857ec5c 100644 --- a/atvc/examples/add_with_scalar/add_with_scalar.cpp +++ b/atvc/examples/add_with_scalar/add_with_scalar.cpp @@ -30,7 +30,8 @@ static constexpr float REL_TOL = 1e-3f; static constexpr float ABS_TOL = 1e-5f; // 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { +bool IsClose(float a, float b) +{ const float eps = 1e-40f; // 防止分母为零 float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); @@ -42,7 +43,7 @@ using OP_TRAITS = ATVC::OpTraits, ATVC::OpOutputs< template struct AddComputeFunc { template - __aicore__ inline void operator()(AscendC::LocalTensor a, AscendC::LocalTensor b, + __aicore__ inline void operator()(AscendC::LocalTensor a, AscendC::LocalTensor b, AscendC::LocalTensor c, AscendC::LocalTensor temp, bool conditionVal) { if (conditionVal) { AscendC::Add(temp, a, a, c.GetSize()); // temp = a + a @@ -66,8 +67,8 @@ template __global__ __aicore__ void AddCustom(GM_ADDR a, GM_ADDR b, GM_ADDR c, GM_ADDR param, bool conditionVal) { KERNEL_TASK_TYPE_DEFAULT(KERNEL_TYPE_AIV_ONLY); - auto op = ATVC::Kernel::EleWiseOpTemplate>(); //传入计算仿函数, 实例化算子 - op.Run(a, b, c, param, conditionVal); //调用Run函数, 执行算子 + auto op = ATVC::Kernel::EleWiseOpTemplate>(); // 传入计算仿函数, 实例化算子 + op.Run(a, b, c, param, conditionVal); // 调用Run函数, 执行算子 } int main() @@ -127,9 +128,12 @@ int main() auto elementParamSize = sizeof(param); CHECK_ACL(aclrtMalloc((void**)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, reinterpret_cast(¶m), elementParamSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, + reinterpret_cast(¶m), elementParamSize, + ACL_MEMCPY_HOST_TO_DEVICE)); - AddCustom<<>>(xDevice, yDevice, zDevice, paramDevice, conditionVal); + AddCustom<<>>( + xDevice, yDevice, zDevice, paramDevice, conditionVal); CHECK_ACL(aclrtSynchronizeStream(stream)); CHECK_ACL(aclrtMemcpy(zHost, outputByteSize, zDevice, outputByteSize, ACL_MEMCPY_DEVICE_TO_HOST)); @@ -148,7 +152,9 @@ int main() for (int32_t i = 0; i < eleNum; i++) { if (!IsClose(golden[i], outputZ[i])) { - printf("Accuracy verification failed! The expected value of element in index [%d] is %f, but actual value is %f.\n", i, golden[i], outputZ[i]); + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, golden[i], outputZ[i]); return -1; } } diff --git a/atvc/examples/reduce_sum/reduce_sum.cpp b/atvc/examples/reduce_sum/reduce_sum.cpp index 81b84274cf6f133635962637a7eaaac66adf8d33..6f19ab4964668cc9cd63744c46c2541b469fe20c 100644 --- a/atvc/examples/reduce_sum/reduce_sum.cpp +++ b/atvc/examples/reduce_sum/reduce_sum.cpp @@ -31,7 +31,8 @@ static constexpr float REL_TOL = 1e-3f; static constexpr float ABS_TOL = 1e-5f; // 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { +bool IsClose(float a, float b) +{ const float eps = 1e-40f; // 防止分母为零 float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); @@ -67,11 +68,13 @@ void ReduceOpAdapter(uint8_t* x, uint8_t* y, ATVC::ReduceParam ¶m, ATVC::Red param.workspaceAddr = reinterpret_cast(workspaceDevice); auto reduceParamSize = sizeof(param); CHECK_ACL(aclrtMalloc((void**)¶mDevice, reduceParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, reduceParamSize, reinterpret_cast(¶m), reduceParamSize, ACL_MEMCPY_HOST_TO_DEVICE)); + CHECK_ACL(aclrtMemcpy(paramDevice, reduceParamSize, + reinterpret_cast(¶m), reduceParamSize, + ACL_MEMCPY_HOST_TO_DEVICE)); // 将tiling api计算出的ReducePolicy转化为编译态参数并实例化相应的核函数 if (policy == ATVC::REDUCE_POLICY0) { ReduceCustom<<>>(x, y, paramDevice); - }else if (policy == ATVC::REDUCE_POLICY1) { + } else if (policy == ATVC::REDUCE_POLICY1) { ReduceCustom<<>>(x, y, paramDevice); } else if (policy == ATVC::REDUCE_POLICY2) { ReduceCustom<<>>(x, y, paramDevice); @@ -177,8 +180,10 @@ int32_t main(int32_t argc, char* argv[]) for (int32_t i = 0; i < outEleNum; i++) { if (!IsClose(golden[i], outputY[i])) { - printf("Accuracy verification failed! The expected value of element in index [%d] is %f, but actual value is %f.\n", i, golden[i], outputY[i]); - return -1; + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, golden[i], outputY[i]); + return -1; } } printf("Accuracy verification passed.\n"); diff --git a/atvc/examples/sinh_custom/sinh_custom.cpp b/atvc/examples/sinh_custom/sinh_custom.cpp index 5b6a31ed054dfc3f4221bd69185f3c31228d1758..b339ccbbcc8e0d741ea84d195274fc8918521277 100644 --- a/atvc/examples/sinh_custom/sinh_custom.cpp +++ b/atvc/examples/sinh_custom/sinh_custom.cpp @@ -31,7 +31,8 @@ static constexpr float REL_TOL = 1e-3f; static constexpr float ABS_TOL = 1e-5f; // 判断两个浮点数是否足够接近 -bool IsClose(float a, float b) { +bool IsClose(float a, float b) +{ const float eps = 1e-40f; // 防止分母为零 float diff = std::abs(a - b); return (diff <= ABS_TOL) || (diff <= REL_TOL * std::max(std::abs(a), std::abs(b) + eps)); @@ -48,14 +49,18 @@ struct SinhComputeFunc { // DataType模板参数,根据实际数据类型个数填写 template // 重载operator公有接口,提供给`ATVC::Kernel::EleWiseOpTemplate`调用 - __aicore__ inline void operator()(AscendC::LocalTensor x, AscendC::LocalTensor y, AscendC::LocalTensor tempBuffer1, AscendC::LocalTensor tempBuffer2) { + __aicore__ inline void operator()(AscendC::LocalTensor x, + AscendC::LocalTensor y, + AscendC::LocalTensor tempBuffer1, + AscendC::LocalTensor tempBuffer2) + { // 开发调用AscendC Api自行实现计算仿函数 uint32_t tiledCnt = y.GetSize(); // 进行单次基块计算的元素个数 AscendC::Muls(tempBuffer1, x, static_cast(-1), tiledCnt); // tempBuffer1 = -1 * x AscendC::Exp(tempBuffer1, tempBuffer1, tiledCnt); // tempbuffer1 = exp(-x) AscendC::Exp(tempBuffer2, x, tiledCnt); // tempbuffer2 = exp(x) AscendC::Sub(y, tempBuffer2, tempBuffer1, tiledCnt); // y = exp(x) - exp(-x) - AscendC::Muls(y, y, static_cast(0.5), tiledCnt); // y = (e^(x) - e^(-x)) / 2 + AscendC::Muls(y, y, static_cast(0.5), tiledCnt); // y = (e^(x) - e^(-x)) / 2 } }; @@ -123,8 +128,10 @@ int main() // 将tiling计算的运行时参数EleWiseParam param传到Device侧 auto elementParamSize = sizeof(param); CHECK_ACL(aclrtMalloc((void**)¶mDevice, elementParamSize, ACL_MEM_MALLOC_HUGE_FIRST)); - CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, reinterpret_cast(¶m), elementParamSize, ACL_MEMCPY_HOST_TO_DEVICE)); - + CHECK_ACL(aclrtMemcpy(paramDevice, elementParamSize, + reinterpret_cast(¶m), elementParamSize, + ACL_MEMCPY_HOST_TO_DEVICE)); + // 调用自定义的Kernel API, <<<>>>的BlockNum参数可通过param的TilingData获取 SinhCustom<<>>(xDevice, yDevice, paramDevice); @@ -144,8 +151,10 @@ int main() for (int32_t i = 0; i < eleNum; i++) { if (!IsClose(golden[i], outputY[i])) { - printf("Accuracy verification failed! The expected value of element in index [%d] is %f, but actual value is %f.\n", i, golden[i], outputY[i]); - return -1; + printf("Accuracy verification failed! The expected value of element " + "in index [%d] is %f, but actual value is %f.\n", + i, golden[i], outputY[i]); + return -1; } } printf("Accuracy verification passed.\n"); diff --git a/atvc/include/broadcast/broadcast_compute.h b/atvc/include/broadcast/broadcast_compute.h index 6d380b29c7f3449dc779a4289a24e01bd3b2945a..90388f328b0f7be2405d66ad5caf64705d7959df 100644 --- a/atvc/include/broadcast/broadcast_compute.h +++ b/atvc/include/broadcast/broadcast_compute.h @@ -23,8 +23,11 @@ public: using DataType = typename ATVC::TypeListGet::Type; template - __aicore__ inline void Compute(AscendC::LocalTensor &src, uint32_t inputOffset, - AscendC::LocalTensor &dst, uint32_t dimA, uint32_t dimB) + __aicore__ inline void Compute(AscendC::LocalTensor &src, + uint32_t inputOffset, + AscendC::LocalTensor &dst, + uint32_t dimA, + uint32_t dimB) { if (patternID == ATVC::AB_PATTERN::ABA) { ComputeBA(src, inputOffset, dst, dimA, dimB); diff --git a/atvc/include/broadcast/broadcast_host.h b/atvc/include/broadcast/broadcast_host.h index e6035a42950ce803ec0ca6cbfcf2c3a59f5801bd..80fba92814ae79e8be737276c88b7266f6d95ee2 100644 --- a/atvc/include/broadcast/broadcast_host.h +++ b/atvc/include/broadcast/broadcast_host.h @@ -49,7 +49,10 @@ void PrintParam(BroadcastPolicy* policy, BroadcastParam* param) } template -bool CalcBroadcastTiling(std::vector shapeIn, std::vector shapeOut, BroadcastPolicy* policy, BroadcastParam* param) +bool CalcBroadcastTiling(std::vector shapeIn, + std::vector shapeOut, + BroadcastPolicy* policy, + BroadcastParam* param) { if(policy == nullptr || param == nullptr) { printf("[ERROR] Invalid input: policy or param is null pointer!\n"); @@ -64,7 +67,7 @@ bool CalcBroadcastTiling(std::vector shapeIn, std::vector shap auto inputDtype = GetOriInputType(); BroadcastTilingInputParam opInput = {shapeIn, shapeOut, inputDtype}; OpTiling::BroadcastOpTiling tiling(opInput, policy, param); - if(!tiling.Run()) { + if (!tiling.Run()) { printf("[ERROR] Tiling Error\n"); return false; } diff --git a/atvc/include/broadcast/broadcast_op_template.h b/atvc/include/broadcast/broadcast_op_template.h index 8dff4f5d10cb14fd28c800911826a451febf30a7..f9d777f5199fd1677c95affa8db46631af470d38 100644 --- a/atvc/include/broadcast/broadcast_op_template.h +++ b/atvc/include/broadcast/broadcast_op_template.h @@ -15,8 +15,7 @@ #include "common/const_def.h" #include "broadcast/broadcast_utils/broadcast_buf_pool.h" namespace ATVC { -struct BroadcastDataView -{ +struct BroadcastDataView { uint32_t dimASize; uint32_t dimBSize; uint32_t inShape[ATVC::MAX_DIM]; @@ -35,7 +34,7 @@ template class BroadcastOpTemplate { public: using DataType = typename BroadcastCompute::DataType; - __aicore__ inline BroadcastOpTemplate(){} + __aicore__ inline BroadcastOpTemplate() {} /* BroadcastOpTemplate对外运行接口,主要完成资源初始化、数据搬入、计算调度、数据搬出操作 @param src: 输入数据的gm指针 diff --git a/atvc/include/broadcast/broadcast_utils/broadcast_buf_pool.h b/atvc/include/broadcast/broadcast_utils/broadcast_buf_pool.h index ae96b0a645440885e0e41cea316959fab91499b5..0888c084dc0f58f2f43a9152eaec66b212bd1a38 100644 --- a/atvc/include/broadcast/broadcast_utils/broadcast_buf_pool.h +++ b/atvc/include/broadcast/broadcast_utils/broadcast_buf_pool.h @@ -31,17 +31,6 @@ struct BrcPoolManagerUnit { class BroadcastBufPool { constexpr static int32_t MAX_INPUT_SIZE = 10; -private: - BrcPoolManagerUnit inputUnit_; - BrcPoolManagerUnit computeUnit_; - event_t vecEventId_[MAX_INPUT_SIZE]; - event_t outEventId_[MAX_INPUT_SIZE]; - bool isBusyOut_[MAX_INPUT_SIZE] = {false}; - AscendC::TBuf<> qQue_; - AscendC::TPipe* pipe_; - int32_t inputNum_; - int32_t outputNum_; - public: __aicore__ inline BroadcastBufPool(){}; @@ -139,12 +128,14 @@ public: return idx; } - __aicore__ inline const void ResetEvent() { + __aicore__ inline const void ResetEvent() + { pipe_->Reset(); } private: - __aicore__ inline int32_t GetComputeTensorId() { + __aicore__ inline int32_t GetComputeTensorId() + { uint32_t loopCnt = 0; do { computeUnit_.idx = (computeUnit_.idx + 1) % computeUnit_.bufNum; @@ -158,10 +149,21 @@ private: return computeUnit_.idx; } - __aicore__ inline int32_t GetInputTensorId() { + __aicore__ inline int32_t GetInputTensorId() + { inputUnit_.idx = (inputUnit_.idx + 1) % inputUnit_.bufNum; return inputUnit_.idx; } + + BrcPoolManagerUnit inputUnit_; + BrcPoolManagerUnit computeUnit_; + event_t vecEventId_[MAX_INPUT_SIZE]; + event_t outEventId_[MAX_INPUT_SIZE]; + bool isBusyOut_[MAX_INPUT_SIZE] = {false}; + AscendC::TBuf<> qQue_; + AscendC::TPipe* pipe_; + int32_t inputNum_; + int32_t outputNum_; }; } // namespace KernelUtils } // namespace ATVC diff --git a/atvc/include/broadcast/common/broadcast_common.h b/atvc/include/broadcast/common/broadcast_common.h index 56179401e6aafe16da3e39d6b99a366f94ffad2f..3f51b4e808a3188cedd287fc36df9d6a9b8e3d9e 100644 --- a/atvc/include/broadcast/common/broadcast_common.h +++ b/atvc/include/broadcast/common/broadcast_common.h @@ -28,11 +28,12 @@ namespace AB_PATTERN { }; struct BroadcastPolicy { - public: +public: int32_t patternID = -1; int32_t loopABCount = -1; int32_t loopInnerABCount = -1; - bool operator==(const BroadcastPolicy& rhs) const { + bool operator==(const BroadcastPolicy& rhs) const + { return this->patternID == rhs.patternID && this->loopABCount == rhs.loopABCount &&\ this->loopInnerABCount == rhs.loopInnerABCount; } diff --git a/atvc/include/broadcast/tiling/broadcast_tiling.h b/atvc/include/broadcast/tiling/broadcast_tiling.h index 5cc2a3c7e389fc0f36a8ddb13de696532e5f12c0..3e62c61b27006d3c2ae0f2c50951d351a4e5ba6f 100644 --- a/atvc/include/broadcast/tiling/broadcast_tiling.h +++ b/atvc/include/broadcast/tiling/broadcast_tiling.h @@ -41,7 +41,7 @@ class BroadcastOpTiling { public: BroadcastOpTiling(ATVC::BroadcastTilingInputParam& inputParam, ATVC::BroadcastPolicy* policy, ATVC::BroadcastParam* param) - : opInput_(inputParam), param_(param), policy_(policy) + : opInput_(inputParam), param_(param), policy_(policy) { compileInfo_ = ATVC::GetOpCompileInfo(); } @@ -137,8 +137,10 @@ private: return true; } - bool EliminateOne(std::vector& oriShapeIn, std::vector& oriShapeOut, std::vector& shapeIn, - std::vector& shapeOut) + bool EliminateOne(std::vector &oriShapeIn, + std::vector &oriShapeOut, + std::vector &shapeIn, + std::vector &shapeOut) { bool isCurB = false; bool haveA = false; diff --git a/atvc/include/common/atvc_opdef.h b/atvc/include/common/atvc_opdef.h index c7be99f893e7d0b78f5fe0977e2121e6c512da39..bac3dfc7ad99145e44e9f72bc70c6ed8e393b838 100644 --- a/atvc/include/common/atvc_opdef.h +++ b/atvc/include/common/atvc_opdef.h @@ -21,7 +21,7 @@ enum class ParamType { }; template -struct ParamTypes{ +struct ParamTypes { using types = ATVC::TypeList; static constexpr ParamType usage = paramType_; }; diff --git a/atvc/include/common/dtype_utils.h b/atvc/include/common/dtype_utils.h index 349e314d32bd09941ffc550e9854a3ed653204e4..c9e430557d5bf462c83563d7939767ef1aef3b79 100644 --- a/atvc/include/common/dtype_utils.h +++ b/atvc/include/common/dtype_utils.h @@ -14,7 +14,8 @@ #include "graph/types.h" namespace ATVC { template -inline ge::DataType GetOriInputType() { +inline ge::DataType GetOriInputType() +{ if constexpr (std::is_same::value) { return ge::DataType::DT_FLOAT; } else if constexpr (std::is_same::value) { @@ -32,8 +33,9 @@ inline ge::DataType GetOriInputType() { } }; -inline ge::DataType GetPromoteDataType(ge::DataType dtype) { - switch(dtype) { +inline ge::DataType GetPromoteDataType(ge::DataType dtype) +{ + switch (dtype) { case ge::DataType::DT_BF16: case ge::DataType::DT_FLOAT16: case ge::DataType::DT_FLOAT: diff --git a/atvc/include/common/kernel_utils.h b/atvc/include/common/kernel_utils.h index c4f31ad45df0950c06474ba4bd556c788acd12c0..810aeff7e30c7d97744ffd4d95cf14b69cbe16f9 100644 --- a/atvc/include/common/kernel_utils.h +++ b/atvc/include/common/kernel_utils.h @@ -19,13 +19,15 @@ namespace ATVC { __BLOCK_LOCAL__ static AscendC::TPipe g_pipe; template -__aicore__ inline void SetEvent(AscendC::HardEvent evt) { +__aicore__ inline void SetEvent(AscendC::HardEvent evt) +{ event_t eventId = static_cast(GetTPipePtr()->FetchEventID(evt)); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); } template -__aicore__ inline void SyncDataQueue() { +__aicore__ inline void SyncDataQueue() +{ event_t eventId = static_cast(GetTPipePtr()->FetchEventID(EVENT)); AscendC::SetFlag(eventId); AscendC::WaitFlag(eventId); @@ -76,7 +78,8 @@ struct GetPromoteType { using T = float; }; -__aicore__ inline int64_t FindNearestPower2(const int64_t value) { +__aicore__ inline int64_t FindNearestPower2(const int64_t value) +{ if (value == 0) { return 0; } else if (value <= CONST2) { @@ -90,7 +93,8 @@ __aicore__ inline int64_t FindNearestPower2(const int64_t value) { } } -__aicore__ inline int64_t CalLog2(int64_t value) { +__aicore__ inline int64_t CalLog2(int64_t value) +{ int64_t res = 0; while (value > 1) { value = value >> 1; diff --git a/atvc/include/common/ops_utils.h b/atvc/include/common/ops_utils.h index b5239f7f1d059e4a35a548d87ec36a68a84069d4..efb33063196c3d9d098a42b8abd82e62dc103b9a 100644 --- a/atvc/include/common/ops_utils.h +++ b/atvc/include/common/ops_utils.h @@ -63,7 +63,7 @@ __host_aicore__ inline T Aligned(T value, T alignment) */ template __host_aicore__ inline typename std::enable_if ::value, T>::type FloorAlign(T x, U align) { - return align == 0 ? 0 : x / align * align; + return align == 0 ? 0 : x / align * align; } } diff --git a/atvc/include/common/platform.h b/atvc/include/common/platform.h index af02863fcdae831085706f53251c2a77c0173e4a..ad27408858cfbde869e122611c75be751a93a874 100644 --- a/atvc/include/common/platform.h +++ b/atvc/include/common/platform.h @@ -18,7 +18,7 @@ #include "kernel_operator.h" #include "kernel_tiling/kernel_tiling.h" -namespace ATVC{ +namespace ATVC { namespace Platform { /** * Get the block size of unified buffer in bytes diff --git a/atvc/include/common/tensor_info.h b/atvc/include/common/tensor_info.h index 5a7a84058f47b7eeace90aca1ff20f52dbd6fd94..c2338b91d64df776d2353964e7813f14fcf6b0e6 100644 --- a/atvc/include/common/tensor_info.h +++ b/atvc/include/common/tensor_info.h @@ -26,7 +26,7 @@ namespace ATVC { // 记录一个LocalTensor在In/Out/Local中的偏移位置,以及自己的类型 template struct TensorInfo { - __aicore__ inline TensorInfo(){} + __aicore__ inline TensorInfo() {} using Dtype = T; AscendC::GlobalTensor gmTensor; int32_t local_offset; diff --git a/atvc/include/elewise/elewise_host.h b/atvc/include/elewise/elewise_host.h index fbd676d8fef0b5d6c0446e6473f760c1ffeb868f..cffb7f99ea4a7dd4f4fd9b04bea721ed6359c67b 100644 --- a/atvc/include/elewise/elewise_host.h +++ b/atvc/include/elewise/elewise_host.h @@ -33,7 +33,8 @@ struct EleWiseTilingHyperParam { int32_t nBufferNum = 2; // 每个Queue中的Tensor数量 }; -int32_t GetEleWiseBasicCnt(const EleWiseTilingHyperParam &hiperParam, int32_t totalCnt, uint32_t blockNum, uint32_t ubufLimitCnt) +int32_t GetEleWiseBasicCnt(const EleWiseTilingHyperParam &hiperParam, + int32_t totalCnt, uint32_t blockNum, uint32_t ubufLimitCnt) { uint32_t basicCnt = hiperParam.basicCntBase; // 基本块初始值 if (blockNum == 0) { @@ -50,7 +51,7 @@ int32_t GetEleWiseBasicCnt(const EleWiseTilingHyperParam &hiperParam, int32_t to basicCnt = basicCnt / hiperParam.nBufferNum; } } - if(basicCnt > ubufLimitCnt) { + if (basicCnt > ubufLimitCnt) { basicCnt = ubufLimitCnt / basicCntMin * basicCntMin; } return basicCnt; @@ -94,7 +95,8 @@ bool CalcEleWiseTiling(int32_t totalCnt, ATVC::EleWiseParam ¶m) if (blockNum > compileInfo.vectorCoreNum) { blockNum = compileInfo.vectorCoreNum; } - uint32_t ubufLimitCnt = ubSize / ((inTensorSumBytes + outTensorSumBytes) * hiperParam.nBufferNum + tempTensorSumBytes) / TILE_CONTROL; + uint32_t ubufLimitCnt = ubSize / ((inTensorSumBytes + outTensorSumBytes) * hiperParam.nBufferNum + + tempTensorSumBytes) / TILE_CONTROL; if (tempTensorSumBytes == 0) { // 未声明tempbuffer时,预留 1 / (bufferNum + 1)的空间给AscendC高阶API内部临时空间使用 ubufLimitCnt = ubSize / ((inTensorSumBytes + outTensorSumBytes) * (hiperParam.nBufferNum + 1)); diff --git a/atvc/include/elewise/elewise_op_template.h b/atvc/include/elewise/elewise_op_template.h index 451f121305eb901dc9f11ac3e554150090038d9b..ff1d05604181a42f1c725c740e84fd9f0eafeec9 100644 --- a/atvc/include/elewise/elewise_op_template.h +++ b/atvc/include/elewise/elewise_op_template.h @@ -39,7 +39,7 @@ class EleWiseOpTemplate { static constexpr size_t tempTensorSumBytes = ATVC::TypeListReduce, SumSizes>::Type::value; public: - __aicore__ inline EleWiseOpTemplate(){} + __aicore__ inline EleWiseOpTemplate() {} template __aicore__ inline void Run(Args&&... args) { g_pipe.Reset(); @@ -69,10 +69,12 @@ private: if (curBlockId < param_->tilingData.tailBlockCnt) { this->curCoreCnt_ = (param_->tilingData.numPerBlock + 1) * param_->tilingData.tiledCnt; - this->curCoreStartCnt_ = (param_->tilingData.numPerBlock + 1) * curBlockId * param_->tilingData.tiledCnt; + this->curCoreStartCnt_ = (param_->tilingData.numPerBlock + 1) * + curBlockId * param_->tilingData.tiledCnt; } else { this->curCoreCnt_ = param_->tilingData.numPerBlock * param_->tilingData.tiledCnt; - this->curCoreStartCnt_ = (curBlockId * param_->tilingData.numPerBlock + param_->tilingData.tailBlockCnt) * param_->tilingData.tiledCnt; + this->curCoreStartCnt_ = (curBlockId * param_->tilingData.numPerBlock + + param_->tilingData.tailBlockCnt) * param_->tilingData.tiledCnt; } if (curBlockId + 1 == param_->tilingData.blockNum) { this->curCoreCnt_ += param_->tilingData.tailElemCnt; @@ -88,7 +90,8 @@ private: private: // 申请LocalTensor等资源,初始化本核计算的GlobalTensor - __aicore__ inline void Init() { + __aicore__ inline void Init() + { // in/out/temp各自使用一个pipe进行管理,每个pipe里面管理的是ub地址连续的多个tensor if constexpr (InputCount > 0) { g_pipe.InitBuffer(inQueue, param_->nBufferNum, param_->tilingData.tiledCnt * inTensorSumBytes); @@ -133,10 +136,10 @@ private: { caclCnt_ = tailCnt; CopyIn(inTensors, ATVC::MakeIndexSequence{}); - Compute(inTensors, outTensors, tempTensors, - ATVC::MakeIndexSequence{}, - ATVC::MakeIndexSequence{}, - ATVC::MakeIndexSequence{}, + Compute(inTensors, outTensors, tempTensors, + ATVC::MakeIndexSequence{}, + ATVC::MakeIndexSequence{}, + ATVC::MakeIndexSequence{}, ATVC::Forward(args)...); CopyOut(outTensors, ATVC::MakeIndexSequence{}); } @@ -154,7 +157,7 @@ private: if (alignMainCnt > 0) { AscendC::DataCopy(inLocal_i, tensorInfo.gmTensor[curCoreStartCnt_ + offsetCnt_], alignMainCnt); } - if(alignTailCnt > 0) { + if (alignTailCnt > 0) { struct AscendC::DataCopyExtParams repeatParams = {1, (uint16_t)(alignTailCnt*sizeof(DataType)), 0, 0, 0}; AscendC::DataCopyPadExtParams padParams; AscendC::DataCopyPad(inLocal_i[alignMainCnt], @@ -169,7 +172,7 @@ private: // 所有 Tensor 的处理入口逻辑:递归完成对每个 Tensor 的处理 template - __aicore__ inline void CopyInAllTensors(AscendC::LocalTensor& inLocal, int32_t i , T& first, Ts&... rest) { + __aicore__ inline void CopyInAllTensors(AscendC::LocalTensor& inLocal, int32_t i, T& first, Ts&... rest) { CopyInAllTensors(inLocal, i, first); CopyInAllTensors(inLocal, ++i, rest...); } @@ -200,9 +203,9 @@ private: if (alignMainCnt > 0) { AscendC::DataCopy(tensorInfo.gmTensor[curCoreStartCnt_ + offsetCnt_], outLocal_i, alignMainCnt); } - if(alignTailCnt > 0) { + if (alignTailCnt > 0) { struct AscendC::DataCopyParams repeatParams = {1, (uint16_t)(alignTailCnt*sizeof(DataType)), 0, 0}; - AscendC::DataCopyPad( tensorInfo.gmTensor[curCoreStartCnt_ + offsetCnt_ + alignMainCnt], + AscendC::DataCopyPad(tensorInfo.gmTensor[curCoreStartCnt_ + offsetCnt_ + alignMainCnt], outLocal_i[alignMainCnt], repeatParams); } } @@ -212,10 +215,13 @@ private: } // 所有 Tensor 的处理入口逻辑:递归完成对每个 Tensor 的处理 - template - __aicore__ inline void CopyOutAllTensors(AscendC::LocalTensor& outLocal, int32_t i, T& first, Ts&... rest) { - CopyOutAllTensors(outLocal, i, first); - CopyOutAllTensors(outLocal, ++i, rest...); + template + __aicore__ inline void + CopyOutAllTensors(AscendC::LocalTensor &outLocal, int32_t i, + T &first, Ts &...rest) + { + CopyOutAllTensors(outLocal, i, first); + CopyOutAllTensors(outLocal, ++i, rest...); } // 将所有输出tensor拷贝到gm template @@ -236,9 +242,9 @@ __aicore__ inline void Compute(InTuple& inTensors, OutTuple& outTensors, TmpTupl if constexpr(TempCount > 0) { tempLocal = tempQueue.Get(); } - compute_(TupleElemGetLocalTensor(inLocal, inTensors, this->caclCnt_)..., - TupleElemGetLocalTensor(outLocal, outTensors, this->caclCnt_)..., - TupleElemGetLocalTensor(tempLocal, tempTensors, this->caclCnt_)..., + compute_(TupleElemGetLocalTensor(inLocal, inTensors, this->caclCnt_)..., + TupleElemGetLocalTensor(outLocal, outTensors, this->caclCnt_)..., + TupleElemGetLocalTensor(tempLocal, tempTensors, this->caclCnt_)..., ATVC::Forward(args)...); inQueue.FreeTensor(inLocal); outQueue.template EnQue(outLocal); @@ -286,7 +292,8 @@ private: private: // 填充 addr 到数组 - __aicore__ inline void FillAddrs(GM_ADDR argsArr[]) { + __aicore__ inline void FillAddrs(GM_ADDR argsArr[]) + { for (std::size_t i = 0; i < InputCount; ++i) { inGMAddrs_[i].SetGlobalBuffer(argsArr[i]); } @@ -304,7 +311,7 @@ private: template __aicore__ inline constexpr void FillOffsets(std::size_t* offsets) { constexpr std::size_t count = ATVC::TypeListSize::value; - FillOffsetsImpl(offsets, ATVC::MakeIndexSequence{}); + FillOffsetsImpl(offsets, ATVC::MakeIndexSequence{}); } private: diff --git a/atvc/include/reduce/common/patterns.h b/atvc/include/reduce/common/patterns.h index b88b115a172366e11098e84d55907415398260e7..364100b7aec41b35ced061e9754d407dd75d11eb 100644 --- a/atvc/include/reduce/common/patterns.h +++ b/atvc/include/reduce/common/patterns.h @@ -14,8 +14,7 @@ #include "common/const_def.h" namespace ATVC { -namespace ReducePattern -{ +namespace ReducePattern { constexpr int32_t MAX_LOOP_DIM = (MAX_DIM + 1) / 2; constexpr int32_t PATTERN_A = 10; constexpr int32_t PATTERN_RA = 0; diff --git a/atvc/include/reduce/common/reduce_common.h b/atvc/include/reduce/common/reduce_common.h index 38de9f526767db0e1d99505ca34d6335516c9d2e..fca80f3fe2d28dc5a2dbdf9fc8877f3610c19758 100644 --- a/atvc/include/reduce/common/reduce_common.h +++ b/atvc/include/reduce/common/reduce_common.h @@ -46,7 +46,7 @@ namespace BASIC_CNT { static constexpr uint32_t BASIC_4096 = 4096; }; -namespace AR_COUNT{ +namespace AR_COUNT { static constexpr uint32_t A0R1 = 1; static constexpr uint32_t A0R2 = 2; static constexpr uint32_t A1R0 = 10; @@ -77,11 +77,12 @@ namespace AR_COUNT{ }; struct ReducePolicy { - public: +public: int32_t patternID = -1; int32_t loopARCount = -1; int32_t loopInnerARCount = -1; - bool operator==(const ReducePolicy& rhs) const { + bool operator==(const ReducePolicy& rhs) const + { return this->patternID == rhs.patternID && this->loopARCount == rhs.loopARCount &&\ this->loopInnerARCount == rhs.loopInnerARCount; } diff --git a/atvc/include/reduce/reduce_host.h b/atvc/include/reduce/reduce_host.h index fe99aca7dfe5349036a650edcf9370cfa36c9a9c..479a98d9aaea6e8f461b323acbf8cc4edd59754e 100644 --- a/atvc/include/reduce/reduce_host.h +++ b/atvc/include/reduce/reduce_host.h @@ -45,9 +45,11 @@ void PrintParam(ReducePolicy* policy, ReduceParam* param) * @return bool 返回true表示计算成功,false表示失败。 */ template -bool CalcReduceTiling(std::vector inputShape, std::vector reduceDim, ReducePolicy* policy, ReduceParam* param) +bool CalcReduceTiling(std::vector inputShape, + std::vector reduceDim, ReducePolicy *policy, + ReduceParam *param) { - if(policy == nullptr || param == nullptr) { + if (policy == nullptr || param == nullptr) { printf("[ERROR] Invalid input: policy or param is null pointer!\n"); return false; } @@ -56,7 +58,8 @@ bool CalcReduceTiling(std::vector inputShape, std::vector redu int32_t nBufferNum = 2; // 每个Queue中的Tensor数量 }; using inputDTypeList = typename OpTraits::In::types; - static constexpr size_t inTensorSumBytes = ATVC::TypeListReduce, SumSizes>::Type::value; + static constexpr size_t inTensorSumBytes = + ATVC::TypeListReduce, SumSizes>::Type::value; if (inTensorSumBytes == 0) { printf("[ERROR] Tiling Error: OpTraits Input cannot be null!\n"); return false; @@ -76,7 +79,7 @@ bool CalcReduceTiling(std::vector inputShape, std::vector redu } ReduceTilingInputParam opInput = {reduceDim, inputShape, inputDtype, GetPromoteDataType(inputDtype)}; OpTiling::ReduceOpTiling tiling(opInput, policy, param); - if(tiling.Run() != 0) { + if (tiling.Run() != 0) { printf("[ERROR] Tiling Error\n"); return false; } diff --git a/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h b/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h index 9e723d1a33878e781ea568579aba94940f5c82ca..612b1d0e3a5eccb8a935f6232715eec6f843154f 100644 --- a/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h +++ b/atvc/include/reduce/reduce_utils/reduce_block_aux_util.h @@ -27,7 +27,8 @@ namespace ATVC { namespace KernelUtils { namespace Reduce { constexpr uint16_t MAX_OFFSET = 16; -__aicore__ inline int16_t MainR(int64_t dimR, bool isAR) { +__aicore__ inline int16_t MainR(int64_t dimR, bool isAR) +{ if (isAR && dimR < UB_ALIGN_32) { return 0; } @@ -41,7 +42,8 @@ __aicore__ inline int16_t MainR(int64_t dimR, bool isAR) { return static_cast(mainR); } -__aicore__ inline int64_t GetCacheID(const int64_t idx) { +__aicore__ inline int64_t GetCacheID(const int64_t idx) +{ return bcnt1(idx ^ (idx + CONST1)) - CONST1; } @@ -273,7 +275,8 @@ __aicore__ inline constexpr ReduceSchLoopInfo GetSchLoopInfo5() { return schInfo; } -__aicore__ inline constexpr ReduceSchLoopInfo GetGroupSchLoopInfo() { +__aicore__ inline constexpr ReduceSchLoopInfo GetGroupSchLoopInfo() +{ constexpr ReduceSchLoopInfo schInfo = {.patternID = ReducePattern::PATTERN_RA, .reduceDichotomy = CONST1, .loopACount = CONST1, diff --git a/atvc/include/reduce/reduce_utils/reduce_buf_pool.h b/atvc/include/reduce/reduce_utils/reduce_buf_pool.h index f84f8741c029a228bc551fdd4ba48a8b1f4a5cb7..01c63adaaa0391efa278c47d1dfb3d10790cdb5f 100644 --- a/atvc/include/reduce/reduce_utils/reduce_buf_pool.h +++ b/atvc/include/reduce/reduce_utils/reduce_buf_pool.h @@ -33,15 +33,6 @@ struct PoolManagerUnit { class ReduceBufPool { constexpr static int32_t MAX_INPUT_SIZE = 10; -private: - bool memo_[MAX_INPUT_SIZE] = {0}; - PoolManagerUnit inputUnit_; - PoolManagerUnit computeUnit_; - event_t eventIdV2Mte2_[MAX_INPUT_SIZE]; - AscendC::TBuf<> qQue_; - AscendC::TPipe* pipe_; - int32_t basicNum_; - public: __aicore__ inline ReduceBufPool(){}; @@ -64,7 +55,8 @@ public: AscendC::Duplicate(inputUb, 0, basicNum_ * inputNum); } - __aicore__ inline void ResetEvent() { + __aicore__ inline void ResetEvent() + { inputUnit_.idx = -1; computeUnit_.idx = -1; } @@ -75,7 +67,8 @@ public: computeUnit_.offset = basicNum_ * sizeof(T) * inputNum; } - __aicore__ inline void ResetComputeSize(int32_t computeNum) { + __aicore__ inline void ResetComputeSize(int32_t computeNum) + { computeUnit_.bufNum = computeNum; } @@ -113,15 +106,25 @@ public: } private: - __aicore__ inline int32_t GetComputeTensorId() { + __aicore__ inline int32_t GetComputeTensorId() + { computeUnit_.idx = (computeUnit_.idx + 1) % computeUnit_.bufNum; return computeUnit_.idx; } - __aicore__ inline int32_t GetInputTensorId() { + __aicore__ inline int32_t GetInputTensorId() + { inputUnit_.idx = (inputUnit_.idx + 1) % inputUnit_.bufNum; return inputUnit_.idx; } + + bool memo_[MAX_INPUT_SIZE] = {0}; + PoolManagerUnit inputUnit_; + PoolManagerUnit computeUnit_; + event_t eventIdV2Mte2_[MAX_INPUT_SIZE]; + AscendC::TBuf<> qQue_; + AscendC::TPipe* pipe_; + int32_t basicNum_; }; // class ReduceBufPool } // namespace KernelUtils } // namespace ATVC diff --git a/atvc/include/reduce/tiling/reduce_tiling.h b/atvc/include/reduce/tiling/reduce_tiling.h index d35bf19855c01f020a09ce79bad76fc175fb90e7..f649485c8e7db518c5383d53222893b16a653bcf 100644 --- a/atvc/include/reduce/tiling/reduce_tiling.h +++ b/atvc/include/reduce/tiling/reduce_tiling.h @@ -69,12 +69,12 @@ static void MakeWrapDim(const std::vector& shape, std::vector& } } -class ReduceOpTiling -{ +class ReduceOpTiling { public: ReduceOpTiling(ReduceTilingInputParam& inputParam, ATVC::ReducePolicy* policy, ATVC::ReduceParam* param) - : param_(param), policy_(policy), opInput_(inputParam) { + : param_(param), policy_(policy), opInput_(inputParam) + { compileInfo_ = ATVC::GetOpCompileInfo(); }; @@ -438,7 +438,8 @@ void ComputeSplit(std::vector& shape) param_->tilingData.basicBlock = basicBlock_; param_->tilingData.coreNum = static_cast(compileInfo_.vectorCoreNum); ComputeStride(shape); - uint32_t realCore = OpsUtils::CeilDiv(unitA_.outer, factorACntPerCore) * OpsUtils::CeilDiv(unitR_.outer, factorRCntPerCore); + uint32_t realCore = OpsUtils::CeilDiv(unitA_.outer, factorACntPerCore) * + OpsUtils::CeilDiv(unitR_.outer, factorRCntPerCore); param_->tilingData.coreNum = realCore; } template @@ -481,23 +482,16 @@ bool CalcCacheLineStep(const std::vector& shape) cBlock_.axis = i; } } - for (int32_t i = Pattern::TailA ? Pattern::Dim - 1 : Pattern::Dim - ATVC::CONST2; i > cBlock_.axis; i -= ATVC::CONST2) { - if (i == Pattern::Dim - 1) { - aInCacheLine *= OpsUtils::CeilAlign(shape[i], ubBlockSize); - } else { - aInCacheLine *= shape[i]; - } + const int32_t startIndex = Pattern::TailA ? Pattern::Dim - 1 : Pattern::Dim - ATVC::CONST2; + for (int32_t i = startIndex; i > cBlock_.axis; i -= ATVC::CONST2) { + aInCacheLine *= (i == Pattern::Dim - 1) ? OpsUtils::CeilAlign(shape[i], ubBlockSize) : shape[i]; } bool basicSplitA = (cBlock_.axis + (Pattern::FirstA ? 1 : 0)) % ATVC::CONST2; if (basicSplitA) { aInCacheLine *= cacheLineStep; } for (int32_t i = Pattern::Dim - 1; i > cBlock_.axis; --i) { - if (i == Pattern::Dim - 1) { - cBlock_.size = cBlock_.size * OpsUtils::CeilAlign(shape[i], ubBlockSize); - } else { - cBlock_.size *= shape[i]; - } + cBlock_.size *= (i == Pattern::Dim - 1) ? OpsUtils::CeilAlign(shape[i], ubBlockSize) : shape[i]; } cBlock_.size *= cacheLineStep; cBlock_.cacheLineStep = cacheLineStep; @@ -513,8 +507,9 @@ void ComputeUnitA(const std::vector& shape) uint64_t outerA = unitA_.outer; uint64_t innerA = unitA_.inner; uint64_t maxCacheA = MAX_INNER_A * sizeof(float) / ge::GetSizeByDataType(opInput_.promoteDtpye); - uint64_t maxInnerA = - Pattern::ID == ATVC::ReducePattern::PATTERN_A ? basicBlock_ / ge::GetSizeByDataType(opInput_.inputDtype) : maxCacheA; + const bool isPatternA = (Pattern::ID == ATVC::ReducePattern::PATTERN_A); + const uint64_t patternABlockSize = basicBlock_ / ge::GetSizeByDataType(opInput_.inputDtype); + uint64_t maxInnerA = isPatternA ? patternABlockSize : maxCacheA; uint64_t stepLen = Pattern::ID == ATVC::ReducePattern::PATTERN_A ? A_STEP_LEN : 1; // 纯A的步长为4, 减少循环次数 bool basicSplitA = (axisInCacheLine + (Pattern::FirstA ? 1 : 0)) % ATVC::CONST2; uint64_t bBlockNum = basicBlock_ / ge::GetSizeByDataType(opInput_.inputDtype);