diff --git a/impl/matmul/matmul_impl.h b/impl/matmul/matmul_impl.h index 7602832b7c0ec5c50e84bee1a53008e2d364145d..8372805ac6f926422b2fe9a851c3ed0338d0233f 100644 --- a/impl/matmul/matmul_impl.h +++ b/impl/matmul/matmul_impl.h @@ -654,10 +654,6 @@ template ::CheckTiling() { #ifdef ASCENDC_CPU_DEBUG - ASCENDC_ASSERT((var.tiling_.GetUsedCoreNum() > 0), { - KERNEL_LOG(KERNEL_ERROR, "var.tiling_.GetUsedCoreNum() is %d , which should be larger than 0", - var.tiling_.GetUsedCoreNum()); - }); ASCENDC_ASSERT((M_ > 0), { KERNEL_LOG(KERNEL_ERROR, "M_ is %d , which should be larger than 0", M_); }); ASCENDC_ASSERT((N_ > 0), { KERNEL_LOG(KERNEL_ERROR, "N_ is %d , which should be larger than 0", N_); }); ASCENDC_ASSERT((Ka_ > 0), { KERNEL_LOG(KERNEL_ERROR, "Ka_ is %d , which should be larger than 0", Ka_); }); diff --git a/impl/matmul/matmul_server.h b/impl/matmul/matmul_server.h index 3432d06a528ac90d7a73cfb342715cd9739a2109..a31a2d0166238b4b4479f513a33ca5ab6577b428 100644 --- a/impl/matmul/matmul_server.h +++ b/impl/matmul/matmul_server.h @@ -1,3 +1,4 @@ +<<<<<<< HEAD /** * Copyright (c) 2024 Huawei Technologies Co., Ltd. * This file is a part of the CANN Open Software. @@ -1062,4 +1063,1051 @@ public: __aicore__ inline MatmulServiceAux() {} }; } // namespace matmul +======= +/** + * Copyright (c) 2024 Huawei Technologies Co., Ltd. + * This file is a part of the CANN Open Software. + * Licensed under CANN Open Software License Agreement Version 1.0 (the "License"). + * Please refer to the License for details. You may not use this file except in compliance with the License. + * THIS SOFTWARE IS PROVIDED ON AN "AS IS" BASIS, WITHOUT WARRANTIES OF ANY KIND, EITHER EXPRESS OR IMPLIED, + * INCLUDING BUT NOT LIMITED TO NON-INFRINGEMENT, MERCHANTABILITY, OR FITNESS FOR A PARTICULAR PURPOSE. + * See LICENSE in the root of the software repository for the full text of the License. + */ + +/*! + * \file matmul_server.h + * \brief + */ +#ifndef LIB_MATMUL_MATMUL_SERVER_H +#define LIB_MATMUL_MATMUL_SERVER_H + +#include "../../lib/matmul/matmul.h" +#include "kernel_operator.h" + +namespace matmul { +constexpr uint16_t WORKSPACE_SYNC_ID = 15; +using namespace AscendC; +template struct IBShareCache { + __aicore__ inline IBShareCache() {}; +}; + +template <> +struct IBShareCache { + __aicore__ inline IBShareCache() {}; + using ShareCache = uint16_t; +}; + +template <> +struct IBShareCache { + __aicore__ inline IBShareCache() {}; + using ShareCache = GlobalCache; +}; +template __aicore__ constexpr bool IsIBShare() +{ + if (A_TYPE::ibShare == true) { + return true; + } + if (B_TYPE::ibShare == true) { + return true; + } + return false; +} + +struct MatmulMsg { + uint32_t setOrgShape : 1; + uint32_t orgM; + uint32_t orgN; + uint32_t orgKa; + uint32_t orgKb; + uint32_t orgKc; +}; + +struct ShareMatmulBase { + __aicore__ inline ShareMatmulBase() {}; +}; + +struct ShareMatmul : ShareMatmulBase { + __aicore__ inline ShareMatmul(){}; + MatmulMsg msg0; + MatmulMsg msg1; +}; + +template +struct ShareMatmulAux { + __aicore__ inline ShareMatmulAux(){}; +}; + +template <> +struct ShareMatmulAux { + __aicore__ inline ShareMatmulAux(){}; + using MSG = ShareMatmulBase; +}; + +template <> +struct ShareMatmulAux { + __aicore__ inline ShareMatmulAux(){}; + using MSG = ShareMatmul; +}; + +__aicore__ inline void clearWorkspace(__gm__ uint8_t* workspace) +{ + SetAtomicNone(); +#if __CCE_AICORE__ == 220 + if ASCEND_IS_AIC { + SetMaskNorm(); + SetLoadDataBoundary((uint64_t)0); + SetLoadDataPaddingValue((uint64_t)0); + } else { + AscendCUtils::SetMask((uint64_t)-1, (uint64_t)-1); + SetMaskNorm(); + } +#endif + +#ifdef __DAV_C220_CUBE__ + ClearWorkspaceImpl(workspace); + NotifyEvent(WORKSPACE_SYNC_ID); +#endif +} + +template , MATMUL_POLICY_DEFAULT_OF(MatmulPolicy)> +class MatmulService { + using SrcAT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using SrcT = typename A_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + +public: + __aicore__ inline MatmulService() {} + __aicore__ inline void InitKfc(TPipe* tpipe, void* tiling, KfcCommServer* kfc, int32_t instID, GM_ADDR workspace) + { + ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when init kfc matmul server"); + ASSERT(kfc != nullptr && "kfc cannot be nullptr when init kfc matmul server"); + ASSERT(workspace != nullptr && "workspace cannot be nullptr when init kfc matmul server"); + ASSERT(instID >= 0 && "instID should be not less than 0 when init kfc matmul server"); + this->instID = instID; + this->kfcCommSrv = kfc; + this->tpipe = tpipe; + this->workspace = workspace; + mul.SetSubBlockIdx(kfcCommSrv->subBlockID); + if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { + msgAux.msg0.setOrgShape = false; + msgAux.msg1.setOrgShape = false; + } + this->devEvtID = instID; + if constexpr (A_TYPE::ibShare == true || B_TYPE::ibShare == true) { + if (kfcCommSrv->subBlockID == 0) { + gCache.Init(); + } + } + using TILING_TYPE = typename std::remove_cv::type>::type; + if constexpr (IsSameTypeV) { + tiling_.SetTiling((TCubeTiling *)tiling); + mul.Init(tiling_.GetTiling(), nullptr); + } else if (tiling) { + tiling_.SetTiling((TCubeTiling *)tiling); + mul.Init(tiling_.GetTiling(), nullptr); + } + } + + __aicore__ inline void Init(__gm__ KfcMsg* msg) + { + if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { + return; + } else { + ASSERT(msg != nullptr && "msg cannot be nullptr when init matmul server"); + ASSERT(msg->tilingInfo.tilingAddr != nullptr && "tiling cannot be nullptr when init matmul server"); + auto temp1 = ((__gm__ uint32_t*)(msg->tilingInfo.tilingAddr)); + tiling_.SetTiling(&tmpTiling_); + auto temp2 = (uint32_t*)(tiling_.GetTiling()); + + constexpr uint32_t tCubeTilingSize = ConstCeil(sizeof(TCubeTiling), CACHE_LINE_SIZE) * CACHE_LINE_SIZE; + GlobalTensor tilingGlobal; + for (int i = 0; i < tCubeTilingSize; i += CACHE_LINE_SIZE) { + Barrier(); + tilingGlobal.SetGlobalBuffer((__gm__ int64_t *)(msg->tilingInfo.tilingAddr + i)); + DataCacheCleanAndInvalid(tilingGlobal); + } + + for (int i = 0; i < sizeof(TCubeTiling) / sizeof(uint32_t); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.Init(this->tiling_.GetTiling(), nullptr); + } + } + + __aicore__ inline void SetSubBlockIdx(uint8_t idx) + { + mul.SetSubBlockIdx(idx); + } + + __aicore__ inline void SetOrgShape(__gm__ KfcMsg* msg) + { + if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { + if (mul.GetSubBlockIdx() == 0) { + msgAux.msg0.orgM = msg->orgShape.orgM; + msgAux.msg0.orgN = msg->orgShape.orgN; + msgAux.msg0.orgKa = msg->orgShape.orgKa; + msgAux.msg0.orgKb = msg->orgShape.orgKb; + msgAux.msg0.orgKc = msg->orgShape.orgKc; + msgAux.msg0.setOrgShape = true; + } else { + msgAux.msg1.orgM = msg->orgShape.orgM; + msgAux.msg1.orgN = msg->orgShape.orgN; + msgAux.msg1.orgKa = msg->orgShape.orgKa; + msgAux.msg1.orgKb = msg->orgShape.orgKb; + msgAux.msg1.orgKc = msg->orgShape.orgKc; + msgAux.msg1.setOrgShape = true; + } + } else { + mul.SetOrgShape(msg->orgShape.orgM, msg->orgShape.orgN, msg->orgShape.orgKa, msg->orgShape.orgKb, + msg->orgShape.orgKc); + } + } + + __aicore__ inline void SetSingleShape(__gm__ KfcMsg* msg) + { + if (msg->body.setTail) { + mul.SetSingleShape(msg->body.singleM, msg->body.singleN, msg->body.singleK); + } + } + + __aicore__ inline void SetTail(__gm__ KfcMsg* msg) + { + if (msg->body.setTail) { + mul.SetTail(msg->body.singleM, msg->body.singleN, msg->body.singleK); + } + } + + __aicore__ inline void SetHF32(__gm__ KfcMsg* msg) + { + mul.SetHF32(static_cast(msg->body.enHF32), static_cast(msg->body.hf32TransMode)); + } + + __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg) + { + if (!msg->body.setTensorA) + return; + if constexpr (A_TYPE::format == CubeFormat::SCALAR) { + SrcAT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr)); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcAT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorA(scalar); + return; + } + const uint64_t size = (uint64_t)(msg->body.sizeAmatrix); + if constexpr (PhyPosIsL1(A_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.aAddr, size); + mul.SetTensorA(scmLocal, msg->body.isTransA); + } else { + GlobalTensor aGlobal; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(msg->body.aAddr), size); + mul.SetTensorA(aGlobal, msg->body.isTransA); + } + } + + __aicore__ inline void SetTensorA(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) + { + if (!msg->body.setTensorA) { + return; + } + if constexpr (A_TYPE::format == CubeFormat::SCALAR) { + SrcAT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.aAddr) + offset); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcAT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorA(scalar); + return; + } + if constexpr (PhyPosIsL1(A_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.aAddr + offset, size); + mul.SetTensorA(scmLocal, msg->body.isTransA); + } else { + GlobalTensor aGlobal; + aGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcAT*>(msg->body.aAddr + offset), size); + mul.SetTensorA(aGlobal, msg->body.isTransA); + } + } + + __aicore__ inline void SetQuantVector(__gm__ KfcMsg* msg) + { + if (!msg->body.setQuant) { + return; + } + int quantMode = msg->body.quantMode; + if (quantMode == 1) { + uint64_t quantScalar = msg->body.quantScalar; + mul.SetQuantScalar(quantScalar); + } else if (quantMode == 2) { + const uint64_t size = static_cast(msg->body.quantSize); + GlobalTensor quantGlobal; + quantGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint64_t*>(msg->body.quantAddr), size); + mul.SetQuantVector(quantGlobal); + } + } + + __aicore__ inline void SetBatchNum(__gm__ KfcMsg* msg) + { + if constexpr (A_TYPE::layout == LayoutMode::NONE) { + return; + } + if (!msg->body.setBatch) { + return; + } + mul.SetBatchNum(msg->body.batchA, msg->body.batchB); + } + + __aicore__ inline void SetSelfDefineData(__gm__ KfcMsg* msg) + { + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + mul.SetSelfDefineData(msg->body.dataPtr); + if constexpr (!ToMatmulConfig(MM_CFG).enableReuse) { + GlobalTensor dataGlobal; + dataGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ uint32_t*>(msg->body.dataPtr)); + DataCacheCleanAndInvalid(dataGlobal); + } + } + + __aicore__ inline void SetUserDefInfo(__gm__ KfcMsg* msg) + { + mul.SetUserDefInfo(msg->userDefInfo.tilingPtr); + } + + __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg) + { + if (!msg->body.setTensorB) + return; + if constexpr (B_TYPE::format == CubeFormat::SCALAR) { + SrcBT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr)); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcBT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorB(scalar); + return; + } + const uint64_t size = (uint64_t)(msg->body.sizeBmatrix); + if constexpr (PhyPosIsL1(B_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.bAddr, size); + mul.SetTensorB(scmLocal, msg->body.isTransB); + } else { + GlobalTensor bGlobal; + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(msg->body.bAddr), size); + mul.SetTensorB(bGlobal, msg->body.isTransB); + } + } + + __aicore__ inline void SetTensorB(__gm__ KfcMsg* msg, const uint64_t size, const uint64_t offset) + { + if (!msg->body.setTensorB) { + return; + } + if constexpr (B_TYPE::format == CubeFormat::SCALAR) { + SrcBT scalar; + auto temp1 = reinterpret_cast<__gm__ uint8_t*>(&(msg->body.bAddr) + offset); + auto temp2 = (uint8_t*)&scalar; + + for (int i = 0; i < sizeof(SrcBT); i++, temp1++, temp2++) { + *temp2 = *temp1; + } + mul.SetTensorB(scalar); + return; + } + if constexpr (PhyPosIsL1(B_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.bAddr + offset, size); + mul.SetTensorB(scmLocal, msg->body.isTransB); + } else { + GlobalTensor bGlobal; + bGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ SrcBT*>(msg->body.bAddr + offset), size); + mul.SetTensorB(bGlobal, msg->body.isTransB); + } + } + + __aicore__ inline void SetBias(__gm__ KfcMsg* msg) + { + if (msg->body.setTensorBias) { + const uint64_t size = (uint64_t)tiling_.GetSingleCoreN(); + if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.biasAddr, size); + mul.SetBias(scmLocal); + } else { + GlobalTensor biasGlobal; + biasGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr), size); + mul.SetBias(biasGlobal); + } + } else if (msg->body.setClearBias) { + mul.DisableBias(); + } + } + + __aicore__ inline void SetBias(__gm__ KfcMsg* msg, const uint64_t offset) + { + if (msg->body.setTensorBias) { + const uint64_t size = (uint64_t)tiling_.GetSingleCoreN(); + if constexpr (PhyPosIsL1(BIAS_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.biasAddr + offset, size); + mul.SetBias(scmLocal); + } else { + GlobalTensor biasGlobal; + biasGlobal.SetGlobalBuffer( + reinterpret_cast<__gm__ typename BIAS_TYPE::T*>(msg->body.biasAddr + offset), size); + mul.SetBias(biasGlobal); + } + } else if (msg->body.setClearBias) { + mul.DisableBias(); + } + } + + __aicore__ inline bool GetTensorC(__gm__ KfcMsg* msg) + { + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + uint64_t size; + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + size = ToMatmulConfig(MM_CFG).baseMN; + } else { + size = tiling_.GetBaseM() * tiling_.GetBaseN(); + } + if constexpr (PhyPosIsL1(C_TYPE::pos)) { + const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); + mul.GetTensorC(scmLocal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); + } else { + GlobalTensor cGlobal; + + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + mul.GetTensorC(cGlobal, (uint8_t)(msg->body.enAtomic), msg->body.enSequentialWrite); + } + // Now release UB + if constexpr (PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + if (msg->body.sync == 1) { // Synchronize + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + return false; + } + + __aicore__ inline uint16_t GetInstID() + { + return instID; + } + __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg) + { + if constexpr (!ToMatmulConfig(MM_CFG).enableInit) { + if (mul.GetSubBlockIdx() == 0 && msgAux.msg0.setOrgShape) { + mul.SetOrgShape(msgAux.msg0.orgM, msgAux.msg0.orgN, msgAux.msg0.orgKa, + msgAux.msg0.orgKb, msgAux.msg0.orgKc); + } else if (mul.GetSubBlockIdx() == 1 && msgAux.msg1.setOrgShape) { + mul.SetOrgShape(msgAux.msg1.orgM, msgAux.msg1.orgN, msgAux.msg1.orgKa, + msgAux.msg1.orgKb, msgAux.msg1.orgKc); + } + } + if (msg->body.isFirstIter) { + SetTensorA(msg); + SetTensorB(msg); + if constexpr (ToMatmulConfig(MM_CFG).enableSetBias) { + SetBias(msg); + } + if constexpr (ToMatmulConfig(MM_CFG).enableSetTail) { + SetTail(msg); + } + if constexpr (ToMatmulConfig(MM_CFG).enableQuantVector) { + SetQuantVector(msg); + } + if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) || + ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0)) { + SetBatchNum(msg); + } + if constexpr (ToMatmulConfig(MM_CFG).enableSetDefineData) { + SetSelfDefineData(msg); + } + } + } + + __aicore__ inline void IterateSetMessage(__gm__ KfcMsg* msg, const uint64_t batchASize, const uint64_t batchBSize, + const uint64_t offsetA = 0, const uint64_t offsetB = 0, const uint64_t offsetBias = 0) + { + if (msg->body.isFirstIter) { + SetTensorA(msg, batchASize, offsetA); + SetTensorB(msg, batchBSize, offsetB); + SetBias(msg, offsetBias); + SetTail(msg); + SetQuantVector(msg); + SetBatchNum(msg); + } + } + + __aicore__ inline bool IterateBatch(__gm__ KfcMsg* msg) + { + if constexpr (A_TYPE::layout == LayoutMode::NONE) { + return true; + } + // In the batch scenario, messages occupy 128 bytes. After the update, messages occupy 64 bytes. + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); +#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1 + if (msg->body.setQuant == 1) { + ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in + // Batch MM + } +#endif + IterateSetMessage(msg); + uint64_t size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); + + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), + msg->body.enSequentialWrite, msg->body.matrixStrideA, + msg->body.matrixStrideB, msg->body.matrixStrideC); + + // Now release UB + if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || + PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + if (msg->body.sync || msg->body.waitIterateBatch) { + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + return true; + } + + __aicore__ inline bool IterateNBatch(__gm__ KfcMsg* msg) + { + if constexpr (!ToMatmulConfig(MM_CFG).isNBatch) { + return true; + } + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); +#if defined(ASCENDC_CPU_DEBUG) && ASCENDC_CPU_DEBUG == 1 + if (msg->body.setQuant == 1) { + ASSERT(msg->body.quantMode != 1); // scalar mode is not supported for quantization parameters in + // Batch MM + } +#endif + const uint64_t size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); + const uint64_t singleBatchASize = (uint64_t)(msg->body.sizeAmatrix) / msg->body.batchLoop; + uint64_t batchAOffset = tiling_.GetALayoutInfoD() * msg->body.batchA; + if constexpr (A_TYPE::layout != LayoutMode::SBNGD) { + batchAOffset = batchAOffset * tiling_.GetALayoutInfoS(); + } + const uint64_t singleBatchBSize = (uint64_t)(msg->body.sizeBmatrix) / msg->body.batchLoop; + uint64_t batchBOffset = tiling_.GetBLayoutInfoD() * msg->body.batchB; + if constexpr (B_TYPE::layout != LayoutMode::SBNGD) { + batchBOffset = batchBOffset * tiling_.GetBLayoutInfoS(); + } + const uint64_t batchCOffset = tiling_.GetCLayoutInfoS2(); + const uint32_t batchC = msg->body.batchA > msg->body.batchB ? msg->body.batchA : msg->body.batchB; + bool layoutGCondition = tiling_.GetCLayoutInfoG() == 1 && + (tiling_.GetBLayoutInfoG() != 1 || tiling_.GetALayoutInfoG() != 1); + int32_t layoutG = tiling_.GetBLayoutInfoG() > tiling_.GetALayoutInfoG() ? tiling_.GetBLayoutInfoG() : tiling_.GetALayoutInfoG(); + int32_t batchOffsetBias = tiling_.GetCLayoutInfoS2() * batchC; + if (layoutGCondition) { + batchOffsetBias = batchOffsetBias / layoutG; + } + int32_t batchOffsetC = batchOffsetBias * sizeof(typename C_TYPE::T); + if constexpr (C_TYPE::layout != LayoutMode::SBNGD) { + batchOffsetC = batchOffsetC * tiling_.GetCLayoutInfoS1(); + } + uint64_t offset = 0; + uint32_t cntIterator = 0; + for (uint32_t loopIdx = 0U; loopIdx < msg->body.batchLoop; loopIdx++) { + const uint64_t aOffset = batchAOffset * loopIdx * sizeof(typename A_TYPE::T); + const uint64_t bOffset = batchBOffset * loopIdx * sizeof(typename B_TYPE::T); + const uint64_t biasOffset = batchOffsetBias * loopIdx * sizeof(typename BIAS_TYPE::T); + IterateSetMessage(msg, singleBatchASize, singleBatchBSize, aOffset, bOffset, biasOffset); + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr + offset), size); + mul.IterateBatch(cGlobal, msg->body.enPartialSum, (uint8_t)(msg->body.enAtomic), + msg->body.enSequentialWrite, msg->body.matrixStrideA, + msg->body.matrixStrideB, msg->body.matrixStrideC); + cntIterator++; + if (cntIterator < INC_PROCESS_CHECK && (!msg->body.sync && !msg->body.waitIterateBatch)) { + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + offset += batchOffsetC; + } + // Now release UB + if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || + PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + if (msg->body.sync || msg->body.waitIterateBatch) { + NotifyEvent(eventID); + } else if (cntIterator >= INC_PROCESS_CHECK) { + NotifyEvent(eventID); + } + return true; + } + + __aicore__ inline bool Iterate(__gm__ KfcMsg* msg, KFC_Enum funID) + { + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + if constexpr ((A_TYPE::ibShare == true) || (B_TYPE::ibShare == true)) { + if (msg->body.iterateFakeMsg) { + if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { // fake msg + uint16_t eventID = static_cast(this->devEvtID * 2 + kfcCommSrv->subBlockID); + NotifyEvent(eventID); + return true; + } + } + } else { + ASSERT(!msg->body.iterateFakeMsg &&"Only Ib share mode support fake msg."); + } + SyncCubeWithVec(); + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || + ((IsSameType::value || IsSameType::value) && + IsSameType::value) || + (IsSameType::value && (IsSameType::value || + IsSameType::value))) { + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + } + IterateSetMessage(msg); + uint64_t size; + if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) { + size = ToMatmulConfig(MM_CFG).singleCoreMN; + } else { + size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); + } + + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + const auto& scmLocal = GetTscmTensor(msg->body.cAddr, size); + uint64_t offset = 0; + uint64_t offsetSize = 0; + auto enSequentialWrite = msg->body.enSequentialWrite; + auto enAtomic = msg->body.enAtomic; + auto sync = msg->body.sync; + auto enPartialSum = msg->body.enPartialSum; + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) == 0) { + ASSERT(msg->body.cAddr != 0); // The output address must be configured. + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + offsetSize = enSequentialWrite ? ToMatmulConfig(MM_CFG).baseMN : 0; + } else { + offsetSize = enSequentialWrite ? (tiling_.GetBaseM() * tiling_.GetBaseN()) : 0; + } + } else { + if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { + ASSERT(msg->body.cAddr != 0); // The output address must be configured. + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + offsetSize = enSequentialWrite ? ToMatmulConfig(MM_CFG).baseMN : 0; + } else { + offsetSize = enSequentialWrite ? (tiling_.GetBaseM() * tiling_.GetBaseN()) : 0; + } + } else if (sync == 0) { + // For asynchronous Iterate, the offset must be used for address calculation and + // the size is baseM x baseN. + if constexpr (ToMatmulConfig(MM_CFG).baseMN != 0) { + offsetSize = ToMatmulConfig(MM_CFG).baseMN; + } else { + offsetSize = tiling_.GetBaseM() * tiling_.GetBaseN(); + } + enSequentialWrite = 1; + } + } + uint32_t cntIterator = 0; + TRACE_START(TraceId::MatMul_CALC); + // Asynchronous and configure the workspace + while (mul.Iterate(enPartialSum)) { + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { + if (unlikely(cntIterator == 0)) { + if (unlikely(funID == KFC_Enum::MMFUN_ITERATE && sync == 1)) { + TRACE_STOP(TraceId::MatMul_CALC); + return false; // The queue is not switched, and no message needs to be returned. + } + } + } + if constexpr (PhyPosIsL1(C_TYPE::pos)) { + mul.GetTensorC(scmLocal[offset], (uint8_t)(enAtomic), enSequentialWrite); + } else { + mul.GetTensorC(cGlobal[offset], (uint8_t)(enAtomic), enSequentialWrite); + } + cntIterator++; + if constexpr ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0) { + if (cntIterator < INC_PROCESS_CHECK) { + if (funID == KFC_Enum::MMFUN_ITERATE) { + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + NotifyEvent(eventID); + } + } + } + offset += offsetSize; + } + // Now release UB + if constexpr (PhyPosIsUB(A_TYPE::pos) || PhyPosIsUB(B_TYPE::pos) || + PhyPosIsUB(BIAS_TYPE::pos) || PhyPosIsUB(C_TYPE::pos)) { + if (unlikely(msg->ubAddr >= 0)) { + kfcCommSrv->FreeUB(msg->ubAddr); + } + } + + uint16_t eventID = static_cast(this->devEvtID * 2 + mul.GetSubBlockIdx()); + if (sync || msg->body.waitIterateAll) { + ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); + NotifyEvent(eventID); + } else if (cntIterator >= INC_PROCESS_CHECK && funID == KFC_Enum::MMFUN_ITERATE) { + NotifyEvent(eventID); + } + mul.End(); + TRACE_STOP(TraceId::MatMul_CALC); + return true; + } + + __aicore__ inline bool IterateIntraBlockPartSum(__gm__ KfcMsg* msg, KFC_Enum funID) + { + if constexpr (A_TYPE::layout != LayoutMode::NONE) { + return true; + } + if constexpr (((IsSameType::value || IsSameType::value) && + IsSameType::value) || + ((IsSameType::value || IsSameType::value) && + IsSameType::value) || + (IsSameType::value && (IsSameType::value || + IsSameType::value))) { + GlobalTensor msgGlobal; + msgGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ int64_t*>(msg) + sizeof(int64_t)); + DataCacheCleanAndInvalid(msgGlobal); + } + IterateSetMessage(msg); + if (mul.GetSubBlockIdx() == 0) { + return true; + } + uint64_t size; + if constexpr (ToMatmulConfig(MM_CFG).singleCoreMN != 0) { + size = ToMatmulConfig(MM_CFG).singleCoreMN; + } else { + size = tiling_.GetSingleCoreM() * tiling_.GetSingleCoreN(); + } + + GlobalTensor cGlobal; + cGlobal.SetGlobalBuffer(reinterpret_cast<__gm__ DstT*>(msg->body.cAddr), size); + mul.IterateAll(cGlobal, msg->body.enAtomic, msg->body.enSequentialWrite, + msg->body.waitIterateAll, msg->body.iterateFakeMsg); + + uint16_t eventID0 = static_cast(this->devEvtID * 2 + 0); + uint16_t eventID1 = static_cast(this->devEvtID * 2 + 1); + if (msg->body.sync || msg->body.waitIterateAll) { + ASSERT(funID == KFC_Enum::MMFUN_ITERATE_ALL); + NotifyEvent(eventID0); + NotifyEvent(eventID1); + } + if (!msg->body.iterateFakeMsg) { + mul.End(); + } + TRACE_STOP(TraceId::MatMul_CALC); + return true; + } + + __aicore__ inline bool IsSharedObj() + { + if constexpr (ToMatmulConfig(MM_CFG).enableInit) { + return false; + } else { + return true; + } + } + + __aicore__ inline bool SkipMsg(KFC_Enum funID, bool& freeMsg, + int &lastMsgId, const int subBlockID) + { + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + return false; + } + if constexpr (A_TYPE::ibShare || B_TYPE::ibShare || ToMatmulConfig(MM_CFG).intraBlockPartSum) { + if (funID == KFC_Enum::MMFUN_ITERATE_ALL) { + if (lastMsgId == subBlockID) { + freeMsg = false; + return true; + } + lastMsgId = subBlockID; + return false; + } + return false; + } else { + return false; + } + } + + __aicore__ inline bool LockMsgQueue(KFC_Enum funID, bool& freeMsg, + int &lastMsgId, const int subBlockID) + { + if constexpr (A_TYPE::ibShare && B_TYPE::ibShare) { + return true; + } + return false; + } + + __aicore__ inline bool Process(__gm__ KfcMsg* msg, KFC_Enum funID) + { + if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_ALL) != 0) || + ((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_NORMAL) != 0)) { + if ((static_cast(funID) & static_cast(KFC_Enum::MMFUN_MASK)) == + static_cast(KFC_Enum::MMFUN_MASK)) { + if constexpr (ToMatmulConfig(MM_CFG).intraBlockPartSum) { + return IterateIntraBlockPartSum(msg, funID); + } else { + return Iterate(msg, funID); + } + } + } + if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_BATCH) != 0) && + (A_TYPE::layout != LayoutMode::NONE)) { + if (funID == KFC_Enum::MMFUN_ITERATE_BATCH_ALL) { + return IterateBatch(msg); + } + } + if constexpr (ToMatmulConfig(MM_CFG).enableEnd) { + if (funID == KFC_Enum::MMFUN_END) { + mul.End(); + } + } + if constexpr (ToMatmulConfig(MM_CFG).enableGetTensorC) { + if (funID == KFC_Enum::MMFUN_GET_TENSOR_C) { + return GetTensorC(msg); + } + } + if constexpr (ToMatmulConfig(MM_CFG).enableSetOrgShape) { + if (funID == KFC_Enum::MMFUN_SET_ORG_SHAPE) { + SetOrgShape(msg); + return true; + } + } + if constexpr (ToMatmulConfig(MM_CFG).enableInit) { + if (funID == KFC_Enum::MMFUN_INIT) { + Init(msg); + return true; + } + } + if constexpr (((ToMatmulConfig(MM_CFG).iterateMode & IterateMode::ITERATE_MODE_N_BATCH) != 0) && + (A_TYPE::layout != LayoutMode::NONE)) { + if (funID == KFC_Enum::MMFUN_ITERATE_N_BATCH_ALL) { + return IterateNBatch(msg); + } + } + if (funID == KFC_Enum::MMFUN_SET_USER_DEF_INFO) { + SetUserDefInfo(msg); + return true; + } + if (funID == KFC_Enum::MMFUN_SET_HF32) { + SetHF32(msg); + return true; + } + ASSERT("illegal function ID."); + return true; + } + + template __aicore__ LocalTensor GetTscmTensor(uint64_t addr, const uint64_t size) + { + LocalTensor scmLocal; + TBuffAddr scmTbuf; + scmTbuf.logicPos = (uint8_t)(TPosition::TSCM); + scmTbuf.dataLen = size * sizeof(DstT); + scmTbuf.bufferAddr = addr; +#if ASCENDC_CPU_DEBUG + scmTbuf.absAddr = GetTPipePtr()->GetBaseAddr((uint8_t)(TPosition::TSCM)) + addr; +#endif + scmLocal.SetAddr(scmTbuf); + return scmLocal; + } + +private: + MatmulImpl mul; + GM_ADDR workspace; + KfcCommServer* kfcCommSrv; + TPipe* tpipe; + MatmulTiling tiling_; + TCubeTiling tmpTiling_; // for compatible with init interface + typename IBShareCache()>::ShareCache gCache; + typename ShareMatmulAux::MSG msgAux; + uint16_t instID; + uint16_t devEvtID; +}; + +template +__aicore__ inline constexpr bool IsSharedMatmul() +{ + return !matmul::ToMatmulConfig(MM_CFG).enableInit; +} +template , + MATMUL_POLICY_DEFAULT_OF(matmul::MatmulPolicy)> +struct MatmulInstBase { + __aicore__ inline MatmulInstBase(){}; +}; +template +struct MatmulInstShared : MatmulInstBase { + __aicore__ inline MatmulInstShared(){}; + matmul::MatmulService cubeObj[1]; +}; +template +struct MatmulInst : MatmulInstBase { + __aicore__ inline MatmulInst(){}; + matmul::MatmulService cubeObj[MIX_NUM]; +}; + +template +struct MatmulInstAux { + __aicore__ inline MatmulInstAux(){}; +}; + +template +struct MatmulInstAux { + __aicore__ inline MatmulInstAux(){}; + using MATMUL = MatmulInstShared; +}; + +template +struct MatmulInstAux { + __aicore__ inline MatmulInstAux(){}; + using MATMUL = MatmulInst; +}; + +template , MATMUL_POLICY_DEFAULT_OF(matmul::MatmulPolicy)> +class MatmulServiceAux { + using SrcT = typename A_TYPE::T; + using SrcAT = typename A_TYPE::T; + using SrcBT = typename B_TYPE::T; + using DstT = typename C_TYPE::T; + using BiasT = typename BIAS_TYPE::T; + friend class KfcServer; + constexpr static bool aIbShare = A_TYPE::ibShare; + constexpr static bool bIbShare = B_TYPE::ibShare; +public: + __aicore__ inline MatmulServiceAux() {} + typename MatmulInstAux(), A_TYPE, B_TYPE, C_TYPE, BIAS_TYPE, MM_CFG, MM_CB, + MATMUL_POLICY>::MATMUL cubeObj; + + // stub functions for MatmulImpl + __aicore__ inline void Init(TCubeTiling* cubeTiling, TPipe* tpipe = nullptr){}; + + __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgK){}; + __aicore__ inline void SetOrgShape(int orgM, int orgN, int orgKa, int orgKb, int orgKc = 0){}; + __aicore__ inline void SetSingleShape(int singleM, int singleN, int singleK){}; + __aicore__ inline void SetTail(int tailM = -1, int tailN = -1, int tailK = -1){}; + + __aicore__ inline void SetTensorA(const GlobalTensor& gm, bool isTransposeA = false){}; + + __aicore__ inline void SetTensorAWithCopy(const GlobalTensor& gm, const LocalTensor& leftMatrix, + bool isTransposeA = false){}; + __aicore__ inline void SetTensorB(const GlobalTensor& gm, bool isTransposeB = false){}; + + __aicore__ inline void SetTensorBWithCopy(const GlobalTensor& gm, const LocalTensor& rightMatrix, + bool isTransposeB = false){}; + __aicore__ inline void SetBias(const GlobalTensor& biasGlobal){}; + __aicore__ inline void SetTensorA(const LocalTensor& leftMatrix, bool isTransposeA = false){}; + __aicore__ inline void SetTensorB(const LocalTensor& rightMatrix, bool isTransposeB = false){}; + __aicore__ inline void SetBias(const LocalTensor& inputBias){}; + __aicore__ inline void SetTensorA(SrcAT aScalar){}; + __aicore__ inline void SetTensorB(SrcBT bScalar){}; + __aicore__ inline void DisableBias(){}; + __aicore__ inline void ClearBias(){}; + __aicore__ inline void SetSelfDefineData(const uint64_t dataPtr) {} + __aicore__ inline void SetUserDefInfo(const uint64_t tilingPtr) {} + __aicore__ inline void SetQuantScalar(const uint64_t quantScalar) {} + __aicore__ inline void SetQuantVector(const GlobalTensor& quantTensor) {} + template __aicore__ inline void SetWorkspace(__gm__ T* addr, int size) {}; + template __aicore__ inline void SetWorkspace(GlobalTensor& addr){}; + __aicore__ inline void End(){}; + __aicore__ inline void SetHF32(bool enableHF32 = false, int32_t transMode = 0){}; + + template __aicore__ inline bool Iterate(bool enPartialSum = false) + { + return false; + }; + template + __aicore__ inline void IterateAll(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false, bool waitIterateAll = false, bool fakeMsg = false){}; + template + __aicore__ inline void IterateAll(const LocalTensor& ubCmatrix, uint8_t enAtomic = 0){}; + __aicore__ inline void WaitIterateAll() {}; + template + __aicore__ inline void GetTensorC(const LocalTensor& c, uint8_t enAtomic = 0, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) {}; + template + __aicore__ inline void GetTensorC(const GlobalTensor& gm, uint8_t enAtomic = 0, + bool enSequentialWrite = false){}; + template + __aicore__ inline void GetTensorC(const GlobalTensor &gm, const LocalTensor &cLocal, + uint8_t enAtomic = 0, bool enSequentialWrite = false) {}; + template + __aicore__ inline GlobalTensor GetTensorC(uint8_t enAtomic = 0, bool enSequentialWrite = false) + { + GlobalTensor global; + return global; + }; + template + __aicore__ inline void IterateBatch(const GlobalTensor& gm, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0) {}; + template + __aicore__ inline void IterateBatch(const LocalTensor& ubCmatrix, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0) {}; + template + __aicore__ inline void IterateNBatch(const uint32_t batchLoop, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite, const uint32_t matrixStrideA = 0, const uint32_t matrixStrideB = 0, + const uint32_t matrixStrideC = 0) {}; + template + __aicore__ inline GlobalTensor GetBatchTensorC(uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false) {}; + template + __aicore__ inline GlobalTensor GetBatchC(uint32_t batchA, uint32_t batchB, bool enSequentialWrite = false) {}; + template + __aicore__ inline void GetBatchTensorC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) {}; + template + __aicore__ inline void GetBatchC(const LocalTensor& c, uint32_t batchA, uint32_t batchB, + bool enSequentialWrite = false, uint32_t height = 0, uint32_t width = 0, uint32_t srcGap = 0, + uint32_t dstGap = 0) {}; + __aicore__ inline void WaitIterateBatch() {}; + __aicore__ inline void SetLocalWorkspace(const LocalTensor& tmpBuffer) {}; + __aicore__ inline void AsyncGetTensorC(const LocalTensor& c){}; + __aicore__ inline void WaitGetTensorC(){}; + template + __aicore__ inline MatrixOffset GetOffsetC() + { + if constexpr (isTurnOnDebug) { + static_assert(!isTurnOnDebug, "unsupported!"); + } + } +}; +} // namespace matmul +>>>>>>> e30c307 (KFC常量化适配) #endif // __MATMUL_SERVER_H__ \ No newline at end of file diff --git a/impl/matmul/matmul_utils.h b/impl/matmul/matmul_utils.h index d53e35c9058d50f465f14b9aeeb51e91862dc7cf..5ad93984c162be4f6e07e487993de8b6b199daa2 100644 --- a/impl/matmul/matmul_utils.h +++ b/impl/matmul/matmul_utils.h @@ -458,19 +458,14 @@ template __aicore__ inline void InitKfcClient(T& matmulClient, U* tiling, TPipe* tpipe, KfcCommClient* client, int instIdx, GM_ADDR workspace) { - ASSERT(tpipe != nullptr && "tpipe cannot be nullptr when InitKFC"); ASSERT(client != nullptr && "client cannot be nullptr when InitKFC"); ASSERT(workspace != nullptr && "workspace cannot be nullptr when InitKFC"); ASSERT(instIdx >= 0); matmulClient.client = client; matmulClient.instIdx = instIdx; - matmulClient.tpipe = tpipe; + matmulClient.cubeTiling.SetTiling((TCubeTiling*)tiling); matmulClient.mmCntAddr_ = reinterpret_cast<__gm__ KfcMsg*>(GetMatmulIncAddr(workspace, GetBlockIdxImpl(), instIdx)); - if (tiling) { - matmulClient.InitStatic((const TCubeTiling*)tiling); - } else { - matmulClient.cubeTiling.SetTiling(nullptr); - } + matmulClient.InitStatic(); matmulClient.devEvtID = instIdx * 2 + GetSubBlockIdxImpl(); } #endif diff --git a/lib/matmul/matmul_client.h b/lib/matmul/matmul_client.h index 69b139c7ae3226b6bb620217f11f697c889520ae..8665bf049ae7bc32cf5891358194bc5eabcce209 100644 --- a/lib/matmul/matmul_client.h +++ b/lib/matmul/matmul_client.h @@ -93,7 +93,9 @@ public: template __aicore__ inline void SetWorkspace(__gm__ const T* addr, int size) { ASSERT(addr != nullptr); - ASSERT(!this->cubeTiling.IsNull()); + if constexpr (ToMatmulConfig(MM_CFG).singleCoreM == 0) { + ASSERT(!this->cubeTiling.IsNull()); + } uint64_t offset = mnIter_ * cubeTiling.GetBaseN() * cubeTiling.GetBaseM() * sizeof(DstT); cacheWorkspaceAddr = reinterpret_cast(const_cast<__gm__ T*>(addr)); @@ -808,13 +810,12 @@ private: GM_ADDR workspace); private: - __aicore__ inline void InitStatic(const TCubeTiling* cubeTiling) + __aicore__ inline void InitStatic() { + if (ToMatmulConfig(MM_CFG).singleCoreM == 0 && this->cubeTiling.IsNull()) { + return; + } ASSERT(sizeof(KfcMsg) % CACHE_LINE_SIZE == 0); - ASSERT(cubeTiling != nullptr && "cubeTiling cannot be nullptr when init matmul client"); - ASSERT(sizeof(TCubeTiling) % sizeof(uint64_t) == 0); - - this->cubeTiling.SetTiling(cubeTiling); *((uint64_t*)&kfcMsg_) = 0; *((uint64_t*)&(kfcMsg_.body)) = 0;