diff --git a/impl/matmul/tiling/matmul_tiling_algorithm.cpp b/impl/matmul/tiling/matmul_tiling_algorithm.cpp index 178053d8298c3057e9a8d271588797f02a038eed..fad57699c91237139ee336eeb72e97ac5dd8f55e 100644 --- a/impl/matmul/tiling/matmul_tiling_algorithm.cpp +++ b/impl/matmul/tiling/matmul_tiling_algorithm.cpp @@ -87,6 +87,8 @@ const std::map BAND_WIDTH_TAB = { // (numcols, band ratio) {320, 2}, {128, 2}, {192, 2.7} }; +constexpr int32_t N_BUFFER_33_FACTOR = 3; + namespace { bool IsOrgShapeAlign(int32_t orgShape, int32_t alignSize, bool isSmallShape = false) { @@ -2337,7 +2339,8 @@ ComputeIntensitySmallShape MatmulTilingAlgorithm::CalcComputeIntensitySmallShape MultiCoreScenario MatmulTilingAlgorithm::GetMultiCoreScenario(const MatmulRunParas& params) const { - if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { + if (tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B && + tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910_95) { return MultiCoreScenario::OTHERS; } if (tilingIns_->enableSplitK_ || tilingIns_->singleM != -1 || tilingIns_->singleN != -1) { @@ -2537,8 +2540,7 @@ void MatmulTilingAlgorithm::AdjustFloatL1Factor(const SingleCoreStatus& singleCo int64_t MatmulTilingAlgorithm::UpdateTiling(const MatmulRunParas& param, const CoreStatusPack &coreStatus, SingleCoreStatus& singleCoreStatus) const { - int32_t coreUse = singelBlockDim_ ? tilingIns_->blockDim : - coreStatus.batchDim * coreStatus.mDim * coreStatus.kDim * coreStatus.nDim; + int32_t coreUse = singelBlockDim_ ? tilingIns_->blockDim : coreStatus.batchDim * coreStatus.mDim * coreStatus.kDim * coreStatus.nDim; int32_t singleCoreM; int32_t singleCoreN; int32_t singleCoreK; @@ -2567,18 +2569,18 @@ int64_t MatmulTilingAlgorithm::UpdateTiling(const MatmulRunParas& param, const C int32_t mxTypePara = 0; GetMxScaleFactor(singleCoreStatus, reduceSize, mxTypePara); tilingIns_->tiling_.set_mxTypePara(mxTypePara); - tilingIns_->tiling_.set_depthA1( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); + if (!AdjustNBuffer33L1Factors(coreStatus, singleCoreStatus)) { + return -1L; + } + tilingIns_->tiling_.set_depthA1(MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, + singleCoreStatus.l0Status.kL0) * singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); tilingIns_->tiling_.set_depthB1(UpdateDepthB1(singleCoreStatus)); // if decrease depthB1, nBL1 must decrease to ensure nBL1 is less then depthB1 singleCoreStatus.l1Status.nBL1 = min(singleCoreStatus.l1Status.nBL1, tilingIns_->tiling_.get_depthB1()); tilingIns_->tiling_.set_stepM(singleCoreStatus.l1Status.mAL1); tilingIns_->tiling_.set_stepN(singleCoreStatus.l1Status.nBL1); - tilingIns_->tiling_.set_stepKa( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0)); - tilingIns_->tiling_.set_stepKb( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0)); + tilingIns_->tiling_.set_stepKa(MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0)); + tilingIns_->tiling_.set_stepKb(MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0)); AdjustFloatL1Factor(singleCoreStatus); tilingIns_->tiling_.set_isBias(tilingIns_->isBias ? 1 : 0); tilingIns_->tiling_.set_dbL0A(singleCoreStatus.l0Status.dbL0A); @@ -2592,16 +2594,20 @@ bool MatmulTilingAlgorithm::DoMultiCoreSplitMNTiling(const MatmulRunParas& param BlockDimCalculator& blockDimRes) { auto multiCoreScenario = GetMultiCoreScenario(params); - if (multiCoreScenario != MultiCoreScenario::SPLIT_MN && - multiCoreScenario != MultiCoreScenario::SPLIT_SMALL_MN) { + if (multiCoreScenario != MultiCoreScenario::SPLIT_MN && multiCoreScenario != MultiCoreScenario::SPLIT_SMALL_MN && + tilingIns_->scheduleType != ScheduleType::N_BUFFER_33) { return false; } ComputeBaseBlock baseBlock = GetMultiCoreBasicBlock(params); // calc basic block - if (multiCoreScenario == MultiCoreScenario::SPLIT_MN) { - TILING_LOG_DEBUG("MultiCoreScenario is SPLIT_MN."); + if (tilingIns_->scheduleType == ScheduleType::N_BUFFER_33) { + if (!CalcNBuffer33BlockDims(params, baseBlock, coreStatus)) { + return false; + } + } else if (multiCoreScenario == MultiCoreScenario::SPLIT_MN) { + TILING_LOG_DEBUG("Multi-core scenario is SPLIT_MN."); CalcMultiCoreBlockDims(params, baseBlock, coreStatus, blockDimRes); } else { - TILING_LOG_DEBUG("MultiCoreScenario is SPLIT_SMALL_MN."); + TILING_LOG_DEBUG("Multi-core scenario is SPLIT_SMALL_MN."); CalcMultiCoreBlockDimsSmallShape(params, baseBlock, coreStatus, blockDimRes); } @@ -2615,7 +2621,9 @@ bool MatmulTilingAlgorithm::DoMultiCoreSplitMNTiling(const MatmulRunParas& param int32_t stepKb; CalcL1Tiling(baseBlock, depthA1, depthB1, stepKa, stepKb); singleCoreStatus.l1Status = GetL1CoreStatus(baseBlock, depthA1, depthB1, stepKa, stepKb); - (void)UpdateTiling(params, coreStatus, singleCoreStatus); + if (UpdateTiling(params, coreStatus, singleCoreStatus) == -1L) { + return false; + } return true; } @@ -2630,6 +2638,48 @@ bool MatmulTilingAlgorithm::NeedOutputAlign(int32_t m, int32_t n, int32_t k) con return needAlign; } +bool MatmulTilingAlgorithm::CalcNBuffer33BlockDims(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, + CoreStatusPack& coreStatus) const +{ + coreStatus.batchDim = 1; + coreStatus.mDim = MathUtil::CeilDivision(GetSingleM(), baseBlock.baseM * N_BUFFER_33_FACTOR); + if (tilingIns_->enableSplitK_) { + coreStatus.kDim = MathUtil::CeilDivision(GetSingleK(), baseBlock.baseK * N_BUFFER_33_FACTOR); + } else { + coreStatus.kDim = 1; + if (MathUtil::CeilDivision(GetSingleK(), baseBlock.baseK) > N_BUFFER_33_FACTOR) { + TILING_LOG_WARNING("SingleCoreK %d and baseK %d does not satisfy NBuffer33 requirements. " + "Suggest use EnableMultiCoreSplitK to turn on multi core K split.", GetSingleK(), baseBlock.baseK); + return false; + } + } + if (coreStatus.mDim * coreStatus.kDim > numOfBlock_) { + TILING_LOG_WARNING("M %d (baseM %d) or K %d (baseK %d) is too large to find a valid NBuffer33 single core " + "shape within %d cores. Remnind to slice M or K in test code.", GetSingleM(), baseBlock.baseM, + GetSingleK(), baseBlock.baseK, numOfBlock_); + } + + std::vector> dimPairs; + int32_t nDim = 1; + dimPairs.push_back({coreStatus.mDim, nDim}); + int32_t nDimMax = min(GetSingleN() / baseBlock.baseN, numOfBlock_ / (coreStatus.mDim * coreStatus.kDim)); + while (nDim <= nDimMax) { + nDim++; + dimPairs.push_back({coreStatus.mDim, nDim}); + } + std::vector results; + for (const auto &factor : dimPairs) { + results.push_back(CalcComputeIntensity(params, baseBlock, factor)); + } + std::sort(results.begin(), results.end()); + for (const auto &res : results) { + TILING_LOG_DEBUG("intent:%f, cycle: %f, band: %f, mDim: %d, nDim: %d\n", + res.avgIntensity, res.computeCycle, res.bandRatio, res.dimFactor.first, res.dimFactor.second); + } + coreStatus.nDim = results[0].dimFactor.second; + return true; +} + void MatmulTilingAlgorithm::CalcMultiCoreBlockDims(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes) { @@ -2725,7 +2775,7 @@ void MatmulTilingAlgorithm::CalcMultiCoreBlockDimsPost(const MatmulRunParas& par blockDimRes.kBytes = blockDimRes.kNum * INPUTDTYPE_BYTES; // contain k * 16 * 2 coreStatus.batch = params.batch32; coreStatus.k = params.k32 / coreStatus.kDim; - TILING_LOG_DEBUG("CalcMultiCoreBlockDims, coreStatus m: %d n: %d k: %d", coreStatus.m, coreStatus.n, coreStatus.k); + TILING_LOG_DEBUG("CalcMultiCoreBlockDims, coreStatus m: %d n: %d k: %d.", coreStatus.m, coreStatus.n, coreStatus.k); // load size of A matrix is batch * m // load size of B matrix is n DimFactor blockDim(1, blockDimRes.mDimFactor, blockDimRes.kDimFactor, blockDimRes.nDimFactor); @@ -3365,11 +3415,315 @@ bool MatmulTilingAlgorithm::CheckSingleShape(int32_t singleCoreM, int32_t single // ub only can process with 32B aligned, if format is ND, and D non-aligned output can't pad if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type == CubeFormat::ND && (singleCoreN * DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType)) % C0_BYTE_SIZE != 0) { - TILING_LOG_INFO("for ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " - "aligned, matrix c not support ND format"); + TILING_LOG_INFO("For ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " + "aligned, matrix c not support ND format."); + return false; + } + } + return true; +} + +bool MatmulTilingAlgorithm::CheckFixSplitInputs(int32_t singleCoreM) const +{ + if (tilingIns_->baseM == -1) { + return true; + } + + if (tilingIns_->baseM == 0) { + TILING_LOG_WARNING("baseM should be larger than zero."); + return false; + } + int32_t blockM = MathUtil::CeilDivision(singleCoreM, tilingIns_->baseM); + if (blockM > N_BUFFER_33_FACTOR) { + TILING_LOG_WARNING("Ceil(singleCoreM / baseM) = %d, should be less than or equal to 3.", blockM); + return false; + } + return true; +} + +void MatmulTilingAlgorithm::CalcBaseShape(const SingleCoreStatus &singleCoreStatus, + int32_t &baseM, int32_t &baseN, int32_t &baseK, int32_t &reduceSize) const +{ + baseM = singleCoreStatus.l0Status.mL0 * C0_SIZE; + baseM = tilingIns_->baseM != -1 ? tilingIns_->baseM : baseM; + if (tilingIns_->aType_.type == CubeFormat::ND && tilingIns_->aType_.isTrans && + tilingIns_->aType_.scalePos == TPosition::TSCM) { + baseM = MathUtil::Align(singleCoreStatus.l0Status.mL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; + } + + baseN = singleCoreStatus.l0Status.nL0 * C0_SIZE; + baseN = tilingIns_->baseN != -1 ? tilingIns_->baseN : baseN; + if (tilingIns_->bType_.type == CubeFormat::ND && !tilingIns_->bType_.isTrans && + tilingIns_->bType_.scalePos == TPosition::TSCM) { + baseN = MathUtil::Align(singleCoreStatus.l0Status.nL0, L0_FACTOR_NUM_LIMIT) * C0_SIZE; + } + + reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + baseK = singleCoreStatus.l0Status.kL0 * reduceSize; +} + +bool MatmulTilingAlgorithm::CheckL0ASize(int32_t singleCoreM, int32_t singleCoreK, int32_t &baseM, int32_t &baseK) const +{ + int32_t l0aLoadSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (l0aLoadSize <= tilingIns_->bufferPool_.l0ASize / DB_OFF) { + return true; + } + + if (tilingIns_->baseM == -1) { + int32_t blockM = MathUtil::CeilDivision(singleCoreM, baseM); + while (blockM < N_BUFFER_33_FACTOR) { + blockM++; + baseM = MathUtil::Align(MathUtil::CeilDivision(singleCoreM, blockM), C0_SIZE); + l0aLoadSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (l0aLoadSize <= tilingIns_->bufferPool_.l0ASize / DB_OFF) { + return true; + } + } + } + + int32_t blockK = MathUtil::CeilDivision(singleCoreK, baseK); + int32_t reduceSize = + static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + while (blockK < N_BUFFER_33_FACTOR) { + blockK++; + baseK = MathUtil::Align(MathUtil::CeilDivision(singleCoreK, blockK), reduceSize); + l0aLoadSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (l0aLoadSize <= tilingIns_->bufferPool_.l0ASize / DB_OFF) { + return true; + } + } + + TILING_LOG_WARNING("L0A load size %d with baseM %d and baseK %d as final attempt exceeds L0ASize %d. " + "Cannot find valid baseM & baseK under current singleCoreM %d and singleCoreK %d.", + l0aLoadSize, baseM, baseK, tilingIns_->bufferPool_.l0ASize, singleCoreM, singleCoreK); + return false; +} + +bool MatmulTilingAlgorithm::CheckL0BSize(int32_t singleCoreN, int32_t singleCoreK, int32_t &baseN, int32_t &baseK) const +{ + int32_t l0bLoadSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (l0bLoadSize <= tilingIns_->bufferPool_.l0BSize / DB_OFF) { + return true; + } + + int32_t blockK = MathUtil::CeilDivision(singleCoreK, baseK); + int32_t reduceSize = + static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + while (blockK < N_BUFFER_33_FACTOR) { + blockK++; + baseK = MathUtil::Align(MathUtil::CeilDivision(singleCoreK, blockK), reduceSize); + l0bLoadSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (l0bLoadSize <= tilingIns_->bufferPool_.l0BSize / DB_OFF) { + return true; + } + } + + if (tilingIns_->baseN == -1) { + auto kSize = baseK * static_cast(DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE); + if (kSize > 0) { + baseN = C0_SIZE * (tilingIns_->bufferPool_.l0BSize / kSize / C0_SIZE); + l0bLoadSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (l0bLoadSize <= tilingIns_->bufferPool_.l0BSize / DB_OFF) { + return true; + } + } + } + + TILING_LOG_WARNING("L0B load size %d with baseN %d and baseK %d as final attempt exceeds L0BSize %d. " + "Cannot find valid baseN & baseK under current singleCoreN %d and singleCoreK %d.", + l0bLoadSize, baseN, baseK, tilingIns_->bufferPool_.l0BSize, singleCoreN, singleCoreK); + return false; +} + +bool MatmulTilingAlgorithm::CheckL0CSize(int32_t singleCoreM, int32_t singleCoreN, int32_t &baseM, int32_t &baseN) const +{ + int32_t l0cLoadSize = baseM * baseN * FP32_BYTES; + if (l0cLoadSize <= tilingIns_->bufferPool_.l0CSize / DB_OFF) { + return true; + } + + if (tilingIns_->baseM == -1) { + int32_t blockM = MathUtil::CeilDivision(singleCoreM, baseM); + while (blockM < N_BUFFER_33_FACTOR) { + blockM++; + baseM = MathUtil::Align(MathUtil::CeilDivision(singleCoreM, blockM), C0_SIZE); + l0cLoadSize = baseM * baseN * FP32_BYTES; + if (l0cLoadSize <= tilingIns_->bufferPool_.l0CSize / DB_OFF) { + return true; + } + } + } + + if (tilingIns_->baseN == -1) { + baseN = min(baseN, C0_SIZE * (tilingIns_->bufferPool_.l0CSize / (baseM * FP32_BYTES) / C0_SIZE)); + l0cLoadSize = baseM * baseN * FP32_BYTES; + if (l0cLoadSize <= tilingIns_->bufferPool_.l0CSize / DB_OFF) { + return true; + } + } + + TILING_LOG_WARNING("L0C load size %d with baseM %d and baseN %d as final attempt exceeds L0CSize %d. " + "Cannot find valid baseM & baseN under current singleCoreM %d and singleCoreN %d.", + l0cLoadSize, baseM, baseN, tilingIns_->bufferPool_.l0CSize, singleCoreM, singleCoreN); + return false; +} + +void MatmulTilingAlgorithm::CheckL0DB(int32_t baseM, int32_t baseN, int32_t baseK, + SingleCoreStatus &singleCoreStatus) const +{ + int32_t l0aLoadSize = baseM * baseK * DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (l0aLoadSize <= tilingIns_->bufferPool_.l0ASize / DB_ON) { + singleCoreStatus.l0Status.dbL0A = DB_ON; + } else { + singleCoreStatus.l0Status.dbL0A = DB_OFF; + } + int32_t l0bLoadSize = baseN * baseK * DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (l0bLoadSize <= tilingIns_->bufferPool_.l0BSize / DB_ON) { + singleCoreStatus.l0Status.dbL0B = DB_ON; + } else { + singleCoreStatus.l0Status.dbL0B = DB_OFF; + } + int32_t l0cLoadSize = baseM * baseN * FP32_BYTES; + if (l0cLoadSize <= tilingIns_->bufferPool_.l0CSize / DB_ON) { + singleCoreStatus.l0Status.dbL0C = DB_ON; + } else { + singleCoreStatus.l0Status.dbL0C = DB_OFF; + } +} + +bool MatmulTilingAlgorithm::AdjustNBuffer33L0Factors(const MatmulRunParas ¶m, const CoreStatusPack &coreStatus, + SingleCoreStatus &singleCoreStatus) const +{ + if (tilingIns_->scheduleType != ScheduleType::N_BUFFER_33) { + TILING_LOG_DEBUG("No need to adjust L0Factors for non-nbuffer33 scenario."); + return true; + } + int32_t singleCoreM = 0; + int32_t singleCoreN = 0; + int32_t singleCoreK = 0; + GetSingleShape(coreStatus, param, singleCoreM, singleCoreN, singleCoreK); + if (!CheckFixSplitInputs(singleCoreM)) { + return false; + } + + int32_t baseM = 0; + int32_t baseN = 0; + int32_t baseK = 0; + int32_t reduceSize = 0; + CalcBaseShape(singleCoreStatus, baseM, baseN, baseK, reduceSize); + + if (MathUtil::CeilDivision(singleCoreM, baseM) > N_BUFFER_33_FACTOR) { + baseM = MathUtil::Align(MathUtil::CeilDivision(singleCoreM, N_BUFFER_33_FACTOR), C0_SIZE); + } + if (MathUtil::CeilDivision(singleCoreK, baseK) > N_BUFFER_33_FACTOR) { + baseK = MathUtil::Align(MathUtil::CeilDivision(singleCoreK, N_BUFFER_33_FACTOR), reduceSize); + } + + if (!CheckL0ASize(singleCoreM, singleCoreK, baseM, baseK)) { + return false; + } + if (!CheckL0BSize(singleCoreN, singleCoreK, baseN, baseK)) { + return false; + } + if (!CheckL0CSize(singleCoreM, singleCoreN, baseM, baseN)) { + return false; + } + CheckL0DB(baseM, baseN, baseK, singleCoreStatus); + + if (tilingIns_->baseM == -1) { + singleCoreStatus.l0Status.mL0 = MathUtil::CeilDivision(baseM, C0_SIZE); + } + if (tilingIns_->baseN == -1) { + singleCoreStatus.l0Status.nL0 = MathUtil::CeilDivision(baseN, C0_SIZE); + } + singleCoreStatus.l0Status.kL0 = MathUtil::CeilDivision(baseK, reduceSize); + return true; +} + +int32_t MatmulTilingAlgorithm::GetNBuffer33L1Size(const SingleCoreStatus &singleCoreStatus) const +{ + int32_t curAL1Size = 0; + int32_t depthA1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1; + if (!MathUtil::CheckMulOverflow(tilingIns_->tiling_.get_baseM(), tilingIns_->tiling_.get_baseK(), curAL1Size) || + !MathUtil::CheckMulOverflow(curAL1Size, depthA1, curAL1Size) || + !MathUtil::CheckMulOverflow(curAL1Size, DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE, + curAL1Size)) { + return 0; + } + + int32_t curBL1Size = 0; + int32_t depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; + if (!MathUtil::CheckMulOverflow(tilingIns_->tiling_.get_baseN(), tilingIns_->tiling_.get_baseK(), curBL1Size) || + !MathUtil::CheckMulOverflow(curBL1Size, depthB1, curBL1Size) || + !MathUtil::CheckMulOverflow(curBL1Size, DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE, + curBL1Size)) { + return 0; + } + + int32_t biasL1Size = !tilingIns_->isBias ? 0 : + tilingIns_->tiling_.get_baseN() * DTYPE_BIT_TAB.at(tilingIns_->biasType_.dataType) / BITS_PER_BYTE; + int32_t dequantSize = 0; + if (tilingIns_->deqType == DequantType::TENSOR) { + dequantSize = singleCoreStatus.l1Status.nBL1 * tilingIns_->tiling_.get_baseN() * UINT64_TYPES; + } + return (curAL1Size + curBL1Size + biasL1Size + dequantSize); +} + +bool MatmulTilingAlgorithm::AdjustNBuffer33L1Factors(const CoreStatusPack &coreStatus, + SingleCoreStatus &singleCoreStatus) const +{ + if (tilingIns_->scheduleType != ScheduleType::N_BUFFER_33) { + TILING_LOG_DEBUG("No need to adjust L1Factors for non-nbuffer33 scanario."); + return true; + } + + tilingIns_->tiling_.set_usedCoreNum(min(tilingIns_->tiling_.get_usedCoreNum(), tilingIns_->blockDim)); + if (coreStatus.batchDim * coreStatus.mDim * coreStatus.kDim * coreStatus.nDim > tilingIns_->blockDim) { + tilingIns_->tiling_.set_singleCoreM(min(tilingIns_->tiling_.get_baseM() * N_BUFFER_33_FACTOR, GetSingleM())); + tilingIns_->tiling_.set_singleCoreK(min(tilingIns_->tiling_.get_baseK() * N_BUFFER_33_FACTOR, GetSingleK())); + } + + singleCoreStatus.l1Status.mAL1 = + MathUtil::CeilDivision(tilingIns_->tiling_.get_singleCoreM(), tilingIns_->tiling_.get_baseM()); + int32_t blockK = MathUtil::CeilDivision(tilingIns_->tiling_.get_singleCoreK(), tilingIns_->tiling_.get_baseK()); + if (singleCoreStatus.l1Status.kAL1 != singleCoreStatus.l1Status.kBL1 || + MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) != blockK) { + singleCoreStatus.l1Status.kAL1 = blockK * singleCoreStatus.l0Status.kL0; + singleCoreStatus.l1Status.kBL1 = singleCoreStatus.l1Status.kAL1; + } + + singleCoreStatus.l1Status.dbAL1 = DB_OFF; + singleCoreStatus.l1Status.dbBL1 = DB_OFF; + int32_t curL1Size = GetNBuffer33L1Size(singleCoreStatus); + if (curL1Size > tilingIns_->bufferPool_.l1Size) { + bool succFlag = false; + while (singleCoreStatus.l1Status.nBL1 > 1) { + singleCoreStatus.l1Status.nBL1--; + curL1Size = GetNBuffer33L1Size(singleCoreStatus); + if (curL1Size <= tilingIns_->bufferPool_.l1Size) { + succFlag = true; + break; + } + } + if (!succFlag) { + TILING_LOG_WARNING("Current L1 size %d exceeds L1Size limit %d. Cannot find a valid L1 tiling factors.", + curL1Size, tilingIns_->bufferPool_.l1Size); return false; } } + + if (singleCoreStatus.l1Status.nBL1 <= 1) { + return true; + } + int32_t nBL1DbOff = singleCoreStatus.l1Status.nBL1; + singleCoreStatus.l1Status.dbBL1 = DB_ON; + singleCoreStatus.l1Status.nBL1 = singleCoreStatus.l1Status.nBL1 / DB_ON; + if (GetNBuffer33L1Size(singleCoreStatus) > tilingIns_->bufferPool_.l1Size) { + singleCoreStatus.l1Status.dbBL1 = DB_OFF; + singleCoreStatus.l1Status.nBL1 = nBL1DbOff; + } return true; } @@ -3394,6 +3748,9 @@ int64_t MatmulTilingAlgorithm::Process() if (DoMultiCoreSplitMNTiling(param, coreStatus, blockDimRes)) { return 0; } + if (tilingIns_->scheduleType == ScheduleType::N_BUFFER_33) { + return -1; + } GetBlockDim(opType, param, coreStatus, blockDimRes); } else { if (!g_tempCfg.factorSplit) { @@ -3421,9 +3778,12 @@ int64_t MatmulTilingAlgorithm::Process() GetL0Factors(opType, param, coreStatus, singleCoreStatus); AdjustSparseL0Factors(singleCoreStatus); AdjustMxL0Factors(singleCoreStatus); + if (!AdjustNBuffer33L0Factors(param, coreStatus, singleCoreStatus)) { + return -1; + } if (singleCoreStatus.l0Status.mL0 == 0 || singleCoreStatus.l0Status.nL0 == 0 || singleCoreStatus.l0Status.kL0 == 0) { - TILING_LOG_WARNING("ml0/nl0/kl0 is zero"); + TILING_LOG_WARNING("ml0/nl0/kl0 is zero."); return -1; } GetL1Factors(opType, param, coreStatus, singleCoreStatus.l0Status, singleCoreStatus.l1Status); diff --git a/impl/matmul/tiling/matmul_tiling_algorithm.h b/impl/matmul/tiling/matmul_tiling_algorithm.h index 67b0a291539a8e551789d2715fad996751e6c93b..200fb0d1dc27274aaf62508b7658019e41254414 100644 --- a/impl/matmul/tiling/matmul_tiling_algorithm.h +++ b/impl/matmul/tiling/matmul_tiling_algorithm.h @@ -403,8 +403,19 @@ private: void AdjustMxL1Factors(SingleCoreStatus& singleCoreStatus, const int32_t k0Size) const; void GetMxScaleFactor(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, int32_t& mxTypePara) const; void CheckL0DB(SingleCoreStatus& singleCoreStatus, const int32_t baseK) const; + void CheckL0DB(int32_t baseM, int32_t baseN, int32_t baseK, SingleCoreStatus &singleCoreStatus) const; void GetMxUsedL1Size(const SingleCoreStatus& singleCoreStatus, const int32_t k0Size, int32_t& dataUsedL1Size, int32_t& scaleUsedL1Size, int32_t& biasUsedL1Size) const; + bool AdjustNBuffer33L0Factors(const MatmulRunParas ¶m, const CoreStatusPack &coreStatus, + SingleCoreStatus &singleCoreStatus) const; + bool AdjustNBuffer33L1Factors(const CoreStatusPack &coreStatus, SingleCoreStatus &singleCoreStatus) const; + bool CheckFixSplitInputs(int32_t singleCoreM) const; + void CalcBaseShape(const SingleCoreStatus &singleCoreStatus, + int32_t &baseM, int32_t &baseN, int32_t &baseK, int32_t &reduceSize) const; + bool CheckL0ASize(int32_t singleCoreM, int32_t singleCoreK, int32_t &baseM, int32_t &baseK) const; + bool CheckL0BSize(int32_t singleCoreN, int32_t singleCoreK, int32_t &baseN, int32_t &baseK) const; + bool CheckL0CSize(int32_t singleCoreM, int32_t singleCoreN, int32_t &baseM, int32_t &baseN) const; + int32_t GetNBuffer33L1Size(const SingleCoreStatus &singleCoreStatus) const; bool IsNeedAlign(bool isA) const; int32_t GetL1Size(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const; int32_t CalL1MaxLen(int32_t resL1Size, L1StatusPack& l1Status, const L0StatusPack& l0Status, @@ -463,6 +474,8 @@ private: BlockDimCalculator& blockDimRes); void CalcMultiCoreBlockDimsSmallShape(const MatmulRunParas& params, ComputeBaseBlock &baseBlock, CoreStatusPack& coreStatus, BlockDimCalculator& blockDimRes); + bool CalcNBuffer33BlockDims(const MatmulRunParas& params, const ComputeBaseBlock &baseBlock, + CoreStatusPack& coreStatus) const; void UpdateBaseBlock(const MatmulRunParas& params, const int32_t sm, const int32_t sn, ComputeBaseBlock &baseBlock) const; std::vector CalcTotalCycleMemory(const std::pair& shapeM, const std::pair& shapeN, const ComputeBaseBlock &baseBlock, const float memoryRatio, diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index f899b433cb63532a8da205def6f05b0aa72e25fd..b0c6666a24b4c6b5e6bd1ee5a60cd0006a2b10df 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -151,6 +151,7 @@ enum class DequantType : int32_t { enum class ScheduleType : int32_t { INNER_PRODUCT = 0, OUTER_PRODUCT = 1, + N_BUFFER_33 = 2, }; struct SysTilingTempBufSize { diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index 66bd15bf9f1f242bae8c097e0995f4cb38f5c646..8e9782c0ae553deead26dc82c46b1607286b7967 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -6405,3 +6405,409 @@ TEST_F(TestTiling, testReduceAllTiling) EXPECT_EQ(maxSize, 0); EXPECT_EQ(minSize, 0); } + +TEST_F(TestTiling, MatmulApiTilingNBuffer33CheckL0ABUpdateBaseMN) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(160, 160, 320); + tiling.SetShape(160, 160, 320); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_LE(MathUtil::CeilDivision(tilingData.get_singleCoreM(), tilingData.get_baseM()), 3); + EXPECT_LE(MathUtil::CeilDivision(tilingData.get_singleCoreK(), tilingData.get_baseK()), 3); + EXPECT_EQ(tilingData.get_stepM(), MathUtil::CeilDivision(tilingData.get_singleCoreM(), tilingData.get_baseM())); + EXPECT_LE(tilingData.get_stepKa(), 3); + EXPECT_EQ(tilingData.get_stepKa(), tilingData.get_stepKb()); + EXPECT_EQ(tilingData.get_stepKa(), MathUtil::CeilDivision(tilingData.get_singleCoreK(), tilingData.get_baseK())); + EXPECT_EQ(tilingData.get_baseM(), 80); + EXPECT_EQ(tilingData.get_baseN(), 144); + EXPECT_EQ(tilingData.get_baseK(), 112); + EXPECT_EQ(tilingData.get_stepM(), 2); + EXPECT_EQ(tilingData.get_stepKa(), 3); + EXPECT_EQ(tilingData.get_stepKb(), 3); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33CheckL0ACUpdateBaseKN) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(800, 2048, 64); + tiling.SetShape(800, 2048, 64); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_LE(MathUtil::CeilDivision(tilingData.get_singleCoreM(), tilingData.get_baseM()), 3); + EXPECT_LE(MathUtil::CeilDivision(tilingData.get_singleCoreK(), tilingData.get_baseK()), 3); + EXPECT_EQ(tilingData.get_stepM(), MathUtil::CeilDivision(tilingData.get_singleCoreM(), tilingData.get_baseM())); + EXPECT_LE(tilingData.get_stepKa(), 3); + EXPECT_EQ(tilingData.get_stepKa(), tilingData.get_stepKb()); + EXPECT_EQ(tilingData.get_stepKa(), MathUtil::CeilDivision(tilingData.get_singleCoreK(), tilingData.get_baseK())); + EXPECT_EQ(tilingData.get_baseM(), 272); + EXPECT_EQ(tilingData.get_baseN(), 112); + EXPECT_EQ(tilingData.get_baseK(), 32); + EXPECT_EQ(tilingData.get_stepM(), 3); + EXPECT_EQ(tilingData.get_stepKa(), 2); + EXPECT_EQ(tilingData.get_stepKb(), 2); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33SetCorrectBase) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(384, 512, 192); + tiling.SetShape(384, 512, 192); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + tiling.SetFixSplit(128, 256, 64); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_EQ(tilingData.get_baseM(), 128); + EXPECT_EQ(tilingData.get_baseN(), 256); + EXPECT_EQ(tilingData.get_baseK(), 64); + EXPECT_EQ(tilingData.get_stepM(), 3); + EXPECT_EQ(tilingData.get_stepKa(), 3); + EXPECT_EQ(tilingData.get_stepKb(), 3); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33Set0BaseM) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(384, 512, 192); + tiling.SetShape(384, 512, 192); + tiling.SetSingleShape(384, 512, 192); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + tiling.SetFixSplit(0); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, -1); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33SetSmallBaseM) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(384, 512, 192); + tiling.SetShape(384, 512, 192); + tiling.SetSingleShape(384, 512, 192); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + tiling.SetFixSplit(96); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, -1); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33MultiDepthB1) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::TSCM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT, true); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(32, 1024, 32); + tiling.SetShape(32, 1024, 32); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + tiling.SetDequantType(DequantType::TENSOR); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_LE(MathUtil::CeilDivision(tilingData.get_singleCoreM(), tilingData.get_baseM()), 3); + EXPECT_LE(MathUtil::CeilDivision(tilingData.get_singleCoreK(), tilingData.get_baseK()), 3); + EXPECT_EQ(tilingData.get_stepM(), MathUtil::CeilDivision(tilingData.get_singleCoreM(), tilingData.get_baseM())); + EXPECT_LE(tilingData.get_stepKa(), 3); + EXPECT_EQ(tilingData.get_stepKa(), tilingData.get_stepKb()); + EXPECT_EQ(tilingData.get_stepKa(), MathUtil::CeilDivision(tilingData.get_singleCoreK(), tilingData.get_baseK())); + EXPECT_EQ(tilingData.get_baseM(), 32); + EXPECT_EQ(tilingData.get_baseN(), 256); + EXPECT_EQ(tilingData.get_baseK(), 32); + EXPECT_EQ(tilingData.get_stepM(), 1); + EXPECT_EQ(tilingData.get_stepKa(), 1); + EXPECT_EQ(tilingData.get_stepKb(), 1); + EXPECT_EQ(tilingData.get_depthB1(), 4); + EXPECT_EQ(tilingData.get_stepN(), 2); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33CheckL0AFail) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(1024, 512, 512); + tiling.SetShape(1024, 512, 512); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, -1); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33Fp16CheckL0CFail) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(480, 512, 384); + tiling.SetShape(480, 512, 384); + tiling.SetFixSplit(-1, 256); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + tilingData.set_usedCoreNum(1); + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, -1); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33CheckL0BUpdateK) +{ + matmul_tiling::MatmulApiTiling tiling; + matmul_tiling::MatmulTilingAlgorithm tilingAlgo(&tiling); + int32_t baseN = 128; + int32_t baseK = 256; + auto ret = tilingAlgo.CheckL0BSize(1024, 256, baseN, baseK); + EXPECT_EQ(ret, true); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33CheckL0BFail) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetFixSplit(128, 256); + matmul_tiling::MatmulTilingAlgorithm tilingAlgo(&tiling); + int32_t baseN = 512; + int32_t baseK = 512; + auto ret = tilingAlgo.CheckL0BSize(1024, 512, baseN, baseK); + EXPECT_EQ(ret, false); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33CheckL0CUpdateM) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetFixSplit(-1, 256); + matmul_tiling::MatmulTilingAlgorithm tilingAlgo(&tiling); + int32_t baseM = 256; + int32_t baseN = 256; + auto ret = tilingAlgo.CheckL0CSize(384, 512, baseM, baseN); + EXPECT_EQ(ret, true); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33CheckL0CFail) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetFixSplit(-1, 256); + matmul_tiling::MatmulTilingAlgorithm tilingAlgo(&tiling); + int32_t baseM = 512; + int32_t baseN = 256; + auto ret = tilingAlgo.CheckL0CSize(1024, 512, baseM, baseN); + EXPECT_EQ(ret, false); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33UpdateStepNFail) +{ + matmul_tiling::MatmulApiTiling tiling; + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(384, 1024, 512); + tiling.SetShape(384, 1024, 512); + tiling.tiling_.set_singleCoreM(384); + tiling.tiling_.set_singleCoreN(1024); + tiling.tiling_.set_singleCoreK(512); + tiling.tiling_.set_baseM(128); + tiling.tiling_.set_baseN(256); + tiling.tiling_.set_baseK(512); + tiling.scheduleType = ScheduleType::N_BUFFER_33; + matmul_tiling::MatmulTilingAlgorithm tilingAlgo(&tiling); + CoreStatusPack coreStatus; + SingleCoreStatus singleCoreStatus; + singleCoreStatus.l1Status.mAL1 = 3; + singleCoreStatus.l1Status.nBL1 = 4; + singleCoreStatus.l1Status.kAL1 = 512; + singleCoreStatus.l1Status.kBL1 = 512; + singleCoreStatus.l0Status.kL0 = 512; + auto ret = tilingAlgo.AdjustNBuffer33L1Factors(coreStatus, singleCoreStatus); + EXPECT_EQ(ret, false); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33MultiCoreDisableSplitK) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetDim(24); + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(384, 512, 192); + tiling.SetShape(384, 512, 192); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, -1); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33MultiCoreNormal) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetDim(24); + tiling.EnableMultiCoreSplitK(true); + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(384, 512, 192); + tiling.SetShape(384, 512, 192); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_EQ(tilingData.get_singleCoreM(), 384); + EXPECT_EQ(tilingData.get_singleCoreN(), 256); + EXPECT_EQ(tilingData.get_singleCoreK(), 96); + EXPECT_EQ(tilingData.get_baseM(), 128); + EXPECT_EQ(tilingData.get_baseN(), 256); + EXPECT_EQ(tilingData.get_baseK(), 32); + EXPECT_EQ(tilingData.get_stepM(), 3); + EXPECT_EQ(tilingData.get_stepN(), 1); + EXPECT_EQ(tilingData.get_stepKa(), 3); + EXPECT_EQ(tilingData.get_stepKb(), 3); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33MultiCoreMExceedsCoreNum) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetDim(4); + tiling.EnableMultiCoreSplitK(true); + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(2048, 512, 32); + tiling.SetShape(2048, 512, 32); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_EQ(tilingData.get_usedCoreNum(), 4); + EXPECT_EQ(tilingData.get_singleCoreM(), 384); + EXPECT_EQ(tilingData.get_singleCoreN(), 512); + EXPECT_EQ(tilingData.get_singleCoreK(), 32); + EXPECT_EQ(tilingData.get_baseM(), 128); + EXPECT_EQ(tilingData.get_baseN(), 256); + EXPECT_EQ(tilingData.get_baseK(), 32); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33MultiCoreKExceedsCoreNum) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetDim(4); + tiling.EnableMultiCoreSplitK(true); + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(384, 512, 1024); + tiling.SetShape(394, 512, 1024); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_EQ(tilingData.get_usedCoreNum(), 4); + EXPECT_EQ(tilingData.get_singleCoreM(), 384); + EXPECT_EQ(tilingData.get_singleCoreN(), 512); + EXPECT_EQ(tilingData.get_singleCoreK(), 96); + EXPECT_EQ(tilingData.get_baseM(), 128); + EXPECT_EQ(tilingData.get_baseN(), 256); + EXPECT_EQ(tilingData.get_baseK(), 32); +} + +TEST_F(TestTiling, MatmulApiTilingNBuffer33MultiCoreCheckUsedCoreNum) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetDim(4); + tiling.EnableMultiCoreSplitK(true); + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(1024, 64, 64); + tiling.SetShape(1024, 64, 64); + tiling.SetSingleShape(128, 64, 64); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + tiling.SetMatmulConfigParams(1, false, ScheduleType::N_BUFFER_33, MatrixTraverse::NOSET, false); + optiling::TCubeTiling tilingData; + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_EQ(tilingData.get_usedCoreNum(), 4); +} + +TEST_F(TestTiling, MatmulApiTilingCheckUsedCoreNum) +{ + matmul_tiling::MultiCoreMatmulTiling tiling; + tiling.SetDim(24); + tiling.EnableMultiCoreSplitK(true); + tiling.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT); + tiling.SetOrgShape(2048, 256, 128); + tiling.SetShape(2048, 256, 128); + tiling.SetSingleShape(128, 256, 128); + tiling.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + optiling::TCubeTiling tilingData; + auto ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_EQ(tilingData.get_usedCoreNum(), 24); +}