From 6ceb8239b72fb22a2589fdaf37256f37e5f45ff6 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Thu, 21 Nov 2024 19:56:09 +0800 Subject: [PATCH 01/13] fix ND2NZ error --- impl/matmul/matmul_tiling_algorithm.cpp | 236 ++++++++++++++++++++---- impl/matmul/matmul_tiling_algorithm.h | 4 + impl/matmul/matmul_tiling_base.cpp | 2 + lib/matmul/matmul_tiling_base.h | 18 +- 4 files changed, 218 insertions(+), 42 deletions(-) diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index a9212212..22081645 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -377,12 +377,12 @@ void MatmulTilingAlgorithm::GetL0FactorsCand(L0Factors& resFactors, const CoreSt maxN = tilingIns_->bufferPool_.btSize / C0_SIZE / FP32_BYTES / l0Status.dbL0C; } if (l0Status.maxAxisIdx != 0) { - // n0-major + // Major is n0 axis if ((majorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { continue; } } else { - // m0-major + // Major is m0 axis if ((minorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { continue; } @@ -662,7 +662,7 @@ void MatmulTilingAlgorithm::GetNearestFactor(const int32_t& base, int32_t& facto void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const { - // if b martix is in L1,then b matrix must be full loaded,goto b matrix full loaded process branch + // if b martix in L1,then b matrix must full load,goto b matrix full load patch if (tilingIns_->bType_.pos == TPosition::TSCM) { return; } @@ -673,7 +673,8 @@ void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus GetABL1KAlignValue(kaAlignValue, kbAlignValue); l1Status.kAL1 = MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0; const int32_t curL1Size = GetL1Size(l1Status, l0Status); - if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size) { + const int32_t a1Length = GetAL1UbSize(l1Status, l0Status); + if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size && a1Length < tilingIns_->bufferPool_.ubSize) { l1Status.aL1FullLoad = true; l1Status.aL1Size = max(MathUtil::Align(coreStatus.k, kaAlignValue), MathUtil::Align(l1Status.kAL1, kaAlignValue)) * @@ -698,6 +699,12 @@ void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus l1Status.kBL1 = min(CalL1MaxLen((l1Status.bL1Size - biasSize - dequantSize), l1Status, l0Status, kbAlignValue, L1TilingType::KBL1_16), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.dbBL1 = DB_OFF; + const int32_t b1Length = tilingIns_->bufferPool_.ubSize - a1Length; + l1Status.kBL1 = min(CalL1MaxLen(min(l1Status.bL1Size - biasSize - dequantSize, b1Length), + l1Status, l0Status, kbAlignValue, L1TilingType::KBL1_16), coreStatus.k); + } l1Status.bL1Times = min(l1Status.kBL1 / l0Status.kL0, l1Status.maxKBL1); GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); // tik-mm support no factor---ncheck l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; @@ -729,7 +736,7 @@ void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus void MatmulTilingAlgorithm::L1StatusBl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const { - // if a martix is in L1,then a matrix must be full loaded,goto a matrix full loaded process branch + // if a martix in L1,then a matrix must full load,goto a matrix full load patch if (tilingIns_->aType_.pos == TPosition::TSCM) { return; } @@ -740,7 +747,8 @@ void MatmulTilingAlgorithm::L1StatusBl1FullLoad(const CoreStatusPack& coreStatus GetABL1KAlignValue(kaAlignValue, kbAlignValue); l1Status.kBL1 = MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0; const int32_t curL1Size = GetL1Size(l1Status, l0Status); - if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size) { + const int32_t b1Length = GetBL1UbSize(l1Status, l0Status); + if (curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size && b1Length < tilingIns_->bufferPool_.ubSize) { l1Status.bL1FullLoad = true; l1Status.bL1Size = max(MathUtil::Align(coreStatus.k, kbAlignValue), MathUtil::Align(l1Status.kBL1, kbAlignValue)) * @@ -765,6 +773,12 @@ void MatmulTilingAlgorithm::L1StatusBl1FullLoad(const CoreStatusPack& coreStatus l1Status.kAL1 = min(CalL1MaxLen((l1Status.aL1Size - biasSize - dequantSize), l1Status, l0Status, kaAlignValue, L1TilingType::KAL1_16), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.dbAL1 = DB_OFF; + const int32_t a1Length = tilingIns_->bufferPool_.ubSize - b1Length; + l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), + l1Status, l0Status, kaAlignValue, L1TilingType::KAL1_16), coreStatus.k); + } l1Status.aL1Times = min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1); GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); // tik-mm support no factor---ncheck l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; @@ -799,11 +813,14 @@ void MatmulTilingAlgorithm::L1StatusBothFullLoad(const CoreStatusPack& coreStatu l1Status.kAL1 = MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0; l1Status.kBL1 = MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0; const int32_t curL1Size = GetL1Size(l1Status, l0Status); + const int32_t a1Length = GetAL1UbSize(l1Status, l0Status); + const int32_t b1Length = GetBL1UbSize(l1Status, l0Status); if (tilingIns_->aType_.pos == TPosition::TSCM && tilingIns_->bType_.pos == TPosition::TSCM) { l1Status.mAL1 = 1; l1Status.nBL1 = 1; } - if ((curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size) || + if ((curL1Size > 0 && curL1Size <= tilingIns_->bufferPool_.l1Size) && + a1Length + b1Length <= tilingIns_->bufferPool_.ubSize || (tilingIns_->aType_.pos == TPosition::TSCM && tilingIns_->bType_.pos == TPosition::TSCM)) { l1Status.bothFullLoad = true; l1Status.loadSize = (tilingIns_->aType_.pos == TPosition::TSCM ? 0 : coreStatus.m) + @@ -822,22 +839,27 @@ void MatmulTilingAlgorithm::NeitherFullLoadDb(const CoreStatusPack& coreStatus, { const int32_t tmpKbl116 = l1Status.kBL1; l1Status.kBL1 = kbl1Db; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { l1Status.dbBL1 = DB_OFF; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { l1Status.dbAL1 = DB_OFF; } } l1Status.kBL1 = coreStatus.k; const bool bothDoubleBuffer = coreStatus.m != l0Status.mL0 && coreStatus.k > l0Status.kL0 && - GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size; + (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize); l1Status.kBL1 = tmpKbl116; if (bothDoubleBuffer) { l1Status.dbAL1 = DB_ON; l1Status.dbBL1 = DB_ON; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { l1Status.dbBL1 = DB_OFF; - if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size) { + if (GetL1Size(l1Status, l0Status) > tilingIns_->bufferPool_.l1Size || + GetAL1UbSize(l1Status, l0Status) + GetBL1UbSize(l1Status, l0Status) > tilingIns_->bufferPool_.ubSize) { l1Status.dbAL1 = DB_OFF; } } @@ -878,8 +900,9 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, l1Mfirst.bL1Size = MathUtil::Align(l1Mfirst.kBL1, kbAlignValue) * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Mfirst.dbBL1; l1Mfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.bL1Size; + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Mfirst, l0Status); l1Mfirst.mAL1 = max(min(min( - CalL1MaxLen(l1Mfirst.aL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + CalL1MaxLen(min(l1Mfirst.aL1Size - biasSize - dequantSize, a1Length), l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), l1Mfirst.maxMAL1), mRepeat), 1); @@ -887,8 +910,9 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, l1Mfirst.aL1Size = MathUtil::Align(l1Mfirst.kAL1, kaAlignValue) * l1Mfirst.mAL1 * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Mfirst.dbAL1; l1Mfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.aL1Size; + int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Mfirst, l0Status); l1Mfirst.nBL1 = max(min(min( - CalL1MaxLen(l1Mfirst.bL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + CalL1MaxLen(min(l1Mfirst.bL1Size - biasSize - dequantSize, b1Length), l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), l1Mfirst.maxNBL1), nRepeat), 1); @@ -900,8 +924,9 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, l1Nfirst.aL1Size = MathUtil::Align(l1Nfirst.kAL1, kaAlignValue) * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Nfirst.dbAL1; l1Nfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.aL1Size; + b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Nfirst, l0Status); l1Nfirst.nBL1 = max(min(min( - CalL1MaxLen(l1Nfirst.bL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + CalL1MaxLen(min(l1Nfirst.bL1Size - biasSize - dequantSize, b1Length), l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), l1Nfirst.maxNBL1), nRepeat), 1); @@ -909,9 +934,10 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, l1Nfirst.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Nfirst.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Nfirst.dbBL1; l1Nfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.bL1Size; + a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Nfirst, l0Status); biasSize = biasSize * l1Nfirst.nBL1; l1Nfirst.mAL1 = max(min(min( - CalL1MaxLen(l1Nfirst.aL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + CalL1MaxLen(min(l1Nfirst.aL1Size - biasSize - dequantSize, a1Length), l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), l1Nfirst.maxMAL1), mRepeat), 1); @@ -976,7 +1002,8 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforNZ(const CoreStatusPack& coreStat l1Status.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; - l1Status.kAL1 = min(CalL1MaxLen(l1Status.aL1Size - biasSize - dequantSize, l1Status, l0Status, kaAlignValue, + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); + l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), l1Status, l0Status, kaAlignValue, L1TilingType::KAL1_16), coreStatus.k); l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); @@ -984,7 +1011,7 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforNZ(const CoreStatusPack& coreStat l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; } else { // when NeitherFullLoadMN change the nBL1 and mAL1 - int32_t perK = min((tilingIns_->bufferPool_.l1Size - biasSize - dequantSize) / + int32_t perK = min(min(tilingIns_->bufferPool_.l1Size - biasSize - dequantSize, tilingIns_->bufferPool_.ubSize) / (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / l0Status.kL0 * l0Status.kL0, @@ -1047,26 +1074,28 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat l1Status.bL1Size = l1Status.kBL1 * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; } l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, coreStatus.k); l1Status.aL1Times = max(l1Status.kAL1 / l0Status.kL0, 1); GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); // tik-mm support no factor ----ncheck l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; l1Status.aL1Size = l1Status.kAL1 * l1Status.mAL1 * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1; l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; + int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); if ((tilingIns_->bType_.dataType == DataType::DT_FLOAT) || (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8)) { - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), coreStatus.k); } else if (!tilingIns_->bType_.isTrans && (tilingIns_->bType_.dataType == DataType::DT_INT8 || tilingIns_->bType_.dataType == DataType::DT_INT4)) { - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), coreStatus.k); } else { - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize)/ + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length)/ (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), coreStatus.k); } @@ -1087,7 +1116,8 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat } l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; - l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / + int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), coreStatus.k); l1Status.bL1Times = max(l1Status.kBL1 / l0Status.kL0, 1); @@ -1095,19 +1125,20 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; l1Status.bL1Size = l1Status.kBL1 * l1Status.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbBL1; l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; + int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); if ((tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) || (!tilingIns_->aType_.isTrans && (tilingIns_->aType_.dataType == DataType::DT_INT8 || tilingIns_->aType_.dataType == DataType::DT_INT4))) { auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * alignK; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, coreStatus.k); } else if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8) { - l1Status.kAL1 = min((l1Status.aL1Size - biasSize - dequantSize) / + l1Status.kAL1 = min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / (alignM * l0Status.mL0 * l1Status.dbAL1 * alignK), coreStatus.k); l1Status.aL1Size = l1Status.kAL1 * alignM * l0Status.mL0 * alignK * l1Status.dbAL1; } else { auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, coreStatus.k); } l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); @@ -1157,7 +1188,7 @@ void MatmulTilingAlgorithm::NeitherFullLoadK(const CoreStatusPack& coreStatus, c void MatmulTilingAlgorithm::L1StatusNeitherFullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const { - // if martix is in L1,then matrix must be full loaded,skip non-full loaded branch + // if b martix in L1,then b matrix must full load,skip non-full process if (tilingIns_->aType_.pos == TPosition::TSCM || tilingIns_->bType_.pos == TPosition::TSCM) { return; } @@ -1304,9 +1335,9 @@ void MatmulTilingAlgorithm::GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32 if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - // case2: Non-aligned tail block in ub need to be filled with 0s, such constraint applies to data which is: - // (1) GM input, ND format - // (2) VECCALC input, ND format + // case2: input ND(GM/VECCALC), ND -> NZ transform, for now A/B reuse, only process with tail block, need UB space + // (1) input GM, format is ND, need do zero-fill to non-aligned tail block in ub + // (2) input VECCALC, format is ND, need do zero-fill to non-aligned tail block in ub int32_t aUbLength = 0; int32_t bUbLength = 0; if (!tilingIns_->aType_.isTrans && ((tilingIns_->tiling_.get_singleCoreK() * aTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { @@ -1361,6 +1392,62 @@ void MatmulTilingAlgorithm::GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32 return; } +void MatmulTilingAlgorithm::GetBankConflictSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status, + int32_t& length, bool isAMatrix) const +{ + constexpr int blockSize = 32; + constexpr int bankLen = 512; + bool isBankConflict = false; + int bankConflictSize = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (isAMatrix) { + if (tilingIns_->aType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(l1Status.mAL1 * l0Status.mL0 * C0_SIZE, C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.kL0 * reduceSize * C0_SIZE * + MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l0Status.kL0 * reduceSize, + C0_SIZE) * blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.mL0 * C0_SIZE * C0_SIZE * l1Status.mAL1 * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + } + } else { + if (tilingIns_->bType_.isTrans) { + isBankConflict = + MathUtil::CeilDivision(MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l0Status.kL0 * reduceSize, + C0_SIZE) * blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.nL0 * C0_SIZE * C0_SIZE * l1Status.nBL1 * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } else { + isBankConflict = + MathUtil::CeilDivision(l1Status.nBL1 * l0Status.nL0 * C0_SIZE, C0_SIZE) * + blockSize % bankLen == + 0 ? + true : + false; + bankConflictSize = l0Status.kL0 * reduceSize * C0_SIZE * + MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + } + } + if (isBankConflict) { + length = length + bankConflictSize; + } +} + void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) const { constexpr int blockSize = 32; @@ -1413,6 +1500,55 @@ void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) } } +int32_t MatmulTilingAlgorithm::GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const +{ + int32_t a1Length = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (IsUbNd2Nz()) { + // A matrix ND2NZ + if (tilingIns_->aType_.type == CubeFormat::ND) { + a1Length = l0Status.mL0 * C0_SIZE * l0Status.kL0 * reduceSize * + DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + a1Length = a1Length * MathUtil::CeilDivision(l1Status.kAL1, l0Status.kL0) * l1Status.mAL1; + } + // bank conflict + GetBankConflictSize(l1Status, l0Status, a1Length, true); + } + } + return a1Length; +} + +int32_t MatmulTilingAlgorithm::GetBL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const +{ + int32_t b1Length = 0; + const int32_t reduceSize = static_cast(C0_BYTE_SIZE / DTYPE_BIT_TAB.at(tilingIns_->aType_.dataType) * BITS_PER_BYTE); + if (IsUbNd2Nz()) { + // B matrix ND2NZ + if (tilingIns_->bType_.type == CubeFormat::ND) { + b1Length = l0Status.nL0 * C0_SIZE * l0Status.kL0 * reduceSize * + DTYPE_BIT_TAB.at(tilingIns_->bType_.dataType) / BITS_PER_BYTE; + if (tilingIns_->mmConfigType == 1) { + b1Length = b1Length * MathUtil::CeilDivision(l1Status.kBL1, l0Status.kL0) * l1Status.nBL1; + } + // bank conflict + GetBankConflictSize(l1Status, l0Status, b1Length, false); + } + } + return b1Length; +} + +bool MatmulTilingAlgorithm::IsUbNd2Nz() const +{ + if (tilingIns_->enVecND2NZ && tilingIns_->mmConfigType == 1 && + (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || + tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B)) { + return true; + } + return false; +} + void MatmulTilingAlgorithm::GetTransLength(int32_t& transLength) const { int32_t a1Length = 0; @@ -1537,7 +1673,7 @@ void MatmulTilingAlgorithm::CalcLoadSize(const DimFactor& blockDims, const CoreS constexpr int32_t minTotalSize = 128; const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); - const int32_t k0 = (m0 != 0 && n0 != 0) ? + const int32_t k0 = (m0 != 0 && n0 != 0) ? min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; const int32_t dbBuffer = 2; @@ -2177,6 +2313,31 @@ void MatmulTilingAlgorithm::SetDepthL1CacheUBParams(int32_t &a1LengthCache, int3 } } +int MatmulTilingAlgorithm::UpdateDepthB1(const SingleCoreStatus& singleCoreStatus) const +{ + int depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; + // only bType is f32 need update + if (tilingIns_->bType_.dataType != DataType::DT_FLOAT + || tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { + return depthB1; + } + uint16_t alignedBaseK = MathUtil::CeilDivision(tilingIns_->baseK, FP32_ALIGN_SIZE) * FP32_ALIGN_SIZE; + uint16_t alignedBaseKN = alignedBaseK * tilingIns_->baseN; + + uint16_t alignedBaseKM = tilingIns_->baseK * tilingIns_->baseM; + if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) { + alignedBaseKM = alignedBaseK * tilingIns_->baseM; + } + // if L1 size is overflow, decrease depthB1 + if ((tilingIns_->tiling_.get_depthA1() *alignedBaseKM + alignedBaseKN * depthB1) * sizeof(float) + > tilingIns_->bufferPool_.l1Size) { + depthB1 = tilingIns_->baseN * tilingIns_->baseK * depthB1 / alignedBaseKN; + depthB1 = depthB1 < 1 ? 1 : depthB1; + } + return depthB1; +} + int64_t MatmulTilingAlgorithm::Process() { PreprocessL0DB(); @@ -2218,7 +2379,7 @@ int64_t MatmulTilingAlgorithm::Process() return 1; } } - // logic of single core + // single-core logic GetL0Factors(opType, param, coreStatus, singleCoreStatus); if (singleCoreStatus.l0Status.mL0 == 0 || singleCoreStatus.l0Status.nL0 == 0 || singleCoreStatus.l0Status.kL0 == 0) { @@ -2268,7 +2429,7 @@ int64_t MatmulTilingAlgorithm::Process() if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - // only 32B alignment can be processed on the UB, it can not pad when format is ND and D non-aligned. + // ub only can process with 32B aligned, if format is ND, and D non-aligned output can't pad if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type == CubeFormat::ND && (singleCoreN * DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType)) % C0_BYTE_SIZE != 0) { TILING_LOG_INFO("for ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " @@ -2315,9 +2476,8 @@ int64_t MatmulTilingAlgorithm::Process() tilingIns_->tiling_.set_depthA1( MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); - tilingIns_->tiling_.set_depthB1( - MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1); + int newDepthB1 = UpdateDepthB1(singleCoreStatus); + tilingIns_->tiling_.set_depthB1(newDepthB1); tilingIns_->tiling_.set_stepM(singleCoreStatus.l1Status.mAL1); tilingIns_->tiling_.set_stepN(singleCoreStatus.l1Status.nBL1); tilingIns_->tiling_.set_stepKa( @@ -2338,7 +2498,7 @@ int64_t MatmulTilingAlgorithm::Process() int32_t a1LengthCache = 0; int32_t b1LengthCache = 0; SetDepthL1CacheUBParams(a1LengthCache, b1LengthCache); - tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse on ub + tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse in ub tilingIns_->tiling_.set_shareMode(0); tilingIns_->tiling_.set_dbL0A(singleCoreStatus.l0Status.dbL0A); tilingIns_->tiling_.set_dbL0B(singleCoreStatus.l0Status.dbL0B); @@ -2378,4 +2538,4 @@ int64_t MatmulTilingAlgorithm::Process() const bool ans = CheckFinaleParams(coreStatus); return ans ? 0 : -1; } -} // namespace matmul_tiling \ No newline at end of file +} // namespace matmul_tiling diff --git a/impl/matmul/matmul_tiling_algorithm.h b/impl/matmul/matmul_tiling_algorithm.h index 5937c096..9ef13506 100644 --- a/impl/matmul/matmul_tiling_algorithm.h +++ b/impl/matmul/matmul_tiling_algorithm.h @@ -354,6 +354,10 @@ private: void GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32_t& ubSize, int32_t a1LengthCache, int32_t b1LengthCache) const; void GetBankConflictSize(int32_t& length, bool isAMatrix) const; + void GetBankConflictSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status, int32_t& length, bool isAMatrix) const; + int32_t GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const; + int32_t GetBL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const; + bool IsUbNd2Nz() const; void GetTransLength(int32_t& transLength) const; void SetDepthL1CacheUBParams(int32_t &a1LengthCache, int32_t &b1LengthCache) const; void GetABL1KAlignValue(int32_t& kaAlignValue, int32_t& kbAlignValue) const; diff --git a/impl/matmul/matmul_tiling_base.cpp b/impl/matmul/matmul_tiling_base.cpp index 2922ac27..64d228ab 100644 --- a/impl/matmul/matmul_tiling_base.cpp +++ b/impl/matmul/matmul_tiling_base.cpp @@ -592,10 +592,12 @@ void MatmulApiTilingBase::SetMatmulConfigParams(const MatmulConfigParams& config TILING_LOG_DEBUG("Set EnableL1CacheUB: %d", static_cast(configParams.enableL1CacheUB)); TILING_LOG_DEBUG("Set ScheduleType: %d", static_cast(configParams.scheduleType)); TILING_LOG_DEBUG("Set Traverse: %d", static_cast(configParams.traverse)); + TILING_LOG_DEBUG("Set EnVecND2NZ: %d", static_cast(configParams.enVecND2NZ)); this->mmConfigType = configParams.mmConfigType; this->enableL1CacheUB = configParams.enableL1CacheUB; this->scheduleType = configParams.scheduleType; this->traverse_ = configParams.traverse; + this->enVecND2NZ = configParams.enVecND2NZ; } bool MatmulApiTilingBase::CheckSetParam() diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index 12c5f1f1..b2d418f9 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -176,10 +176,20 @@ struct PlatformInfo { }; struct MatmulConfigParams { - int32_t mmConfigType = 1; - bool enableL1CacheUB = false; - ScheduleType scheduleType = ScheduleType::INNER_PRODUCT; - MatrixTraverse traverse = MatrixTraverse::NOSET; + int32_t mmConfigType; + bool enableL1CacheUB; + ScheduleType scheduleType; + MatrixTraverse traverse; + bool enVecND2NZ; + MatmulConfigParams(int32_t mmConfigTypeIn = 1, bool enableL1CacheUBIn = false, + ScheduleType scheduleTypeIn = ScheduleType::INNER_PRODUCT, MatrixTraverse traverseIn = MatrixTraverse::NOSET, + bool enVecND2NZIn = false) { + mmConfigType = mmConfigTypeIn; + enableL1CacheUB = enableL1CacheUBIn; + scheduleType = scheduleTypeIn; + traverse = traverseIn; + enVecND2NZ = enVecND2NZIn; + } }; class MatmulApiTilingBase { -- Gitee From 9cb3216958672a1927dfa8e268cf30201485e268 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 22 Nov 2024 09:44:11 +0800 Subject: [PATCH 02/13] fix ND2NZ error --- impl/matmul/matmul_tiling_algorithm.cpp | 63 ++++++++----------------- lib/matmul/matmul_tiling_base.h | 1 + 2 files changed, 20 insertions(+), 44 deletions(-) diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index 22081645..c49fac17 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -377,12 +377,12 @@ void MatmulTilingAlgorithm::GetL0FactorsCand(L0Factors& resFactors, const CoreSt maxN = tilingIns_->bufferPool_.btSize / C0_SIZE / FP32_BYTES / l0Status.dbL0C; } if (l0Status.maxAxisIdx != 0) { - // Major is n0 axis + // n0-major if ((majorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { continue; } } else { - // Major is m0 axis + // m0-major if ((minorDimFactor > maxN0) && tilingIns_->isSupportL0c2Out && tilingIns_->isBias) { continue; } @@ -662,7 +662,7 @@ void MatmulTilingAlgorithm::GetNearestFactor(const int32_t& base, int32_t& facto void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const { - // if b martix in L1,then b matrix must full load,goto b matrix full load patch + // if b martix is in L1,then b matrix must be full loaded,goto b matrix full loaded process branch if (tilingIns_->bType_.pos == TPosition::TSCM) { return; } @@ -736,7 +736,7 @@ void MatmulTilingAlgorithm::L1StatusAl1FullLoad(const CoreStatusPack& coreStatus void MatmulTilingAlgorithm::L1StatusBl1FullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const { - // if a martix in L1,then a matrix must full load,goto a matrix full load patch + // if a martix is in L1,then a matrix must be full loaded,goto a matrix full loaded process branch if (tilingIns_->aType_.pos == TPosition::TSCM) { return; } @@ -934,8 +934,8 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, l1Nfirst.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Nfirst.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Nfirst.dbBL1; l1Nfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.bL1Size; - a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Nfirst, l0Status); biasSize = biasSize * l1Nfirst.nBL1; + a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Nfirst, l0Status); l1Nfirst.mAL1 = max(min(min( CalL1MaxLen(min(l1Nfirst.aL1Size - biasSize - dequantSize, a1Length), l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), l1Nfirst.maxMAL1), @@ -1095,7 +1095,7 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), coreStatus.k); } else { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length)/ + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), coreStatus.k); } @@ -1138,7 +1138,7 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat l1Status.aL1Size = l1Status.kAL1 * alignM * l0Status.mL0 * alignK * l1Status.dbAL1; } else { auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, coreStatus.k); } l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); @@ -1188,7 +1188,7 @@ void MatmulTilingAlgorithm::NeitherFullLoadK(const CoreStatusPack& coreStatus, c void MatmulTilingAlgorithm::L1StatusNeitherFullLoad(const CoreStatusPack& coreStatus, const L0StatusPack& l0Status, L1StatusPack& l1Status, int32_t res[][IDX_SEVEN]) const { - // if b martix in L1,then b matrix must full load,skip non-full process + // if martix is in L1,then matrix must be full loaded,skip non-full loaded branch if (tilingIns_->aType_.pos == TPosition::TSCM || tilingIns_->bType_.pos == TPosition::TSCM) { return; } @@ -1335,9 +1335,9 @@ void MatmulTilingAlgorithm::GetUsedSize(int32_t& l1Size, int32_t& l0cSize, int32 if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P || tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310B) { - // case2: input ND(GM/VECCALC), ND -> NZ transform, for now A/B reuse, only process with tail block, need UB space - // (1) input GM, format is ND, need do zero-fill to non-aligned tail block in ub - // (2) input VECCALC, format is ND, need do zero-fill to non-aligned tail block in ub + // case2: Non-aligned tail block in ub need to be filled with 0s, such constraint applies to data which is: + // (1) GM input, ND format + // (2) VECCALC input, ND format int32_t aUbLength = 0; int32_t bUbLength = 0; if (!tilingIns_->aType_.isTrans && ((tilingIns_->tiling_.get_singleCoreK() * aTypeSize / BITS_PER_BYTE) % C0_BYTE_SIZE != 0)) { @@ -1499,7 +1499,6 @@ void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) length = length + bankConflictSize; } } - int32_t MatmulTilingAlgorithm::GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const { int32_t a1Length = 0; @@ -1673,7 +1672,7 @@ void MatmulTilingAlgorithm::CalcLoadSize(const DimFactor& blockDims, const CoreS constexpr int32_t minTotalSize = 128; const int32_t n0 = min(minMNSize, coreStatus.n); // need check m,n > 16 or m,n<16 const int32_t m0 = min(minMNSize, ((n0 == 0) ? 0 : min(coreStatus.m, minTotalSize / n0))); - const int32_t k0 = (m0 != 0 && n0 != 0) ? + const int32_t k0 = (m0 != 0 && n0 != 0) ? min(min(minKSize / m0, minKSize / n0), coreStatus.k) : coreStatus.k; const int32_t dbBuffer = 2; @@ -2313,31 +2312,6 @@ void MatmulTilingAlgorithm::SetDepthL1CacheUBParams(int32_t &a1LengthCache, int3 } } -int MatmulTilingAlgorithm::UpdateDepthB1(const SingleCoreStatus& singleCoreStatus) const -{ - int depthB1 = MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * - singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1; - // only bType is f32 need update - if (tilingIns_->bType_.dataType != DataType::DT_FLOAT - || tilingIns_->socVersion != platform_ascendc::SocVersion::ASCEND910B) { - return depthB1; - } - uint16_t alignedBaseK = MathUtil::CeilDivision(tilingIns_->baseK, FP32_ALIGN_SIZE) * FP32_ALIGN_SIZE; - uint16_t alignedBaseKN = alignedBaseK * tilingIns_->baseN; - - uint16_t alignedBaseKM = tilingIns_->baseK * tilingIns_->baseM; - if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_FLOAT) { - alignedBaseKM = alignedBaseK * tilingIns_->baseM; - } - // if L1 size is overflow, decrease depthB1 - if ((tilingIns_->tiling_.get_depthA1() *alignedBaseKM + alignedBaseKN * depthB1) * sizeof(float) - > tilingIns_->bufferPool_.l1Size) { - depthB1 = tilingIns_->baseN * tilingIns_->baseK * depthB1 / alignedBaseKN; - depthB1 = depthB1 < 1 ? 1 : depthB1; - } - return depthB1; -} - int64_t MatmulTilingAlgorithm::Process() { PreprocessL0DB(); @@ -2379,7 +2353,7 @@ int64_t MatmulTilingAlgorithm::Process() return 1; } } - // single-core logic + // logic of single core GetL0Factors(opType, param, coreStatus, singleCoreStatus); if (singleCoreStatus.l0Status.mL0 == 0 || singleCoreStatus.l0Status.nL0 == 0 || singleCoreStatus.l0Status.kL0 == 0) { @@ -2429,7 +2403,7 @@ int64_t MatmulTilingAlgorithm::Process() if (tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND910 || tilingIns_->socVersion == platform_ascendc::SocVersion::ASCEND310P) { - // ub only can process with 32B aligned, if format is ND, and D non-aligned output can't pad + // only 32B alignment can be processed on the UB, it can not pad when format is ND and D non-aligned. if (tilingIns_->cType_.pos == TPosition::VECCALC && tilingIns_->cType_.type == CubeFormat::ND && (singleCoreN * DTYPE_BYTE_TAB.at(tilingIns_->cType_.dataType)) % C0_BYTE_SIZE != 0) { TILING_LOG_INFO("for ascend310p/ascend910, when matrix c pos is VECCACL and singleCoreN is not 32B " @@ -2476,8 +2450,9 @@ int64_t MatmulTilingAlgorithm::Process() tilingIns_->tiling_.set_depthA1( MathUtil::CeilDivision(singleCoreStatus.l1Status.kAL1, singleCoreStatus.l0Status.kL0) * singleCoreStatus.l1Status.mAL1 * singleCoreStatus.l1Status.dbAL1); - int newDepthB1 = UpdateDepthB1(singleCoreStatus); - tilingIns_->tiling_.set_depthB1(newDepthB1); + tilingIns_->tiling_.set_depthB1( + MathUtil::CeilDivision(singleCoreStatus.l1Status.kBL1, singleCoreStatus.l0Status.kL0) * + singleCoreStatus.l1Status.nBL1 * singleCoreStatus.l1Status.dbBL1); tilingIns_->tiling_.set_stepM(singleCoreStatus.l1Status.mAL1); tilingIns_->tiling_.set_stepN(singleCoreStatus.l1Status.nBL1); tilingIns_->tiling_.set_stepKa( @@ -2498,7 +2473,7 @@ int64_t MatmulTilingAlgorithm::Process() int32_t a1LengthCache = 0; int32_t b1LengthCache = 0; SetDepthL1CacheUBParams(a1LengthCache, b1LengthCache); - tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse in ub + tilingIns_->tiling_.set_transLength(transLength); // a1 b1 c1 reuse on ub tilingIns_->tiling_.set_shareMode(0); tilingIns_->tiling_.set_dbL0A(singleCoreStatus.l0Status.dbL0A); tilingIns_->tiling_.set_dbL0B(singleCoreStatus.l0Status.dbL0B); @@ -2538,4 +2513,4 @@ int64_t MatmulTilingAlgorithm::Process() const bool ans = CheckFinaleParams(coreStatus); return ans ? 0 : -1; } -} // namespace matmul_tiling +} // namespace matmul_tiling \ No newline at end of file diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index b2d418f9..50f3c1a9 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -327,6 +327,7 @@ public: platform_ascendc::SocVersion socVersion = platform_ascendc::SocVersion::ASCEND910B; int32_t mmConfigType = 1; // 0: Norm; 1: MDL bool enableL1CacheUB = false; + bool enVecND2NZ = false; protected: virtual int64_t Compute() = 0; -- Gitee From cd8edc2bda213344b7ccfa8d86dcf92aa91726e1 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 22 Nov 2024 09:49:11 +0800 Subject: [PATCH 03/13] fix ND2NZ error --- impl/matmul/matmul_tiling_algorithm.cpp | 1 + tests/tiling/test_tiling.cpp | 72 +++++++++++++++++++++++++ 2 files changed, 73 insertions(+) diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index c49fac17..f328fa8b 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -1499,6 +1499,7 @@ void MatmulTilingAlgorithm::GetBankConflictSize(int32_t& length, bool isAMatrix) length = length + bankConflictSize; } } + int32_t MatmulTilingAlgorithm::GetAL1UbSize(const L1StatusPack& l1Status, const L0StatusPack& l0Status) const { int32_t a1Length = 0; diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index c72d121b..cc94403a 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -50,6 +50,78 @@ TEST_F(TestTiling, MultiCoreSmallMN) EXPECT_EQ(ret, 0); } +TEST_F(TestTiling, MatmulMDL1951ND2NZNoFullLoad) +{ + matmul_tiling::MultiCoreMatmulTiling rnnMatmul3,rnnMatmul4,rnnMatmul5; + rnnMatmul3.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); + rnnMatmul3.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::NZ, matmul_tiling::DataType ::DT_INT8); + rnnMatmul3.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); + rnnMatmul3.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); + auto ret = rnnMatmul3.EnableBias(true); + ret = rnnMatmul3.SetDim(8); + ret = rnnMatmul3.SetOrgShape(1024, 5120, 3584); + ret = rnnMatmul3.SetShape(1024, 5120, 3584); + ret = rnnMatmul3.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + rnnMatmul3.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); + rnnMatmul3.socVersion = platform_ascendc::SocVersion::ASCEND310P; + optiling::TCubeTiling tilingDataA; + ret = rnnMatmul3.GetTiling(tilingDataA); + rnnMatmul3.PrintTilingData(); + EXPECT_EQ(ret, 0); + + rnnMatmul4.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8); + rnnMatmul4.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); + rnnMatmul4.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); + rnnMatmul4.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); + ret = rnnMatmul4.EnableBias(true); + ret = rnnMatmul4.SetDim(8); + ret = rnnMatmul4.SetOrgShape(5120, 1024, 3584); + ret = rnnMatmul4.SetShape(5120, 1024, 3584); + ret = rnnMatmul4.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + rnnMatmul4.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); + rnnMatmul4.socVersion = platform_ascendc::SocVersion::ASCEND310P; + optiling::TCubeTiling tilingDataB; + ret = rnnMatmul4.GetTiling(tilingDataB); + rnnMatmul4.PrintTilingData(); + EXPECT_EQ(ret, 0); +} + +TEST_F(TestTiling, MatmulMDL1951ND2NZFullLoad) +{ + matmul_tiling::MultiCoreMatmulTiling rnnMatmul3,rnnMatmul4; + rnnMatmul3.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8); + rnnMatmul3.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); + rnnMatmul3.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); + rnnMatmul3.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); + auto ret = rnnMatmul3.EnableBias(true); + ret = rnnMatmul3.SetDim(8); + ret = rnnMatmul3.SetOrgShape(32, 2048, 64); + ret = rnnMatmul3.SetShape(32, 2048, 64); + ret = rnnMatmul3.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + rnnMatmul3.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); + rnnMatmul3.socVersion = platform_ascendc::SocVersion::ASCEND310P; + optiling::TCubeTiling tilingDataA; + ret = rnnMatmul3.GetTiling(tilingDataA); + rnnMatmul3.PrintTilingData(); + EXPECT_EQ(ret, 0); + + rnnMatmul4.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); + rnnMatmul4.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8); + rnnMatmul4.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); + rnnMatmul4.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); + ret = rnnMatmul4.EnableBias(true); + ret = rnnMatmul4.SetDim(8); + ret = rnnMatmul4.SetOrgShape(2048, 32, 64); + ret = rnnMatmul4.SetShape(2048, 32, 64); + ret = rnnMatmul4.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified + rnnMatmul4.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); + rnnMatmul4.socVersion = platform_ascendc::SocVersion::ASCEND310P; + optiling::TCubeTiling tilingDataB; + ret = rnnMatmul4.GetTiling(tilingDataB); + rnnMatmul4.PrintTilingData(); + EXPECT_EQ(ret, 0); +} + TEST_F(TestTiling, MatmulApiTilingFP32) { matmul_tiling::MatmulApiTiling stft; -- Gitee From eecd6da3fcba9f1ba1a6610f9cabbc24442b6ba3 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 22 Nov 2024 09:51:41 +0800 Subject: [PATCH 04/13] fix ND2NZ error --- tests/tiling/test_tiling.cpp | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index cc94403a..52c525ff 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -433,8 +433,8 @@ TEST_F(TestTiling, L1CacheUBCase01NoCache) int ret = tiling.GetTiling(tilingData); tiling.PrintTilingData(); EXPECT_EQ(ret, 0); - EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 0); - EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 0); + EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 1); + EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 1); } TEST_F(TestTiling, L1CacheUBCase02NeiABFullLoad) @@ -454,8 +454,8 @@ TEST_F(TestTiling, L1CacheUBCase02NeiABFullLoad) int ret = tiling.GetTiling(tilingData); tiling.PrintTilingData(); EXPECT_EQ(ret, 0); - EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 0); - EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 3); + EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 1); + EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 4); } TEST_F(TestTiling, L1CacheUBCase03BothABFullLoad) -- Gitee From 56f73a8f7384aad34142de91ab77dafd44197d0f Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 13 Dec 2024 15:11:11 +0800 Subject: [PATCH 05/13] fix ND2NZ error --- impl/matmul/matmul_tiling_algorithm.cpp | 99 +++++++++++++++++++++---- lib/matmul/matmul_tiling_base.h | 4 +- 2 files changed, 89 insertions(+), 14 deletions(-) diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index f328fa8b..710d3d67 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -902,20 +902,35 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, l1Mfirst.aL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.bL1Size; int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Mfirst, l0Status); l1Mfirst.mAL1 = max(min(min( - CalL1MaxLen(min(l1Mfirst.aL1Size - biasSize - dequantSize, a1Length), l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + CalL1MaxLen(l1Mfirst.aL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), l1Mfirst.maxMAL1), mRepeat), 1); + if (IsUbNd2Nz()) { + l1Mfirst.mAL1 = max(min(min( + CalL1MaxLen(min(l1Mfirst.aL1Size - biasSize - dequantSize, a1Length), l1Mfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + l1Mfirst.maxMAL1), + mRepeat), + 1); + } GetNearestFactor(mRepeat, l1Mfirst.mAL1); // tik-mm support no factor ----ncheck l1Mfirst.aL1Size = MathUtil::Align(l1Mfirst.kAL1, kaAlignValue) * l1Mfirst.mAL1 * l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Mfirst.dbAL1; l1Mfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Mfirst.aL1Size; int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Mfirst, l0Status); l1Mfirst.nBL1 = max(min(min( - CalL1MaxLen(min(l1Mfirst.bL1Size - biasSize - dequantSize, b1Length), l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + CalL1MaxLen(l1Mfirst.bL1Size - biasSize - dequantSize, l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), l1Mfirst.maxNBL1), nRepeat), 1); + if (IsUbNd2Nz()) { + l1Mfirst.nBL1 = max(min(min( + CalL1MaxLen(min(l1Mfirst.bL1Size - biasSize - dequantSize, b1Length), l1Mfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + l1Mfirst.maxNBL1), + nRepeat), + 1); + } + GetNearestFactor(nRepeat, l1Mfirst.nBL1); l1Mfirst.loadSize = coreStatus.m + coreStatus.n * MathUtil::CeilDivision(coreStatus.m, l1Mfirst.mAL1 * l0Status.mL0); @@ -926,10 +941,17 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, l1Nfirst.bL1Size = tilingIns_->bufferPool_.l1Size - l1Nfirst.aL1Size; b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Nfirst, l0Status); l1Nfirst.nBL1 = max(min(min( - CalL1MaxLen(min(l1Nfirst.bL1Size - biasSize - dequantSize, b1Length), l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + CalL1MaxLen(l1Nfirst.bL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), l1Nfirst.maxNBL1), nRepeat), 1); + if (IsUbNd2Nz()) { + l1Nfirst.nBL1 = max(min(min( + CalL1MaxLen(min(l1Nfirst.bL1Size - biasSize - dequantSize, b1Length), l1Nfirst, l0Status, kbAlignValue, L1TilingType::N_BL1), + l1Nfirst.maxNBL1), + nRepeat), + 1); + } GetNearestFactor(nRepeat, l1Nfirst.nBL1); l1Nfirst.bL1Size = MathUtil::Align(coreStatus.k, kbAlignValue) * l1Nfirst.nBL1 * l0Status.nL0 * C0_SIZE * C0_BYTE_SIZE * l1Nfirst.dbBL1; @@ -937,10 +959,17 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, biasSize = biasSize * l1Nfirst.nBL1; a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Nfirst, l0Status); l1Nfirst.mAL1 = max(min(min( - CalL1MaxLen(min(l1Nfirst.aL1Size - biasSize - dequantSize, a1Length), l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + CalL1MaxLen(l1Nfirst.aL1Size - biasSize - dequantSize, l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), l1Nfirst.maxMAL1), mRepeat), 1); + if (IsUbNd2Nz()) { + l1Nfirst.mAL1 = max(min(min( + CalL1MaxLen(min(l1Nfirst.aL1Size - biasSize - dequantSize, a1Length), l1Nfirst, l0Status, kaAlignValue, L1TilingType::M_AL1), + l1Nfirst.maxMAL1), + mRepeat), + 1); + } GetNearestFactor(mRepeat, l1Nfirst.mAL1); l1Nfirst.loadSize = coreStatus.m * MathUtil::CeilDivision(coreStatus.n, l1Nfirst.nBL1 * l0Status.nL0) + coreStatus.n; @@ -1003,19 +1032,31 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforNZ(const CoreStatusPack& coreStat C0_BYTE_SIZE * l1Status.dbBL1; l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); - l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), l1Status, l0Status, kaAlignValue, + l1Status.kAL1 = min(CalL1MaxLen(l1Status.aL1Size - biasSize - dequantSize, l1Status, l0Status, kaAlignValue, L1TilingType::KAL1_16), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = min(CalL1MaxLen(min(l1Status.aL1Size - biasSize - dequantSize, a1Length), l1Status, l0Status, kaAlignValue, + L1TilingType::KAL1_16), + coreStatus.k); + } l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; } else { // when NeitherFullLoadMN change the nBL1 and mAL1 - int32_t perK = min(min(tilingIns_->bufferPool_.l1Size - biasSize - dequantSize, tilingIns_->bufferPool_.ubSize) / + int32_t perK = min((tilingIns_->bufferPool_.l1Size - biasSize - dequantSize) / (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / l0Status.kL0 * l0Status.kL0, coreStatus.k); + if (IsUbNd2Nz()) { + int32_t perK = min(min(tilingIns_->bufferPool_.l1Size - biasSize - dequantSize, tilingIns_->bufferPool_.ubSize) / + (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + + C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / + l0Status.kL0 * l0Status.kL0, + coreStatus.k); + } const int32_t biasFactor = tilingIns_->isBias ? l1Status.nBL1 * l0Status.nL0 : 0; const int32_t aAlignedPerK = MathUtil::Align(perK, kaAlignValue); const int32_t bAlignedPerK = MathUtil::Align(perK, kbAlignValue); @@ -1076,8 +1117,12 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat l1Status.aL1Size = tilingIns_->bufferPool_.l1Size - l1Status.bL1Size; int32_t a1Length = tilingIns_->bufferPool_.ubSize - GetBL1UbSize(l1Status, l0Status); auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + coreStatus.k); + } l1Status.aL1Times = max(l1Status.kAL1 / l0Status.kL0, 1); GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); // tik-mm support no factor ----ncheck l1Status.kAL1 = l1Status.aL1Times * l0Status.kL0; @@ -1086,18 +1131,34 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); if ((tilingIns_->bType_.dataType == DataType::DT_FLOAT) || (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8)) { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), + coreStatus.k); + } + } else if (!tilingIns_->bType_.isTrans && (tilingIns_->bType_.dataType == DataType::DT_INT8 || tilingIns_->bType_.dataType == DataType::DT_INT4)) { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + (alignN * l0Status.nL0 * l1Status.dbBL1 * alignK), + coreStatus.k); + } } else { - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), + coreStatus.k); + } } l1Status.bL1Times = max(min(l1Status.kBL1 / l0Status.kL0, l1Status.maxKBL1), 1); GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); @@ -1130,16 +1191,28 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat (!tilingIns_->aType_.isTrans && (tilingIns_->aType_.dataType == DataType::DT_INT8 || tilingIns_->aType_.dataType == DataType::DT_INT4))) { auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * alignK; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + coreStatus.k); + } } else if (tilingIns_->aType_.isTrans && tilingIns_->aType_.dataType == DataType::DT_INT8) { - l1Status.kAL1 = min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / + l1Status.kAL1 = min((l1Status.aL1Size - biasSize - dequantSize) / (alignM * l0Status.mL0 * l1Status.dbAL1 * alignK), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / + (alignM * l0Status.mL0 * l1Status.dbAL1 * alignK), coreStatus.k); + } l1Status.aL1Size = l1Status.kAL1 * alignM * l0Status.mL0 * alignK * l1Status.dbAL1; } else { auto factor = l1Status.mAL1 * l0Status.mL0 * C0_SIZE * l1Status.dbAL1 * C0_BYTE_SIZE; - l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min((l1Status.aL1Size - biasSize - dequantSize) / factor, coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kAL1 = (factor == 0) ? coreStatus.k : min(min(l1Status.aL1Size - biasSize - dequantSize, a1Length) / factor, + coreStatus.k); + } } l1Status.aL1Times = max(min(l1Status.kAL1 / l0Status.kL0, l1Status.maxKAL1), 1); GetNearestFactor(l1Status.allTimes, l1Status.aL1Times); diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index 50f3c1a9..91dc87ce 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -229,7 +229,9 @@ public: int32_t SetDoubleBuffer(bool a, bool b, bool c, bool bias, bool transND2NZ = true, bool transNZ2ND = true); - void SetMatmulConfigParams(int32_t mmConfigTypeIn = 1, bool enableL1CacheUBIn = false); + void SetMatmulConfigParams(int32_t mmConfigTypeIn = 1, bool enableL1CacheUBIn = false, + ScheduleType scheduleTypeIn = ScheduleType::INNER_PRODUCT, MatrixTraverse traverseIn = MatrixTraverse::NOSET, + bool enVecND2NZIn = false); void SetMatmulConfigParams(const MatmulConfigParams& configParams); int32_t GetBaseM() const -- Gitee From cd4d61b22802cb741c5ba5e7098ff895198f42a5 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 13 Dec 2024 15:17:25 +0800 Subject: [PATCH 06/13] fix ND2NZ error --- impl/matmul/matmul_tiling_base.cpp | 9 ++++++++- 1 file changed, 8 insertions(+), 1 deletion(-) diff --git a/impl/matmul/matmul_tiling_base.cpp b/impl/matmul/matmul_tiling_base.cpp index 64d228ab..f7e644a9 100644 --- a/impl/matmul/matmul_tiling_base.cpp +++ b/impl/matmul/matmul_tiling_base.cpp @@ -578,12 +578,19 @@ int32_t MatmulApiTilingBase::SetSplitRange(int32_t maxBaseM, int32_t maxBaseN, i return 0; } -void MatmulApiTilingBase::SetMatmulConfigParams(int32_t mmConfigTypeIn, bool enableL1CacheUBIn) +void MatmulApiTilingBase::SetMatmulConfigParams(int32_t mmConfigTypeIn, bool enableL1CacheUBIn, + ScheduleType scheduleTypeIn, MatrixTraverse traverseIn, bool enVecND2NZIn) { TILING_LOG_DEBUG("Set MatmulConfigType: %d", mmConfigTypeIn); TILING_LOG_DEBUG("Set EnableL1CacheUB: %d", static_cast(enableL1CacheUBIn)); + TILING_LOG_DEBUG("Set ScheduleType: %d", static_cast(scheduleTypeIn)); + TILING_LOG_DEBUG("Set Traverse: %d", static_cast(traverseIn)); + TILING_LOG_DEBUG("Set EnVecND2NZ: %d", static_cast(enVecND2NZIn)); this->mmConfigType = mmConfigTypeIn; this->enableL1CacheUB = enableL1CacheUBIn; + this->scheduleType = scheduleTypeIn; + this->traverse_ = traverseIn; + this->enVecND2NZ = enVecND2NZIn; } void MatmulApiTilingBase::SetMatmulConfigParams(const MatmulConfigParams& configParams) -- Gitee From 4b37e580a7a99bb66d582958293146baf8c1c4be Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 13 Dec 2024 15:27:41 +0800 Subject: [PATCH 07/13] fix ND2NZ error --- impl/matmul/matmul_tiling_algorithm.cpp | 7 +- tests/tiling/test_tiling.cpp | 108 +++++++++++++++++++++++- 2 files changed, 110 insertions(+), 5 deletions(-) diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index 710d3d67..d16323d1 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -1178,9 +1178,14 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat l1Status.bL1Size = tilingIns_->bufferPool_.l1Size - l1Status.aL1Size; int32_t b1Length = tilingIns_->bufferPool_.ubSize - GetAL1UbSize(l1Status, l0Status); - l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), coreStatus.k); + if (IsUbNd2Nz()) { + l1Status.kBL1 = min(min(l1Status.bL1Size - biasSize - dequantSize, b1Length) / + (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * C0_BYTE_SIZE), + coreStatus.k); + } l1Status.bL1Times = max(l1Status.kBL1 / l0Status.kL0, 1); GetNearestFactor(l1Status.allTimes, l1Status.bL1Times); l1Status.kBL1 = l1Status.bL1Times * l0Status.kL0; diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index 52c525ff..b57f0c72 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -122,6 +122,106 @@ TEST_F(TestTiling, MatmulMDL1951ND2NZFullLoad) EXPECT_EQ(ret, 0); } +TEST_F(TestTiling, L1CacheUBCase01NoCacheND2NZ) +{ + MatmulApiTiling tiling; + tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetShape(1024, 10240, 1280); + tiling.SetOrgShape(1024, 10240, 1280); + tiling.EnableBias(true); + tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; + tiling.SetMatmulConfigParams(1, true, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); + optiling::TCubeTiling tilingData; + int ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); + EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 2); + EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 4); +} + +TEST_F(TestTiling, AFullLoadND2NZCase) +{ + MatmulApiTiling tiling; + tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetShape(32, 512000, 128); + tiling.SetOrgShape(32, 512000, 128); + tiling.EnableBias(true); + tiling.SetBufferSpace(-1, -1, -1); + tiling.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); + tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; + optiling::TCubeTiling tilingData; + int ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); +} + +TEST_F(TestTiling, BothNotFullLoadND2NZCase) +{ + MatmulApiTiling tiling; + tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); + tiling.SetShape(512, 1024, 128000); + tiling.SetOrgShape(512, 1024, 128000); + tiling.EnableBias(true); + tiling.SetBufferSpace(-1, -1, -1); + tiling.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); + tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; + optiling::TCubeTiling tilingData; + int ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); +} + +TEST_F(TestTiling, TestMatmulApiTilngSingleCoreFullLoadND2NZCase) +{ + optiling::TCubeTiling tilingData; + MultiCoreMatmulTiling tilingApi; + tilingApi.SetDim(24); + + tilingApi.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true); + tilingApi.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + tilingApi.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + tilingApi.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); + + tilingApi.SetOrgShape(2048, 2048, 204); + tilingApi.SetShape(2048, 2048, 204); + tilingApi.EnableBias(false); + tilingApi.SetBufferSpace(-1, -1, -1); + tilingApi.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); + tilingApi.socVersion = platform_ascendc::SocVersion::ASCEND310P; + int64_t res = tilingApi.GetTiling(tilingData); + tilingApi.PrintTilingData(); + EXPECT_EQ(res, 0); +} + +TEST_F(TestTiling, TestMatmulApiTilngInt8ND2NZCase13) +{ + MatmulApiTiling tiling; + tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT8, true); + tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT8, true); + tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32); + tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32); + tiling.SetShape(1024, 1024, 1024); + tiling.SetOrgShape(1024, 1024, 1024); + tiling.EnableBias(true); + tiling.SetBufferSpace(-1, -1, -1); + tiling.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); + tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; + + optiling::TCubeTiling tilingData; + int ret = tiling.GetTiling(tilingData); + tiling.PrintTilingData(); + EXPECT_EQ(ret, 0); +} + TEST_F(TestTiling, MatmulApiTilingFP32) { matmul_tiling::MatmulApiTiling stft; @@ -433,8 +533,8 @@ TEST_F(TestTiling, L1CacheUBCase01NoCache) int ret = tiling.GetTiling(tilingData); tiling.PrintTilingData(); EXPECT_EQ(ret, 0); - EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 1); - EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 1); + EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 0); + EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 0); } TEST_F(TestTiling, L1CacheUBCase02NeiABFullLoad) @@ -454,8 +554,8 @@ TEST_F(TestTiling, L1CacheUBCase02NeiABFullLoad) int ret = tiling.GetTiling(tilingData); tiling.PrintTilingData(); EXPECT_EQ(ret, 0); - EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 1); - EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 4); + EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 0); + EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 3); } TEST_F(TestTiling, L1CacheUBCase03BothABFullLoad) -- Gitee From b1a3ca23d5a37582e671335a1de1deb18b85733d Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 13 Dec 2024 15:43:45 +0800 Subject: [PATCH 08/13] fix ND2NZ error --- impl/matmul/matmul_tiling_algorithm.cpp | 2 -- 1 file changed, 2 deletions(-) diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index d16323d1..455e806d 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -930,7 +930,6 @@ void MatmulTilingAlgorithm::NeitherFullLoadMN(const CoreStatusPack& coreStatus, nRepeat), 1); } - GetNearestFactor(nRepeat, l1Mfirst.nBL1); l1Mfirst.loadSize = coreStatus.m + coreStatus.n * MathUtil::CeilDivision(coreStatus.m, l1Mfirst.mAL1 * l0Status.mL0); @@ -1139,7 +1138,6 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforND(const CoreStatusPack& coreStat (l1Status.nBL1 * l0Status.nL0 * C0_SIZE * l1Status.dbBL1 * alignK), coreStatus.k); } - } else if (!tilingIns_->bType_.isTrans && (tilingIns_->bType_.dataType == DataType::DT_INT8 || tilingIns_->bType_.dataType == DataType::DT_INT4)) { l1Status.kBL1 = min((l1Status.bL1Size - biasSize - dequantSize) / -- Gitee From 6c79d69a7c53968d43a274fa43f6b5538e8821ea Mon Sep 17 00:00:00 2001 From: qiaolili Date: Fri, 13 Dec 2024 16:25:09 +0800 Subject: [PATCH 09/13] fix ND2NZ error --- impl/matmul/matmul_tiling_algorithm.cpp | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/impl/matmul/matmul_tiling_algorithm.cpp b/impl/matmul/matmul_tiling_algorithm.cpp index 455e806d..f6d3d5d8 100644 --- a/impl/matmul/matmul_tiling_algorithm.cpp +++ b/impl/matmul/matmul_tiling_algorithm.cpp @@ -1050,7 +1050,7 @@ void MatmulTilingAlgorithm::NeitherFullLoadKforNZ(const CoreStatusPack& coreStat l0Status.kL0 * l0Status.kL0, coreStatus.k); if (IsUbNd2Nz()) { - int32_t perK = min(min(tilingIns_->bufferPool_.l1Size - biasSize - dequantSize, tilingIns_->bufferPool_.ubSize) / + perK = min(min(tilingIns_->bufferPool_.l1Size - biasSize - dequantSize, tilingIns_->bufferPool_.ubSize) / (l0Status.mL0 * C0_SIZE * C0_BYTE_SIZE * l1Status.dbAL1 + C0_SIZE * l0Status.nL0 * C0_BYTE_SIZE * l1Status.dbBL1) / l0Status.kL0 * l0Status.kL0, -- Gitee From 4761fc730339d061705dc1f0ca1e5882e8c96836 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Mon, 16 Dec 2024 11:34:18 +0800 Subject: [PATCH 10/13] fix ND2NZ error --- tests/tiling/test_tiling.cpp | 172 ----------------------------------- 1 file changed, 172 deletions(-) diff --git a/tests/tiling/test_tiling.cpp b/tests/tiling/test_tiling.cpp index b57f0c72..c72d121b 100644 --- a/tests/tiling/test_tiling.cpp +++ b/tests/tiling/test_tiling.cpp @@ -50,178 +50,6 @@ TEST_F(TestTiling, MultiCoreSmallMN) EXPECT_EQ(ret, 0); } -TEST_F(TestTiling, MatmulMDL1951ND2NZNoFullLoad) -{ - matmul_tiling::MultiCoreMatmulTiling rnnMatmul3,rnnMatmul4,rnnMatmul5; - rnnMatmul3.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); - rnnMatmul3.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::NZ, matmul_tiling::DataType ::DT_INT8); - rnnMatmul3.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); - rnnMatmul3.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); - auto ret = rnnMatmul3.EnableBias(true); - ret = rnnMatmul3.SetDim(8); - ret = rnnMatmul3.SetOrgShape(1024, 5120, 3584); - ret = rnnMatmul3.SetShape(1024, 5120, 3584); - ret = rnnMatmul3.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified - rnnMatmul3.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); - rnnMatmul3.socVersion = platform_ascendc::SocVersion::ASCEND310P; - optiling::TCubeTiling tilingDataA; - ret = rnnMatmul3.GetTiling(tilingDataA); - rnnMatmul3.PrintTilingData(); - EXPECT_EQ(ret, 0); - - rnnMatmul4.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8); - rnnMatmul4.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); - rnnMatmul4.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); - rnnMatmul4.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); - ret = rnnMatmul4.EnableBias(true); - ret = rnnMatmul4.SetDim(8); - ret = rnnMatmul4.SetOrgShape(5120, 1024, 3584); - ret = rnnMatmul4.SetShape(5120, 1024, 3584); - ret = rnnMatmul4.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified - rnnMatmul4.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); - rnnMatmul4.socVersion = platform_ascendc::SocVersion::ASCEND310P; - optiling::TCubeTiling tilingDataB; - ret = rnnMatmul4.GetTiling(tilingDataB); - rnnMatmul4.PrintTilingData(); - EXPECT_EQ(ret, 0); -} - -TEST_F(TestTiling, MatmulMDL1951ND2NZFullLoad) -{ - matmul_tiling::MultiCoreMatmulTiling rnnMatmul3,rnnMatmul4; - rnnMatmul3.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8); - rnnMatmul3.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); - rnnMatmul3.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); - rnnMatmul3.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); - auto ret = rnnMatmul3.EnableBias(true); - ret = rnnMatmul3.SetDim(8); - ret = rnnMatmul3.SetOrgShape(32, 2048, 64); - ret = rnnMatmul3.SetShape(32, 2048, 64); - ret = rnnMatmul3.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified - rnnMatmul3.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); - rnnMatmul3.socVersion = platform_ascendc::SocVersion::ASCEND310P; - optiling::TCubeTiling tilingDataA; - ret = rnnMatmul3.GetTiling(tilingDataA); - rnnMatmul3.PrintTilingData(); - EXPECT_EQ(ret, 0); - - rnnMatmul4.SetAType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8, true); - rnnMatmul4.SetBType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT8); - rnnMatmul4.SetCType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_FLOAT16); - rnnMatmul4.SetBiasType(matmul_tiling::TPosition::GM, matmul_tiling::CubeFormat::ND, matmul_tiling::DataType ::DT_INT32); - ret = rnnMatmul4.EnableBias(true); - ret = rnnMatmul4.SetDim(8); - ret = rnnMatmul4.SetOrgShape(2048, 32, 64); - ret = rnnMatmul4.SetShape(2048, 32, 64); - ret = rnnMatmul4.SetBufferSpace(-1, -1, -1); // will use all buffer space if not explicitly specified - rnnMatmul4.SetMatmulConfigParams({1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true}); - rnnMatmul4.socVersion = platform_ascendc::SocVersion::ASCEND310P; - optiling::TCubeTiling tilingDataB; - ret = rnnMatmul4.GetTiling(tilingDataB); - rnnMatmul4.PrintTilingData(); - EXPECT_EQ(ret, 0); -} - -TEST_F(TestTiling, L1CacheUBCase01NoCacheND2NZ) -{ - MatmulApiTiling tiling; - tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); - tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); - tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetShape(1024, 10240, 1280); - tiling.SetOrgShape(1024, 10240, 1280); - tiling.EnableBias(true); - tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; - tiling.SetMatmulConfigParams(1, true, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); - optiling::TCubeTiling tilingData; - int ret = tiling.GetTiling(tilingData); - tiling.PrintTilingData(); - EXPECT_EQ(ret, 0); - EXPECT_EQ(tilingData.get_depthAL1CacheUB(), 2); - EXPECT_EQ(tilingData.get_depthBL1CacheUB(), 4); -} - -TEST_F(TestTiling, AFullLoadND2NZCase) -{ - MatmulApiTiling tiling; - tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetShape(32, 512000, 128); - tiling.SetOrgShape(32, 512000, 128); - tiling.EnableBias(true); - tiling.SetBufferSpace(-1, -1, -1); - tiling.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); - tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; - optiling::TCubeTiling tilingData; - int ret = tiling.GetTiling(tilingData); - tiling.PrintTilingData(); - EXPECT_EQ(ret, 0); -} - -TEST_F(TestTiling, BothNotFullLoadND2NZCase) -{ - MatmulApiTiling tiling; - tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT); - tiling.SetShape(512, 1024, 128000); - tiling.SetOrgShape(512, 1024, 128000); - tiling.EnableBias(true); - tiling.SetBufferSpace(-1, -1, -1); - tiling.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); - tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; - optiling::TCubeTiling tilingData; - int ret = tiling.GetTiling(tilingData); - tiling.PrintTilingData(); - EXPECT_EQ(ret, 0); -} - -TEST_F(TestTiling, TestMatmulApiTilngSingleCoreFullLoadND2NZCase) -{ - optiling::TCubeTiling tilingData; - MultiCoreMatmulTiling tilingApi; - tilingApi.SetDim(24); - - tilingApi.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16, true); - tilingApi.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); - tilingApi.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); - tilingApi.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_FLOAT16); - - tilingApi.SetOrgShape(2048, 2048, 204); - tilingApi.SetShape(2048, 2048, 204); - tilingApi.EnableBias(false); - tilingApi.SetBufferSpace(-1, -1, -1); - tilingApi.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); - tilingApi.socVersion = platform_ascendc::SocVersion::ASCEND310P; - int64_t res = tilingApi.GetTiling(tilingData); - tilingApi.PrintTilingData(); - EXPECT_EQ(res, 0); -} - -TEST_F(TestTiling, TestMatmulApiTilngInt8ND2NZCase13) -{ - MatmulApiTiling tiling; - tiling.SetAType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT8, true); - tiling.SetBType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT8, true); - tiling.SetCType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32); - tiling.SetBiasType(TPosition::GM, CubeFormat::ND, matmul_tiling::DataType::DT_INT32); - tiling.SetShape(1024, 1024, 1024); - tiling.SetOrgShape(1024, 1024, 1024); - tiling.EnableBias(true); - tiling.SetBufferSpace(-1, -1, -1); - tiling.SetMatmulConfigParams(1, false, ScheduleType::INNER_PRODUCT, MatrixTraverse::NOSET, true); - tiling.socVersion = platform_ascendc::SocVersion::ASCEND310P; - - optiling::TCubeTiling tilingData; - int ret = tiling.GetTiling(tilingData); - tiling.PrintTilingData(); - EXPECT_EQ(ret, 0); -} - TEST_F(TestTiling, MatmulApiTilingFP32) { matmul_tiling::MatmulApiTiling stft; -- Gitee From cf0ad4570dc5813443bfea584e47330dc7ed6392 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Mon, 16 Dec 2024 07:40:55 +0000 Subject: [PATCH 11/13] update impl/matmul/matmul_tiling_base.cpp. Signed-off-by: qiaolili --- impl/matmul/matmul_tiling_base.cpp | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/impl/matmul/matmul_tiling_base.cpp b/impl/matmul/matmul_tiling_base.cpp index f7e644a9..bf321298 100644 --- a/impl/matmul/matmul_tiling_base.cpp +++ b/impl/matmul/matmul_tiling_base.cpp @@ -578,19 +578,16 @@ int32_t MatmulApiTilingBase::SetSplitRange(int32_t maxBaseM, int32_t maxBaseN, i return 0; } -void MatmulApiTilingBase::SetMatmulConfigParams(int32_t mmConfigTypeIn, bool enableL1CacheUBIn, - ScheduleType scheduleTypeIn, MatrixTraverse traverseIn, bool enVecND2NZIn) +void MatmulApiTilingBase::SetMatmulConfigParams(int32_t mmConfigTypeIn, bool enableL1CacheUBIn) { TILING_LOG_DEBUG("Set MatmulConfigType: %d", mmConfigTypeIn); TILING_LOG_DEBUG("Set EnableL1CacheUB: %d", static_cast(enableL1CacheUBIn)); TILING_LOG_DEBUG("Set ScheduleType: %d", static_cast(scheduleTypeIn)); TILING_LOG_DEBUG("Set Traverse: %d", static_cast(traverseIn)); - TILING_LOG_DEBUG("Set EnVecND2NZ: %d", static_cast(enVecND2NZIn)); this->mmConfigType = mmConfigTypeIn; this->enableL1CacheUB = enableL1CacheUBIn; this->scheduleType = scheduleTypeIn; this->traverse_ = traverseIn; - this->enVecND2NZ = enVecND2NZIn; } void MatmulApiTilingBase::SetMatmulConfigParams(const MatmulConfigParams& configParams) @@ -599,12 +596,10 @@ void MatmulApiTilingBase::SetMatmulConfigParams(const MatmulConfigParams& config TILING_LOG_DEBUG("Set EnableL1CacheUB: %d", static_cast(configParams.enableL1CacheUB)); TILING_LOG_DEBUG("Set ScheduleType: %d", static_cast(configParams.scheduleType)); TILING_LOG_DEBUG("Set Traverse: %d", static_cast(configParams.traverse)); - TILING_LOG_DEBUG("Set EnVecND2NZ: %d", static_cast(configParams.enVecND2NZ)); this->mmConfigType = configParams.mmConfigType; this->enableL1CacheUB = configParams.enableL1CacheUB; this->scheduleType = configParams.scheduleType; this->traverse_ = configParams.traverse; - this->enVecND2NZ = configParams.enVecND2NZ; } bool MatmulApiTilingBase::CheckSetParam() -- Gitee From 476f8212d3ca63562ccb67185c88d73a75c23e46 Mon Sep 17 00:00:00 2001 From: qiaolili Date: Mon, 16 Dec 2024 07:41:50 +0000 Subject: [PATCH 12/13] update lib/matmul/matmul_tiling_base.h. Signed-off-by: qiaolili --- lib/matmul/matmul_tiling_base.h | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/lib/matmul/matmul_tiling_base.h b/lib/matmul/matmul_tiling_base.h index 91dc87ce..50f3c1a9 100644 --- a/lib/matmul/matmul_tiling_base.h +++ b/lib/matmul/matmul_tiling_base.h @@ -229,9 +229,7 @@ public: int32_t SetDoubleBuffer(bool a, bool b, bool c, bool bias, bool transND2NZ = true, bool transNZ2ND = true); - void SetMatmulConfigParams(int32_t mmConfigTypeIn = 1, bool enableL1CacheUBIn = false, - ScheduleType scheduleTypeIn = ScheduleType::INNER_PRODUCT, MatrixTraverse traverseIn = MatrixTraverse::NOSET, - bool enVecND2NZIn = false); + void SetMatmulConfigParams(int32_t mmConfigTypeIn = 1, bool enableL1CacheUBIn = false); void SetMatmulConfigParams(const MatmulConfigParams& configParams); int32_t GetBaseM() const -- Gitee From a49fa536cc92c3ac08ffc7ea5371cd2fdb4e880e Mon Sep 17 00:00:00 2001 From: qiaolili Date: Mon, 16 Dec 2024 07:45:59 +0000 Subject: [PATCH 13/13] update impl/matmul/matmul_tiling_base.cpp. Signed-off-by: qiaolili --- impl/matmul/matmul_tiling_base.cpp | 4 ---- 1 file changed, 4 deletions(-) diff --git a/impl/matmul/matmul_tiling_base.cpp b/impl/matmul/matmul_tiling_base.cpp index bf321298..2922ac27 100644 --- a/impl/matmul/matmul_tiling_base.cpp +++ b/impl/matmul/matmul_tiling_base.cpp @@ -582,12 +582,8 @@ void MatmulApiTilingBase::SetMatmulConfigParams(int32_t mmConfigTypeIn, bool ena { TILING_LOG_DEBUG("Set MatmulConfigType: %d", mmConfigTypeIn); TILING_LOG_DEBUG("Set EnableL1CacheUB: %d", static_cast(enableL1CacheUBIn)); - TILING_LOG_DEBUG("Set ScheduleType: %d", static_cast(scheduleTypeIn)); - TILING_LOG_DEBUG("Set Traverse: %d", static_cast(traverseIn)); this->mmConfigType = mmConfigTypeIn; this->enableL1CacheUB = enableL1CacheUBIn; - this->scheduleType = scheduleTypeIn; - this->traverse_ = traverseIn; } void MatmulApiTilingBase::SetMatmulConfigParams(const MatmulConfigParams& configParams) -- Gitee