From 9f24d7730a86025ce53a09d6c73aaeab5ff4e638 Mon Sep 17 00:00:00 2001 From: jianghaibo Date: Wed, 19 Nov 2025 16:24:26 +0800 Subject: [PATCH] [LICM] Hoist conditional load with sve ffr When an illegal address is accessed, the ldnf1 instruction of sve will not trap, so this instruction can be used to hoist the load under conditional execution in a loop. --- llvm/include/llvm/Analysis/LoopInfo.h | 6 + llvm/include/llvm/Transforms/Utils/Cloning.h | 12 + .../include/llvm/Transforms/Utils/LoopUtils.h | 11 +- llvm/lib/Analysis/LoopInfo.cpp | 16 + llvm/lib/Transforms/Scalar/LICM.cpp | 498 ++++++++++++++---- llvm/lib/Transforms/Utils/CloneFunction.cpp | 68 ++- llvm/lib/Transforms/Utils/LoopUtils.cpp | 4 + .../LICM/AArch64/hoist-cond-load.ll | 245 +++++++++ 8 files changed, 737 insertions(+), 123 deletions(-) create mode 100644 llvm/test/Transforms/LICM/AArch64/hoist-cond-load.ll diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h index ea4cb7f7c684..8d29f9ffc08d 100644 --- a/llvm/include/llvm/Analysis/LoopInfo.h +++ b/llvm/include/llvm/Analysis/LoopInfo.h @@ -380,6 +380,12 @@ public: /// unrolling pass is run more than once (which it generally is). void setLoopAlreadyUnrolled(); + /// Add llvm.loop.hoist.cond_load.disable to this loop's id metadata. + /// + /// Add hoist.cond_load disable metadata to indicate the loop is a loop + /// version, and the original loop has already hoist the conditional load. + void setLoopHoistedVersion(); + /// Add llvm.loop.mustprogress to this loop's loop id metadata. void setLoopMustProgress(); diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h index 1c342b871a4a..186e4bea16f0 100644 --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -281,6 +281,18 @@ Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, DominatorTree *DT, SmallVectorImpl &Blocks); +/// Clones a loop \p OrigLoop without preheader. Returns the loop and the +/// blocks in \p Blocks. +/// +/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block +/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. +/// Note: Only innermost loops are supported. +Loop *cloneLoopBody(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &Blocks); + /// Remaps instructions in \p Blocks using the mapping in \p VMap. void remapInstructionsInBlocks(ArrayRef Blocks, ValueToValueMapTy &VMap); diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index cc31fc79c2de..4074d0f85b34 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -173,10 +173,10 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *, /// \p AllowSpeculation is whether values should be hoisted even if they are not /// guaranteed to execute in the loop, but are safe to speculatively execute. bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - AssumptionCache *, TargetLibraryInfo *, Loop *, - MemorySSAUpdater &, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool, - bool AllowSpeculation); + AssumptionCache *, TargetLibraryInfo *, TargetTransformInfo *, + BlockFrequencyInfo *, Loop *, MemorySSAUpdater &, + ScalarEvolution *, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, + OptimizationRemarkEmitter *, bool, bool AllowSpeculation); /// Return true if the induction variable \p IV in a Loop whose latch is /// \p LatchBlock would become dead if the exit test \p Cond were removed. @@ -271,6 +271,9 @@ bool hasDisableAllTransformsHint(const Loop *L); /// Look for the loop attribute that disables the LICM transformation heuristics. bool hasDisableLICMTransformsHint(const Loop *L); +/// Look for the loop attribute that disables the LICM to hoist conditional load. +bool hasDisableHoistCondLoad(const Loop *L); + /// The mode sets how eager a transformation should be applied. enum TransformationMode { /// The pass can use heuristics to determine whether a transformation should diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index 36aca73ee675..3531ce923668 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -549,6 +549,22 @@ void Loop::setLoopAlreadyUnrolled() { setLoopID(NewLoopID); } +void Loop::setLoopHoistedVersion() { + LLVMContext &Context = getHeader()->getContext(); + + MDNode *DisableHoist = findOptionMDForLoop(this, "llvm.loop.hoist.cond_load.disable"); + + if (DisableHoist) + return; + + MDNode *DisableHoistMD = + MDNode::get(Context, MDString::get(Context, "llvm.loop.hoist.cond_load.disable")); + MDNode *LoopID = getLoopID(); + MDNode *NewLoopID = makePostTransformationMetadata( + Context, LoopID, {"llvm.loop.hoist.cond_load."}, {DisableHoistMD}); + setLoopID(NewLoopID); +} + void Loop::setLoopMustProgress() { LLVMContext &Context = getHeader()->getContext(); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 71b567bc7c96..bcf78808ce58 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -56,10 +56,12 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -68,6 +70,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -81,8 +84,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include #include @@ -133,6 +138,23 @@ static cl::opt MaxNumUsesTraversed( cl::desc("Max num uses visited for identifying load " "invariance in loop using invariant start (default = 8)")); +enum HoistCondLoadMode { + Hoist_Off, + Generic_Prof, + Aggressive_Prof, + Generic_NoProf, + Aggressive_NoProf, +}; + +static cl::opt HoistConditionalLoad( + "licm-hoist-conditional-load", cl::Hidden, + cl::values(clEnumValN(Hoist_Off, "off", "Disable hoist conditional load"), + clEnumValN(Generic_Prof, "generic", "Generic mode with pgo, which does hoist with loop version(default mode)"), + clEnumValN(Aggressive_Prof, "aggressive", "Aggressive mode with pgo, which allow illegal address accesses without fault"), + clEnumValN(Generic_NoProf, "generic_noprof", "Generic mode without profile, which does hosit with loop version(default mode)"), + clEnumValN(Aggressive_NoProf, "aggressive_noprof", "Aggressive mode without profile, which allow illegal address accesses without fault")), + cl::init(Generic_Prof), + cl::desc("Hoist conditional load with sve no-fault instruction")); // Experimental option to allow imprecision in LICM in pathological cases, in // exchange for faster compile. This is to be removed if MemorySSA starts to // address the same issue. LICM calls MemorySSAWalker's @@ -202,11 +224,29 @@ using PointersAndHasReadsOutsideSet = static SmallVector collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L); +static void updateExitsPhis(Loop *CurLoop, Loop *ClonedLoop, ScalarEvolution *SE, + SmallVectorImpl &DefsUsedOutside, + ValueToValueMapTy &VMap, SmallVectorImpl &Exits); + +static ScalableVectorType *getSVEContainerType(Type *EltTy); + +static Instruction *replaceLoadWithLdnf(Instruction *I); + +static bool +findConditionalLoad(LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + Instruction *I); + +static void createRuntimeCheckForCondLoad( + Loop *CurLoop, Loop *ClonedLoop, BasicBlock *Preheader, MemorySSAUpdater &MSSAU, + ICFLoopSafetyInfo *SafetyInfo, SmallVectorImpl &Updates); + namespace { struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA, + TargetTransformInfo *TTI, BlockFrequencyInfo *BFI, + ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool LoopNestMode = false); LoopInvariantCodeMotion(unsigned LicmMssaOptCap, @@ -255,6 +295,7 @@ struct LegacyLICMPass : public LoopPass { &getAnalysis().getAssumptionCache(*F), &getAnalysis().getTLI(*F), &getAnalysis().getTTI(*F), + &getAnalysis().getBFI(), SE ? &SE->getSE() : nullptr, MSSA, &ORE); } @@ -294,7 +335,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, LoopInvariantCodeMotion LICM(Opts.MssaOptCap, Opts.MssaNoAccForPromotionCap, Opts.AllowSpeculation); if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.AC, &AR.TLI, &AR.TTI, - &AR.SE, AR.MSSA, &ORE)) + AR.BFI, &AR.SE, AR.MSSA, &ORE)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); @@ -330,7 +371,8 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, Loop &OutermostLoop = LN.getOutermostLoop(); bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, &AR.AC, - &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, &ORE, true); + &AR.TLI, &AR.TTI, AR.BFI, &AR.SE, AR.MSSA, + &ORE, true); if (!Changed) return PreservedAnalyses::all(); @@ -398,6 +440,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + BlockFrequencyInfo *BFI, ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool LoopNestMode) { @@ -456,9 +499,9 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, MSSAU, &SafetyInfo, Flags, ORE); Flags.setIsSink(false); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, - MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, - LicmAllowSpeculation); + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, TTI, + BFI, L, MSSAU, SE, &SafetyInfo, Flags, ORE, + LoopNestMode, LicmAllowSpeculation); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -855,6 +898,154 @@ public: }; } // namespace +// It can be ensured that the loop is in LCSSA form, so many definitions within the +// loop that are used outside the loop must be accessed through PHINodes. +static void updateExitsPhis(Loop *CurLoop, Loop *ClonedLoop, ScalarEvolution *SE, + SmallVectorImpl &DefsUsedOutside, + ValueToValueMapTy &VMap, SmallVectorImpl &Exits) { + PHINode *PN = nullptr; + SmallPtrSet IsVisited; + for (auto *BB : Exits) { + if (IsVisited.contains(BB)) + continue; + IsVisited.insert(BB); + // Then for each PHI add the operand for the edge from the cloned loop. + for (auto I = BB->begin(); (PN = dyn_cast(I)); ++I) { + + // If the definition was cloned used that otherwise use the same value. + size_t IncomingSize = PN->getNumIncomingValues(); + for (size_t Idx = 0; Idx < IncomingSize; ++Idx) { + Value *ClonedValue = PN->getIncomingValue(Idx); + BasicBlock *ClonedExiting = PN->getIncomingBlock(Idx); + auto MappedValue = VMap.find(ClonedValue); + if (MappedValue != VMap.end()) + ClonedValue = MappedValue->second; + + auto MappedBB = VMap.find(ClonedExiting); + if (MappedBB != VMap.end()) + ClonedExiting = dyn_cast(MappedBB->second); + + PN->addIncoming(ClonedValue, ClonedExiting); + } + } + } +} + +// Now, processing is done according to the 128-bit SVE register width. +static ScalableVectorType *getSVEContainerType(Type *EltTy) { + if (EltTy == Type::getDoubleTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getBFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getHalfTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt64Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getInt32Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getInt16Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt8Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 16); + + return nullptr; +} + +static Instruction *replaceLoadWithLdnf(Instruction *I) { + auto *LoadI = dyn_cast(I); + auto *PointerOp = LoadI->getPointerOperand(); + const DataLayout &DL = LoadI->getModule()->getDataLayout(); + auto *LITy = LoadI->getType(); + IRBuilder<> B(LoadI); + auto *EltTy = LITy->isPointerTy() ? B.getIntNTy(DL.getPointerSizeInBits()) + : LITy; + auto *SVTy = getSVEContainerType(EltTy); + assert(SVTy && "Unsupport type of load instruction"); + + auto *PredTy = ScalableVectorType::get(B.getInt1Ty(), SVTy->getMinNumElements()); + Value *Imm = B.getInt32(1); + CallInst *Pred = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {Imm}); + + Type *PtrTy = LITy->getPointerTo(PointerOp->getType()->getPointerAddressSpace()); + if (PointerOp->getType() != PtrTy) + PointerOp = B.CreateBitCast(PointerOp, PtrTy); + + CallInst *Ldnf = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ldnf1, {SVTy}, {Pred, PointerOp}); + // FIXME: 1. need copy align info? 2. Is right method to copy metadata + propagateMetadata(Ldnf, LoadI); + + Value *Scalar = B.CreateExtractElement(Ldnf, B.getInt64(0), "extract"); + if (LITy->isPointerTy()) { + Value *PtrValue = B.CreateIntToPtr(Scalar, PointerType::getUnqual(Scalar->getContext())); + return dyn_cast(PtrValue); + } + return dyn_cast(Scalar); +} + +static bool +findConditionalLoad(LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + Instruction *I) { + auto LoadI = dyn_cast(I); + if (!LoadI) + return false; + + if (LoadI->isAtomic() || LoadI->isVolatile()) + return false; + + Module *M = LoadI->getModule(); + Triple TargetTriple(M->getTargetTriple()); + if (!TargetTriple.isAArch64() || !TTI->supportsScalableVectors()) + return false; + + // TODO: add support for vector type + if (LoadI->getType()->isVectorTy()) + return false; + + BasicBlock *BB = LoadI->getParent(); + if (HoistConditionalLoad == Generic_Prof || HoistConditionalLoad == Aggressive_Prof) { + if (!PSI->hasProfileSummary() || !PSI->isHotBlock(BB, BFI)) + return false; + } + + auto PointerOp = LoadI->getPointerOperand(); + if (isa(PointerOp)) + return true; + + auto *PI = dyn_cast(PointerOp); + if (!PI || CurLoop->contains(PI)) + return false; + + return true; +} + +static void createRuntimeCheckForCondLoad( + Loop *CurLoop, Loop *ClonedLoop, BasicBlock *Preheader, MemorySSAUpdater &MSSAU, + ICFLoopSafetyInfo *SafetyInfo, SmallVectorImpl &Updates) { + auto *OriTerm = Preheader->getTerminator(); + IRBuilder<> B(OriTerm); + CallInst *Rdffr = + B.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr, {}, {}); + Type *RdffrTy = Rdffr->getType(); + CallInst *PTrue = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {RdffrTy}, {B.getInt32(1)}); + CallInst *IsValid = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, {RdffrTy}, {PTrue, Rdffr}); + B.CreateCondBr(IsValid, CurLoop->getHeader(), ClonedLoop->getHeader()); + eraseInstruction(*OriTerm, *SafetyInfo, MSSAU); + Updates.push_back({cfg::UpdateKind::Insert, Preheader, ClonedLoop->getHeader()}); +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -862,7 +1053,8 @@ public: /// bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, - TargetLibraryInfo *TLI, Loop *CurLoop, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + BlockFrequencyInfo *BFI, Loop *CurLoop, MemorySSAUpdater &MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, @@ -879,6 +1071,9 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // re-hoisted if they end up not dominating all of their uses. SmallVector HoistedInstructions; + Module *M = CurLoop->getHeader()->getModule(); + ProfileSummaryInfo PSI(*M); + // For PHI hoisting to work we need to hoist blocks before their successors. // We can do this by iterating through the blocks in the loop in reverse // post-order. @@ -886,103 +1081,222 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, Worklist.perform(LI); bool Changed = false; BasicBlock *Preheader = CurLoop->getLoopPreheader(); - for (BasicBlock *BB : Worklist) { - // Only need to process the contents of this block if it is not part of a - // subloop (which would already have been processed). - if (!LoopNestMode && inSubLoop(BB, CurLoop, LI)) - continue; - - for (Instruction &I : llvm::make_early_inc_range(*BB)) { - // Try hoisting the instruction out to the preheader. We can only do - // this if all of the operands of the instruction are loop invariant and - // if it is safe to hoist the instruction. We also check block frequency - // to make sure instruction only gets hoisted into colder blocks. - // TODO: It may be safe to hoist if we are hoisting to a conditional block - // and we have accurately duplicated the control flow from the loop header - // to that block. - if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && - isSafeToExecuteUnconditionally( - I, DT, TLI, CurLoop, SafetyInfo, ORE, - Preheader->getTerminator(), AC, AllowSpeculation)) { - hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, SE, ORE); - HoistedInstructions.push_back(&I); - Changed = true; + bool hasCondLoadCand = false; + auto traverseAndHoist = [&](BasicBlock *Preheader, bool hoistCondLoad) -> void { + for (BasicBlock *BB : Worklist) { + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (!LoopNestMode && inSubLoop(BB, CurLoop, LI)) continue; - } - // Attempt to remove floating point division out of the loop by - // converting it to a reciprocal multiplication. - if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() && - CurLoop->isLoopInvariant(I.getOperand(1))) { - auto Divisor = I.getOperand(1); - auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); - auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); - ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); - SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent()); - ReciprocalDivisor->insertBefore(&I); - - auto Product = - BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); - Product->setFastMathFlags(I.getFastMathFlags()); - SafetyInfo->insertInstructionTo(Product, I.getParent()); - Product->insertAfter(&I); - I.replaceAllUsesWith(Product); - eraseInstruction(I, *SafetyInfo, MSSAU); + for (Instruction &I : llvm::make_early_inc_range(*BB)) { + bool SafeHoist = + isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE, + Preheader->getTerminator(), AC, + AllowSpeculation); + // Try hoisting the instruction out to the preheader. We can only do + // this if all of the operands of the instruction are loop invariant and + // if it is safe to hoist the instruction. We also check block frequency + // to make sure instruction only gets hoisted into colder blocks. + // TODO: It may be safe to hoist if we are hoisting to a conditional block + // and we have accurately duplicated the control flow from the loop header + // to that block. + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && + SafeHoist) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + HoistedInstructions.push_back(&I); + Changed = true; + continue; + } - hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), - SafetyInfo, MSSAU, SE, ORE); - HoistedInstructions.push_back(ReciprocalDivisor); - Changed = true; - continue; - } + if (!SafeHoist && HoistConditionalLoad != Hoist_Off && + !hasDisableHoistCondLoad(CurLoop)) { + if (findConditionalLoad(LI, CurLoop, TTI, &PSI, BFI, &I)) { + LLVM_DEBUG(dbgs() << "LICM: find the conditional load: " << I << "\n"); + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && + (getSVEContainerType(I.getType()) || I.getType()->isPointerTy())) { + hasCondLoadCand = true; + if (hoistCondLoad) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + // Replace hoisted load with @llvm.aarch64.sve.ldnf1.* + Instruction *ExtractI = replaceLoadWithLdnf(&I); + assert(ExtractI && "Failed to create ldnf1 to replace load"); + LLVM_DEBUG(dbgs() << "LICM: repalced with ldnf1: " << *ExtractI << "\n"); + I.replaceAllUsesWith(ExtractI); + eraseInstruction(I, *SafetyInfo, MSSAU); + // TODO: need HoistedInstructions? + Changed = true; + continue; + } + } + } + } - auto IsInvariantStart = [&](Instruction &I) { - using namespace PatternMatch; - return I.use_empty() && - match(&I, m_Intrinsic()); - }; - auto MustExecuteWithoutWritesBefore = [&](Instruction &I) { - return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && - SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop); - }; - if ((IsInvariantStart(I) || isGuard(&I)) && - CurLoop->hasLoopInvariantOperands(&I) && - MustExecuteWithoutWritesBefore(I)) { - hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, SE, ORE); - HoistedInstructions.push_back(&I); - Changed = true; - continue; - } + // Attempt to remove floating point division out of the loop by + // converting it to a reciprocal multiplication. + if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() && + CurLoop->isLoopInvariant(I.getOperand(1))) { + auto Divisor = I.getOperand(1); + auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); + auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); + ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); + SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent()); + ReciprocalDivisor->insertBefore(&I); + + auto Product = + BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); + Product->setFastMathFlags(I.getFastMathFlags()); + SafetyInfo->insertInstructionTo(Product, I.getParent()); + Product->insertAfter(&I); + I.replaceAllUsesWith(Product); + eraseInstruction(I, *SafetyInfo, MSSAU); + + hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), + SafetyInfo, MSSAU, SE, ORE); + HoistedInstructions.push_back(ReciprocalDivisor); + Changed = true; + continue; + } - if (PHINode *PN = dyn_cast(&I)) { - if (CFH.canHoistPHI(PN)) { - // Redirect incoming blocks first to ensure that we create hoisted - // versions of those blocks before we hoist the phi. - for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i) - PN->setIncomingBlock( - i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); - hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + auto IsInvariantStart = [&](Instruction &I) { + using namespace PatternMatch; + return I.use_empty() && + match(&I, m_Intrinsic()); + }; + auto MustExecuteWithoutWritesBefore = [&](Instruction &I) { + return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop); + }; + if ((IsInvariantStart(I) || isGuard(&I)) && + CurLoop->hasLoopInvariantOperands(&I) && + MustExecuteWithoutWritesBefore(I)) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); - assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); + HoistedInstructions.push_back(&I); Changed = true; continue; } - } - // Try to reassociate instructions so that part of computations can be - // done out of loop. - if (hoistArithmetics(I, *CurLoop, *SafetyInfo, MSSAU, AC, DT)) { - Changed = true; - continue; + if (PHINode *PN = dyn_cast(&I)) { + if (CFH.canHoistPHI(PN)) { + // Redirect incoming blocks first to ensure that we create hoisted + // versions of those blocks before we hoist the phi. + for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i) + PN->setIncomingBlock( + i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); + hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); + Changed = true; + continue; + } + } + + // Try to reassociate instructions so that part of computations can be + // done out of loop. + if (hoistArithmetics(I, *CurLoop, *SafetyInfo, MSSAU, AC, DT)) { + Changed = true; + continue; + } + + // Remember possibly hoistable branches so we can actually hoist them + // later if needed. + if (BranchInst *BI = dyn_cast(&I)) + CFH.registerPossiblyHoistableBranch(BI); } + } + }; - // Remember possibly hoistable branches so we can actually hoist them - // later if needed. - if (BranchInst *BI = dyn_cast(&I)) - CFH.registerPossiblyHoistableBranch(BI); + if (HoistConditionalLoad != Generic_Prof && HoistConditionalLoad != Generic_NoProf) { + traverseAndHoist(Preheader, true); + } else { + traverseAndHoist(Preheader, false); + + if (hasCondLoadCand) { + SmallVector NonVersionedLoopBlocks; + ValueToValueMapTy VMap; + + BasicBlock *Header = CurLoop->getHeader(); + Preheader = CurLoop->getLoopPreheader(); + + SmallVector DefsUsedOutside = findDefsUsedOutsideOfLoop(CurLoop); + // FIXME: Will creating a new loop during the LICM process produce unexpected errors? + VMap[Preheader] = Preheader; + Loop *ClonedLoop = cloneLoopBody(Header, Preheader, CurLoop, VMap, ".licm.lver.orig", LI, DT, + NonVersionedLoopBlocks); + remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap); + + SmallVector Exits; + CurLoop->getExitBlocks(Exits); + for (auto *ExitBB : Exits) { + DT->changeImmediateDominator(ExitBB, Preheader); + } + Instruction *Term = Preheader->getTerminator(); + IRBuilder<> B(Term); + + Value *Cond = ConstantInt::getTrue(B.getContext()); + B.CreateCondBr(Cond, CurLoop->getHeader(), ClonedLoop->getHeader()); + eraseInstruction(*Term, *SafetyInfo, MSSAU); + + SmallVector Updates; + Updates.push_back({cfg::UpdateKind::Insert, Preheader, ClonedLoop->getHeader()}); + updateExitsPhis(CurLoop, ClonedLoop, SE, DefsUsedOutside, VMap, Exits); + + // Update MemorySSA + LoopBlocksRPO LBRPO(CurLoop); + LBRPO.perform(LI); + MSSAU.updateForClonedLoop(LBRPO, Exits, VMap, /*IngoreIncomingWithNoCloned*/true); + MSSAU.updateExitBlocksForClonedLoop(Exits, VMap, *DT); + + SmallVector ClonedExitings; + ClonedLoop->getExitingBlocks(ClonedExitings); + for (auto Exiting: ClonedExitings) { + for (auto Succ : successors(Exiting)) { + if (!ClonedLoop->contains(Succ)) + Updates.push_back({cfg::UpdateKind::Insert, Exiting, Succ}); + } + } + MSSAU.applyInsertUpdates(Updates, *DT); + InsertPreheaderForLoop(CurLoop, DT, LI, &MSSAU, true); + + BasicBlock *OldPreheader = Preheader; + + Preheader = CurLoop->getLoopPreheader(); + B.SetInsertPoint(Preheader->getTerminator()); + B.CreateIntrinsic(Intrinsic::aarch64_sve_setffr, {}, {}); + + // Hoist conditional loads for curloop. + traverseAndHoist(Preheader, true); + + Instruction *NewTerm = Preheader->getTerminator(); + B.SetInsertPoint(NewTerm); + B.CreateBr(Preheader); + eraseInstruction(*NewTerm, *SafetyInfo, MSSAU); + + SmallVector Updates1; + Updates1.push_back({cfg::UpdateKind::Delete, OldPreheader, ClonedLoop->getHeader()}); + + createRuntimeCheckForCondLoad(CurLoop, ClonedLoop, Preheader, MSSAU, SafetyInfo, Updates); + ClonedLoop->getHeader()->replacePhiUsesWith(OldPreheader, Preheader); + + // Update DominatorTree + DT->recalculate(*Preheader->getParent()); + Updates1.push_back({cfg::UpdateKind::Insert, Preheader, ClonedLoop->getHeader()}); + MSSAU.applyInsertUpdates(Updates1, *DT); + + InsertPreheaderForLoop(ClonedLoop, DT, LI, &MSSAU, true); + InsertPreheaderForLoop(CurLoop, DT, LI, &MSSAU, true); + formDedicatedExitBlocks(ClonedLoop, DT, LI, &MSSAU, true); + formDedicatedExitBlocks(CurLoop, DT, LI, &MSSAU, true); + assert(CurLoop->isLoopSimplifyForm() && + ClonedLoop->isLoopSimplifyForm() && + "The versioned loops should be in simplified form."); + // Set "llvm.loop.hoist.cond_load.disable" metadata + ClonedLoop->setLoopHoistedVersion(); } } diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index d55208602b71..59ee7ab77b57 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -945,40 +945,21 @@ void llvm::remapInstructionsInBlocks(ArrayRef Blocks, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); } -/// Clones a loop \p OrigLoop. Returns the loop and the blocks in \p -/// Blocks. -/// -/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block -/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. -Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, - Loop *OrigLoop, ValueToValueMapTy &VMap, - const Twine &NameSuffix, LoopInfo *LI, - DominatorTree *DT, - SmallVectorImpl &Blocks) { +Loop *llvm::cloneLoopBody(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &Blocks) { Function *F = OrigLoop->getHeader()->getParent(); Loop *ParentLoop = OrigLoop->getParentLoop(); DenseMap LMap; Loop *NewLoop = LI->AllocateLoop(); - LMap[OrigLoop] = NewLoop; if (ParentLoop) ParentLoop->addChildLoop(NewLoop); else LI->addTopLevelLoop(NewLoop); - - BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); - assert(OrigPH && "No preheader"); - BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); - // To rename the loop PHIs. - VMap[OrigPH] = NewPH; - Blocks.push_back(NewPH); - - // Update LoopInfo. - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewPH, *LI); - - // Update DominatorTree. - DT->addNewBlock(NewPH, LoopDomBB); + LMap[OrigLoop] = NewLoop; for (Loop *CurLoop : OrigLoop->getLoopsInPreorder()) { Loop *&NewLoop = LMap[CurLoop]; @@ -1008,7 +989,7 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, // Add DominatorTree node. After seeing all blocks, update to correct // IDom. - DT->addNewBlock(NewBB, NewPH); + DT->addNewBlock(NewBB, LoopDomBB); Blocks.push_back(NewBB); } @@ -1026,13 +1007,46 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, } // Move them physically from the end of the block list. - F->splice(Before->getIterator(), F, NewPH->getIterator()); F->splice(Before->getIterator(), F, NewLoop->getHeader()->getIterator(), F->end()); return NewLoop; } +/// Clones a loop \p OrigLoop. Returns the loop and the blocks in \p +/// Blocks. +/// +/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block +/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. +Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &Blocks) { + Function *F = OrigLoop->getHeader()->getParent(); + Loop *ParentLoop = OrigLoop->getParentLoop(); + + BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); + assert(OrigPH && "No preheader"); + BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); + // To rename the loop PHIs. + VMap[OrigPH] = NewPH; + Blocks.push_back(NewPH); + + // Update LoopInfo. + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewPH, *LI); + + // Update DominatorTree. + DT->addNewBlock(NewPH, LoopDomBB); + + // Move them physically from the end of the block list. + F->splice(Before->getIterator(), F, NewPH->getIterator()); + + return cloneLoopBody(Before, NewPH, OrigLoop, VMap, NameSuffix, LI, DT, + Blocks); +} + /// Duplicate non-Phi instructions from the beginning of block up to /// StopAt instruction into a split block between BB and its predecessor. BasicBlock *llvm::DuplicateInstructionsInSplitBetween( diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index eaaf910e6df1..69cc465000dd 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -349,6 +349,10 @@ bool llvm::hasDisableLICMTransformsHint(const Loop *L) { return getBooleanLoopAttribute(L, LLVMLoopDisableLICM); } +bool llvm::hasDisableHoistCondLoad(const Loop *L) { + return getBooleanLoopAttribute(L, "llvm.loop.hoist.cond_load.disable"); +} + TransformationMode llvm::hasUnrollTransformation(const Loop *L) { if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable")) return TM_SuppressedByUser; diff --git a/llvm/test/Transforms/LICM/AArch64/hoist-cond-load.ll b/llvm/test/Transforms/LICM/AArch64/hoist-cond-load.ll new file mode 100644 index 000000000000..26d57830af7f --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/hoist-cond-load.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -passes=licm -licm-hoist-conditional-load=aggressive_noprof -verify-memoryssa < %s | FileCheck %s -check-prefixes=CHECK-AGG-NOP +; RUN: opt -S -passes=licm -licm-hoist-conditional-load=generic_noprof -verify-memoryssa < %s | FileCheck %s -check-prefixes=CHECK-GEN-NOP + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +%struct.Data = type { i32, double, ptr } + +define i32 @func(ptr %this, ptr %attr, ptr %indexs, i64 %sizes) #0 { +; CHECK-AGG-NOP-LABEL: define i32 @func +; CHECK-AGG-NOP-SAME: (ptr [[THIS:%.*]], ptr [[ATTR:%.*]], ptr [[INDEXS:%.*]], i64 [[SIZES:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-AGG-NOP-NEXT: entry: +; CHECK-AGG-NOP-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP0]], ptr [[INDEXS]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT:%.*]] = extractelement [[TMP1]], i64 0 +; CHECK-AGG-NOP-NEXT: [[TMP2:%.*]] = inttoptr i64 [[EXTRACT]] to ptr +; CHECK-AGG-NOP-NEXT: [[CMP3:%.*]] = icmp eq ptr [[ATTR]], null +; CHECK-AGG-NOP-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP3]], ptr [[ATTR]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT1:%.*]] = extractelement [[TMP4]], i64 0 +; CHECK-AGG-NOP-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[EXTRACT1]], 0 +; CHECK-AGG-NOP-NEXT: [[WEIGHT:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], ptr [[ATTR]], i64 0, i32 1 +; CHECK-AGG-NOP-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2f64( [[TMP5]], ptr [[WEIGHT]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT2:%.*]] = extractelement [[TMP6]], i64 0 +; CHECK-AGG-NOP-NEXT: [[CMP6:%.*]] = fcmp ogt double [[EXTRACT2]], 1.000000e+01 +; CHECK-AGG-NOP-NEXT: [[VPTR:%.*]] = getelementptr inbounds [[STRUCT_DATA]], ptr [[ATTR]], i64 0, i32 2 +; CHECK-AGG-NOP-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP7]], ptr [[VPTR]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT3:%.*]] = extractelement [[TMP8]], i64 0 +; CHECK-AGG-NOP-NEXT: [[TMP9:%.*]] = inttoptr i64 [[EXTRACT3]] to ptr +; CHECK-AGG-NOP-NEXT: [[TOBOOL8_NOT:%.*]] = icmp eq ptr [[TMP9]], null +; CHECK-AGG-NOP-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP11:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP10]], ptr [[TMP9]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT4:%.*]] = extractelement [[TMP11]], i64 0 +; CHECK-AGG-NOP-NEXT: br label [[FOR_COND:%.*]] +; CHECK-AGG-NOP: for.cond: +; CHECK-AGG-NOP-NEXT: [[RES_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RES_2:%.*]], [[FOR_INC:%.*]] ] +; CHECK-AGG-NOP-NEXT: [[I_0:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_INC]] ] +; CHECK-AGG-NOP-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_0]], [[SIZES]] +; CHECK-AGG-NOP-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-AGG-NOP: for.cond.cleanup: +; CHECK-AGG-NOP-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[RES_0]], [[FOR_COND]] ] +; CHECK-AGG-NOP-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-AGG-NOP: for.body: +; CHECK-AGG-NOP-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[I_0]] +; CHECK-AGG-NOP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4 +; CHECK-AGG-NOP-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP12]], 0 +; CHECK-AGG-NOP-NEXT: [[OR_COND:%.*]] = or i1 [[CMP3]], [[CMP2]] +; CHECK-AGG-NOP-NEXT: br i1 [[OR_COND]], label [[FOR_INC]], label [[IF_END5:%.*]] +; CHECK-AGG-NOP: if.end5: +; CHECK-AGG-NOP-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; CHECK-AGG-NOP: land.lhs.true: +; CHECK-AGG-NOP-NEXT: br i1 [[CMP6]], label [[IF_THEN7:%.*]], label [[FOR_INC]] +; CHECK-AGG-NOP: if.then7: +; CHECK-AGG-NOP-NEXT: br i1 [[TOBOOL8_NOT]], label [[IF_END13:%.*]], label [[IF_THEN9:%.*]] +; CHECK-AGG-NOP: if.then9: +; CHECK-AGG-NOP-NEXT: [[ADD:%.*]] = add i32 [[TMP12]], [[RES_0]] +; CHECK-AGG-NOP-NEXT: [[ADD12:%.*]] = add i32 [[ADD]], [[EXTRACT4]] +; CHECK-AGG-NOP-NEXT: br label [[IF_END13]] +; CHECK-AGG-NOP: if.end13: +; CHECK-AGG-NOP-NEXT: [[RES_1:%.*]] = phi i32 [ [[ADD12]], [[IF_THEN9]] ], [ [[RES_0]], [[IF_THEN7]] ] +; CHECK-AGG-NOP-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 2 +; CHECK-AGG-NOP-NEXT: [[ADD15:%.*]] = add nsw i32 [[RES_1]], [[MUL]] +; CHECK-AGG-NOP-NEXT: br label [[FOR_INC]] +; CHECK-AGG-NOP: for.inc: +; CHECK-AGG-NOP-NEXT: [[RES_2]] = phi i32 [ [[RES_0]], [[FOR_BODY]] ], [ [[ADD15]], [[IF_END13]] ], [ [[RES_0]], [[LAND_LHS_TRUE]] ], [ [[RES_0]], [[IF_END5]] ] +; CHECK-AGG-NOP-NEXT: [[INC]] = add nuw i64 [[I_0]], 1 +; CHECK-AGG-NOP-NEXT: br label [[FOR_COND]] +; +; CHECK-GEN-NOP-LABEL: define i32 @func +; CHECK-GEN-NOP-SAME: (ptr [[THIS:%.*]], ptr [[ATTR:%.*]], ptr [[INDEXS:%.*]], i64 [[SIZES:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-GEN-NOP-NEXT: entry: +; CHECK-GEN-NOP-NEXT: [[CMP3:%.*]] = icmp eq ptr [[ATTR]], null +; CHECK-GEN-NOP-NEXT: [[WEIGHT:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], ptr [[ATTR]], i64 0, i32 1 +; CHECK-GEN-NOP-NEXT: [[VPTR:%.*]] = getelementptr inbounds [[STRUCT_DATA]], ptr [[ATTR]], i64 0, i32 2 +; CHECK-GEN-NOP-NEXT: br i1 true, label [[FOR_COND_PREHEADER:%.*]], label [[FOR_COND_LICM_LVER_ORIG_PREHEADER:%.*]] +; CHECK-GEN-NOP: for.cond.preheader: +; CHECK-GEN-NOP-NEXT: call void @llvm.aarch64.sve.setffr() +; CHECK-GEN-NOP-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP0]], ptr [[INDEXS]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT:%.*]] = extractelement [[TMP1]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TMP2:%.*]] = inttoptr i64 [[EXTRACT]] to ptr +; CHECK-GEN-NOP-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP3]], ptr [[ATTR]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT1:%.*]] = extractelement [[TMP4]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[EXTRACT1]], 0 +; CHECK-GEN-NOP-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2f64( [[TMP5]], ptr [[WEIGHT]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT2:%.*]] = extractelement [[TMP6]], i64 0 +; CHECK-GEN-NOP-NEXT: [[CMP6:%.*]] = fcmp ogt double [[EXTRACT2]], 1.000000e+01 +; CHECK-GEN-NOP-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP7]], ptr [[VPTR]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT3:%.*]] = extractelement [[TMP8]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TMP9:%.*]] = inttoptr i64 [[EXTRACT3]] to ptr +; CHECK-GEN-NOP-NEXT: [[TOBOOL8_NOT:%.*]] = icmp eq ptr [[TMP9]], null +; CHECK-GEN-NOP-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP11:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP10]], ptr [[TMP9]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT4:%.*]] = extractelement [[TMP11]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TMP12:%.*]] = call @llvm.aarch64.sve.rdffr() +; CHECK-GEN-NOP-NEXT: [[TMP13:%.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP14:%.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( [[TMP13]], [[TMP12]]) +; CHECK-GEN-NOP-NEXT: br i1 [[TMP14]], label [[FOR_COND_PREHEADER5:%.*]], label [[FOR_COND_LICM_LVER_ORIG_PREHEADER]] +; CHECK-GEN-NOP: for.cond.preheader5: +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND:%.*]] +; CHECK-GEN-NOP: for.cond.licm.lver.orig.preheader: +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: for.cond.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[RES_0_LICM_LVER_ORIG:%.*]] = phi i32 [ [[RES_2_LICM_LVER_ORIG:%.*]], [[FOR_INC_LICM_LVER_ORIG:%.*]] ], [ 0, [[FOR_COND_LICM_LVER_ORIG_PREHEADER]] ] +; CHECK-GEN-NOP-NEXT: [[I_0_LICM_LVER_ORIG:%.*]] = phi i64 [ [[INC_LICM_LVER_ORIG:%.*]], [[FOR_INC_LICM_LVER_ORIG]] ], [ 0, [[FOR_COND_LICM_LVER_ORIG_PREHEADER]] ] +; CHECK-GEN-NOP-NEXT: [[CMP_LICM_LVER_ORIG:%.*]] = icmp ult i64 [[I_0_LICM_LVER_ORIG]], [[SIZES]] +; CHECK-GEN-NOP-NEXT: br i1 [[CMP_LICM_LVER_ORIG]], label [[FOR_BODY_LICM_LVER_ORIG:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; CHECK-GEN-NOP: for.body.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP15:%.*]] = load ptr, ptr [[INDEXS]], align 8 +; CHECK-GEN-NOP-NEXT: [[ADD_PTR_I_LICM_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[I_0_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: [[TMP16:%.*]] = load i32, ptr [[ADD_PTR_I_LICM_LVER_ORIG]], align 4 +; CHECK-GEN-NOP-NEXT: [[CMP2_LICM_LVER_ORIG:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-GEN-NOP-NEXT: [[OR_COND_LICM_LVER_ORIG:%.*]] = or i1 [[CMP3]], [[CMP2_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: br i1 [[OR_COND_LICM_LVER_ORIG]], label [[FOR_INC_LICM_LVER_ORIG]], label [[IF_END5_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: if.end5.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP17:%.*]] = load i32, ptr [[ATTR]], align 8 +; CHECK-GEN-NOP-NEXT: [[TOBOOL_NOT_LICM_LVER_ORIG:%.*]] = icmp eq i32 [[TMP17]], 0 +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL_NOT_LICM_LVER_ORIG]], label [[FOR_INC_LICM_LVER_ORIG]], label [[LAND_LHS_TRUE_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: land.lhs.true.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP18:%.*]] = load double, ptr [[WEIGHT]], align 8 +; CHECK-GEN-NOP-NEXT: [[CMP6_LICM_LVER_ORIG:%.*]] = fcmp ogt double [[TMP18]], 1.000000e+01 +; CHECK-GEN-NOP-NEXT: br i1 [[CMP6_LICM_LVER_ORIG]], label [[IF_THEN7_LICM_LVER_ORIG:%.*]], label [[FOR_INC_LICM_LVER_ORIG]] +; CHECK-GEN-NOP: if.then7.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP19:%.*]] = load ptr, ptr [[VPTR]], align 8 +; CHECK-GEN-NOP-NEXT: [[TOBOOL8_NOT_LICM_LVER_ORIG:%.*]] = icmp eq ptr [[TMP19]], null +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL8_NOT_LICM_LVER_ORIG]], label [[IF_END13_LICM_LVER_ORIG:%.*]], label [[IF_THEN9_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: if.then9.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +; CHECK-GEN-NOP-NEXT: [[ADD_LICM_LVER_ORIG:%.*]] = add i32 [[TMP16]], [[RES_0_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: [[ADD12_LICM_LVER_ORIG:%.*]] = add i32 [[ADD_LICM_LVER_ORIG]], [[TMP20]] +; CHECK-GEN-NOP-NEXT: br label [[IF_END13_LICM_LVER_ORIG]] +; CHECK-GEN-NOP: if.end13.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[RES_1_LICM_LVER_ORIG:%.*]] = phi i32 [ [[ADD12_LICM_LVER_ORIG]], [[IF_THEN9_LICM_LVER_ORIG]] ], [ [[RES_0_LICM_LVER_ORIG]], [[IF_THEN7_LICM_LVER_ORIG]] ] +; CHECK-GEN-NOP-NEXT: [[MUL_LICM_LVER_ORIG:%.*]] = mul nsw i32 [[TMP16]], 2 +; CHECK-GEN-NOP-NEXT: [[ADD15_LICM_LVER_ORIG:%.*]] = add nsw i32 [[RES_1_LICM_LVER_ORIG]], [[MUL_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: br label [[FOR_INC_LICM_LVER_ORIG]] +; CHECK-GEN-NOP: for.inc.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[RES_2_LICM_LVER_ORIG]] = phi i32 [ [[RES_0_LICM_LVER_ORIG]], [[FOR_BODY_LICM_LVER_ORIG]] ], [ [[ADD15_LICM_LVER_ORIG]], [[IF_END13_LICM_LVER_ORIG]] ], [ [[RES_0_LICM_LVER_ORIG]], [[LAND_LHS_TRUE_LICM_LVER_ORIG]] ], [ [[RES_0_LICM_LVER_ORIG]], [[IF_END5_LICM_LVER_ORIG]] ] +; CHECK-GEN-NOP-NEXT: [[INC_LICM_LVER_ORIG]] = add nuw i64 [[I_0_LICM_LVER_ORIG]], 1 +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_LICM_LVER_ORIG]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-GEN-NOP: for.cond: +; CHECK-GEN-NOP-NEXT: [[RES_0:%.*]] = phi i32 [ [[RES_2:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_COND_PREHEADER5]] ] +; CHECK-GEN-NOP-NEXT: [[I_0:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC]] ], [ 0, [[FOR_COND_PREHEADER5]] ] +; CHECK-GEN-NOP-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_0]], [[SIZES]] +; CHECK-GEN-NOP-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT6:%.*]] +; CHECK-GEN-NOP: for.cond.cleanup.loopexit: +; CHECK-GEN-NOP-NEXT: [[RES_0_LCSSA_PH:%.*]] = phi i32 [ [[RES_0_LICM_LVER_ORIG]], [[FOR_COND_LICM_LVER_ORIG]] ] +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_CLEANUP:%.*]] +; CHECK-GEN-NOP: for.cond.cleanup.loopexit6: +; CHECK-GEN-NOP-NEXT: [[RES_0_LCSSA_PH7:%.*]] = phi i32 [ [[RES_0]], [[FOR_COND]] ] +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-GEN-NOP: for.cond.cleanup: +; CHECK-GEN-NOP-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[RES_0_LCSSA_PH]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ [[RES_0_LCSSA_PH7]], [[FOR_COND_CLEANUP_LOOPEXIT6]] ] +; CHECK-GEN-NOP-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-GEN-NOP: for.body: +; CHECK-GEN-NOP-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[I_0]] +; CHECK-GEN-NOP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4 +; CHECK-GEN-NOP-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP21]], 0 +; CHECK-GEN-NOP-NEXT: [[OR_COND:%.*]] = or i1 [[CMP3]], [[CMP2]] +; CHECK-GEN-NOP-NEXT: br i1 [[OR_COND]], label [[FOR_INC]], label [[IF_END5:%.*]] +; CHECK-GEN-NOP: if.end5: +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; CHECK-GEN-NOP: land.lhs.true: +; CHECK-GEN-NOP-NEXT: br i1 [[CMP6]], label [[IF_THEN7:%.*]], label [[FOR_INC]] +; CHECK-GEN-NOP: if.then7: +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL8_NOT]], label [[IF_END13:%.*]], label [[IF_THEN9:%.*]] +; CHECK-GEN-NOP: if.then9: +; CHECK-GEN-NOP-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], [[RES_0]] +; CHECK-GEN-NOP-NEXT: [[ADD12:%.*]] = add i32 [[ADD]], [[EXTRACT4]] +; CHECK-GEN-NOP-NEXT: br label [[IF_END13]] +; CHECK-GEN-NOP: if.end13: +; CHECK-GEN-NOP-NEXT: [[RES_1:%.*]] = phi i32 [ [[ADD12]], [[IF_THEN9]] ], [ [[RES_0]], [[IF_THEN7]] ] +; CHECK-GEN-NOP-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 2 +; CHECK-GEN-NOP-NEXT: [[ADD15:%.*]] = add nsw i32 [[RES_1]], [[MUL]] +; CHECK-GEN-NOP-NEXT: br label [[FOR_INC]] +; CHECK-GEN-NOP: for.inc: +; CHECK-GEN-NOP-NEXT: [[RES_2]] = phi i32 [ [[RES_0]], [[FOR_BODY]] ], [ [[ADD15]], [[IF_END13]] ], [ [[RES_0]], [[LAND_LHS_TRUE]] ], [ [[RES_0]], [[IF_END5]] ] +; CHECK-GEN-NOP-NEXT: [[INC]] = add nuw i64 [[I_0]], 1 +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND]] +; +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %res.0 = phi i32 [ 0, %entry ], [ %res.2, %for.inc ] + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp ult i64 %i.0, %sizes + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %res.0.lcssa = phi i32 [ %res.0, %for.cond ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.cond + %0 = load ptr, ptr %indexs, align 8 + %add.ptr.i = getelementptr inbounds i32, ptr %0, i64 %i.0 + %1 = load i32, ptr %add.ptr.i, align 4 + %cmp2 = icmp eq i32 %1, 0 + %cmp3 = icmp eq ptr %attr, null + %or.cond = or i1 %cmp3, %cmp2 + br i1 %or.cond, label %for.inc, label %if.end5 + +if.end5: ; preds = %for.body + %2 = load i32, ptr %attr, align 8 + %tobool.not = icmp eq i32 %2, 0 + br i1 %tobool.not, label %for.inc, label %land.lhs.true + +land.lhs.true: ; preds = %if.end5 + %weight = getelementptr inbounds %struct.Data, ptr %attr, i64 0, i32 1 + %3 = load double, ptr %weight, align 8 + %cmp6 = fcmp ogt double %3, 1.000000e+01 + br i1 %cmp6, label %if.then7, label %for.inc + +if.then7: ; preds = %land.lhs.true + %vptr = getelementptr inbounds %struct.Data, ptr %attr, i64 0, i32 2 + %4 = load ptr, ptr %vptr, align 8 + %tobool8.not = icmp eq ptr %4, null + br i1 %tobool8.not, label %if.end13, label %if.then9 + +if.then9: ; preds = %if.then7 + %5 = load i32, ptr %4, align 4 + %add = add i32 %1, %res.0 + %add12 = add i32 %add, %5 + br label %if.end13 + +if.end13: ; preds = %if.then9, %if.then7 + %res.1 = phi i32 [ %add12, %if.then9 ], [ %res.0, %if.then7 ] + %mul = mul nsw i32 %1, 2 + %add15 = add nsw i32 %res.1, %mul + br label %for.inc + +for.inc: ; preds = %if.end13, %land.lhs.true, %if.end5, %for.body + %res.2 = phi i32 [ %res.0, %for.body ], [ %add15, %if.end13 ], [ %res.0, %land.lhs.true ], [ %res.0, %if.end5 ] + %inc = add nuw i64 %i.0, 1 + br label %for.cond +} + +attributes #0 = { mustprogress noinline nounwind uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+crc,+dotprod,+fp-armv8,+fullfp16,+neon,+ras,+rcpc,+rdm,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } + -- Gitee