diff --git a/llvm/include/llvm/Analysis/LoopInfo.h b/llvm/include/llvm/Analysis/LoopInfo.h index ea4cb7f7c68415bd1c663483aea443cc795df516..8d29f9ffc08d2ed8f65574a0fada99dffd602054 100644 --- a/llvm/include/llvm/Analysis/LoopInfo.h +++ b/llvm/include/llvm/Analysis/LoopInfo.h @@ -380,6 +380,12 @@ public: /// unrolling pass is run more than once (which it generally is). void setLoopAlreadyUnrolled(); + /// Add llvm.loop.hoist.cond_load.disable to this loop's id metadata. + /// + /// Add hoist.cond_load disable metadata to indicate the loop is a loop + /// version, and the original loop has already hoist the conditional load. + void setLoopHoistedVersion(); + /// Add llvm.loop.mustprogress to this loop's loop id metadata. void setLoopMustProgress(); diff --git a/llvm/include/llvm/Transforms/Utils/Cloning.h b/llvm/include/llvm/Transforms/Utils/Cloning.h index 1c342b871a4a45e314b50e5404488f3a8527c8f8..186e4bea16f0a9b73c0ca9b7fcf8d40b041001e1 100644 --- a/llvm/include/llvm/Transforms/Utils/Cloning.h +++ b/llvm/include/llvm/Transforms/Utils/Cloning.h @@ -281,6 +281,18 @@ Loop *cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, DominatorTree *DT, SmallVectorImpl &Blocks); +/// Clones a loop \p OrigLoop without preheader. Returns the loop and the +/// blocks in \p Blocks. +/// +/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block +/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. +/// Note: Only innermost loops are supported. +Loop *cloneLoopBody(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &Blocks); + /// Remaps instructions in \p Blocks using the mapping in \p VMap. void remapInstructionsInBlocks(ArrayRef Blocks, ValueToValueMapTy &VMap); diff --git a/llvm/include/llvm/Transforms/Utils/LoopUtils.h b/llvm/include/llvm/Transforms/Utils/LoopUtils.h index cc31fc79c2de1818f1aa58759edd7117fcf347dd..4074d0f85b347b5db49e7e59e6a1ac2e570d77ce 100644 --- a/llvm/include/llvm/Transforms/Utils/LoopUtils.h +++ b/llvm/include/llvm/Transforms/Utils/LoopUtils.h @@ -173,10 +173,10 @@ bool sinkRegionForLoopNest(DomTreeNode *, AAResults *, LoopInfo *, /// \p AllowSpeculation is whether values should be hoisted even if they are not /// guaranteed to execute in the loop, but are safe to speculatively execute. bool hoistRegion(DomTreeNode *, AAResults *, LoopInfo *, DominatorTree *, - AssumptionCache *, TargetLibraryInfo *, Loop *, - MemorySSAUpdater &, ScalarEvolution *, ICFLoopSafetyInfo *, - SinkAndHoistLICMFlags &, OptimizationRemarkEmitter *, bool, - bool AllowSpeculation); + AssumptionCache *, TargetLibraryInfo *, TargetTransformInfo *, + BlockFrequencyInfo *, Loop *, MemorySSAUpdater &, + ScalarEvolution *, ICFLoopSafetyInfo *, SinkAndHoistLICMFlags &, + OptimizationRemarkEmitter *, bool, bool AllowSpeculation); /// Return true if the induction variable \p IV in a Loop whose latch is /// \p LatchBlock would become dead if the exit test \p Cond were removed. @@ -271,6 +271,9 @@ bool hasDisableAllTransformsHint(const Loop *L); /// Look for the loop attribute that disables the LICM transformation heuristics. bool hasDisableLICMTransformsHint(const Loop *L); +/// Look for the loop attribute that disables the LICM to hoist conditional load. +bool hasDisableHoistCondLoad(const Loop *L); + /// The mode sets how eager a transformation should be applied. enum TransformationMode { /// The pass can use heuristics to determine whether a transformation should diff --git a/llvm/lib/Analysis/LoopInfo.cpp b/llvm/lib/Analysis/LoopInfo.cpp index 36aca73ee675ec791482ba46b73deb4991a79af0..3531ce9236684a616837dcf942e214b78f04a338 100644 --- a/llvm/lib/Analysis/LoopInfo.cpp +++ b/llvm/lib/Analysis/LoopInfo.cpp @@ -549,6 +549,22 @@ void Loop::setLoopAlreadyUnrolled() { setLoopID(NewLoopID); } +void Loop::setLoopHoistedVersion() { + LLVMContext &Context = getHeader()->getContext(); + + MDNode *DisableHoist = findOptionMDForLoop(this, "llvm.loop.hoist.cond_load.disable"); + + if (DisableHoist) + return; + + MDNode *DisableHoistMD = + MDNode::get(Context, MDString::get(Context, "llvm.loop.hoist.cond_load.disable")); + MDNode *LoopID = getLoopID(); + MDNode *NewLoopID = makePostTransformationMetadata( + Context, LoopID, {"llvm.loop.hoist.cond_load."}, {DisableHoistMD}); + setLoopID(NewLoopID); +} + void Loop::setLoopMustProgress() { LLVMContext &Context = getHeader()->getContext(); diff --git a/llvm/lib/Transforms/Scalar/LICM.cpp b/llvm/lib/Transforms/Scalar/LICM.cpp index 71b567bc7c966055d6121e61449c8cacac423ebe..bcf78808ce586997dc39fd2489b83e1960b1ef15 100644 --- a/llvm/lib/Transforms/Scalar/LICM.cpp +++ b/llvm/lib/Transforms/Scalar/LICM.cpp @@ -56,10 +56,12 @@ #include "llvm/Analysis/MemorySSAUpdater.h" #include "llvm/Analysis/MustExecute.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" +#include "llvm/Analysis/ProfileSummaryInfo.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/TargetLibraryInfo.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/Analysis/ValueTracking.h" +#include "llvm/Analysis/VectorUtils.h" #include "llvm/IR/CFG.h" #include "llvm/IR/Constants.h" #include "llvm/IR/DataLayout.h" @@ -68,6 +70,7 @@ #include "llvm/IR/Dominators.h" #include "llvm/IR/Instructions.h" #include "llvm/IR/IntrinsicInst.h" +#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/IRBuilder.h" #include "llvm/IR/LLVMContext.h" #include "llvm/IR/Metadata.h" @@ -81,8 +84,10 @@ #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils/AssumeBundleBuilder.h" #include "llvm/Transforms/Utils/BasicBlockUtils.h" +#include "llvm/Transforms/Utils/Cloning.h" #include "llvm/Transforms/Utils/Local.h" #include "llvm/Transforms/Utils/LoopUtils.h" +#include "llvm/Transforms/Utils/LoopSimplify.h" #include "llvm/Transforms/Utils/SSAUpdater.h" #include #include @@ -133,6 +138,23 @@ static cl::opt MaxNumUsesTraversed( cl::desc("Max num uses visited for identifying load " "invariance in loop using invariant start (default = 8)")); +enum HoistCondLoadMode { + Hoist_Off, + Generic_Prof, + Aggressive_Prof, + Generic_NoProf, + Aggressive_NoProf, +}; + +static cl::opt HoistConditionalLoad( + "licm-hoist-conditional-load", cl::Hidden, + cl::values(clEnumValN(Hoist_Off, "off", "Disable hoist conditional load"), + clEnumValN(Generic_Prof, "generic", "Generic mode with pgo, which does hoist with loop version(default mode)"), + clEnumValN(Aggressive_Prof, "aggressive", "Aggressive mode with pgo, which allow illegal address accesses without fault"), + clEnumValN(Generic_NoProf, "generic_noprof", "Generic mode without profile, which does hosit with loop version(default mode)"), + clEnumValN(Aggressive_NoProf, "aggressive_noprof", "Aggressive mode without profile, which allow illegal address accesses without fault")), + cl::init(Generic_Prof), + cl::desc("Hoist conditional load with sve no-fault instruction")); // Experimental option to allow imprecision in LICM in pathological cases, in // exchange for faster compile. This is to be removed if MemorySSA starts to // address the same issue. LICM calls MemorySSAWalker's @@ -202,11 +224,29 @@ using PointersAndHasReadsOutsideSet = static SmallVector collectPromotionCandidates(MemorySSA *MSSA, AliasAnalysis *AA, Loop *L); +static void updateExitsPhis(Loop *CurLoop, Loop *ClonedLoop, ScalarEvolution *SE, + SmallVectorImpl &DefsUsedOutside, + ValueToValueMapTy &VMap, SmallVectorImpl &Exits); + +static ScalableVectorType *getSVEContainerType(Type *EltTy); + +static Instruction *replaceLoadWithLdnf(Instruction *I); + +static bool +findConditionalLoad(LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + Instruction *I); + +static void createRuntimeCheckForCondLoad( + Loop *CurLoop, Loop *ClonedLoop, BasicBlock *Preheader, MemorySSAUpdater &MSSAU, + ICFLoopSafetyInfo *SafetyInfo, SmallVectorImpl &Updates); + namespace { struct LoopInvariantCodeMotion { bool runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *TLI, - TargetTransformInfo *TTI, ScalarEvolution *SE, MemorySSA *MSSA, + TargetTransformInfo *TTI, BlockFrequencyInfo *BFI, + ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool LoopNestMode = false); LoopInvariantCodeMotion(unsigned LicmMssaOptCap, @@ -255,6 +295,7 @@ struct LegacyLICMPass : public LoopPass { &getAnalysis().getAssumptionCache(*F), &getAnalysis().getTLI(*F), &getAnalysis().getTTI(*F), + &getAnalysis().getBFI(), SE ? &SE->getSE() : nullptr, MSSA, &ORE); } @@ -294,7 +335,7 @@ PreservedAnalyses LICMPass::run(Loop &L, LoopAnalysisManager &AM, LoopInvariantCodeMotion LICM(Opts.MssaOptCap, Opts.MssaNoAccForPromotionCap, Opts.AllowSpeculation); if (!LICM.runOnLoop(&L, &AR.AA, &AR.LI, &AR.DT, &AR.AC, &AR.TLI, &AR.TTI, - &AR.SE, AR.MSSA, &ORE)) + AR.BFI, &AR.SE, AR.MSSA, &ORE)) return PreservedAnalyses::all(); auto PA = getLoopPassPreservedAnalyses(); @@ -330,7 +371,8 @@ PreservedAnalyses LNICMPass::run(LoopNest &LN, LoopAnalysisManager &AM, Loop &OutermostLoop = LN.getOutermostLoop(); bool Changed = LICM.runOnLoop(&OutermostLoop, &AR.AA, &AR.LI, &AR.DT, &AR.AC, - &AR.TLI, &AR.TTI, &AR.SE, AR.MSSA, &ORE, true); + &AR.TLI, &AR.TTI, AR.BFI, &AR.SE, AR.MSSA, + &ORE, true); if (!Changed) return PreservedAnalyses::all(); @@ -398,6 +440,7 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + BlockFrequencyInfo *BFI, ScalarEvolution *SE, MemorySSA *MSSA, OptimizationRemarkEmitter *ORE, bool LoopNestMode) { @@ -456,9 +499,9 @@ bool LoopInvariantCodeMotion::runOnLoop(Loop *L, AAResults *AA, LoopInfo *LI, MSSAU, &SafetyInfo, Flags, ORE); Flags.setIsSink(false); if (Preheader) - Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, L, - MSSAU, SE, &SafetyInfo, Flags, ORE, LoopNestMode, - LicmAllowSpeculation); + Changed |= hoistRegion(DT->getNode(L->getHeader()), AA, LI, DT, AC, TLI, TTI, + BFI, L, MSSAU, SE, &SafetyInfo, Flags, ORE, + LoopNestMode, LicmAllowSpeculation); // Now that all loop invariants have been removed from the loop, promote any // memory references to scalars that we can. @@ -855,6 +898,154 @@ public: }; } // namespace +// It can be ensured that the loop is in LCSSA form, so many definitions within the +// loop that are used outside the loop must be accessed through PHINodes. +static void updateExitsPhis(Loop *CurLoop, Loop *ClonedLoop, ScalarEvolution *SE, + SmallVectorImpl &DefsUsedOutside, + ValueToValueMapTy &VMap, SmallVectorImpl &Exits) { + PHINode *PN = nullptr; + SmallPtrSet IsVisited; + for (auto *BB : Exits) { + if (IsVisited.contains(BB)) + continue; + IsVisited.insert(BB); + // Then for each PHI add the operand for the edge from the cloned loop. + for (auto I = BB->begin(); (PN = dyn_cast(I)); ++I) { + + // If the definition was cloned used that otherwise use the same value. + size_t IncomingSize = PN->getNumIncomingValues(); + for (size_t Idx = 0; Idx < IncomingSize; ++Idx) { + Value *ClonedValue = PN->getIncomingValue(Idx); + BasicBlock *ClonedExiting = PN->getIncomingBlock(Idx); + auto MappedValue = VMap.find(ClonedValue); + if (MappedValue != VMap.end()) + ClonedValue = MappedValue->second; + + auto MappedBB = VMap.find(ClonedExiting); + if (MappedBB != VMap.end()) + ClonedExiting = dyn_cast(MappedBB->second); + + PN->addIncoming(ClonedValue, ClonedExiting); + } + } + } +} + +// Now, processing is done according to the 128-bit SVE register width. +static ScalableVectorType *getSVEContainerType(Type *EltTy) { + if (EltTy == Type::getDoubleTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getBFloatTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getHalfTy(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt64Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 2); + + if(EltTy == Type::getInt32Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 4); + + if(EltTy == Type::getInt16Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 8); + + if(EltTy == Type::getInt8Ty(EltTy->getContext())) + return ScalableVectorType::get(EltTy, 16); + + return nullptr; +} + +static Instruction *replaceLoadWithLdnf(Instruction *I) { + auto *LoadI = dyn_cast(I); + auto *PointerOp = LoadI->getPointerOperand(); + const DataLayout &DL = LoadI->getModule()->getDataLayout(); + auto *LITy = LoadI->getType(); + IRBuilder<> B(LoadI); + auto *EltTy = LITy->isPointerTy() ? B.getIntNTy(DL.getPointerSizeInBits()) + : LITy; + auto *SVTy = getSVEContainerType(EltTy); + assert(SVTy && "Unsupport type of load instruction"); + + auto *PredTy = ScalableVectorType::get(B.getInt1Ty(), SVTy->getMinNumElements()); + Value *Imm = B.getInt32(1); + CallInst *Pred = B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {PredTy}, {Imm}); + + Type *PtrTy = LITy->getPointerTo(PointerOp->getType()->getPointerAddressSpace()); + if (PointerOp->getType() != PtrTy) + PointerOp = B.CreateBitCast(PointerOp, PtrTy); + + CallInst *Ldnf = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ldnf1, {SVTy}, {Pred, PointerOp}); + // FIXME: 1. need copy align info? 2. Is right method to copy metadata + propagateMetadata(Ldnf, LoadI); + + Value *Scalar = B.CreateExtractElement(Ldnf, B.getInt64(0), "extract"); + if (LITy->isPointerTy()) { + Value *PtrValue = B.CreateIntToPtr(Scalar, PointerType::getUnqual(Scalar->getContext())); + return dyn_cast(PtrValue); + } + return dyn_cast(Scalar); +} + +static bool +findConditionalLoad(LoopInfo *LI, Loop *CurLoop, TargetTransformInfo *TTI, + ProfileSummaryInfo *PSI, BlockFrequencyInfo *BFI, + Instruction *I) { + auto LoadI = dyn_cast(I); + if (!LoadI) + return false; + + if (LoadI->isAtomic() || LoadI->isVolatile()) + return false; + + Module *M = LoadI->getModule(); + Triple TargetTriple(M->getTargetTriple()); + if (!TargetTriple.isAArch64() || !TTI->supportsScalableVectors()) + return false; + + // TODO: add support for vector type + if (LoadI->getType()->isVectorTy()) + return false; + + BasicBlock *BB = LoadI->getParent(); + if (HoistConditionalLoad == Generic_Prof || HoistConditionalLoad == Aggressive_Prof) { + if (!PSI->hasProfileSummary() || !PSI->isHotBlock(BB, BFI)) + return false; + } + + auto PointerOp = LoadI->getPointerOperand(); + if (isa(PointerOp)) + return true; + + auto *PI = dyn_cast(PointerOp); + if (!PI || CurLoop->contains(PI)) + return false; + + return true; +} + +static void createRuntimeCheckForCondLoad( + Loop *CurLoop, Loop *ClonedLoop, BasicBlock *Preheader, MemorySSAUpdater &MSSAU, + ICFLoopSafetyInfo *SafetyInfo, SmallVectorImpl &Updates) { + auto *OriTerm = Preheader->getTerminator(); + IRBuilder<> B(OriTerm); + CallInst *Rdffr = + B.CreateIntrinsic(Intrinsic::aarch64_sve_rdffr, {}, {}); + Type *RdffrTy = Rdffr->getType(); + CallInst *PTrue = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ptrue, {RdffrTy}, {B.getInt32(1)}); + CallInst *IsValid = + B.CreateIntrinsic(Intrinsic::aarch64_sve_ptest_any, {RdffrTy}, {PTrue, Rdffr}); + B.CreateCondBr(IsValid, CurLoop->getHeader(), ClonedLoop->getHeader()); + eraseInstruction(*OriTerm, *SafetyInfo, MSSAU); + Updates.push_back({cfg::UpdateKind::Insert, Preheader, ClonedLoop->getHeader()}); +} + /// Walk the specified region of the CFG (defined by all blocks dominated by /// the specified block, and that are in the current loop) in depth first /// order w.r.t the DominatorTree. This allows us to visit definitions before @@ -862,7 +1053,8 @@ public: /// bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, DominatorTree *DT, AssumptionCache *AC, - TargetLibraryInfo *TLI, Loop *CurLoop, + TargetLibraryInfo *TLI, TargetTransformInfo *TTI, + BlockFrequencyInfo *BFI, Loop *CurLoop, MemorySSAUpdater &MSSAU, ScalarEvolution *SE, ICFLoopSafetyInfo *SafetyInfo, SinkAndHoistLICMFlags &Flags, @@ -879,6 +1071,9 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, // re-hoisted if they end up not dominating all of their uses. SmallVector HoistedInstructions; + Module *M = CurLoop->getHeader()->getModule(); + ProfileSummaryInfo PSI(*M); + // For PHI hoisting to work we need to hoist blocks before their successors. // We can do this by iterating through the blocks in the loop in reverse // post-order. @@ -886,103 +1081,222 @@ bool llvm::hoistRegion(DomTreeNode *N, AAResults *AA, LoopInfo *LI, Worklist.perform(LI); bool Changed = false; BasicBlock *Preheader = CurLoop->getLoopPreheader(); - for (BasicBlock *BB : Worklist) { - // Only need to process the contents of this block if it is not part of a - // subloop (which would already have been processed). - if (!LoopNestMode && inSubLoop(BB, CurLoop, LI)) - continue; - - for (Instruction &I : llvm::make_early_inc_range(*BB)) { - // Try hoisting the instruction out to the preheader. We can only do - // this if all of the operands of the instruction are loop invariant and - // if it is safe to hoist the instruction. We also check block frequency - // to make sure instruction only gets hoisted into colder blocks. - // TODO: It may be safe to hoist if we are hoisting to a conditional block - // and we have accurately duplicated the control flow from the loop header - // to that block. - if (CurLoop->hasLoopInvariantOperands(&I) && - canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && - isSafeToExecuteUnconditionally( - I, DT, TLI, CurLoop, SafetyInfo, ORE, - Preheader->getTerminator(), AC, AllowSpeculation)) { - hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, SE, ORE); - HoistedInstructions.push_back(&I); - Changed = true; + bool hasCondLoadCand = false; + auto traverseAndHoist = [&](BasicBlock *Preheader, bool hoistCondLoad) -> void { + for (BasicBlock *BB : Worklist) { + // Only need to process the contents of this block if it is not part of a + // subloop (which would already have been processed). + if (!LoopNestMode && inSubLoop(BB, CurLoop, LI)) continue; - } - // Attempt to remove floating point division out of the loop by - // converting it to a reciprocal multiplication. - if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() && - CurLoop->isLoopInvariant(I.getOperand(1))) { - auto Divisor = I.getOperand(1); - auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); - auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); - ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); - SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent()); - ReciprocalDivisor->insertBefore(&I); - - auto Product = - BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); - Product->setFastMathFlags(I.getFastMathFlags()); - SafetyInfo->insertInstructionTo(Product, I.getParent()); - Product->insertAfter(&I); - I.replaceAllUsesWith(Product); - eraseInstruction(I, *SafetyInfo, MSSAU); + for (Instruction &I : llvm::make_early_inc_range(*BB)) { + bool SafeHoist = + isSafeToExecuteUnconditionally(I, DT, TLI, CurLoop, SafetyInfo, ORE, + Preheader->getTerminator(), AC, + AllowSpeculation); + // Try hoisting the instruction out to the preheader. We can only do + // this if all of the operands of the instruction are loop invariant and + // if it is safe to hoist the instruction. We also check block frequency + // to make sure instruction only gets hoisted into colder blocks. + // TODO: It may be safe to hoist if we are hoisting to a conditional block + // and we have accurately duplicated the control flow from the loop header + // to that block. + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && + SafeHoist) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + HoistedInstructions.push_back(&I); + Changed = true; + continue; + } - hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), - SafetyInfo, MSSAU, SE, ORE); - HoistedInstructions.push_back(ReciprocalDivisor); - Changed = true; - continue; - } + if (!SafeHoist && HoistConditionalLoad != Hoist_Off && + !hasDisableHoistCondLoad(CurLoop)) { + if (findConditionalLoad(LI, CurLoop, TTI, &PSI, BFI, &I)) { + LLVM_DEBUG(dbgs() << "LICM: find the conditional load: " << I << "\n"); + if (CurLoop->hasLoopInvariantOperands(&I) && + canSinkOrHoistInst(I, AA, DT, CurLoop, MSSAU, true, Flags, ORE) && + (getSVEContainerType(I.getType()) || I.getType()->isPointerTy())) { + hasCondLoadCand = true; + if (hoistCondLoad) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + // Replace hoisted load with @llvm.aarch64.sve.ldnf1.* + Instruction *ExtractI = replaceLoadWithLdnf(&I); + assert(ExtractI && "Failed to create ldnf1 to replace load"); + LLVM_DEBUG(dbgs() << "LICM: repalced with ldnf1: " << *ExtractI << "\n"); + I.replaceAllUsesWith(ExtractI); + eraseInstruction(I, *SafetyInfo, MSSAU); + // TODO: need HoistedInstructions? + Changed = true; + continue; + } + } + } + } - auto IsInvariantStart = [&](Instruction &I) { - using namespace PatternMatch; - return I.use_empty() && - match(&I, m_Intrinsic()); - }; - auto MustExecuteWithoutWritesBefore = [&](Instruction &I) { - return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && - SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop); - }; - if ((IsInvariantStart(I) || isGuard(&I)) && - CurLoop->hasLoopInvariantOperands(&I) && - MustExecuteWithoutWritesBefore(I)) { - hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, - MSSAU, SE, ORE); - HoistedInstructions.push_back(&I); - Changed = true; - continue; - } + // Attempt to remove floating point division out of the loop by + // converting it to a reciprocal multiplication. + if (I.getOpcode() == Instruction::FDiv && I.hasAllowReciprocal() && + CurLoop->isLoopInvariant(I.getOperand(1))) { + auto Divisor = I.getOperand(1); + auto One = llvm::ConstantFP::get(Divisor->getType(), 1.0); + auto ReciprocalDivisor = BinaryOperator::CreateFDiv(One, Divisor); + ReciprocalDivisor->setFastMathFlags(I.getFastMathFlags()); + SafetyInfo->insertInstructionTo(ReciprocalDivisor, I.getParent()); + ReciprocalDivisor->insertBefore(&I); + + auto Product = + BinaryOperator::CreateFMul(I.getOperand(0), ReciprocalDivisor); + Product->setFastMathFlags(I.getFastMathFlags()); + SafetyInfo->insertInstructionTo(Product, I.getParent()); + Product->insertAfter(&I); + I.replaceAllUsesWith(Product); + eraseInstruction(I, *SafetyInfo, MSSAU); + + hoist(*ReciprocalDivisor, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), + SafetyInfo, MSSAU, SE, ORE); + HoistedInstructions.push_back(ReciprocalDivisor); + Changed = true; + continue; + } - if (PHINode *PN = dyn_cast(&I)) { - if (CFH.canHoistPHI(PN)) { - // Redirect incoming blocks first to ensure that we create hoisted - // versions of those blocks before we hoist the phi. - for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i) - PN->setIncomingBlock( - i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); - hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + auto IsInvariantStart = [&](Instruction &I) { + using namespace PatternMatch; + return I.use_empty() && + match(&I, m_Intrinsic()); + }; + auto MustExecuteWithoutWritesBefore = [&](Instruction &I) { + return SafetyInfo->isGuaranteedToExecute(I, DT, CurLoop) && + SafetyInfo->doesNotWriteMemoryBefore(I, CurLoop); + }; + if ((IsInvariantStart(I) || isGuard(&I)) && + CurLoop->hasLoopInvariantOperands(&I) && + MustExecuteWithoutWritesBefore(I)) { + hoist(I, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, MSSAU, SE, ORE); - assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); + HoistedInstructions.push_back(&I); Changed = true; continue; } - } - // Try to reassociate instructions so that part of computations can be - // done out of loop. - if (hoistArithmetics(I, *CurLoop, *SafetyInfo, MSSAU, AC, DT)) { - Changed = true; - continue; + if (PHINode *PN = dyn_cast(&I)) { + if (CFH.canHoistPHI(PN)) { + // Redirect incoming blocks first to ensure that we create hoisted + // versions of those blocks before we hoist the phi. + for (unsigned int i = 0; i < PN->getNumIncomingValues(); ++i) + PN->setIncomingBlock( + i, CFH.getOrCreateHoistedBlock(PN->getIncomingBlock(i))); + hoist(*PN, DT, CurLoop, CFH.getOrCreateHoistedBlock(BB), SafetyInfo, + MSSAU, SE, ORE); + assert(DT->dominates(PN, BB) && "Conditional PHIs not expected"); + Changed = true; + continue; + } + } + + // Try to reassociate instructions so that part of computations can be + // done out of loop. + if (hoistArithmetics(I, *CurLoop, *SafetyInfo, MSSAU, AC, DT)) { + Changed = true; + continue; + } + + // Remember possibly hoistable branches so we can actually hoist them + // later if needed. + if (BranchInst *BI = dyn_cast(&I)) + CFH.registerPossiblyHoistableBranch(BI); } + } + }; - // Remember possibly hoistable branches so we can actually hoist them - // later if needed. - if (BranchInst *BI = dyn_cast(&I)) - CFH.registerPossiblyHoistableBranch(BI); + if (HoistConditionalLoad != Generic_Prof && HoistConditionalLoad != Generic_NoProf) { + traverseAndHoist(Preheader, true); + } else { + traverseAndHoist(Preheader, false); + + if (hasCondLoadCand) { + SmallVector NonVersionedLoopBlocks; + ValueToValueMapTy VMap; + + BasicBlock *Header = CurLoop->getHeader(); + Preheader = CurLoop->getLoopPreheader(); + + SmallVector DefsUsedOutside = findDefsUsedOutsideOfLoop(CurLoop); + // FIXME: Will creating a new loop during the LICM process produce unexpected errors? + VMap[Preheader] = Preheader; + Loop *ClonedLoop = cloneLoopBody(Header, Preheader, CurLoop, VMap, ".licm.lver.orig", LI, DT, + NonVersionedLoopBlocks); + remapInstructionsInBlocks(NonVersionedLoopBlocks, VMap); + + SmallVector Exits; + CurLoop->getExitBlocks(Exits); + for (auto *ExitBB : Exits) { + DT->changeImmediateDominator(ExitBB, Preheader); + } + Instruction *Term = Preheader->getTerminator(); + IRBuilder<> B(Term); + + Value *Cond = ConstantInt::getTrue(B.getContext()); + B.CreateCondBr(Cond, CurLoop->getHeader(), ClonedLoop->getHeader()); + eraseInstruction(*Term, *SafetyInfo, MSSAU); + + SmallVector Updates; + Updates.push_back({cfg::UpdateKind::Insert, Preheader, ClonedLoop->getHeader()}); + updateExitsPhis(CurLoop, ClonedLoop, SE, DefsUsedOutside, VMap, Exits); + + // Update MemorySSA + LoopBlocksRPO LBRPO(CurLoop); + LBRPO.perform(LI); + MSSAU.updateForClonedLoop(LBRPO, Exits, VMap, /*IngoreIncomingWithNoCloned*/true); + MSSAU.updateExitBlocksForClonedLoop(Exits, VMap, *DT); + + SmallVector ClonedExitings; + ClonedLoop->getExitingBlocks(ClonedExitings); + for (auto Exiting: ClonedExitings) { + for (auto Succ : successors(Exiting)) { + if (!ClonedLoop->contains(Succ)) + Updates.push_back({cfg::UpdateKind::Insert, Exiting, Succ}); + } + } + MSSAU.applyInsertUpdates(Updates, *DT); + InsertPreheaderForLoop(CurLoop, DT, LI, &MSSAU, true); + + BasicBlock *OldPreheader = Preheader; + + Preheader = CurLoop->getLoopPreheader(); + B.SetInsertPoint(Preheader->getTerminator()); + B.CreateIntrinsic(Intrinsic::aarch64_sve_setffr, {}, {}); + + // Hoist conditional loads for curloop. + traverseAndHoist(Preheader, true); + + Instruction *NewTerm = Preheader->getTerminator(); + B.SetInsertPoint(NewTerm); + B.CreateBr(Preheader); + eraseInstruction(*NewTerm, *SafetyInfo, MSSAU); + + SmallVector Updates1; + Updates1.push_back({cfg::UpdateKind::Delete, OldPreheader, ClonedLoop->getHeader()}); + + createRuntimeCheckForCondLoad(CurLoop, ClonedLoop, Preheader, MSSAU, SafetyInfo, Updates); + ClonedLoop->getHeader()->replacePhiUsesWith(OldPreheader, Preheader); + + // Update DominatorTree + DT->recalculate(*Preheader->getParent()); + Updates1.push_back({cfg::UpdateKind::Insert, Preheader, ClonedLoop->getHeader()}); + MSSAU.applyInsertUpdates(Updates1, *DT); + + InsertPreheaderForLoop(ClonedLoop, DT, LI, &MSSAU, true); + InsertPreheaderForLoop(CurLoop, DT, LI, &MSSAU, true); + formDedicatedExitBlocks(ClonedLoop, DT, LI, &MSSAU, true); + formDedicatedExitBlocks(CurLoop, DT, LI, &MSSAU, true); + assert(CurLoop->isLoopSimplifyForm() && + ClonedLoop->isLoopSimplifyForm() && + "The versioned loops should be in simplified form."); + // Set "llvm.loop.hoist.cond_load.disable" metadata + ClonedLoop->setLoopHoistedVersion(); } } diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp index d55208602b715f861a62d0753ba68e2aadff0816..59ee7ab77b57c03256dfdc5fe39fd72d8788e1d9 100644 --- a/llvm/lib/Transforms/Utils/CloneFunction.cpp +++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp @@ -945,40 +945,21 @@ void llvm::remapInstructionsInBlocks(ArrayRef Blocks, RF_NoModuleLevelChanges | RF_IgnoreMissingLocals); } -/// Clones a loop \p OrigLoop. Returns the loop and the blocks in \p -/// Blocks. -/// -/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block -/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. -Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, - Loop *OrigLoop, ValueToValueMapTy &VMap, - const Twine &NameSuffix, LoopInfo *LI, - DominatorTree *DT, - SmallVectorImpl &Blocks) { +Loop *llvm::cloneLoopBody(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &Blocks) { Function *F = OrigLoop->getHeader()->getParent(); Loop *ParentLoop = OrigLoop->getParentLoop(); DenseMap LMap; Loop *NewLoop = LI->AllocateLoop(); - LMap[OrigLoop] = NewLoop; if (ParentLoop) ParentLoop->addChildLoop(NewLoop); else LI->addTopLevelLoop(NewLoop); - - BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); - assert(OrigPH && "No preheader"); - BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); - // To rename the loop PHIs. - VMap[OrigPH] = NewPH; - Blocks.push_back(NewPH); - - // Update LoopInfo. - if (ParentLoop) - ParentLoop->addBasicBlockToLoop(NewPH, *LI); - - // Update DominatorTree. - DT->addNewBlock(NewPH, LoopDomBB); + LMap[OrigLoop] = NewLoop; for (Loop *CurLoop : OrigLoop->getLoopsInPreorder()) { Loop *&NewLoop = LMap[CurLoop]; @@ -1008,7 +989,7 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, // Add DominatorTree node. After seeing all blocks, update to correct // IDom. - DT->addNewBlock(NewBB, NewPH); + DT->addNewBlock(NewBB, LoopDomBB); Blocks.push_back(NewBB); } @@ -1026,13 +1007,46 @@ Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, } // Move them physically from the end of the block list. - F->splice(Before->getIterator(), F, NewPH->getIterator()); F->splice(Before->getIterator(), F, NewLoop->getHeader()->getIterator(), F->end()); return NewLoop; } +/// Clones a loop \p OrigLoop. Returns the loop and the blocks in \p +/// Blocks. +/// +/// Updates LoopInfo and DominatorTree assuming the loop is dominated by block +/// \p LoopDomBB. Insert the new blocks before block specified in \p Before. +Loop *llvm::cloneLoopWithPreheader(BasicBlock *Before, BasicBlock *LoopDomBB, + Loop *OrigLoop, ValueToValueMapTy &VMap, + const Twine &NameSuffix, LoopInfo *LI, + DominatorTree *DT, + SmallVectorImpl &Blocks) { + Function *F = OrigLoop->getHeader()->getParent(); + Loop *ParentLoop = OrigLoop->getParentLoop(); + + BasicBlock *OrigPH = OrigLoop->getLoopPreheader(); + assert(OrigPH && "No preheader"); + BasicBlock *NewPH = CloneBasicBlock(OrigPH, VMap, NameSuffix, F); + // To rename the loop PHIs. + VMap[OrigPH] = NewPH; + Blocks.push_back(NewPH); + + // Update LoopInfo. + if (ParentLoop) + ParentLoop->addBasicBlockToLoop(NewPH, *LI); + + // Update DominatorTree. + DT->addNewBlock(NewPH, LoopDomBB); + + // Move them physically from the end of the block list. + F->splice(Before->getIterator(), F, NewPH->getIterator()); + + return cloneLoopBody(Before, NewPH, OrigLoop, VMap, NameSuffix, LI, DT, + Blocks); +} + /// Duplicate non-Phi instructions from the beginning of block up to /// StopAt instruction into a split block between BB and its predecessor. BasicBlock *llvm::DuplicateInstructionsInSplitBetween( diff --git a/llvm/lib/Transforms/Utils/LoopUtils.cpp b/llvm/lib/Transforms/Utils/LoopUtils.cpp index eaaf910e6df1ce5166e011267e778672bd94ea6d..69cc465000ddba4fd1ece303952c9f8f57da8157 100644 --- a/llvm/lib/Transforms/Utils/LoopUtils.cpp +++ b/llvm/lib/Transforms/Utils/LoopUtils.cpp @@ -349,6 +349,10 @@ bool llvm::hasDisableLICMTransformsHint(const Loop *L) { return getBooleanLoopAttribute(L, LLVMLoopDisableLICM); } +bool llvm::hasDisableHoistCondLoad(const Loop *L) { + return getBooleanLoopAttribute(L, "llvm.loop.hoist.cond_load.disable"); +} + TransformationMode llvm::hasUnrollTransformation(const Loop *L) { if (getBooleanLoopAttribute(L, "llvm.loop.unroll.disable")) return TM_SuppressedByUser; diff --git a/llvm/test/Transforms/LICM/AArch64/hoist-cond-load.ll b/llvm/test/Transforms/LICM/AArch64/hoist-cond-load.ll new file mode 100644 index 0000000000000000000000000000000000000000..26d57830af7f6e8959456a2cf4ff14058b5df733 --- /dev/null +++ b/llvm/test/Transforms/LICM/AArch64/hoist-cond-load.ll @@ -0,0 +1,245 @@ +; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 2 +; RUN: opt -S -passes=licm -licm-hoist-conditional-load=aggressive_noprof -verify-memoryssa < %s | FileCheck %s -check-prefixes=CHECK-AGG-NOP +; RUN: opt -S -passes=licm -licm-hoist-conditional-load=generic_noprof -verify-memoryssa < %s | FileCheck %s -check-prefixes=CHECK-GEN-NOP + +target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" +target triple = "aarch64-unknown-linux-gnu" + +%struct.Data = type { i32, double, ptr } + +define i32 @func(ptr %this, ptr %attr, ptr %indexs, i64 %sizes) #0 { +; CHECK-AGG-NOP-LABEL: define i32 @func +; CHECK-AGG-NOP-SAME: (ptr [[THIS:%.*]], ptr [[ATTR:%.*]], ptr [[INDEXS:%.*]], i64 [[SIZES:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-AGG-NOP-NEXT: entry: +; CHECK-AGG-NOP-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP0]], ptr [[INDEXS]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT:%.*]] = extractelement [[TMP1]], i64 0 +; CHECK-AGG-NOP-NEXT: [[TMP2:%.*]] = inttoptr i64 [[EXTRACT]] to ptr +; CHECK-AGG-NOP-NEXT: [[CMP3:%.*]] = icmp eq ptr [[ATTR]], null +; CHECK-AGG-NOP-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP3]], ptr [[ATTR]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT1:%.*]] = extractelement [[TMP4]], i64 0 +; CHECK-AGG-NOP-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[EXTRACT1]], 0 +; CHECK-AGG-NOP-NEXT: [[WEIGHT:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], ptr [[ATTR]], i64 0, i32 1 +; CHECK-AGG-NOP-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2f64( [[TMP5]], ptr [[WEIGHT]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT2:%.*]] = extractelement [[TMP6]], i64 0 +; CHECK-AGG-NOP-NEXT: [[CMP6:%.*]] = fcmp ogt double [[EXTRACT2]], 1.000000e+01 +; CHECK-AGG-NOP-NEXT: [[VPTR:%.*]] = getelementptr inbounds [[STRUCT_DATA]], ptr [[ATTR]], i64 0, i32 2 +; CHECK-AGG-NOP-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP7]], ptr [[VPTR]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT3:%.*]] = extractelement [[TMP8]], i64 0 +; CHECK-AGG-NOP-NEXT: [[TMP9:%.*]] = inttoptr i64 [[EXTRACT3]] to ptr +; CHECK-AGG-NOP-NEXT: [[TOBOOL8_NOT:%.*]] = icmp eq ptr [[TMP9]], null +; CHECK-AGG-NOP-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-AGG-NOP-NEXT: [[TMP11:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP10]], ptr [[TMP9]]) +; CHECK-AGG-NOP-NEXT: [[EXTRACT4:%.*]] = extractelement [[TMP11]], i64 0 +; CHECK-AGG-NOP-NEXT: br label [[FOR_COND:%.*]] +; CHECK-AGG-NOP: for.cond: +; CHECK-AGG-NOP-NEXT: [[RES_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[RES_2:%.*]], [[FOR_INC:%.*]] ] +; CHECK-AGG-NOP-NEXT: [[I_0:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[INC:%.*]], [[FOR_INC]] ] +; CHECK-AGG-NOP-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_0]], [[SIZES]] +; CHECK-AGG-NOP-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP:%.*]] +; CHECK-AGG-NOP: for.cond.cleanup: +; CHECK-AGG-NOP-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[RES_0]], [[FOR_COND]] ] +; CHECK-AGG-NOP-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-AGG-NOP: for.body: +; CHECK-AGG-NOP-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[I_0]] +; CHECK-AGG-NOP-NEXT: [[TMP12:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4 +; CHECK-AGG-NOP-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP12]], 0 +; CHECK-AGG-NOP-NEXT: [[OR_COND:%.*]] = or i1 [[CMP3]], [[CMP2]] +; CHECK-AGG-NOP-NEXT: br i1 [[OR_COND]], label [[FOR_INC]], label [[IF_END5:%.*]] +; CHECK-AGG-NOP: if.end5: +; CHECK-AGG-NOP-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; CHECK-AGG-NOP: land.lhs.true: +; CHECK-AGG-NOP-NEXT: br i1 [[CMP6]], label [[IF_THEN7:%.*]], label [[FOR_INC]] +; CHECK-AGG-NOP: if.then7: +; CHECK-AGG-NOP-NEXT: br i1 [[TOBOOL8_NOT]], label [[IF_END13:%.*]], label [[IF_THEN9:%.*]] +; CHECK-AGG-NOP: if.then9: +; CHECK-AGG-NOP-NEXT: [[ADD:%.*]] = add i32 [[TMP12]], [[RES_0]] +; CHECK-AGG-NOP-NEXT: [[ADD12:%.*]] = add i32 [[ADD]], [[EXTRACT4]] +; CHECK-AGG-NOP-NEXT: br label [[IF_END13]] +; CHECK-AGG-NOP: if.end13: +; CHECK-AGG-NOP-NEXT: [[RES_1:%.*]] = phi i32 [ [[ADD12]], [[IF_THEN9]] ], [ [[RES_0]], [[IF_THEN7]] ] +; CHECK-AGG-NOP-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP12]], 2 +; CHECK-AGG-NOP-NEXT: [[ADD15:%.*]] = add nsw i32 [[RES_1]], [[MUL]] +; CHECK-AGG-NOP-NEXT: br label [[FOR_INC]] +; CHECK-AGG-NOP: for.inc: +; CHECK-AGG-NOP-NEXT: [[RES_2]] = phi i32 [ [[RES_0]], [[FOR_BODY]] ], [ [[ADD15]], [[IF_END13]] ], [ [[RES_0]], [[LAND_LHS_TRUE]] ], [ [[RES_0]], [[IF_END5]] ] +; CHECK-AGG-NOP-NEXT: [[INC]] = add nuw i64 [[I_0]], 1 +; CHECK-AGG-NOP-NEXT: br label [[FOR_COND]] +; +; CHECK-GEN-NOP-LABEL: define i32 @func +; CHECK-GEN-NOP-SAME: (ptr [[THIS:%.*]], ptr [[ATTR:%.*]], ptr [[INDEXS:%.*]], i64 [[SIZES:%.*]]) #[[ATTR0:[0-9]+]] { +; CHECK-GEN-NOP-NEXT: entry: +; CHECK-GEN-NOP-NEXT: [[CMP3:%.*]] = icmp eq ptr [[ATTR]], null +; CHECK-GEN-NOP-NEXT: [[WEIGHT:%.*]] = getelementptr inbounds [[STRUCT_DATA:%.*]], ptr [[ATTR]], i64 0, i32 1 +; CHECK-GEN-NOP-NEXT: [[VPTR:%.*]] = getelementptr inbounds [[STRUCT_DATA]], ptr [[ATTR]], i64 0, i32 2 +; CHECK-GEN-NOP-NEXT: br i1 true, label [[FOR_COND_PREHEADER:%.*]], label [[FOR_COND_LICM_LVER_ORIG_PREHEADER:%.*]] +; CHECK-GEN-NOP: for.cond.preheader: +; CHECK-GEN-NOP-NEXT: call void @llvm.aarch64.sve.setffr() +; CHECK-GEN-NOP-NEXT: [[TMP0:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP1:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP0]], ptr [[INDEXS]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT:%.*]] = extractelement [[TMP1]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TMP2:%.*]] = inttoptr i64 [[EXTRACT]] to ptr +; CHECK-GEN-NOP-NEXT: [[TMP3:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP4:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP3]], ptr [[ATTR]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT1:%.*]] = extractelement [[TMP4]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TOBOOL_NOT:%.*]] = icmp eq i32 [[EXTRACT1]], 0 +; CHECK-GEN-NOP-NEXT: [[TMP5:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP6:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2f64( [[TMP5]], ptr [[WEIGHT]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT2:%.*]] = extractelement [[TMP6]], i64 0 +; CHECK-GEN-NOP-NEXT: [[CMP6:%.*]] = fcmp ogt double [[EXTRACT2]], 1.000000e+01 +; CHECK-GEN-NOP-NEXT: [[TMP7:%.*]] = call @llvm.aarch64.sve.ptrue.nxv2i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP8:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv2i64( [[TMP7]], ptr [[VPTR]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT3:%.*]] = extractelement [[TMP8]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TMP9:%.*]] = inttoptr i64 [[EXTRACT3]] to ptr +; CHECK-GEN-NOP-NEXT: [[TOBOOL8_NOT:%.*]] = icmp eq ptr [[TMP9]], null +; CHECK-GEN-NOP-NEXT: [[TMP10:%.*]] = call @llvm.aarch64.sve.ptrue.nxv4i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP11:%.*]] = call @llvm.aarch64.sve.ldnf1.nxv4i32( [[TMP10]], ptr [[TMP9]]) +; CHECK-GEN-NOP-NEXT: [[EXTRACT4:%.*]] = extractelement [[TMP11]], i64 0 +; CHECK-GEN-NOP-NEXT: [[TMP12:%.*]] = call @llvm.aarch64.sve.rdffr() +; CHECK-GEN-NOP-NEXT: [[TMP13:%.*]] = call @llvm.aarch64.sve.ptrue.nxv16i1(i32 1) +; CHECK-GEN-NOP-NEXT: [[TMP14:%.*]] = call i1 @llvm.aarch64.sve.ptest.any.nxv16i1( [[TMP13]], [[TMP12]]) +; CHECK-GEN-NOP-NEXT: br i1 [[TMP14]], label [[FOR_COND_PREHEADER5:%.*]], label [[FOR_COND_LICM_LVER_ORIG_PREHEADER]] +; CHECK-GEN-NOP: for.cond.preheader5: +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND:%.*]] +; CHECK-GEN-NOP: for.cond.licm.lver.orig.preheader: +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: for.cond.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[RES_0_LICM_LVER_ORIG:%.*]] = phi i32 [ [[RES_2_LICM_LVER_ORIG:%.*]], [[FOR_INC_LICM_LVER_ORIG:%.*]] ], [ 0, [[FOR_COND_LICM_LVER_ORIG_PREHEADER]] ] +; CHECK-GEN-NOP-NEXT: [[I_0_LICM_LVER_ORIG:%.*]] = phi i64 [ [[INC_LICM_LVER_ORIG:%.*]], [[FOR_INC_LICM_LVER_ORIG]] ], [ 0, [[FOR_COND_LICM_LVER_ORIG_PREHEADER]] ] +; CHECK-GEN-NOP-NEXT: [[CMP_LICM_LVER_ORIG:%.*]] = icmp ult i64 [[I_0_LICM_LVER_ORIG]], [[SIZES]] +; CHECK-GEN-NOP-NEXT: br i1 [[CMP_LICM_LVER_ORIG]], label [[FOR_BODY_LICM_LVER_ORIG:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]] +; CHECK-GEN-NOP: for.body.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP15:%.*]] = load ptr, ptr [[INDEXS]], align 8 +; CHECK-GEN-NOP-NEXT: [[ADD_PTR_I_LICM_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr [[TMP15]], i64 [[I_0_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: [[TMP16:%.*]] = load i32, ptr [[ADD_PTR_I_LICM_LVER_ORIG]], align 4 +; CHECK-GEN-NOP-NEXT: [[CMP2_LICM_LVER_ORIG:%.*]] = icmp eq i32 [[TMP16]], 0 +; CHECK-GEN-NOP-NEXT: [[OR_COND_LICM_LVER_ORIG:%.*]] = or i1 [[CMP3]], [[CMP2_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: br i1 [[OR_COND_LICM_LVER_ORIG]], label [[FOR_INC_LICM_LVER_ORIG]], label [[IF_END5_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: if.end5.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP17:%.*]] = load i32, ptr [[ATTR]], align 8 +; CHECK-GEN-NOP-NEXT: [[TOBOOL_NOT_LICM_LVER_ORIG:%.*]] = icmp eq i32 [[TMP17]], 0 +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL_NOT_LICM_LVER_ORIG]], label [[FOR_INC_LICM_LVER_ORIG]], label [[LAND_LHS_TRUE_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: land.lhs.true.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP18:%.*]] = load double, ptr [[WEIGHT]], align 8 +; CHECK-GEN-NOP-NEXT: [[CMP6_LICM_LVER_ORIG:%.*]] = fcmp ogt double [[TMP18]], 1.000000e+01 +; CHECK-GEN-NOP-NEXT: br i1 [[CMP6_LICM_LVER_ORIG]], label [[IF_THEN7_LICM_LVER_ORIG:%.*]], label [[FOR_INC_LICM_LVER_ORIG]] +; CHECK-GEN-NOP: if.then7.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP19:%.*]] = load ptr, ptr [[VPTR]], align 8 +; CHECK-GEN-NOP-NEXT: [[TOBOOL8_NOT_LICM_LVER_ORIG:%.*]] = icmp eq ptr [[TMP19]], null +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL8_NOT_LICM_LVER_ORIG]], label [[IF_END13_LICM_LVER_ORIG:%.*]], label [[IF_THEN9_LICM_LVER_ORIG:%.*]] +; CHECK-GEN-NOP: if.then9.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[TMP20:%.*]] = load i32, ptr [[TMP19]], align 4 +; CHECK-GEN-NOP-NEXT: [[ADD_LICM_LVER_ORIG:%.*]] = add i32 [[TMP16]], [[RES_0_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: [[ADD12_LICM_LVER_ORIG:%.*]] = add i32 [[ADD_LICM_LVER_ORIG]], [[TMP20]] +; CHECK-GEN-NOP-NEXT: br label [[IF_END13_LICM_LVER_ORIG]] +; CHECK-GEN-NOP: if.end13.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[RES_1_LICM_LVER_ORIG:%.*]] = phi i32 [ [[ADD12_LICM_LVER_ORIG]], [[IF_THEN9_LICM_LVER_ORIG]] ], [ [[RES_0_LICM_LVER_ORIG]], [[IF_THEN7_LICM_LVER_ORIG]] ] +; CHECK-GEN-NOP-NEXT: [[MUL_LICM_LVER_ORIG:%.*]] = mul nsw i32 [[TMP16]], 2 +; CHECK-GEN-NOP-NEXT: [[ADD15_LICM_LVER_ORIG:%.*]] = add nsw i32 [[RES_1_LICM_LVER_ORIG]], [[MUL_LICM_LVER_ORIG]] +; CHECK-GEN-NOP-NEXT: br label [[FOR_INC_LICM_LVER_ORIG]] +; CHECK-GEN-NOP: for.inc.licm.lver.orig: +; CHECK-GEN-NOP-NEXT: [[RES_2_LICM_LVER_ORIG]] = phi i32 [ [[RES_0_LICM_LVER_ORIG]], [[FOR_BODY_LICM_LVER_ORIG]] ], [ [[ADD15_LICM_LVER_ORIG]], [[IF_END13_LICM_LVER_ORIG]] ], [ [[RES_0_LICM_LVER_ORIG]], [[LAND_LHS_TRUE_LICM_LVER_ORIG]] ], [ [[RES_0_LICM_LVER_ORIG]], [[IF_END5_LICM_LVER_ORIG]] ] +; CHECK-GEN-NOP-NEXT: [[INC_LICM_LVER_ORIG]] = add nuw i64 [[I_0_LICM_LVER_ORIG]], 1 +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_LICM_LVER_ORIG]], !llvm.loop [[LOOP0:![0-9]+]] +; CHECK-GEN-NOP: for.cond: +; CHECK-GEN-NOP-NEXT: [[RES_0:%.*]] = phi i32 [ [[RES_2:%.*]], [[FOR_INC:%.*]] ], [ 0, [[FOR_COND_PREHEADER5]] ] +; CHECK-GEN-NOP-NEXT: [[I_0:%.*]] = phi i64 [ [[INC:%.*]], [[FOR_INC]] ], [ 0, [[FOR_COND_PREHEADER5]] ] +; CHECK-GEN-NOP-NEXT: [[CMP:%.*]] = icmp ult i64 [[I_0]], [[SIZES]] +; CHECK-GEN-NOP-NEXT: br i1 [[CMP]], label [[FOR_BODY:%.*]], label [[FOR_COND_CLEANUP_LOOPEXIT6:%.*]] +; CHECK-GEN-NOP: for.cond.cleanup.loopexit: +; CHECK-GEN-NOP-NEXT: [[RES_0_LCSSA_PH:%.*]] = phi i32 [ [[RES_0_LICM_LVER_ORIG]], [[FOR_COND_LICM_LVER_ORIG]] ] +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_CLEANUP:%.*]] +; CHECK-GEN-NOP: for.cond.cleanup.loopexit6: +; CHECK-GEN-NOP-NEXT: [[RES_0_LCSSA_PH7:%.*]] = phi i32 [ [[RES_0]], [[FOR_COND]] ] +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND_CLEANUP]] +; CHECK-GEN-NOP: for.cond.cleanup: +; CHECK-GEN-NOP-NEXT: [[RES_0_LCSSA:%.*]] = phi i32 [ [[RES_0_LCSSA_PH]], [[FOR_COND_CLEANUP_LOOPEXIT]] ], [ [[RES_0_LCSSA_PH7]], [[FOR_COND_CLEANUP_LOOPEXIT6]] ] +; CHECK-GEN-NOP-NEXT: ret i32 [[RES_0_LCSSA]] +; CHECK-GEN-NOP: for.body: +; CHECK-GEN-NOP-NEXT: [[ADD_PTR_I:%.*]] = getelementptr inbounds i32, ptr [[TMP2]], i64 [[I_0]] +; CHECK-GEN-NOP-NEXT: [[TMP21:%.*]] = load i32, ptr [[ADD_PTR_I]], align 4 +; CHECK-GEN-NOP-NEXT: [[CMP2:%.*]] = icmp eq i32 [[TMP21]], 0 +; CHECK-GEN-NOP-NEXT: [[OR_COND:%.*]] = or i1 [[CMP3]], [[CMP2]] +; CHECK-GEN-NOP-NEXT: br i1 [[OR_COND]], label [[FOR_INC]], label [[IF_END5:%.*]] +; CHECK-GEN-NOP: if.end5: +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL_NOT]], label [[FOR_INC]], label [[LAND_LHS_TRUE:%.*]] +; CHECK-GEN-NOP: land.lhs.true: +; CHECK-GEN-NOP-NEXT: br i1 [[CMP6]], label [[IF_THEN7:%.*]], label [[FOR_INC]] +; CHECK-GEN-NOP: if.then7: +; CHECK-GEN-NOP-NEXT: br i1 [[TOBOOL8_NOT]], label [[IF_END13:%.*]], label [[IF_THEN9:%.*]] +; CHECK-GEN-NOP: if.then9: +; CHECK-GEN-NOP-NEXT: [[ADD:%.*]] = add i32 [[TMP21]], [[RES_0]] +; CHECK-GEN-NOP-NEXT: [[ADD12:%.*]] = add i32 [[ADD]], [[EXTRACT4]] +; CHECK-GEN-NOP-NEXT: br label [[IF_END13]] +; CHECK-GEN-NOP: if.end13: +; CHECK-GEN-NOP-NEXT: [[RES_1:%.*]] = phi i32 [ [[ADD12]], [[IF_THEN9]] ], [ [[RES_0]], [[IF_THEN7]] ] +; CHECK-GEN-NOP-NEXT: [[MUL:%.*]] = mul nsw i32 [[TMP21]], 2 +; CHECK-GEN-NOP-NEXT: [[ADD15:%.*]] = add nsw i32 [[RES_1]], [[MUL]] +; CHECK-GEN-NOP-NEXT: br label [[FOR_INC]] +; CHECK-GEN-NOP: for.inc: +; CHECK-GEN-NOP-NEXT: [[RES_2]] = phi i32 [ [[RES_0]], [[FOR_BODY]] ], [ [[ADD15]], [[IF_END13]] ], [ [[RES_0]], [[LAND_LHS_TRUE]] ], [ [[RES_0]], [[IF_END5]] ] +; CHECK-GEN-NOP-NEXT: [[INC]] = add nuw i64 [[I_0]], 1 +; CHECK-GEN-NOP-NEXT: br label [[FOR_COND]] +; +entry: + br label %for.cond + +for.cond: ; preds = %for.inc, %entry + %res.0 = phi i32 [ 0, %entry ], [ %res.2, %for.inc ] + %i.0 = phi i64 [ 0, %entry ], [ %inc, %for.inc ] + %cmp = icmp ult i64 %i.0, %sizes + br i1 %cmp, label %for.body, label %for.cond.cleanup + +for.cond.cleanup: ; preds = %for.cond + %res.0.lcssa = phi i32 [ %res.0, %for.cond ] + ret i32 %res.0.lcssa + +for.body: ; preds = %for.cond + %0 = load ptr, ptr %indexs, align 8 + %add.ptr.i = getelementptr inbounds i32, ptr %0, i64 %i.0 + %1 = load i32, ptr %add.ptr.i, align 4 + %cmp2 = icmp eq i32 %1, 0 + %cmp3 = icmp eq ptr %attr, null + %or.cond = or i1 %cmp3, %cmp2 + br i1 %or.cond, label %for.inc, label %if.end5 + +if.end5: ; preds = %for.body + %2 = load i32, ptr %attr, align 8 + %tobool.not = icmp eq i32 %2, 0 + br i1 %tobool.not, label %for.inc, label %land.lhs.true + +land.lhs.true: ; preds = %if.end5 + %weight = getelementptr inbounds %struct.Data, ptr %attr, i64 0, i32 1 + %3 = load double, ptr %weight, align 8 + %cmp6 = fcmp ogt double %3, 1.000000e+01 + br i1 %cmp6, label %if.then7, label %for.inc + +if.then7: ; preds = %land.lhs.true + %vptr = getelementptr inbounds %struct.Data, ptr %attr, i64 0, i32 2 + %4 = load ptr, ptr %vptr, align 8 + %tobool8.not = icmp eq ptr %4, null + br i1 %tobool8.not, label %if.end13, label %if.then9 + +if.then9: ; preds = %if.then7 + %5 = load i32, ptr %4, align 4 + %add = add i32 %1, %res.0 + %add12 = add i32 %add, %5 + br label %if.end13 + +if.end13: ; preds = %if.then9, %if.then7 + %res.1 = phi i32 [ %add12, %if.then9 ], [ %res.0, %if.then7 ] + %mul = mul nsw i32 %1, 2 + %add15 = add nsw i32 %res.1, %mul + br label %for.inc + +for.inc: ; preds = %if.end13, %land.lhs.true, %if.end5, %for.body + %res.2 = phi i32 [ %res.0, %for.body ], [ %add15, %if.end13 ], [ %res.0, %land.lhs.true ], [ %res.0, %if.end5 ] + %inc = add nuw i64 %i.0, 1 + br label %for.cond +} + +attributes #0 = { mustprogress noinline nounwind uwtable vscale_range(1,16) "frame-pointer"="non-leaf" "no-trapping-math"="true" "stack-protector-buffer-size"="8" "target-cpu"="generic" "target-features"="+crc,+dotprod,+fp-armv8,+fullfp16,+neon,+ras,+rcpc,+rdm,+sve,+v8.1a,+v8.2a,+v8.3a,+v8.4a,+v8.5a,+v8a,-fmv" } +