diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp index dde7de406c585e47699a02f179cd7ff2d21d1763..7c2770979a900fa7e953a712b192144c27bee3ce 100644 --- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp +++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp @@ -14,32 +14,21 @@ #include "llvm/InitializePasses.h" #include "llvm/ADT/DepthFirstIterator.h" -#include "llvm/ADT/SmallSet.h" #include "llvm/ADT/Statistic.h" -#include "llvm/Analysis/AliasAnalysis.h" #include "llvm/Analysis/AssumptionCache.h" #include "llvm/Analysis/CodeMetrics.h" -#include "llvm/Analysis/DomTreeUpdater.h" -#include "llvm/Analysis/IVDescriptors.h" #include "llvm/Analysis/LoopInfo.h" -#include "llvm/Analysis/LoopIterator.h" #include "llvm/Analysis/OptimizationRemarkEmitter.h" #include "llvm/Analysis/ScalarEvolution.h" #include "llvm/Analysis/ScalarEvolutionExpressions.h" #include "llvm/Analysis/TargetTransformInfo.h" #include "llvm/IR/Dominators.h" #include "llvm/IR/Function.h" -#include "llvm/IR/Intrinsics.h" -#include "llvm/IR/IntrinsicsAArch64.h" #include "llvm/IR/Module.h" -#include "llvm/IR/ReplaceConstant.h" #include "llvm/Support/CommandLine.h" #include "llvm/Support/Debug.h" #include "llvm/Transforms/Scalar.h" #include "llvm/Transforms/Utils.h" -#include "llvm/Transforms/Utils/Cloning.h" -#include "llvm/Transforms/Utils/LoopSimplify.h" -#include "llvm/Transforms/Utils/LoopUtils.h" #include "llvm/Transforms/Utils/ScalarEvolutionExpander.h" #define DEBUG_TYPE "loop-data-prefetch" @@ -65,131 +54,23 @@ static cl::opt MaxPrefetchIterationsAhead( "max-prefetch-iters-ahead", cl::desc("Max number of iterations to prefetch ahead"), cl::Hidden); -static cl::opt - IndirectLoadPrefetch("indirect-load-prefetch", cl::Hidden, cl::init(false), - cl::desc("Enable indirect laod prefetch")); - -static cl::opt PrefetchIterationsAhead( - "indirect-prefetch-iters-ahead", - cl::desc("Number of iterations for indirect-load prefetch"), cl::Hidden, cl::init(0)); - -static cl::opt SkipIntermediate( - "indirect-prefetch-skip-intermediate", cl::Hidden, cl::init(false), - cl::desc( - "Skip prefetching intermediate loads while doing indirect prefetch")); - -static cl::opt IndirectionLevel( - "indirect-level", - cl::desc("Indirection level considered for indirect load prefetch"), - cl::Hidden, cl::init(2)); - -static cl::opt RandomAccessPrefetchOnly( - "random-access-prefetch-only", cl::Hidden, cl::init(false), - cl::desc("Enable only outer loop indirect load prefetch")); - -static cl::opt CachelineSize("prefetch-cache-line-size", - cl::desc("Specify cache line size"), - cl::Hidden, cl::init(64)); - -static cl::opt - OuterLoopPrefetch("outer-loop-prefetch", cl::Hidden, cl::init(false), - cl::desc("Enable prefetch in outer loops")); - -static cl::opt - DisableDirectLoadPrefetch("disable-direct-prefetch", cl::Hidden, - cl::init(false), - cl::desc("Disable direct load prefetch")); - -static cl::opt - PrefetchLoopDepth("prefetch-loop-depth", - cl::desc("Least loop depth to insert prefetch"), - cl::Hidden, cl::init(1)); - STATISTIC(NumPrefetches, "Number of prefetches inserted"); -STATISTIC(NumIndPrefetches, "Number of indirect prefetches inserted"); -STATISTIC(NumOuterLoopPrefetches, "Number of outer loop prefetches inserted"); namespace { -// Helper function to return a type with the same size as -// given step size -static Type *getPtrTypefromPHI(PHINode *PHI, int64_t StepSize) { - Type *Int8Ty = Type::getInt8Ty(PHI->getParent()->getContext()); - return ArrayType::get(Int8Ty, StepSize); -} - /// Loop prefetch implementation class. class LoopDataPrefetch { public: - LoopDataPrefetch(AliasAnalysis *AA, AssumptionCache *AC, DominatorTree *DT, - LoopInfo *LI, ScalarEvolution *SE, - const TargetTransformInfo *TTI, + LoopDataPrefetch(AssumptionCache *AC, DominatorTree *DT, LoopInfo *LI, + ScalarEvolution *SE, const TargetTransformInfo *TTI, OptimizationRemarkEmitter *ORE) - : AA(AA), AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} + : AC(AC), DT(DT), LI(LI), SE(SE), TTI(TTI), ORE(ORE) {} bool run(); private: bool runOnLoop(Loop *L); - Value *getCanonicalishSizeVariable(Loop *L, PHINode *PHI) const; - Value * - getLoopIterationNumber(Loop *L, - SmallPtrSet &LoopAuxIndPHINodes, - ValueMap &AuxIndBounds); - /// If prefetch instruction is not inserted, need to clean iteration - /// instructions in the preheader. - void cleanLoopIterationNumber(Value *NumIterations); - /// Returns whether the auxiliary induction variable can generate bound. - /// If it can, add PHI to LoopAuxIndPHINodes - bool canGetAuxIndVarBound(Loop *L, PHINode *PHI, - SmallPtrSet &LoopAuxIndPHINodes); - - /// Generate bound for the auxiliary induction variable at the - /// preheader and add it to AuxIndBounds. - /// Returns whether the bound was successfully generated. - bool getAuxIndVarBound(Loop *L, PHINode *PHI, Value *NumIterations, - ValueMap &AuxIndBounds); - - bool insertPrefetcherInOuterloopForIndirectLoad( - Loop *L, unsigned Idx, Value *NumIterations, - SmallVector &CandidateMemoryLoads, - SmallSetVector &DependentInsts, - ValueMap &AuxIndBounds, - SmallVectorImpl> &Transforms, - unsigned ItersAhead); - - bool insertPrefetcherForIndirectLoad( - Loop *L, unsigned Idx, Value *NumIterations, - SmallVector &CandidateMemoryLoads, - SmallSetVector &DependentInsts, - ValueMap &AuxIndBounds, - SmallVectorImpl> &Transforms, - unsigned ItersAhead); - - bool findCandidateMemoryLoads( - Instruction *I, SmallSetVector &InstList, - SmallPtrSet &InstSet, - SmallVector &CandidateMemoryLoads, - std::vector> &DependentInstList, - SmallPtrSet LoopAuxIndPHINodes, - bool PrefetchInOuterLoop, Loop *L); - - /// Helper function to determine whether the given load is in - /// CandidateMemoryLoads. If yes, add the candidate's depending inst to the - /// list - bool isLoadInCandidateMemoryLoads( - LoadInst *LoadI, SmallSetVector &InstList, - SmallPtrSet &InstSet, - SmallVector &CandidateMemoryLoads, - std::vector> &DependentInstList); - - /// Returns whether the given loop can do indirect prefetch and should be - /// processed to insert prefetches for indirect loads. - bool canDoIndirectPrefetch(Loop *L); - - bool isCrcHashDataAccess(Instruction *I, Instruction *PrefetchingLoad); - /// Check if the stride of the accesses is large enough to /// warrant a prefetch. bool isStrideLargeEnough(const SCEVAddRecExpr *AR, unsigned TargetMinStride); @@ -222,7 +103,6 @@ private: return TTI->enableWritePrefetching(); } - AliasAnalysis *AA; AssumptionCache *AC; DominatorTree *DT; LoopInfo *LI; @@ -240,8 +120,6 @@ public: } void getAnalysisUsage(AnalysisUsage &AU) const override { - AU.addRequired(); - AU.addPreserved(); AU.addRequired(); AU.addRequired(); AU.addPreserved(); @@ -262,7 +140,6 @@ public: char LoopDataPrefetchLegacyPass::ID = 0; INITIALIZE_PASS_BEGIN(LoopDataPrefetchLegacyPass, "loop-data-prefetch", "Loop Data Prefetch", false, false) -INITIALIZE_PASS_DEPENDENCY(AAResultsWrapperPass) INITIALIZE_PASS_DEPENDENCY(AssumptionCacheTracker) INITIALIZE_PASS_DEPENDENCY(TargetTransformInfoWrapperPass) INITIALIZE_PASS_DEPENDENCY(LoopInfoWrapperPass) @@ -292,1133 +169,8 @@ bool LoopDataPrefetch::isStrideLargeEnough(const SCEVAddRecExpr *AR, return TargetMinStride <= AbsStride; } -/// Use the induction variable to generate value represeting the total num of -/// iterations for the loop in the preheader. -Value *LoopDataPrefetch::getLoopIterationNumber( - Loop *L, SmallPtrSet &LoopAuxIndPHINodes, - ValueMap &AuxIndBounds) { - Value *LoopBoundValue; - Value *LoopStepValue; - Value *LoopStartValue; - Value *NumIterations; - - // Use induction variable to derive number of iterations for the loop which - // will be used to calculate the upper bound for other auxiliary induction - // variables. - PHINode *PHI = L->getInductionVariable(*SE); - if (PHI == nullptr) - return nullptr; - - auto LoopLB = L->getBounds(*SE); - if (!LoopLB) - return nullptr; - - LoopStartValue = &(LoopLB->getInitialIVValue()); - LoopStepValue = LoopLB->getStepValue(); - LoopBoundValue = &(LoopLB->getFinalIVValue()); - - if (LoopStartValue == nullptr || LoopStepValue == nullptr || - LoopBoundValue == nullptr) - return nullptr; - - // Step should be constant. - if (!isa(SE->getSCEV(LoopStepValue))) - return nullptr; - - // Make sure each of them is invariant so we can use them in the preheader. - if (!L->isLoopInvariant(LoopBoundValue) || - !L->isLoopInvariant(LoopStepValue) || !L->isLoopInvariant(LoopStartValue)) - return nullptr; - - // Generate instruction that calculated the total number of iterations of the - // loop in the preheader. - IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); - Value *Range = Builder.CreateSub(LoopBoundValue, LoopStartValue); - NumIterations = Builder.CreateSDiv(Range, LoopStepValue); - - LoopAuxIndPHINodes.insert(PHI); - Value *Bound = nullptr; - // If the step is positive, the upper bound isn't included, i.e. accessing - // [bound] is not legal, so subtract the bound by LoopStepValue to prevent out - // of bounds memory access. - if (SE->isKnownNegative(SE->getSCEV(LoopStepValue))) - Bound = LoopBoundValue; - else - Bound = Builder.CreateSub(LoopBoundValue, LoopStepValue); - AuxIndBounds.insert(std::pair(PHI, Bound)); - return NumIterations; -} - -/// If prefetch instruction is not inserted. Need to clean iteration instruction -/// in the preheader. -void LoopDataPrefetch::cleanLoopIterationNumber(Value *NumIterations) { - auto *IDiv = dyn_cast(NumIterations); - if (IDiv != nullptr && IDiv->getOpcode() == Instruction::SDiv && - IDiv->use_empty()) { - auto *IRange = dyn_cast(IDiv->getOperand(0)); - IDiv->eraseFromParent(); - if (IRange != nullptr && IRange->getOpcode() == Instruction::Sub && - IRange->use_empty()) { - IRange->eraseFromParent(); - } - } -} - -/// Returns whether the auxiliary induction variable can generate bound. -/// If it can genearte a bound, add PHI to LoopAuxIndPHINodes -bool LoopDataPrefetch::canGetAuxIndVarBound( - Loop *L, PHINode *PHI, SmallPtrSet &LoopAuxIndPHINodes) { - Value *AuxIndVarStartValue = - PHI->getIncomingValueForBlock(L->getLoopPreheader()); - if (AuxIndVarStartValue == nullptr) - return false; - - const SCEV *LSCEV = SE->getSCEV(PHI); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); - - if (LSCEVAddRec == nullptr) - return false; - - // Currently, we only support constant steps. - if (dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { - InductionDescriptor IndDesc; - if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc)) - return false; - - if (IndDesc.getInductionOpcode() != Instruction::Add && - IndDesc.getInductionOpcode() != Instruction::Sub && - IndDesc.getKind() != InductionDescriptor::IK_PtrInduction) - return false; - - LoopAuxIndPHINodes.insert(PHI); - - return true; - } - return false; -} - -/// Generate bound for the auxiliary induction variable at the preheader and add -/// it to AuxIndBounds. Returns whether the bound was successfully generated. -bool LoopDataPrefetch::getAuxIndVarBound( - Loop *L, PHINode *PHI, Value *NumIterations, - ValueMap &AuxIndBounds) { - IRBuilder<> Builder(L->getLoopPreheader()->getTerminator()); - Value *AuxIndVarStartValue = - PHI->getIncomingValueForBlock(L->getLoopPreheader()); - if (AuxIndVarStartValue == nullptr) - return false; - - const SCEV *LSCEV = SE->getSCEV(PHI); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); - - // Currently, we only support constant steps. - if (const SCEVConstant *ConstPtrDiff = - dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { - Value *AuxIndVarBound; - InductionDescriptor IndDesc; - if (!InductionDescriptor::isInductionPHI(PHI, L, SE, IndDesc)) - return false; - - // Calculate the upper bound for the auxiliary induction variable. - Value *CastedNumIterations = - Builder.CreateSExtOrTrunc(NumIterations, ConstPtrDiff->getType()); - - // Subtract one from CastedNumIterations as we want the bound to be in - // bounds. If there are N iterations, the first iteration will access the - // array at offset 0. On the N-th iteration, it will access the array at - // offset N-1, not N. - CastedNumIterations = Builder.CreateSub( - CastedNumIterations, ConstantInt::get(ConstPtrDiff->getType(), 1)); - // Teh induction operator is add / sub - if (IndDesc.getInductionOpcode() == Instruction::Add || - IndDesc.getInductionOpcode() == Instruction::Sub) { - Value *Range = - Builder.CreateMul(ConstPtrDiff->getValue(), CastedNumIterations); - AuxIndVarBound = Builder.CreateAdd(Range, AuxIndVarStartValue); - } else if (IndDesc.getKind() == InductionDescriptor::IK_PtrInduction) { - // The induction variable is a pointer - int64_t StepSize = ConstPtrDiff->getAPInt().getSExtValue(); - if (SE->isKnownNegative(ConstPtrDiff)) { - StepSize = -StepSize; - CastedNumIterations = Builder.CreateMul( - ConstantInt::getSigned(ConstPtrDiff->getType(), -1), - CastedNumIterations); - } - Type *GEPType = getPtrTypefromPHI(PHI, StepSize); - AuxIndVarBound = Builder.CreateInBoundsGEP(GEPType, AuxIndVarStartValue, - CastedNumIterations); - } else - return false; - - LLVM_DEBUG(dbgs() << "Added " - << (isa(SE->getSCEV(AuxIndVarBound)) - ? "Constant " - : "") - << "AuxIndVarBound " << *AuxIndVarBound - << " for AuxIndVar:" << *PHI << "\n"); - AuxIndBounds.insert(std::pair(PHI, AuxIndVarBound)); - - return true; - } - return false; -} - -// Helper function to calculate the step for a given loop -static uint64_t getStep(PHINode *PN, ScalarEvolution *SE) { - // Get the constant step for the induction phi so we can use it to calculate - // how much we should increase the induction for prefetching. - uint64_t Step = 0; - const SCEV *LSCEV = SE->getSCEV(PN); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); - - if (LSCEVAddRec == nullptr) - return Step; - - if (const SCEVConstant *ConstPtrDiff = - dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { - Step = ConstPtrDiff->getAPInt().getZExtValue(); - } - return Step; -} - -// Helper function to determine if the loop step is positive -static bool isPositiveStep(PHINode *PN, ScalarEvolution *SE) { - bool PositiveStep = true; - const SCEV *LSCEV = SE->getSCEV(PN); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); - if (const SCEVConstant *ConstPtrDiff = - dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) { - if (SE->isKnownNegative(ConstPtrDiff)) { - PositiveStep = false; - } - } - return PositiveStep; -} - -// Helper function to calculate the step type of a PHI node. If the PHI node is -// not a pointer type, get the type PHI Node itself. Otherwise, get the integer -// type of the PHI's step/offset value. -static Type *getStepTypeFromPHINode(PHINode *PN, ScalarEvolution *SE) { - // Get the constant step for the induction phi so we can use it to calculate - // how much we should increase the induction for prefetching. - Type *T = PN->getType(); - if (!T->isPointerTy()) - return T; - - const SCEV *LSCEV = SE->getSCEV(PN); - const SCEVAddRecExpr *LSCEVAddRec = dyn_cast(LSCEV); - if (const SCEVConstant *ConstPtrDiff = - dyn_cast(LSCEVAddRec->getStepRecurrence(*SE))) - return ConstPtrDiff->getType(); - - return T; -} - -/// This function will take an instr list that contains indirect loads and -/// transform them into prefetchers. E.g. Transform following indirect load -/// A[B[i]]: -/// phi indvar [0] [bound] -/// idxB = gep *B, indvar -/// offsetA = load * idxB -/// idxA = gep *A, offsetA -/// valueA = load *idxA -/// To indirect load with prefetchers N iteration ahead: -/// phi indvar [0] [bound] -/// offsetN = add indvar, N -/// offset2N = add indvar, 2N -/// compare = icmp offsetN, bound -/// offsetN = select compare, offsetN, bound -/// preIdxN = gep *B, offsetN -/// preIdx2N = get *B, offset2N -/// call prefetch(preIdx2N) -/// preOffsetA = load preIdxN -/// preIdxA = gep *A, preOffsetA -/// call prefetch(preIdxA) -/// idxB = gep *B, indvar -/// offsetA = load *idxB -/// idxA = gep *A, offsetA -/// valueA = load *idxA -bool LoopDataPrefetch::insertPrefetcherForIndirectLoad( - Loop *L, unsigned Idx, Value *NumIterations, - SmallVector &CandidateMemoryLoads, - SmallSetVector &DependentInsts, - ValueMap &AuxIndBounds, - SmallVectorImpl> &Transforms, - unsigned ItersAhead) { - bool PositiveStep = true; - Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx]; - IRBuilder<> Builder(TargetIndirectLoad); - Module *M = TargetIndirectLoad->getModule(); - Type *I32Ty = Type::getInt32Ty(TargetIndirectLoad->getParent()->getContext()); - - if (RandomAccessPrefetchOnly) { - bool isRandomAccess = false; - for (auto *I : DependentInsts) { - if (isCrcHashDataAccess(I, TargetIndirectLoad)) { - isRandomAccess = true; - break; - } - } - if (!isRandomAccess) - return false; - } - - LLVM_DEBUG(dbgs() << "Inserting indirect prefetchers for\t" - << *TargetIndirectLoad << "\twith " << DependentInsts.size() - << " dependent instructions\n"); - - // Keep track of the number of prefetches left to process among the - // DependentInst List. We assume that for given indirectLevel N, we will have - // N prefetches to do, unless we are skipping intermediate loads, then we are - // only doing 1 prefetch. - size_t NumPrefetchesLeft = SkipIntermediate ? 1 : IndirectionLevel; - int64_t Step; - while (!DependentInsts.empty()) { - Instruction *DependentInst = DependentInsts.pop_back_val(); - Instruction *Inst = dyn_cast(DependentInst); - - switch (Inst->getOpcode()) { - case Instruction::PHI: { - // Get the constant step for the induction phi so we can use it to - // calculate how much we should increase the induction for prefetching. - PHINode *PN = dyn_cast(Inst); - Step = getStep(PN, SE); - PositiveStep = isPositiveStep(PN, SE); - Type *InstType = getStepTypeFromPHINode(PN, SE); - if (!PositiveStep) - Step = -Step; - - // Make sure phi node is i64 or i32. - if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32)) - return false; - - // Create the bound for this PHI if needed: - if (!AuxIndBounds.count(PN)) - getAuxIndVarBound(L, PN, NumIterations, AuxIndBounds); - - // We create values based on the induction variable so we can use it to - // generate prefetcher later on. The first value (indvar + IterationAhead - // * step) will be used for the load of prefetched address and it must - // not exceeding the bound. The second value (indvar + 2 * IterationAhead - // * step) will be used to generate prefether for the load of address. - // The subsequent values are generated in a similar fashion to generate - // prefetchers for offset of all dependent loads. - - // Insert the new instruction after all PHI node. - auto InsertionPoint = Inst; - if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI()) - InsertionPoint = FirstNonPHI->getPrevNode(); - - for (size_t i = 0; i < NumPrefetchesLeft; i++) { - if (i > 0 && SkipIntermediate) - break; - - if (Transforms.size() < i + 1) { - Transforms.push_back(DenseMap()); - } else if (Transforms[i].count(Inst)) - continue; - - // Create the new operation for the target load - Value *NewOp = nullptr; - if (Inst->getType()->isPointerTy()) { - Type *GEPType = getPtrTypefromPHI(PN, Step); - int64_t Offset = - PrefetchIterationsAhead ? PrefetchIterationsAhead : ItersAhead; - if (!PositiveStep) - Offset = -Offset; - // Do not need to calculate Offset * Step as it is calculated - // implicitly within the GEP instruction - NewOp = Builder.CreateInBoundsGEP( - GEPType, Inst, - ConstantInt::getSigned(InstType, (i + 1) * Offset)); - } else { - // FullStep is the initial offset for the new value, taking into - // account, both Step and the number of iterations ahead to prefetch. - // If indirect prefetch iterations ahead is enabled, we directly use - // the supplied indirect-prefetch-iters-ahead value. - int64_t FullStep = PrefetchIterationsAhead - ? PrefetchIterationsAhead * Step - : ItersAhead * Step; - - Instruction::BinaryOps BiOp = - PositiveStep ? Instruction::Add : Instruction::Sub; - NewOp = Builder.CreateBinOp( - BiOp, Inst, - ConstantInt::get(Inst->getType(), (i + 1) * FullStep)); - } - - if (auto NewOpInstr = dyn_cast(NewOp)) { - NewOpInstr->moveAfter(InsertionPoint); - InsertionPoint = NewOpInstr; - } - - // Create the new operations for the offset loads - if (i > 0 && i == NumPrefetchesLeft - 1) { - Transforms[i].insert(std::pair(Inst, NewOp)); - } else { - Value *NewCmp = Builder.CreateICmp( - PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp, - AuxIndBounds[cast(Inst)]); - Value *NewSelect = - Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]); - Transforms[i].insert(std::pair(Inst, NewSelect)); - - if (auto NewCmpInstr = dyn_cast(NewCmp)) { - NewCmpInstr->moveAfter(InsertionPoint); - InsertionPoint = NewCmpInstr; - } - - if (auto NewSelectInstr = dyn_cast(NewSelect)) { - NewSelectInstr->moveAfter(InsertionPoint); - InsertionPoint = NewSelectInstr; - } - } - } - break; - } - case Instruction::Load: { - LoadInst *LoadI = dyn_cast(Inst); - Value *LoadPtr = LoadI->getPointerOperand(); - if (!SkipIntermediate) - NumPrefetchesLeft--; - - auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) { - Function *PrefetchFunc = Intrinsic::getDeclaration( - M, Intrinsic::prefetch, LoadPtr->getType()); - Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0), - ConstantInt::get(I32Ty, 3), - ConstantInt::get(I32Ty, 1)}; - CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg); - return PrefetchCall; - }; - - if (!DependentInsts.empty()) { - // For any intermediate (not last) load, we generate a load for all the - // offset at min(indvar+N*IterationsAhead*step, bound)] for each N up to - // NumPrefetchesLeft - 1, and generate a prefetcher at - // (indvar+(N+1)*IterationAhead*step) for the offset load. - Instruction *PrefetchOffsetLoad = nullptr; - for (size_t i = 0; i < NumPrefetchesLeft; i++) { - if (Transforms[i].count(LoadI)) - continue; - PrefetchOffsetLoad = LoadI->clone(); - Builder.Insert(PrefetchOffsetLoad); - for (size_t i = 0; i < NumPrefetchesLeft; i++) { - if (Transforms[i].count(LoadI)) - continue; - PrefetchOffsetLoad = LoadI->clone(); - Builder.Insert(PrefetchOffsetLoad); - PrefetchOffsetLoad->moveAfter(LoadI); - PrefetchOffsetLoad->replaceUsesOfWith(LoadPtr, - Transforms[i][LoadPtr]); - - Transforms[i].insert( - std::pair(LoadI, PrefetchOffsetLoad)); - } - } - - if (SkipIntermediate) - break; - - // Create a prefetcher for the offset laod. - if (PrefetchOffsetLoad) { - CallInst *PrefetchCall = - GeneratePrefetcher(Transforms[NumPrefetchesLeft][LoadPtr]); - PrefetchCall->insertAfter(PrefetchOffsetLoad); - NumIndPrefetches++; - } - } else { - CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]); - PrefetchCall->insertAfter(LoadI); - NumIndPrefetches++; - } - break; - } - default: { - // For other types of instructions, we make a clone of the instruction and - // repalce operands that we already transformed before. - for (size_t j = 0; j < NumPrefetchesLeft; j++) { - if (j >= Transforms.size() || Transforms[j].count(Inst)) - continue; - Instruction *TransformedInst = Inst->clone(); - Builder.Insert(TransformedInst); - TransformedInst->moveAfter(Inst); - for (unsigned i = 0; i < TransformedInst->getNumOperands(); i++) { - Value *Operand = TransformedInst->getOperand(i); - if (Transforms[j].count(Operand)) - TransformedInst->replaceUsesOfWith(Operand, Transforms[j][Operand]); - } - - Transforms[j].insert( - std::pair(Inst, TransformedInst)); - } - break; - } - } - } - return true; -} - -/// Find the indirect load that depends on the auxiliary induction variable and -/// construct an instr list that contains loop variant instruction from the -/// target load to the candidate phi instr. -bool LoopDataPrefetch::findCandidateMemoryLoads( - Instruction *I, SmallSetVector &InstList, - SmallPtrSet &InstSet, - SmallVector &CandidateMemoryLoads, - std::vector> &DependentInstList, - SmallPtrSet LoopAuxIndPHINodes, bool PrefetchInOuterLoop, - Loop *L) { - bool ret = false; - - for (Use &U : I->operands()) { - // If value is loop invariant, just continue - if (PrefetchInOuterLoop) { - if (L->getParentLoop()->isLoopInvariant(U.get())) - continue; - } else if (LI->getLoopFor(I->getParent())->isLoopInvariant(U.get())) - continue; - - Instruction *OperandInst = dyn_cast(U.get()); - if (OperandInst != nullptr) { - switch (OperandInst->getOpcode()) { - case Instruction::Load: { - // Check if the load instruction that it depends on is already in the - // candidate. If yes, add the canddiate's depending instr to the list. - // If not, the load instruction it depends on is invalid. - LoadInst *LoadI = dyn_cast(OperandInst); - if (isLoadInCandidateMemoryLoads(LoadI, InstList, InstSet, - CandidateMemoryLoads, - DependentInstList)) { - // We do not return early in case there are other auxiliary induction - // variables to check. - ret = true; - } - break; - } - case Instruction::PHI: { - // Check if PHI is the loop auxiliary induction PHI. If yes, found a - // valid load dependent on loop auxiliary induction variable. If not, - // invalid candidate. - PHINode *PhiInst = dyn_cast(OperandInst); - if (LoopAuxIndPHINodes.contains(PhiInst)) { - // In order to prevent the size of SmallVector from going out of - // bounds for large cases, only the last access of the element is - // retained. Update the position of OperandInst in the InstList. - if (InstList.count(OperandInst)) - InstList.remove(OperandInst); - InstList.insert(OperandInst); - return true; - } - break; - } - case Instruction::Call: { - if (PrefetchInOuterLoop || RandomAccessPrefetchOnly) { - if (OperandInst->mayReadOrWriteMemory()) - return false; - CallInst *Call = dyn_cast(OperandInst); - if (!Call->doesNotThrow()) - return false; - - // Use DFS to search though the operands. - InstList.insert(OperandInst); - if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, - CandidateMemoryLoads, DependentInstList, - LoopAuxIndPHINodes, PrefetchInOuterLoop, - L)) { - // We do not return early in case there are other auxiliary - // induction variable to check - ret = true; - } else { - // If the Operand isn't dependent on an auxiliary induction - // variable, remove any instructions added to DependentInstList from - // this operand - if (InstList.count(OperandInst)) - InstList.remove(OperandInst); - InstList.insert(OperandInst); - return false; - } - break; - } else { - // We currently can not handle case where indirect load depends on - // other functions yet. - return false; - } - } - case Instruction::Invoke: { - // We currently can not handle case where indirect load depends on other - // functions yet. - return false; - } - default: { - // Use DFS to search though the operands. - if (InstList.count(OperandInst)) - InstList.remove(OperandInst); - InstList.insert(OperandInst); - if (findCandidateMemoryLoads(OperandInst, InstList, InstSet, - CandidateMemoryLoads, DependentInstList, - LoopAuxIndPHINodes, PrefetchInOuterLoop, - L)) { - // We do not return early in case there are other auxiliary induction - // variables to check - ret = true; - } else { - // If the operand isn't dependent on an auxiliary induction variable, - // remove any instructions added to DependentInstList from this - // operand - InstList.remove(OperandInst); - } - } - } - } - } - return ret; -} - -/// Helper function to determine whether the given load is in -/// CandidateMemoryLoads. If Yes, add the candidate's depending instr to the -/// list. -bool LoopDataPrefetch::isLoadInCandidateMemoryLoads( - LoadInst *LoadI, SmallSetVector &InstList, - SmallPtrSet &InstSet, - SmallVector &CandidateMemoryLoads, - std::vector> &DependentInstList) { - size_t CandidateLoadIndex = 0; - for (auto CandidateMemoryLoad : CandidateMemoryLoads) { - if (LoadI == CandidateMemoryLoad) - break; - CandidateLoadIndex++; - } - - if (CandidateLoadIndex >= CandidateMemoryLoads.size() || InstSet.count(LoadI)) - return false; - - for (auto CandidateInst : DependentInstList[CandidateLoadIndex]) { - if (InstList.count(CandidateInst)) - InstList.remove(CandidateInst); - InstList.insert(CandidateInst); - InstSet.insert(CandidateInst); - } - return true; -} - -/// Returns whether the given loop should be processed to insert prefetches for -/// indirect loads. -bool LoopDataPrefetch::canDoIndirectPrefetch(Loop *L) { - // Support inner most loops in a simple form. However, the parent of inner - // loop will be processed as well in the case of nested loops. If - // indirectLevel is low, only allow one block loop, otherwise, allow up to 5 - // under certain conditions. - if (!L->isInnermost() || !L->getLoopPreheader() || - (IndirectionLevel <= 3 && L->getNumBlocks() != 1) || - (IndirectionLevel > 3 && L->getNumBlocks() == 1) || L->getNumBlocks() > 5) - return false; - return true; -} - -/// Check if the load depends on Crc Hash functions. -bool LoopDataPrefetch::isCrcHashDataAccess(Instruction *I, - Instruction *PrefetchingLoad) { - if (llvm::IntrinsicInst *II = dyn_cast(I)) - // If CRC functions are used for offset calculation then offset will be - // random. To avoid cache misses, data prefetch is needed. - switch (II->getIntrinsicID()) { - case Intrinsic::aarch64_crc32b: - case Intrinsic::aarch64_crc32cb: - case Intrinsic::aarch64_crc32h: - case Intrinsic::aarch64_crc32ch: - case Intrinsic::aarch64_crc32w: - case Intrinsic::aarch64_crc32cw: - case Intrinsic::aarch64_crc32x: - case Intrinsic::aarch64_crc32cx: { - // Checking Candidate load is incremented by 1. - if (auto *LI = dyn_cast(PrefetchingLoad)) { - if (auto *GEPI = dyn_cast(LI->getPointerOperand())) { - // The data access will be consecutive, if the gep has one indices. - if (GEPI->getNumOperands() > 2) - return false; - auto *PtrIndices = dyn_cast(GEPI->getOperand(1)); - if (!PtrIndices || isa(PtrIndices)) - return true; - for (auto &U : PtrIndices->uses()) - if (auto *PN = dyn_cast(U.getUser())) - if (getStep(PN, SE) <= 1) - return true; - } - } - break; - } - } - return false; -} - -/// Checks the indirect loads inside the inner loop and -/// it is derived from induction variable of outer loop then, -/// insert the prefetch instruction in outer loop. -/// It maintains the same CFG structure of inner loop and -/// clone it in the outerloop. Insert the prefetch for -/// the last indirect load, not for the intermediate loads. -bool LoopDataPrefetch::insertPrefetcherInOuterloopForIndirectLoad( - Loop *L, unsigned Idx, Value *NumIterations, - SmallVector &CandidateMemoryLoads, - SmallSetVector &DependentInsts, - ValueMap &AuxIndBounds, - SmallVectorImpl> &Transforms, - unsigned ItersAhead) { - Instruction *TargetIndirectLoad = CandidateMemoryLoads[Idx]; - IRBuilder<> Builder(TargetIndirectLoad); - Module *M = TargetIndirectLoad->getModule(); - auto *ParentLoop = L->getParentLoop(); - - if (!ParentLoop) - return false; - - SmallVector ExitBlocks; - L->getUniqueExitBlocks(ExitBlocks); - bool HasCatchSwitch = llvm::any_of(ExitBlocks, [](BasicBlock *Exit) { - return isa(Exit->getTerminator()); - }); - if (HasCatchSwitch) - return false; - - SmallVector NewBBlocks; - SmallVector AllDependentInsts; - SmallPtrSet Visited; - SmallPtrSet IndirectLoadDependents; - SmallPtrSet BranchInsts; - SmallPtrSet InsertedPrefetchCalls; - DenseMap BBTransforms; - DenseMap BBPostNumbers; - BasicBlock *NewRootBB = nullptr; - DomTreeUpdater DTU(DT, DomTreeUpdater::UpdateStrategy::Lazy); - - if (!isa(DependentInsts[DependentInsts.size() - 1])) { - return false; - } else { - if (auto *PN = - dyn_cast(DependentInsts[DependentInsts.size() - 1])) { - if (!ParentLoop->contains(PN)) { - return false; - } - if (!getStep(PN, SE)) - return false; - if (isa(PN->getType())) - return false; - } - } - - ExitBlocks.clear(); - ParentLoop->getUniqueExitBlocks(ExitBlocks); - if (HasCatchSwitch) - return false; - - Instruction *CandidateLoad = DependentInsts[0]; - BasicBlock *LoopPreheader = L->getLoopPreheader(); - - // Only consider crc hashed random data accesses. - bool isRandomAccess = false; - for (auto *I : DependentInsts) { - IndirectLoadDependents.insert(I); - Visited.insert(I); - isRandomAccess |= isCrcHashDataAccess(I, CandidateLoad); - } - if (!isRandomAccess) - return false; - - if (!LoopPreheader || !ParentLoop->getLoopPreheader()) - return false; - - if (LoopPreheader->getTerminator() == nullptr || - !isa(LoopPreheader->getTerminator())) - return false; - if (Visited.insert(LoopPreheader->getTerminator()).second) - DependentInsts.insert(LoopPreheader->getTerminator()); - - // Start from target indirect load block, get the list of predecessor blocks - // till loop preheader. And we assign each block with post order number with - // which we can sort. - SmallSetVector BBPredecessors; - BBPredecessors.insert(CandidateLoad->getParent()); - BBPostNumbers.insert({CandidateLoad->getParent(), 0}); - while (BBPredecessors.size()) { - BasicBlock *BBPred = BBPredecessors[0]; - BBPredecessors.remove(BBPred); - int Depth = BBPostNumbers[BBPred]; - // Check all predecessors and add their branch instr into dependent list - for (BasicBlock *Predecessor : predecessors(BBPred)) { - if (LoopPreheader != Predecessor && !DT->dominates(BBPred, Predecessor)) { - if (BBPostNumbers.end() == BBPostNumbers.find(Predecessor)) { - BBPostNumbers.insert({Predecessor, Depth - 1}); - BBPredecessors.insert(Predecessor); - // Check each terminator is a branch instr. - if (Predecessor->getTerminator() == nullptr || - !isa(Predecessor->getTerminator())) - return false; - // Add branch instruction as dependent instr. - if (Visited.insert(Predecessor->getTerminator()).second) - DependentInsts.insert(Predecessor->getTerminator()); - } - } - } - } - - // Loop preheader is last depend block. - BBPostNumbers.insert({LoopPreheader, -1 * BBPostNumbers.size()}); - - // Update DependentInsts to include instructions that branch instruction - // depends. - for (unsigned j = 0; j < DependentInsts.size(); j++) { - Instruction *Inst = DependentInsts[j]; - if (Inst == nullptr) - return false; - - if (auto *PN = dyn_cast(Inst)) { - if (!IndirectLoadDependents.count(Inst)) { - if (0 > PN->getBasicBlockIndex(LoopPreheader)) - return false; - } - } else if (auto *BranchInstr = dyn_cast(Inst)) { - // Add condition of branch instruction into dependent insts. - if (BranchInstr->isConditional()) { - auto *BranchCond = BranchInstr->getCondition(); - if (BranchCond == nullptr) - return false; - if (Instruction *BranchCondInst = dyn_cast(BranchCond)) - if (Visited.insert(BranchCondInst).second) - DependentInsts.insert(BranchCondInst); - } else if (BranchInstr->getSuccessor(0)->isEHPad()) - return false; - } else if (isa(Inst)) { - return false; - } else { - if (CallInst *Call = dyn_cast(Inst)) - if (Inst->mayReadOrWriteMemory() || !Call->doesNotThrow()) - return false; - // Traverse instruction operands and add dependent instructions till - // function argument, constant or value outside current loop. - for (unsigned i = 0; i < Inst->getNumOperands(); i++) { - Value *Operand = Inst->getOperand(i); - if (Operand == nullptr) - return false; - if (isa(Operand) || isa(Operand)) - continue; - if (Instruction *I = dyn_cast(Operand)) - if (L->contains(I) || I->getParent() == LoopPreheader) - if (Visited.insert(I).second) - DependentInsts.insert(I); - } - } - } - - // Sort dependent instruction based on PostNumber id and instruction ordering - // in the same block. - SmallVector, 8> SortedDependentInsts; - DT->updateDFSNumbers(); - SortedDependentInsts.reserve(DependentInsts.size()); - for (auto I : DependentInsts) { - auto *NodeI = DT->getNode(I->getParent()); - SortedDependentInsts.push_back({I, NodeI->getDFSNumIn()}); - } - llvm::sort(SortedDependentInsts, [&](auto const &LHS, auto const &RHS) { - if (get<0>(RHS)->getParent() == get<0>(LHS)->getParent()) - return get<0>(RHS)->comesBefore(get<0>(LHS)); - if (BBPostNumbers.end() == BBPostNumbers.find(get<0>(LHS)->getParent()) || - BBPostNumbers.end() == BBPostNumbers.find(get<0>(RHS)->getParent())) - return get<1>(RHS) < get<1>(LHS); - if (BBPostNumbers[get<0>(LHS)->getParent()] == - BBPostNumbers[get<0>(RHS)->getParent()]) - return get<1>(RHS) < get<1>(LHS); - return BBPostNumbers[get<0>(LHS)->getParent()] > - BBPostNumbers[get<0>(RHS)->getParent()]; - }); - - // Checking all the BasicBlocks have branch instruction - int BBDepth = 0; - for (auto I : SortedDependentInsts) { - if (BBDepth && get<1>(I) != BBDepth) - if (!isa(get<0>(I)) && - BBPostNumbers.end() != BBPostNumbers.find(get<0>(I)->getParent())) - return false; - BBDepth = get<1>(I); - } - - if (!isa(get<0>(SortedDependentInsts[0]))) - return false; - - if (!L->contains(get<0>(SortedDependentInsts[0]))) - return false; - - if (!isa( - get<0>(SortedDependentInsts[SortedDependentInsts.size() - 1]))) - return false; - else if (auto *PN = dyn_cast( - get<0>(SortedDependentInsts[SortedDependentInsts.size() - 1]))) - if (!ParentLoop->contains(PN)) - return false; - - auto cloneInstructionWithBB = [&](llvm::Instruction *Inst, - llvm::Instruction *NewInstr = nullptr) { - Instruction *TransformedInstr = NewInstr; - if (TransformedInstr == nullptr) - TransformedInstr = Inst->clone(); - - BasicBlock *NewBlock; - BasicBlock *OldBlock = Inst->getParent(); - // Check if block had been created before. - if (BBTransforms.count(OldBlock)) { - NewBlock = BBTransforms[OldBlock]; - } else { - NewBlock = BasicBlock::Create(OldBlock->getContext(), - "prefetch." + OldBlock->getName()); - NewBlock->insertInto(OldBlock->getParent(), LoopPreheader); - if (NewRootBB == nullptr) - NewRootBB = NewBlock; - if (!ParentLoop->contains(NewBlock)) - ParentLoop->addBasicBlockToLoop(NewBlock, *LI); - BBTransforms.insert( - std::pair(OldBlock, NewBlock)); - NewBBlocks.push_back(NewBlock); - } - TransformedInstr->insertInto(NewBlock, NewBlock->end()); - if (NewInstr == nullptr) { - for (unsigned i = 0; i < TransformedInstr->getNumOperands(); i++) { - Value *Operand = TransformedInstr->getOperand(i); - if (Transforms[0].count(Operand)) - TransformedInstr->replaceUsesOfWith(Operand, Transforms[0][Operand]); - } - } - Transforms[0].insert(std::pair(Inst, TransformedInstr)); - AllDependentInsts.push_back(TransformedInstr); - return TransformedInstr; - }; - - // We create block and instructions with topdown manner, e.g. from PHI node in - // the parent loop to target indirect load. - bool PositiveStep = true; - int64_t Step; - while (!SortedDependentInsts.empty()) { - Instruction *DependentInst = get<0>(SortedDependentInsts.pop_back_val()); - Instruction *Inst = dyn_cast(DependentInst); - - // For target load related instruction. - switch (Inst->getOpcode()) { - case Instruction::PHI: { - // For non-root phi node, replace phi node with incoming value. - if (!IndirectLoadDependents.count(Inst)) { - if (Transforms[0].count(Inst)) - continue; - auto *PN = dyn_cast(Inst); - Transforms[0].insert(std::pair( - Inst, PN->getIncomingValueForBlock(LoopPreheader))); - break; - } - // Replace root phi node with following value: - // select((phi + step) < bound, (phi + step), bound) - // Get the constant step for the induction phi so we can use it to - // calculate how much we should increase the induction for prefetching - PHINode *PN = dyn_cast(Inst); - Step = getStep(PN, SE); - PositiveStep = isPositiveStep(PN, SE); - Type *InstType = getStepTypeFromPHINode(PN, SE); - if (!PositiveStep) - Step = -Step; - - // Make sure phi node is i64 or i32. - if (!InstType->isIntegerTy(64) && !InstType->isIntegerTy(32)) - return false; - - // Create the bound for this PHI if needed: - if (!AuxIndBounds.count(PN)) - getAuxIndVarBound(ParentLoop, PN, NumIterations, AuxIndBounds); - - // Insert the new instruction after all PHI nodes - auto InsertionPoint = Inst; - if (auto FirstNonPHI = Inst->getParent()->getFirstNonPHI()) - InsertionPoint = FirstNonPHI->getPrevNode(); - - if (Transforms.size() < 1) - Transforms.push_back(DenseMap()); - else if (Transforms[0].count(Inst)) - continue; - - // FullStep is the inital offset for the new value, taking into account, - // both Step and the number of iterations ahead to prefetch. If indirect - // prefetch iteration ahead is enabled, we directly use the supplied - // indirect-prefetch-iters-ahead value. - int64_t FullStep = PrefetchIterationsAhead - ? PrefetchIterationsAhead * Step - : ItersAhead * Step; - - Instruction::BinaryOps BiOp = - PositiveStep ? Instruction::Add : Instruction::Sub; - auto *NewOp = Builder.CreateBinOp( - BiOp, Inst, ConstantInt::get(Inst->getType(), FullStep)); - if (auto NewOpInstr = dyn_cast(NewOp)) { - NewOpInstr->moveAfter(InsertionPoint); - InsertionPoint = NewOpInstr; - AllDependentInsts.push_back(NewOpInstr); - } - - Value *NewCmp = Builder.CreateICmp( - PositiveStep ? CmpInst::ICMP_SLT : CmpInst::ICMP_SGT, NewOp, - AuxIndBounds[cast(Inst)]); - Value *NewSelect = Builder.CreateSelect(NewCmp, NewOp, AuxIndBounds[PN]); - Transforms[0].insert(std::pair(Inst, NewSelect)); - - if (auto NewCmpInstr = dyn_cast(NewCmp)) { - NewCmpInstr->moveAfter(InsertionPoint); - InsertionPoint = NewCmpInstr; - AllDependentInsts.push_back(NewCmpInstr); - } - if (auto NewSelectInstr = dyn_cast(NewSelect)) { - NewSelectInstr->moveAfter(InsertionPoint); - InsertionPoint = NewSelectInstr; - AllDependentInsts.push_back(NewSelectInstr); - } - break; - } - case Instruction::Load: { - LoadInst *LoadI = dyn_cast(Inst); - Value *LoadPtr = LoadI->getPointerOperand(); - auto GeneratePrefetcher = [&](llvm::Value *PrefetchPtr) { - Function *PrefetchFunc = Intrinsic::getDeclaration( - M, Intrinsic::prefetch, LoadPtr->getType()); - Type *I32Ty = - Type::getInt32Ty(CandidateLoad->getParent()->getContext()); - Value *PrefetchArg[] = {PrefetchPtr, ConstantInt::get(I32Ty, 0), - ConstantInt::get(I32Ty, 3), - ConstantInt::get(I32Ty, 1)}; - CallInst *PrefetchCall = CallInst::Create(PrefetchFunc, PrefetchArg); - return PrefetchCall; - }; - - // We clone the intermediate load but prefetch the target load. - if (!SortedDependentInsts.empty()) { - if (Transforms[0].count(LoadI)) - continue; - cloneInstructionWithBB(LoadI); - } else { - CallInst *PrefetchCall = GeneratePrefetcher(Transforms[0][LoadPtr]); - cloneInstructionWithBB(LoadI, PrefetchCall); - InsertedPrefetchCalls.insert(PrefetchCall); - } - break; - } - case Instruction::Br: { - BranchInsts.insert(cloneInstructionWithBB(Inst)); - break; - } - default: { - // For other types of instructions, we make a clone of the instruction and - // replace operands that we already transformed before. - if (Transforms[0].count(Inst)) - continue; - cloneInstructionWithBB(Inst); - break; - } - } - } - - BasicBlock *EndBlock = - BasicBlock::Create(LoopPreheader->getContext(), "prefetch.end"); - ParentLoop->addBasicBlockToLoop(EndBlock, *LI); - EndBlock->insertInto(LoopPreheader->getParent(), LoopPreheader); - - // Create branch from prefetch call block to end block. - for (CallInst *PrefetchCall : InsertedPrefetchCalls) - if (!PrefetchCall->getParent()->getTerminator()) { - AllDependentInsts.push_back( - BranchInst::Create(EndBlock, PrefetchCall->getParent())); - } - - // Checking all the newly created BasicBlock has Terminator instruction. If - // not, considered as incomplete. Delete all new BasicBlocks and return. - for (BasicBlock *BB : NewBBlocks) { - if (BB->getTerminator() == nullptr) { - for (unsigned j = 0; j < AllDependentInsts.size(); j++) { - auto *I = AllDependentInsts[j]; - I->replaceAllUsesWith(UndefValue::get(I->getType())); - I->eraseFromParent(); - } - for (unsigned j = 0; j < NewBBlocks.size(); j++) { - auto *DelBBlock = NewBBlocks[j]; - ParentLoop->removeBlockFromLoop(DelBBlock); - DelBBlock->eraseFromParent(); - } - ParentLoop->removeBlockFromLoop(EndBlock); - EndBlock->eraseFromParent(); - return false; - } - } - - // Updating with branch from Entry to PreHeader to NewRootBB - for (BasicBlock *PredecessorBB : predecessors(LoopPreheader)) { - auto *BrInstr = PredecessorBB->getTerminator(); - for (unsigned i = 0, NumSuccessor = BrInstr->getNumSuccessors(); - i < NumSuccessor; i++) { - auto *OldSuccessor = BrInstr->getSuccessor(i); - if (OldSuccessor == LoopPreheader) { - DTU.applyUpdates( - {{DominatorTree::Delete, PredecessorBB, LoopPreheader}}); - BrInstr->setSuccessor(i, NewRootBB); - DTU.applyUpdates({{DominatorTree::Insert, PredecessorBB, NewRootBB}}); - } - } - } - AllDependentInsts.push_back(BranchInst::Create(LoopPreheader, EndBlock)); - - // Updating with new BasicBlock in all newly created branch instruction. - // Updating DominatorTree for all new BasicBlocks. - for (auto *I : BranchInsts) { - auto *BrInstr = dyn_cast(I); - for (unsigned i = 0, NumSuccessor = BrInstr->getNumSuccessors(); - i < NumSuccessor; i++) { - auto *OldSuccessor = BrInstr->getSuccessor(i); - if (BBTransforms.end() != BBTransforms.find(OldSuccessor)) { - auto *NewSuccessor = BBTransforms[OldSuccessor]; - BrInstr->setSuccessor(i, NewSuccessor); - DTU.applyUpdates( - {{DominatorTree::Insert, BrInstr->getParent(), NewSuccessor}}); - } else { - BrInstr->setSuccessor(i, EndBlock); - DTU.applyUpdates( - {{DominatorTree::Insert, BrInstr->getParent(), EndBlock}}); - } - } - } - - for (CallInst *PrefetchCall : InsertedPrefetchCalls) { - if (!PrefetchCall->getParent()->getTerminator()) { - DTU.applyUpdates( - {{DominatorTree::Insert, PrefetchCall->getParent(), EndBlock}}); - } - } - - auto *InsertPoint = ParentLoop->getLoopPreheader(); - auto *BBTerminator = InsertPoint->getTerminator(); - Instruction *EndPoint = nullptr; - if (InsertPoint) { - for (unsigned j = 0; j < AllDependentInsts.size(); j++) { - auto *I = AllDependentInsts[j]; - if (I->getOpcode() != Instruction::Br) - if (ParentLoop->hasLoopInvariantOperands(I)) { - auto *InvariantInstr = I->clone(); - InvariantInstr->insertInto(InsertPoint, InsertPoint->end()); - EndPoint = InvariantInstr; - I->replaceAllUsesWith(InvariantInstr); - I->eraseFromParent(); - } - } - if (EndPoint) - BBTerminator->moveAfter(EndPoint); - NumOuterLoopPrefetches++; - } - return true; -} - PreservedAnalyses LoopDataPrefetchPass::run(Function &F, FunctionAnalysisManager &AM) { - AliasAnalysis *AA = &AM.getResult(F); DominatorTree *DT = &AM.getResult(F); LoopInfo *LI = &AM.getResult(F); ScalarEvolution *SE = &AM.getResult(F); @@ -1427,16 +179,8 @@ PreservedAnalyses LoopDataPrefetchPass::run(Function &F, &AM.getResult(F); const TargetTransformInfo *TTI = &AM.getResult(F); - // Ensure loops are in simplified form which is a pre-requisite for loop data - // prefetch pass. Added only for new PM since the legacy PM has already added - // LoopSimplify pass as a dependency. - bool Changed = false; - for (auto &L : *LI) { - Changed |= simplifyLoop(L, DT, LI, SE, AC, nullptr, false); - } - - LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE); - Changed |= LDP.run(); + LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); + bool Changed = LDP.run(); if (Changed) { PreservedAnalyses PA; @@ -1452,7 +196,6 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { if (skipFunction(F)) return false; - AliasAnalysis *AA = &getAnalysis().getAAResults(); DominatorTree *DT = &getAnalysis().getDomTree(); LoopInfo *LI = &getAnalysis().getLoopInfo(); ScalarEvolution *SE = &getAnalysis().getSE(); @@ -1463,7 +206,7 @@ bool LoopDataPrefetchLegacyPass::runOnFunction(Function &F) { const TargetTransformInfo *TTI = &getAnalysis().getTTI(F); - LoopDataPrefetch LDP(AA, AC, DT, LI, SE, TTI, ORE); + LoopDataPrefetch LDP(AC, DT, LI, SE, TTI, ORE); return LDP.run(); } @@ -1471,8 +214,7 @@ bool LoopDataPrefetch::run() { // If PrefetchDistance is not set, don't run the pass. This gives an // opportunity for targets to run this pass for selected subtargets only // (whose TTI sets PrefetchDistance and CacheLineSize). - if (getPrefetchDistance() == 0 || - (TTI->getCacheLineSize() == 0 && CachelineSize == 0)) { + if (getPrefetchDistance() == 0 || TTI->getCacheLineSize() == 0) { LLVM_DEBUG(dbgs() << "Please set both PrefetchDistance and CacheLineSize " "for loop data prefetch.\n"); return false; @@ -1532,36 +274,13 @@ struct Prefetch { bool LoopDataPrefetch::runOnLoop(Loop *L) { bool MadeChange = false; - if (L->getLoopDepth() < PrefetchLoopDepth) + // Only prefetch in the inner-most loop + if (!L->isInnermost()) return MadeChange; - bool IsInnerMost = true; - // Prefetch outer loop if needed. - if (!L->isInnermost()) { - if (OuterLoopPrefetch) - IsInnerMost = false; - else - return MadeChange; - } - SmallPtrSet EphValues; CodeMetrics::collectEphemeralValues(L, AC, EphValues); - CodeMetrics InnerLoopMetrics; - // Calculate the sub loop size when prefetching outer loops. - SmallPtrSet InnerMostBBs; - if (!IsInnerMost) { - for (Loop *LL : L->getSubLoops()) { - // Make sure all sub loops are inner most loop. - if (!LL->isInnermost()) - return MadeChange; - for (const auto BB : LL->blocks()) { - InnerMostBBs.insert(BB); - InnerLoopMetrics.analyzeBasicBlock(BB, *TTI, EphValues); - } - } - } - // Calculate the number of iterations ahead to prefetch CodeMetrics Metrics; bool HasCall = false; @@ -1590,12 +309,6 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (!LoopSize) LoopSize = 1; - // Only prefetch small outer loops with small sub loops. - if (!IsInnerMost) - if (LoopSize - InnerLoopMetrics.NumInsts > 128 || - InnerLoopMetrics.NumInsts > 128) - return MadeChange; - unsigned ItersAhead = getPrefetchDistance() / LoopSize; if (!ItersAhead) ItersAhead = 1; @@ -1610,10 +323,9 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { unsigned NumMemAccesses = 0; unsigned NumStridedMemAccesses = 0; SmallVector Prefetches; - for (const auto BB : L->blocks()) { - // If this is not inner most, we avoid prefetching in sub loops. + for (const auto BB : L->blocks()) for (auto &I : *BB) { - Value *PtrValue = nullptr; + Value *PtrValue; Instruction *MemI; if (LoadInst *LMemI = dyn_cast(&I)) { @@ -1625,11 +337,6 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { PtrValue = SMemI->getPointerOperand(); } else continue; - if (!PtrValue) - continue; - if (getPrefetchDistance() == 0) - continue; - unsigned PtrAddrSpace = PtrValue->getType()->getPointerAddressSpace(); if (!TTI->shouldPrefetchAddressSpace(PtrAddrSpace)) continue; @@ -1643,11 +350,6 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { continue; NumStridedMemAccesses++; - // For outer loops, we only prefetch memory instruction with stride - // depending on the current loop. - if (!IsInnerMost && LSCEVAddRec->getLoop() != L) - continue; - // We don't want to double prefetch individual cache lines. If this // access is known to be within one cache line of some other one that // has already been prefetched, then don't prefetch this one as well. @@ -1657,19 +359,16 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { if (const SCEVConstant *ConstPtrDiff = dyn_cast(PtrDiff)) { int64_t PD = std::abs(ConstPtrDiff->getValue()->getSExtValue()); - int64_t CacheLineSize = - TTI->getCacheLineSize() ? TTI->getCacheLineSize() : CachelineSize; - if (PD < (int64_t)CacheLineSize) { + if (PD < (int64_t) TTI->getCacheLineSize()) { Pref.addInstruction(MemI, DT, PD); DupPref = true; break; } } } - if (!DupPref && !DisableDirectLoadPrefetch) + if (!DupPref) Prefetches.push_back(Prefetch(LSCEVAddRec, MemI)); } - } unsigned TargetMinStride = getMinPrefetchStride(NumMemAccesses, NumStridedMemAccesses, @@ -1687,18 +386,15 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { for (auto &P : Prefetches) { // Check if the stride of the accesses is large enough to warrant a - // prefetch. If MinPrefetchStride <= 1, no need to check if any stride - // goes. - const SCEV *StrideExpr = P.LSCEVAddRec->getStepRecurrence(*SE); + // prefetch. if (!isStrideLargeEnough(P.LSCEVAddRec, TargetMinStride)) continue; BasicBlock *BB = P.InsertPt->getParent(); SCEVExpander SCEVE(*SE, BB->getModule()->getDataLayout(), "prefaddr"); - const SCEV *NextLSCEV = SE->getAddExpr( - P.LSCEVAddRec, - SE->getMulExpr(SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), - StrideExpr)); + const SCEV *NextLSCEV = SE->getAddExpr(P.LSCEVAddRec, SE->getMulExpr( + SE->getConstant(P.LSCEVAddRec->getType(), ItersAhead), + P.LSCEVAddRec->getStepRecurrence(*SE))); if (!SCEVE.isSafeToExpand(NextLSCEV)) continue; @@ -1711,10 +407,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { Type *I32 = Type::getInt32Ty(BB->getContext()); Function *PrefetchFunc = Intrinsic::getDeclaration( M, Intrinsic::prefetch, PrefPtrValue->getType()); - Builder.CreateCall(PrefetchFunc, - {PrefPtrValue, ConstantInt::get(I32, P.Writes), - ConstantInt::get(I32, IsInnerMost ? 3 : 0), - ConstantInt::get(I32, 1)}); + Builder.CreateCall( + PrefetchFunc, + {PrefPtrValue, + ConstantInt::get(I32, P.Writes), + ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)}); ++NumPrefetches; LLVM_DEBUG(dbgs() << " Access: " << *P.MemI->getOperand(isa(P.MemI) ? 0 : 1) @@ -1727,142 +424,5 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) { MadeChange = true; } - if (!IndirectLoadPrefetch) - return MadeChange; - - // List of valid phi nodes that indirect loads can depend on. - SmallPtrSet LoopAuxIndPHINodes; - // Map of valid phi node to its bound value in the preheader. - ValueMap AuxIndBounds; - // Candidate memory loads in the loop. - SmallVector CandidateMemoryLoads; - // List of instruction from phi to load. - std::vector> DependentInstList; - // List of store instr in the loop. - SmallVector LoopStorePtrs; - - // Get loop induction and auxiliary induction phis. (Thye will be candidates - // for phi node matching during construction of the candidate instructions.) - // And we use the phi nodes to determine the loop upperbound. - Value *NumIterations = - getLoopIterationNumber(L, LoopAuxIndPHINodes, AuxIndBounds); - bool PrefetchInOuterLoop = false; - if (NumIterations == nullptr) { - if (!L->isOutermost()) { - NumIterations = getLoopIterationNumber(L->getParentLoop(), - LoopAuxIndPHINodes, AuxIndBounds); - if (NumIterations == nullptr) - return MadeChange; - PrefetchInOuterLoop = true; - } else - return MadeChange; - } - - if (!RandomAccessPrefetchOnly && !PrefetchInOuterLoop && - !canDoIndirectPrefetch(L)) { - cleanLoopIterationNumber(NumIterations); - return MadeChange; - } - - // Find candidate auxiliary induction variables which could be a dependent for - // the indirect load. - BasicBlock *Header = nullptr; - Loop *CurrentLoop = L; - if (PrefetchInOuterLoop) { - Header = L->getParentLoop()->getHeader(); - CurrentLoop = L->getParentLoop(); - } else { - if (getBooleanLoopAttribute(L, "llvm.loop.isvectorized")) - return false; - Header = L->getHeader(); - } - - for (auto &I : *Header) - if (PHINode *PHI = dyn_cast(&I)) { - InductionDescriptor IndDesc; - if (InductionDescriptor::isInductionPHI(PHI, CurrentLoop, SE, IndDesc) && - CurrentLoop->getInductionVariable(*SE) != PHI) { - canGetAuxIndVarBound(CurrentLoop, PHI, LoopAuxIndPHINodes); - } - } - - // Will search for candidates in the parent loop of the current inner most - // loop. This will capture more opportunities in the outer loop. - SmallVector BBList; - for (auto &BB : L->blocks()) - BBList.push_back(BB); - if (L->getParentLoop()) - for (auto &BB : L->getParentLoop()->blocks()) { - // We don't want to repeat blocks in the case of nested loops. - if (L->contains(BB)) - continue; - BBList.push_back(BB); - } - - // Iterate through the loop and keep track of the memory loads and the - // instruction list they depend on. - for (const auto BB : BBList) { - for (auto &I : *BB) - if (LoadInst *LoadI = dyn_cast(&I)) { - SmallSetVector InstList; - SmallSet InstSet; - InstList.insert(LoadI); - InstSet.insert(LoadI); - if (findCandidateMemoryLoads(LoadI, InstList, InstSet, - CandidateMemoryLoads, DependentInstList, - LoopAuxIndPHINodes, PrefetchInOuterLoop, - L)) { - LLVM_DEBUG(dbgs() << "Found load candidate " << *LoadI << "\n"); - CandidateMemoryLoads.push_back(LoadI); - DependentInstList.push_back(InstList); - } - } else if (StoreInst *StoreI = dyn_cast(&I)) { - // Keep track of store insts to avoid conflict. - LoopStorePtrs.push_back(StoreI->getPointerOperand()); - } - } - - // Keep track of previously transformed instrs for offset load and target - // loads so we can reuse them. - SmallVector> Transforms; - for (unsigned i = 0; i < CandidateMemoryLoads.size(); i++) { - SmallSetVector DependentInsts = DependentInstList[i]; - unsigned NumLoads = 0; - bool NoConflict = true; - // Find candidate that contains indirect loads and check load for offset - // doesn't alias with other stores. - for (auto DependentInst : DependentInsts) { - if (LoadInst *LoadI = dyn_cast(DependentInst)) { - NumLoads++; - // For the load of target address offset, we avoid the load being - // conflict with stores in the same loop. - if (NumLoads == IndirectionLevel) { - Value *LoadPtr = LoadI->getPointerOperand(); - for (Value *StorePtr : LoopStorePtrs) - if (AA->isMustAlias(LoadPtr, StorePtr)) { - NoConflict = false; - break; - } - } - } - } - - // Prefetch all indirect loads without conflict to the offset load. - if (NumLoads == IndirectionLevel && NoConflict) { - if (PrefetchInOuterLoop) { - MadeChange |= insertPrefetcherInOuterloopForIndirectLoad( - L, i, NumIterations, CandidateMemoryLoads, DependentInsts, - AuxIndBounds, Transforms, ItersAhead); - break; - } else { - MadeChange |= insertPrefetcherForIndirectLoad( - L, i, NumIterations, CandidateMemoryLoads, DependentInsts, - AuxIndBounds, Transforms, ItersAhead); - } - } - } - - cleanLoopIterationNumber(NumIterations); - return MadeChange; } diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll deleted file mode 100644 index afde478f89e7f8512bffcc173f9ea50b57aa2940..0000000000000000000000000000000000000000 --- a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-crc-outerloop.ll +++ /dev/null @@ -1,147 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --indirect-load-prefetch=true --prefetch-distance=512 --outer-loop-prefetch=true --random-access-prefetch-only=true -disable-direct-prefetch -S | FileCheck %s - -target datalayout = "e-m:e-p:32:32-Fi8-i64:64:128-a:0:32-n32-S64" -target triple = "armv8a-unknown-linux-gun" - -; Function Attrs: mustprogress nocallback nofree nosync nounwind willreturn memory(none) -declare i32 @llvm.aarch64.crc32w(i32, i32) - -; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(read, inaccessiblemem: none) -define dso_local arm_aapcscc noundef i32 @_z12matchcolumnsPPiS_ii(ptr nocapture noundef readonly %A, ptr nocapture noundef readnone %key, i32 noundef %index, i32 noundef %count) local_unnamed_addr { -; CHECK-LABEL: @_z12matchcolumnsPPiS_ii( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[TMP0:%.*]] = getelementptr inbounds ptr, ptr [[A:%.*]], i32 0 -; CHECK-NEXT: [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 4 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret i32 [[SUM_1_LCSSA:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[SUM_040:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[SUM_1_LCSSA]], [[FOR_COND_CLEANUP4:%.*]] ] -; CHECK-NEXT: [[I_039:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[INC17:%.*]], [[FOR_COND_CLEANUP4]] ] -; CHECK-NEXT: [[TMP2:%.*]] = add i32 [[I_039]], 19 -; CHECK-NEXT: [[TMP3:%.*]] = icmp slt i32 [[TMP2]], 99 -; CHECK-NEXT: [[TMP4:%.*]] = select i1 [[TMP3]], i32 [[TMP2]], i32 99 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[I_039]] -; CHECK-NEXT: [[TMP5:%.*]] = load ptr, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[TMP5]], align 4 -; CHECK-NEXT: [[CMP336:%.*]] = icmp sgt i32 [[TMP6]], 0 -; CHECK-NEXT: br i1 [[CMP336]], label [[PREFETCH_FOR_BODY5_PREHEADER:%.*]], label [[FOR_COND_CLEANUP4]] -; CHECK: prefetch.for.body5.preheader: -; CHECK-NEXT: br label [[PREFETCH_FOR_BODY5:%.*]] -; CHECK: prefetch.for.body5: -; CHECK-NEXT: [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[TMP4]] -; CHECK-NEXT: [[TMP8:%.*]] = load i32, ptr [[TMP7]], align 4 -; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP8]], i32 -1) -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 255 -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP1]], i32 [[TMP10]] -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP11]], i32 0, i32 3, i32 1) -; CHECK-NEXT: br label [[PREFETCH_END:%.*]] -; CHECK: prefetch.end: -; CHECK-NEXT: br label [[FOR_BODY5_PREHEADER:%.*]] -; CHECK: for.body5.preheader: -; CHECK-NEXT: br label [[FOR_BODY5:%.*]] -; CHECK: for.cond.cleanup4.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP4]] -; CHECK: for.cond.cleanup4: -; CHECK-NEXT: [[SUM_1_LCSSA]] = phi i32 [ [[SUM_040]], [[FOR_BODY]] ], [ [[ADD:%.*]], [[FOR_COND_CLEANUP4_LOOPEXIT:%.*]] ] -; CHECK-NEXT: [[INC17]] = add nuw nsw i32 [[I_039]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i32 [[INC17]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] -; CHECK: for.body5: -; CHECK-NEXT: [[J_038:%.*]] = phi i32 [ [[INC15:%.*]], [[IF_END:%.*]] ], [ 0, [[FOR_BODY5_PREHEADER]] ] -; CHECK-NEXT: [[SUM_137:%.*]] = phi i32 [ [[ADD]], [[IF_END]] ], [ [[SUM_040]], [[FOR_BODY5_PREHEADER]] ] -; CHECK-NEXT: [[ARRAYIDX6:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[J_038]] -; CHECK-NEXT: [[TMP12:%.*]] = load ptr, ptr [[ARRAYIDX6]], align 4 -; CHECK-NEXT: [[ARRAYIDX7:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[I_039]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX7]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP13]], i32 -1) -; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP14]], 255 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[TMP12]], i32 [[AND]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[CMP9_NOT:%.*]] = icmp eq i32 [[TMP15]], [[INDEX:%.*]] -; CHECK-NEXT: br i1 [[CMP9_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]] -; CHECK: do.body.preheader: -; CHECK-NEXT: br label [[DO_BODY:%.*]] -; CHECK: do.body: -; CHECK-NEXT: [[J_1:%.*]] = phi i32 [ [[INC10:%.*]], [[DO_BODY]] ], [ [[J_038]], [[DO_BODY_PREHEADER]] ] -; CHECK-NEXT: [[AKEY_0:%.*]] = phi i32 [ [[INC:%.*]], [[DO_BODY]] ], [ [[AND]], [[DO_BODY_PREHEADER]] ] -; CHECK-NEXT: [[INC]] = add nuw nsw i32 [[AKEY_0]], 1 -; CHECK-NEXT: [[INC10]] = add nsw i32 [[J_1]], 1 -; CHECK-NEXT: [[ARRAYIDX11:%.*]] = getelementptr inbounds ptr, ptr [[A]], i32 [[INC10]] -; CHECK-NEXT: [[TMP16:%.*]] = load ptr, ptr [[ARRAYIDX11]], align 4 -; CHECK-NEXT: [[ARRAYIDX12:%.*]] = getelementptr inbounds i32, ptr [[TMP16]], i32 [[INC]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX12]], align 4 -; CHECK-NEXT: [[CMP13_NOT:%.*]] = icmp eq i32 [[TMP17]], [[INDEX]] -; CHECK-NEXT: br i1 [[CMP13_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]] -; CHECK: if.end.loopexit: -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: -; CHECK-NEXT: [[J_2:%.*]] = phi i32 [ [[J_038]], [[FOR_BODY5]] ], [ [[INC10]], [[IF_END_LOOPEXIT]] ] -; CHECK-NEXT: [[B_0:%.*]] = phi ptr [ [[TMP12]], [[FOR_BODY5]] ], [ [[TMP16]], [[IF_END_LOOPEXIT]] ] -; CHECK-NEXT: [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY5]] ], [ [[INC]], [[IF_END_LOOPEXIT]] ] -; CHECK-NEXT: [[ARRAYIDX14:%.*]] = getelementptr inbounds i32, ptr [[B_0]], i32 [[AKEY_1]] -; CHECK-NEXT: [[TMP18:%.*]] = load i32, ptr [[ARRAYIDX14]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP18]], [[SUM_137]] -; CHECK-NEXT: [[INC15]] = add nsw i32 [[J_2]], 1 -; CHECK-NEXT: [[CMP3:%.*]] = icmp slt i32 [[INC15]], [[TMP6]] -; CHECK-NEXT: br i1 [[CMP3]], label [[FOR_BODY5]], label [[FOR_COND_CLEANUP4_LOOPEXIT]] -; -entry: - br label %for.body - -for.cond.cleanup: - ret i32 %sum.1.lcssa - -for.body: - %sum.040 = phi i32 [ 0, %entry ], [ %sum.1.lcssa, %for.cond.cleanup4 ] - %i.039 = phi i32 [ 0, %entry ], [ %inc17, %for.cond.cleanup4 ] - %arrayidx = getelementptr inbounds ptr, ptr %A, i32 %i.039 - %0 = load ptr, ptr %arrayidx, align 4 - %1 = load i32, ptr %0, align 4 - %cmp336 = icmp sgt i32 %1, 0 - br i1 %cmp336, label %for.body5, label %for.cond.cleanup4 - -for.cond.cleanup4: - %sum.1.lcssa = phi i32 [ %sum.040, %for.body ], [ %add, %if.end ] - %inc17 = add nuw nsw i32 %i.039, 1 - %exitcond.not = icmp eq i32 %inc17, 100 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body - -for.body5: - %j.038 = phi i32 [ %inc15, %if.end ], [ 0, %for.body ] - %sum.137 = phi i32 [ %add, %if.end ], [ %sum.040, %for.body ] - %arrayidx6 = getelementptr inbounds ptr, ptr %A, i32 %j.038 - %2 = load ptr, ptr %arrayidx6, align 4 - %arrayidx7 = getelementptr inbounds i32, ptr %2, i32 %i.039 - %3 = load i32, ptr %arrayidx7, align 4 - %4 = tail call i32 @llvm.aarch64.crc32w(i32 %3, i32 -1) - %and = and i32 %4, 255 - %arrayidx8 = getelementptr inbounds i32, ptr %2, i32 %and - %5 = load i32, ptr %arrayidx8, align 4 - %cmp9.not = icmp eq i32 %5, %index - br i1 %cmp9.not, label %if.end, label %do.body - -do.body: - %j.1 = phi i32 [ %inc10, %do.body ], [ %j.038, %for.body5 ] - %AKey.0 = phi i32 [ %inc, %do.body ], [ %and, %for.body5 ] - %inc = add nuw nsw i32 %AKey.0, 1 - %inc10 = add nsw i32 %j.1, 1 - %arrayidx11 = getelementptr inbounds ptr, ptr %A, i32 %inc10 - %6 = load ptr, ptr %arrayidx11, align 4 - %arrayidx12 = getelementptr inbounds i32, ptr %6, i32 %inc - %7 = load i32, ptr %arrayidx12, align 4 - %cmp13.not = icmp eq i32 %7, %index - br i1 %cmp13.not, label %if.end, label %do.body - -if.end: - %j.2 = phi i32 [ %j.038, %for.body5 ], [ %inc10, %do.body ] - %B.0 = phi ptr [ %2, %for.body5 ], [ %6, %do.body ] - %AKey.1 = phi i32 [ %and, %for.body5 ], [ %inc, %do.body ] - %arrayidx14 = getelementptr inbounds i32, ptr %B.0, i32 %AKey.1 - %8 = load i32, ptr %arrayidx14, align 4 - %add = add nsw i32 %8, %sum.137 - %inc15 = add nsw i32 %j.2, 1 - %cmp3 = icmp slt i32 %inc15, %1 - br i1 %cmp3, label %for.body5, label %for.cond.cleanup4 -} diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll deleted file mode 100644 index 7d65952e2a2aa013506c7e8dab6446712c776d6e..0000000000000000000000000000000000000000 --- a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch.ll +++ /dev/null @@ -1,80 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --indirect-load-prefetch=true --prefetch-distance=512 --min-prefetch-stride=4 -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" - -; Function Attrs: nofree norecurse nounwind uwtable -define dso_local void @test(i32 %Num, float* nocapture readonly %TargetArray, i32* nocapture readonly %OffsetArray, float* noalias nocapture %TempArray) local_unnamed_addr #0 { -; CHECK-LABEL: @test( -; CHECK-NEXT: entry: -; CHECK-NEXT: [[CMP13:%.*]] = icmp sgt i32 [[NUM:%.*]], 0 -; CHECK-NEXT: br i1 [[CMP13]], label [[FOR_BODY_PREHEADER:%.*]], label [[FOR_COND_CLEANUP:%.*]] -; CHECK: for.body.preheader: -; CHECK-NEXT: [[WIDE_TRIP_COUNT:%.*]] = zext i32 [[NUM]] to i64 -; CHECK-NEXT: [[TMP0:%.*]] = sub i64 [[WIDE_TRIP_COUNT]], 1 -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup.loopexit: -; CHECK-NEXT: br label [[FOR_COND_CLEANUP]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret void -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ 0, [[FOR_BODY_PREHEADER]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ] -; CHECK-NEXT: [[TMP1:%.*]] = add i64 [[INDVARS_IV]], 42 -; CHECK-NEXT: [[TMP2:%.*]] = icmp slt i64 [[TMP1]], [[TMP0]] -; CHECK-NEXT: [[TMP3:%.*]] = select i1 [[TMP2]], i64 [[TMP1]], i64 [[TMP0]] -; CHECK-NEXT: [[TMP4:%.*]] = add i64 [[INDVARS_IV]], 84 -; CHECK-NEXT: [[TMP5:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[TMP6:%.*]] = add i64 [[TMP5]], 168 -; CHECK-NEXT: [[UGLYGEP1:%.*]] = getelementptr i8, ptr [[OFFSETARRAY:%.*]], i64 [[TMP6]] -; CHECK-NEXT: [[TMP7:%.*]] = shl nuw nsw i64 [[INDVARS_IV]], 2 -; CHECK-NEXT: [[TMP8:%.*]] = add i64 [[TMP7]], 168 -; CHECK-NEXT: [[UGLYGEP:%.*]] = getelementptr i8, ptr [[TEMPARRAY:%.*]], i64 [[TMP8]] -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[TEMPARRAY]], i64 [[INDVARS_IV]] -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP]], i32 0, i32 3, i32 1) -; CHECK-NEXT: [[TMP9:%.*]] = load float, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[INDVARS_IV]] -; CHECK-NEXT: [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[TMP4]] -; CHECK-NEXT: [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[OFFSETARRAY]], i64 [[TMP3]] -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[UGLYGEP1]], i32 0, i32 3, i32 1) -; CHECK-NEXT: [[TMP12:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[TMP11]], align 4 -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP10]], i32 0, i32 3, i32 1) -; CHECK-NEXT: [[IDXPROM3:%.*]] = sext i32 [[TMP12]] to i64 -; CHECK-NEXT: [[TMP14:%.*]] = sext i32 [[TMP13]] to i64 -; CHECK-NEXT: [[ARRAYIDX4:%.*]] = getelementptr inbounds float, ptr [[TARGETARRAY:%.*]], i64 [[IDXPROM3]] -; CHECK-NEXT: [[TMP15:%.*]] = getelementptr inbounds float, ptr [[TARGETARRAY]], i64 [[TMP14]] -; CHECK-NEXT: [[TMP16:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: [[TMP17:%.*]] = load float, ptr [[ARRAYIDX4]], align 4 -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP15]], i32 0, i32 3, i32 1) -; CHECK-NEXT: [[MUL:%.*]] = fmul contract float [[TMP9]], [[TMP17]] -; CHECK-NEXT: store float [[MUL]], ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT]], [[WIDE_TRIP_COUNT]] -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP_LOOPEXIT:%.*]], label [[FOR_BODY]] -; -entry: - %cmp13 = icmp sgt i32 %Num, 0 - br i1 %cmp13, label %for.body.preheader, label %for.cond.cleanup - -for.body.preheader: - %wide.trip.count = zext i32 %Num to i64 - br label %for.body - -for.cond.cleanup: - ret void - -for.body: - %indvars.iv = phi i64 [ 0, %for.body.preheader ], [ %indvars.iv.next, %for.body ] - %arrayidx = getelementptr inbounds float, float* %TempArray, i64 %indvars.iv - %0 = load float, float* %arrayidx, align 4 - %arrayidx2 = getelementptr inbounds i32, i32* %OffsetArray, i64 %indvars.iv - %1 = load i32, i32* %arrayidx2, align 4 - %idxprom3 = sext i32 %1 to i64 - %arrayidx4 = getelementptr inbounds float, float* %TargetArray, i64 %idxprom3 - %2 = load float, float* %arrayidx4, align 4 - %mul = fmul contract float %0, %2 - store float %mul, float* %arrayidx, align 4 - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next, %wide.trip.count - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -} diff --git a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll b/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll deleted file mode 100644 index b4d85c62e18fc23ea8ba5d4afcdff8fef5983288..0000000000000000000000000000000000000000 --- a/llvm/test/Transforms/LoopDataPrefetch/AArch64/indirect-load-prefetch_crc.ll +++ /dev/null @@ -1,104 +0,0 @@ -; NOTE: Assertions have been autogenerated by utils/update_test_checks.py -; RUN: opt < %s -mtriple=unknown -passes=loop-data-prefetch --indirect-load-prefetch=true --prefetch-distance=512 --outer-loop-prefetch=true --random-access-prefetch-only=true -disable-direct-prefetch -S | FileCheck %s - -target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" -target triple = "aarch64-unknown-linux-gun" - -declare i32 @llvm.aarch64.crc32w(i32, i32) - -; Function Attrs: mustprogress nofree nosync nounwind willreturn memory(argmem: read) uwtable -define dso_local noundef i32 @_Z12matchcolumnsPiiS_ii(ptr nocapture noundef readonly %A, i32 noundef %B, ptr nocapture noundef readonly %Key, i32 noundef %index, i32 noundef %count) { -; CHECK-LABEL: @_Z12matchcolumnsPiiS_ii( -; CHECK-NEXT: entry: -; CHECK-NEXT: br label [[FOR_BODY:%.*]] -; CHECK: for.cond.cleanup: -; CHECK-NEXT: ret i32 [[ADD:%.*]] -; CHECK: for.body: -; CHECK-NEXT: [[INDVARS_IV22:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT23:%.*]], [[IF_END:%.*]] ] -; CHECK-NEXT: [[SUM_020:%.*]] = phi i32 [ 0, [[ENTRY]] ], [ [[ADD]], [[IF_END]] ] -; CHECK-NEXT: [[TMP0:%.*]] = add i64 [[INDVARS_IV22]], 22 -; CHECK-NEXT: [[TMP1:%.*]] = icmp slt i64 [[TMP0]], 99 -; CHECK-NEXT: [[TMP2:%.*]] = select i1 [[TMP1]], i64 [[TMP0]], i64 99 -; CHECK-NEXT: [[TMP3:%.*]] = add i64 [[INDVARS_IV22]], 44 -; CHECK-NEXT: [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], i64 [[INDVARS_IV22]] -; CHECK-NEXT: [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP3]] -; CHECK-NEXT: [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[TMP2]] -; CHECK-NEXT: [[TMP6:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP7:%.*]] = load i32, ptr [[TMP5]], align 4 -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP4]], i32 0, i32 3, i32 1) -; CHECK-NEXT: [[TMP8:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP6]], i32 -1) -; CHECK-NEXT: [[TMP9:%.*]] = tail call i32 @llvm.aarch64.crc32w(i32 [[TMP7]], i32 -1) -; CHECK-NEXT: [[AND:%.*]] = and i32 [[TMP8]], 255 -; CHECK-NEXT: [[TMP10:%.*]] = and i32 [[TMP9]], 255 -; CHECK-NEXT: [[IDXPROM1:%.*]] = zext i32 [[AND]] to i64 -; CHECK-NEXT: [[TMP11:%.*]] = zext i32 [[TMP10]] to i64 -; CHECK-NEXT: [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[KEY:%.*]], i64 [[IDXPROM1]] -; CHECK-NEXT: [[TMP12:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[TMP11]] -; CHECK-NEXT: [[TMP13:%.*]] = load i32, ptr [[ARRAYIDX]], align 4 -; CHECK-NEXT: [[TMP14:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4 -; CHECK-NEXT: call void @llvm.prefetch.p0(ptr [[TMP12]], i32 0, i32 3, i32 1) -; CHECK-NEXT: [[CMP3_NOT:%.*]] = icmp eq i32 [[TMP14]], [[B:%.*]] -; CHECK-NEXT: br i1 [[CMP3_NOT]], label [[IF_END]], label [[DO_BODY_PREHEADER:%.*]] -; CHECK: do.body.preheader: -; CHECK-NEXT: br label [[DO_BODY:%.*]] -; CHECK: do.body: -; CHECK-NEXT: [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[DO_BODY]] ], [ [[IDXPROM1]], [[DO_BODY_PREHEADER]] ] -; CHECK-NEXT: [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1 -; CHECK-NEXT: [[ARRAYIDX5:%.*]] = getelementptr inbounds i32, ptr [[KEY]], i64 [[INDVARS_IV_NEXT]] -; CHECK-NEXT: [[TMP15:%.*]] = load i32, ptr [[ARRAYIDX5]], align 4 -; CHECK-NEXT: [[CMP6_NOT:%.*]] = icmp eq i32 [[TMP15]], [[B]] -; CHECK-NEXT: br i1 [[CMP6_NOT]], label [[IF_END_LOOPEXIT:%.*]], label [[DO_BODY]] -; CHECK: if.end.loopexit: -; CHECK-NEXT: [[TMP16:%.*]] = trunc i64 [[INDVARS_IV_NEXT]] to i32 -; CHECK-NEXT: br label [[IF_END]] -; CHECK: if.end: -; CHECK-NEXT: [[AKEY_1:%.*]] = phi i32 [ [[AND]], [[FOR_BODY]] ], [ [[TMP16]], [[IF_END_LOOPEXIT]] ] -; CHECK-NEXT: [[IDXPROM7:%.*]] = sext i32 [[AKEY_1]] to i64 -; CHECK-NEXT: [[ARRAYIDX8:%.*]] = getelementptr inbounds i32, ptr [[A]], i64 [[IDXPROM7]] -; CHECK-NEXT: [[TMP17:%.*]] = load i32, ptr [[ARRAYIDX8]], align 4 -; CHECK-NEXT: [[ADD]] = add nsw i32 [[TMP17]], [[SUM_020]] -; CHECK-NEXT: [[INDVARS_IV_NEXT23]] = add nuw nsw i64 [[INDVARS_IV22]], 1 -; CHECK-NEXT: [[EXITCOND_NOT:%.*]] = icmp eq i64 [[INDVARS_IV_NEXT23]], 100 -; CHECK-NEXT: br i1 [[EXITCOND_NOT]], label [[FOR_COND_CLEANUP:%.*]], label [[FOR_BODY]] -; -entry: - br label %for.body - -for.cond.cleanup: - ret i32 %add - -for.body: - %indvars.iv22 = phi i64 [ 0, %entry ], [ %indvars.iv.next23, %if.end ] - %sum.020 = phi i32 [ 0, %entry ], [ %add, %if.end ] - %arrayidx = getelementptr inbounds i32, ptr %A, i64 %indvars.iv22 - %0 = load i32, ptr %arrayidx, align 4 - %1 = tail call i32 @llvm.aarch64.crc32w(i32 %0, i32 -1) - %and = and i32 %1, 255 - %idxprom1 = zext i32 %and to i64 - %arrayidx2 = getelementptr inbounds i32, ptr %Key, i64 %idxprom1 - %2 = load i32, ptr %arrayidx2, align 4 - %cmp3.not = icmp eq i32 %2, %B - br i1 %cmp3.not, label %if.end, label %do.body - -do.body: - %indvars.iv = phi i64 [ %idxprom1, %for.body ], [ %indvars.iv.next, %do.body ] - %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1 - %arrayidx5 = getelementptr inbounds i32, ptr %Key, i64 %indvars.iv.next - %3 = load i32, ptr %arrayidx5, align 4 - %cmp6.not = icmp eq i32 %3, %B - br i1 %cmp6.not, label %if.end.loopexit, label %do.body - -if.end.loopexit: - %4 = trunc i64 %indvars.iv.next to i32 - br label %if.end - -if.end: - %AKey.1 = phi i32 [ %and, %for.body ], [ %4, %if.end.loopexit ] - %idxprom7 = sext i32 %AKey.1 to i64 - %arrayidx8 = getelementptr inbounds i32, ptr %A, i64 %idxprom7 - %5 = load i32, ptr %arrayidx8, align 4 - %add = add nsw i32 %5, %sum.020 - %indvars.iv.next23 = add nuw nsw i64 %indvars.iv22, 1 - %exitcond.not = icmp eq i64 %indvars.iv.next23, 100 - br i1 %exitcond.not, label %for.cond.cleanup, label %for.body -}