diff --git a/bolt/include/bolt/Core/BinaryBasicBlock.h b/bolt/include/bolt/Core/BinaryBasicBlock.h index db2d7b7e976d5fd308de52be326b6c46657aa1eb..6807f2277f5f04bc9c79169a360aa0f7d310f7aa 100644 --- a/bolt/include/bolt/Core/BinaryBasicBlock.h +++ b/bolt/include/bolt/Core/BinaryBasicBlock.h @@ -49,6 +49,7 @@ public: struct BinaryBranchInfo { uint64_t Count; uint64_t MispredictedCount; /// number of branches mispredicted + int64_t Cycle; bool operator<(const BinaryBranchInfo &Other) const { return (Count < Other.Count) || @@ -57,10 +58,16 @@ public: } }; + struct BinaryCacheMissInfo { + uint64_t MissCount; + uint64_t AccessCount; + }; + static constexpr uint32_t INVALID_OFFSET = std::numeric_limits::max(); using BranchInfoType = SmallVector; + using CacheMissInfoType = DenseMap; private: /// Vector of all instructions in the block. @@ -73,6 +80,7 @@ private: /// Each successor has a corresponding BranchInfo entry in the list. BranchInfoType BranchInfo; + CacheMissInfoType CacheMissInfo; using ExceptionListType = SmallVector; @@ -433,6 +441,17 @@ public: ErrorOr> getBranchStats(const BinaryBasicBlock *Succ) const; + /// Cache Miss Info + uint32_t getNumCacheMisses() { return CacheMissInfo.size(); } + + std::optional getCacheMissInfoAtInstr(const MCInst *Instr); + + std::optional getCacheMissInfoAtInstr(const MCInst *Instr) const; + + void setCacheMissInfoAtInstr(const MCInst *Instr, uint64_t MissCount, + uint64_t AccessCount); + + /// If the basic block ends with a conditional branch (possibly followed by /// an unconditional branch) and thus has 2 successor, reverse the order of /// its successors in CFG, update branch info, and return true. If the basic diff --git a/bolt/include/bolt/Core/BinaryData.h b/bolt/include/bolt/Core/BinaryData.h index fe3365f36beda728e037d884a17616613d49c7eb..c55ca79bb572cc88c741388a9bd94c9a990e1a2c 100644 --- a/bolt/include/bolt/Core/BinaryData.h +++ b/bolt/include/bolt/Core/BinaryData.h @@ -237,6 +237,24 @@ inline raw_ostream &operator<<(raw_ostream &OS, return OS; } +struct CacheMissProfile { + // TODO: do we need use next-instr-offset instead of instr-offset + uint64_t InstrOffset; /// Offset of the instruction + uint64_t MissCount; /// Number of cache misses + uint64_t AccessCount; /// Number of accesses + bool operator==(const CacheMissProfile &Other) const { + return InstrOffset == Other.InstrOffset && + MissCount == Other.MissCount && + AccessCount == Other.AccessCount; + } +}; + +inline raw_ostream &operator<<(raw_ostream &OS, + const bolt::CacheMissProfile &CMP) { + OS << "Misses: " << CMP.MissCount << ", Accesses: " << CMP.AccessCount << "\n"; + return OS; +} + } // namespace bolt } // namespace llvm diff --git a/bolt/include/bolt/Core/BinaryFunction.h b/bolt/include/bolt/Core/BinaryFunction.h index 5fe31214721b9b7c0ed8ad9833a963b50da0e438..4e0d032565a2fd47c7b750fdd33fd9b9ac3c714b 100644 --- a/bolt/include/bolt/Core/BinaryFunction.h +++ b/bolt/include/bolt/Core/BinaryFunction.h @@ -331,6 +331,8 @@ private: bool HasMemoryProfile{false}; + bool HasCacheMissProfile{false}; + /// Execution halts whenever this function is entered. bool TrapsOnEntry{false}; @@ -1371,6 +1373,8 @@ public: bool hasMemoryProfile() const { return HasMemoryProfile; } + bool hasCacheMissProfile() const { return HasCacheMissProfile; } + /// Return true if the body of the function was merged into another function. bool isFolded() const { return FoldedIntoFunction != nullptr; } diff --git a/bolt/include/bolt/Core/BinaryLoop.h b/bolt/include/bolt/Core/BinaryLoop.h index b425c75715d8b1d02e66655d8ff7e4d9115fffdd..1d64f5ff95ec5773e4b60600e9f672cd9e7cd0b8 100644 --- a/bolt/include/bolt/Core/BinaryLoop.h +++ b/bolt/include/bolt/Core/BinaryLoop.h @@ -15,6 +15,7 @@ #ifndef BOLT_CORE_BINARY_LOOP_H #define BOLT_CORE_BINARY_LOOP_H +#include "llvm/ADT/GraphTraits.h" #include "llvm/Support/GenericLoopInfo.h" namespace llvm { @@ -46,6 +47,7 @@ protected: class BinaryLoopInfo : public LoopInfoBase { public: BinaryLoopInfo() {} + //explicit BinaryLoopInfo(const DominatorTreeBase &DomTree); unsigned OuterLoops{0}; unsigned TotalLoops{0}; @@ -55,6 +57,25 @@ public: }; } // namespace bolt + +template <> struct GraphTraits { + typedef const bolt::BinaryLoop *NodeRef; + typedef bolt::BinaryLoopInfo::iterator ChildIteratorType; + + static NodeRef getEntryNode(const bolt::BinaryLoop *L) { return L; } + static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->end(); } +}; + +template <> struct GraphTraits { + typedef bolt::BinaryLoop *NodeRef; + typedef bolt::BinaryLoopInfo::iterator ChildIteratorType; + + static NodeRef getEntryNode(bolt::BinaryLoop *L) { return L; } + static ChildIteratorType child_begin(NodeRef N) { return N->begin(); } + static ChildIteratorType child_end(NodeRef N) { return N->end(); } +}; + } // namespace llvm #endif diff --git a/bolt/include/bolt/Core/MCPlusBuilder.h b/bolt/include/bolt/Core/MCPlusBuilder.h index 5a8a4f6e391c9dcac77e53d033f4184f1e4f0258..bf148eb5497e54c698d532d2fae4683bb5a278d3 100644 --- a/bolt/include/bolt/Core/MCPlusBuilder.h +++ b/bolt/include/bolt/Core/MCPlusBuilder.h @@ -598,6 +598,16 @@ public: return 0; } + virtual bool isCMP(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + + virtual bool isADD(const MCInst &Inst) const { + llvm_unreachable("not implemented"); + return false; + } + virtual bool isSUB(const MCInst &Inst) const { llvm_unreachable("not implemented"); return false; @@ -1646,6 +1656,24 @@ public: llvm_unreachable("not implemented"); } + virtual bool createPrefetchMui(MCInst &Inst, const MCPhysReg &BaseReg, StringRef PrfOp, unsigned Offset) const { + llvm_unreachable("not implemented"); + } + + virtual bool createPrefetch(MCInst &Inst, const MCPhysReg &BaseReg, const MCPhysReg &IndexReg, + unsigned Opcode, StringRef PrfOp, unsigned Offset) const { + llvm_unreachable("not implemented"); + } + + virtual bool createADD64ri(MCInst &Inst, const MCPhysReg &DstReg, const MCPhysReg &SrcReg, int64_t Imm) const { + llvm_unreachable("not implemented"); + } + + virtual bool createADD64rs(MCInst &Inst, const MCPhysReg &DstReg, const MCPhysReg &SrcReg1, + const MCPhysReg &SrcReg2, int64_t Imm) const { + llvm_unreachable("not implemented"); + } + /// Create a fragment of code (sequence of instructions) that load a 32-bit /// address from memory, zero-extends it to 64 and jump to it (indirect jump). virtual void diff --git a/bolt/include/bolt/Passes/LoopDataPrefetch.h b/bolt/include/bolt/Passes/LoopDataPrefetch.h new file mode 100644 index 0000000000000000000000000000000000000000..3e921d459e4c4e2a390d6da08b769dce711f51f6 --- /dev/null +++ b/bolt/include/bolt/Passes/LoopDataPrefetch.h @@ -0,0 +1,142 @@ +//===- bolt/Passes/ThreeWayBranch.h -----------------------------*- C++ -*-===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// + +#ifndef BOLT_PASSES_LOOPDATAPREFETCH_H +#define BOLT_PASSES_LOOPDATAPREFETCH_H + +#include "bolt/Passes/BinaryPasses.h" + +namespace llvm { +namespace bolt { + +struct Prefetch { + Prefetch(MCInst *MemI, BinaryBasicBlock *CacheMissBB, BinaryLoop *CurL, BinaryLoop *InsertL, bool IsBatch) : + MemI(MemI), CacheMissBB(CacheMissBB), CurL(CurL), InsertL(InsertL), IsBatch(IsBatch) {}; + MCInst *MemI; + BinaryBasicBlock *CacheMissBB; + BinaryLoop *CurL; + BinaryLoop *InsertL; + bool IsBatch; + bool IsIndirectLoad; + bool HasValidDependency; +}; + +struct DependentInfo { + SmallVector DependentInsts; + std::set NotUsedBeforWrittenRegs; + std::set WrittenRegs; + std::set UsedRegs; + // std::set FreeRegs; +}; + +class LoopDataPrefetch : public BinaryFunctionPass { +private: + /// Run pass on Function + void runOnFunction(BinaryFunction &Function, BinaryContext &BC); + + /// + void runOnLoop(BinaryLoop *L, BinaryContext &BC); + + using LoopPrefetchMapTy = std::unordered_map>; + /// + void getPrefetchCandidates(BinaryLoop *L, BinaryContext &BC, + DenseSet IsVisited, + LoopPrefetchMapTy &PrefetchCandidates); + + std::pair selectLoopForInsertPrefetch(BinaryLoop *L); + + /// + MCInst* getLoopInductionInstr(BinaryContext &BC, BinaryLoop *L); + + MCInst* getSpecifyLoopIndInstr(BinaryContext &BC, Prefetch &P); + + void doPrefetchForLoop(BinaryContext &BC, BinaryLoop *L, SmallVectorImpl &PrefetchCands); + + /// + bool insertPrefetchInstr(); + + /// + bool insertPrefetchForOuterLoop(); + + BinaryBasicBlock* createPrefetchEntryBB(BinaryFunction* BF, BinaryBasicBlock* LatchBB, BinaryBasicBlock* HeaderBB, + SmallVectorImpl &CommonDependentInsts, std::set WrittenRegs, + int Distance); + + SmallVector + createPrefetchBBs(BinaryFunction* BF, SmallVectorImpl &DependentInsts, + SmallVectorImpl &CommonDependentInsts, BinaryBasicBlock* PrefetchEntryBB, + SmallVectorImpl &PrefetchCands, std::set WrittenRegs); + + /// + BinaryBasicBlock* createPrefetchExitBB(BinaryFunction* BF, BinaryBasicBlock* HeaderBB, + SmallVectorImpl &PredBBs, + std::set WrittenRegs); + + + + SmallVector mergeAllDependents(SmallVectorImpl> &AllDependentInsts); + + SmallVector getCommonDependents(SmallVectorImpl> &AllDependentInsts); + + bool getDependentBBs(BinaryBasicBlock* LoopInductionBB, BinaryBasicBlock* PrefetchCandBB, + SmallVectorImpl &DependentBBs); + + // bool getDependentInsts(BinaryContext &BC, MCInst* LoopInductionInst, + // MCInst* PrefetchCandInst, BinaryBasicBlock* LoopInductionBB, + // BinaryBasicBlock* PrefetchCandBB, SmallVectorImpl &DependentInsts, + // bool &IsIndirectLoad, SmallVectorImpl &WrittenRegs, std::set &OnlyWrittenRegs); + bool getDependentInsts(BinaryContext &BC, MCInst* LoopInductionInst, MCInst* PrefetchCandInst, + BinaryBasicBlock* LoopInductionBB, BinaryBasicBlock* PrefetchCandBB, + DependentInfo &Dependent, bool &IsIndirectLoad); + + /// + int calculatePrefetchDistance(BinaryContext &BC, BinaryLoop *L, bool IsPositive); + + /// + int predictPrefetchDistance(BinaryContext &BC, BinaryLoop *L, bool IsPositive); + + void getCallerSavedRegs(MCInst &Inst, BitVector &Regs, BinaryContext &BC); + + /// + // void findDependentBBs(BinaryContext &BC, BinaryLoop *L, + // SmallVectorImpl> DependentBBs) + bool regIsPossiblyOverwritten(MCInst &Inst, unsigned Reg, BinaryContext &BC); + + bool regIsDefinitelyOverwritten(MCInst &Inst, unsigned Reg, BinaryContext &BC); + + bool regIsUsed(MCInst &Inst, unsigned Reg, BinaryContext &BC); + /// + void findInstrDependences(); + + bool isHotMiss(uint64_t MissCount, uint64_t AccessCount); + + void findLargeSubLoop(DenseSet BL); + +public: + explicit LoopDataPrefetch(const cl::opt &PrintPass) + : BinaryFunctionPass(PrintPass) {} + + const char *getName() const override { return "loop data prefetch"; } + + // WIP: Only add BF.hasCacheMissProfile(), then add BF.hasProfile() + bool shouldOptimize(const BinaryFunction &BF) const override { + return BF.isSimple() && !BF.isIgnored() && BF.hasCacheMissProfile() && + !BF.hasUnknownControlFlow(); + } + + Error runOnFunctions(BinaryContext &BC) override; + + //DenseMap PrefetchCandidates; + + //SmallSet HasLargeSubLoops; +}; + +} // namespace bolt +} // namespace llvm + +#endif // BOLT_PASSES_LOOPDATAPREFETCH_H diff --git a/bolt/include/bolt/Profile/DataAggregator.h b/bolt/include/bolt/Profile/DataAggregator.h index 6453b3070ceb8d23983e09c77c3ef3f5e29764be..9881b4a8b73889bf6caaed3e4d9e8762989ec061 100644 --- a/bolt/include/bolt/Profile/DataAggregator.h +++ b/bolt/include/bolt/Profile/DataAggregator.h @@ -91,6 +91,7 @@ private: struct PerfMemSample { uint64_t PC; uint64_t Addr; + bool isMiss; }; /// Used for parsing specific pre-aggregated input files. @@ -126,6 +127,15 @@ private: struct TakenBranchInfo { uint64_t TakenCount{0}; uint64_t MispredCount{0}; + // FIXME: Maybe we need to use double instead of int64_t. If the cycles + // is very small, it may cause large error. + int64_t Cycles{0}; + }; + + struct BranchCycleInfo { + int64_t DiffSumCycles{0}; + int64_t LastTotalCycles{0}; + int Occurances{0}; }; /// Intermediate storage for profile data. We save the results of parsing @@ -257,16 +267,17 @@ private: /// Register an intraprocedural branch \p Branch. bool doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, - uint64_t Count, uint64_t Mispreds); + uint64_t Count, uint64_t Mispreds, int64_t Cycles); /// Register an interprocedural branch from \p FromFunc to \p ToFunc with /// offsets \p From and \p To, respectively. bool doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, uint64_t From, uint64_t To, uint64_t Count, - uint64_t Mispreds); + uint64_t Mispreds, int64_t Cycles); /// Register a \p Branch. - bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds); + bool doBranch(uint64_t From, uint64_t To, uint64_t Count, uint64_t Mispreds, + int64_t Cycles); /// Register a trace between two LBR entries supplied in execution order. bool doTrace(const LBREntry &First, const LBREntry &Second, @@ -301,6 +312,10 @@ private: /// binary. Used exclusively for pre-aggregated LBR samples. ErrorOr parseLocationOrOffset(); + bool hasCacheMissEvent(StringRef event); + + bool isCacheMissEvent(StringRef event); + /// Check if a field separator is the next char to parse and, if yes, consume /// it and return true bool checkAndConsumeFS(); @@ -345,6 +360,9 @@ private: /// Process parsed memory events profile. void processMemEvents(); + /// + void processCacheMissEvents(); + /// Parse a single line of a PERF_RECORD_MMAP2 event looking for a mapping /// between the binary name and its memory layout in a process with a given /// PID. diff --git a/bolt/include/bolt/Profile/DataReader.h b/bolt/include/bolt/Profile/DataReader.h index 314dcc911558633ab8e1c030bf2d9ebef53a2986..ce53397d9c2d6560ce0ea920cf4699178ced3850 100644 --- a/bolt/include/bolt/Profile/DataReader.h +++ b/bolt/include/bolt/Profile/DataReader.h @@ -36,6 +36,7 @@ struct LBREntry { uint64_t From; uint64_t To; bool Mispred; + int64_t Cycles; }; inline raw_ostream &operator<<(raw_ostream &OS, const LBREntry &LBR) { @@ -80,10 +81,11 @@ struct BranchInfo { Location To; int64_t Mispreds; int64_t Branches; + int64_t Cycles; - BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches) + BranchInfo(Location From, Location To, int64_t Mispreds, int64_t Branches, int64_t Cycles) : From(std::move(From)), To(std::move(To)), Mispreds(Mispreds), - Branches(Branches) {} + Branches(Branches), Cycles(Cycles) {} bool operator==(const BranchInfo &RHS) const { return From == RHS.From && To == RHS.To; @@ -148,7 +150,7 @@ struct FuncBranchData { DenseMap> EntryIndex; void bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, uint64_t Count, - uint64_t Mispreds); + uint64_t Mispreds, int64_t Cycles); void bumpCallCount(uint64_t OffsetFrom, const Location &To, uint64_t Count, uint64_t Mispreds); void bumpEntryCount(const Location &From, uint64_t OffsetTo, uint64_t Count, @@ -258,6 +260,45 @@ struct FuncSampleData { void bumpCount(uint64_t Offset, uint64_t Count); }; +struct CacheMissInfo { + Location Offset; + uint64_t MissCount; + uint64_t AccessCount; + + CacheMissInfo(Location Offset, uint64_t MissCount, uint64_t AccessCount) + : Offset(Offset), MissCount(MissCount), AccessCount(AccessCount) {} + + bool operator==(const CacheMissInfo &RHS) const { return Offset == RHS.Offset; } + + bool operator<(const CacheMissInfo &RHS) const { + if (Offset < RHS.Offset) + return true; + + return false; + } + + void print(raw_ostream &OS) const; + + void mergeWith(const CacheMissInfo &SI); +}; + +struct FuncCacheMissData { + typedef std::vector ContainerTy; + + StringRef Name; + ContainerTy Data; + + /// Indicate if the data was used. + bool Used{false}; + + FuncCacheMissData(StringRef Name, ContainerTy Data) + : Name(Name), Data(std::move(Data)) {} + + DenseMap Index; + + void bumpCount(uint64_t Offset, uint64_t MissCount, uint64_t AccessCount); +}; + /// DataReader Class /// class DataReader : public ProfileReaderBase { @@ -299,6 +340,9 @@ protected: /// creation of basic blocks. void matchProfileMemData(BinaryFunction &BF); + /// + void matchProfileCacheMissData(BinaryFunction &BF); + /// Check how closely \p BranchData matches the function \p BF. /// Return accuracy (ranging from 0.0 to 1.0) of the matching. float evaluateProfileData(BinaryFunction &BF, @@ -321,7 +365,7 @@ protected: /// /// Return true if the branch is valid, false otherwise. bool recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To, - uint64_t Count = 1, uint64_t Mispreds = 0) const; + uint64_t Count = 1, uint64_t Mispreds = 0, int64_t Cycles = 0) const; /// Parses the input bolt data file into internal data structures. We expect /// the file format to follow the syntax below. @@ -387,6 +431,8 @@ protected: FuncSampleData *getFuncSampleData(const std::vector &FuncNames); + FuncCacheMissData *getCacheMissDataForNames(const std::vector &FuncNames); + /// Return a vector of all FuncBranchData matching the list of names. /// Internally use fuzzy matching to match special names like LTO-generated /// function names. @@ -399,6 +445,10 @@ protected: std::vector getMemDataForNamesRegex(const std::vector &FuncNames); + /// + std::vector + getCacheMissDataForNamesRegex(const std::vector &FuncNames); + /// Return branch data profile associated with function \p BF or nullptr /// if the function has no associated profile. FuncBranchData *getBranchData(const BinaryFunction &BF) const { @@ -427,13 +477,30 @@ protected: FuncsToMemData[&BF] = FMD; } + /// + FuncCacheMissData *getCacheMissData(const BinaryFunction &BF) const { + auto FCMDI = FuncsToCacheMisses.find(&BF); + if(FCMDI == FuncsToCacheMisses.end()) { + return nullptr; + } + return FCMDI->second; + } + + /// + void setCacheMissData(const BinaryFunction &BF, FuncCacheMissData *FCMD) { + FuncsToCacheMisses[&BF] = FCMD; + } + using NamesToBranchesMapTy = std::map; using NamesToSamplesMapTy = std::map; using NamesToMemEventsMapTy = std::map; + using NamesToCacheMissEventsMapTy = std::map; using FuncsToBranchesMapTy = std::unordered_map; using FuncsToMemDataMapTy = std::unordered_map; + using FuncsToCacheMissesMapTy = + std::unordered_map; /// Dumps the entire data structures parsed. Used for debugging. void dump() const; @@ -469,20 +536,26 @@ protected: ErrorOr parseString(char EndChar, bool EndNl = false); ErrorOr parseNumberField(char EndChar, bool EndNl = false); ErrorOr parseHexField(char EndChar, bool EndNl = false); - ErrorOr parseLocation(char EndChar, bool EndNl, bool ExpectMemLoc); + ErrorOr parseLocation(char EndChar, bool EndNl, bool ExpectMemLoc, + bool ExpectCacheMissLoc); ErrorOr parseLocation(char EndChar, bool EndNl = false) { - return parseLocation(EndChar, EndNl, false); + return parseLocation(EndChar, EndNl, false, false); } ErrorOr parseMemLocation(char EndChar, bool EndNl = false) { - return parseLocation(EndChar, EndNl, true); + return parseLocation(EndChar, EndNl, true, false); + } + ErrorOr parseCacheMissLocation(char EndChar, bool EndNl = false) { + return parseLocation(EndChar, EndNl, false, true); } ErrorOr parseBranchInfo(); ErrorOr parseSampleInfo(); ErrorOr parseMemInfo(); + ErrorOr parseCacheMissInfo(); ErrorOr maybeParseNoLBRFlag(); ErrorOr maybeParseBATFlag(); bool hasBranchData(); bool hasMemData(); + bool hasCacheMissData(); /// An in-memory copy of the input data file - owns strings used in reader. std::unique_ptr FileBuf; @@ -493,8 +566,10 @@ protected: NamesToBranchesMapTy NamesToBranches; NamesToSamplesMapTy NamesToSamples; NamesToMemEventsMapTy NamesToMemEvents; + NamesToCacheMissEventsMapTy NamesToCacheMissEvents; FuncsToBranchesMapTy FuncsToBranches; FuncsToMemDataMapTy FuncsToMemData; + FuncsToCacheMissesMapTy FuncsToCacheMisses; bool NoLBRMode{false}; bool BATMode{false}; StringSet<> EventNames; @@ -503,6 +578,7 @@ protected: /// Maps of common LTO names to possible matching profiles. StringMap> LTOCommonNameMap; StringMap> LTOCommonNameMemMap; + StringMap> LTOCommonNameCacheMissMap; public: void setParsingBuffer(StringRef Buffer) { ParsingBuf = Buffer; } diff --git a/bolt/lib/Core/BinaryBasicBlock.cpp b/bolt/lib/Core/BinaryBasicBlock.cpp index 7da836a0bfffb4dc968e1edd562292f1d8a8d664..7d4d19c5dfed350254ad2f1cbb651d7872084069 100644 --- a/bolt/lib/Core/BinaryBasicBlock.cpp +++ b/bolt/lib/Core/BinaryBasicBlock.cpp @@ -557,6 +557,35 @@ BinaryBasicBlock::getBranchInfo(const BinaryBasicBlock &Succ) const { return std::get<1>(*Result); } +std::optional +BinaryBasicBlock::getCacheMissInfoAtInstr(const MCInst *Instr) { + return static_cast(*this).getCacheMissInfoAtInstr(Instr); +} + +std::optional +BinaryBasicBlock::getCacheMissInfoAtInstr(const MCInst *Instr) const { + auto Iter = CacheMissInfo.find(Instr); + if (Iter != CacheMissInfo.end()) { + return Iter->second; + } else { + return std::nullopt; + } +} + +void BinaryBasicBlock::setCacheMissInfoAtInstr(const MCInst *Instr, uint64_t MissCount, + uint64_t AccessCount) { + // TODO: More real cases are needed to determine whether we should do overlay or accumulation. + auto Iter = CacheMissInfo.find(Instr); + if (Iter != CacheMissInfo.end()) { + Iter->second.MissCount = MissCount; + Iter->second.AccessCount = AccessCount; + return; + } + + BinaryCacheMissInfo NewBCMI{MissCount, AccessCount}; + CacheMissInfo.insert({Instr, NewBCMI}); +} + BinaryBasicBlock *BinaryBasicBlock::splitAt(iterator II) { assert(II != end() && "expected iterator pointing to instruction"); diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp index 7c3b4a87cfac8b63f4ca98e0dcf76bcdeecf7351..a43df59b9cbce6976b21bd651648c808a8b3e9df 100644 --- a/bolt/lib/Core/BinaryContext.cpp +++ b/bolt/lib/Core/BinaryContext.cpp @@ -69,6 +69,11 @@ PrintMemData("print-mem-data", cl::ZeroOrMore, cl::cat(BoltCategory)); +static cl::opt +PrintCacheMiss("print-cache-miss", cl::Hidden, cl::init(false), + cl::desc("print cache miss info when printing functions"), + cl::ZeroOrMore, cl::cat(BoltCategory)); + cl::opt CompDirOverride( "comp-dir-override", cl::desc("overrides DW_AT_comp_dir, and provides an alternative base " @@ -2006,6 +2011,18 @@ void BinaryContext::printInstruction(raw_ostream &OS, const MCInst &Instruction, Function->printRelocations(OS, Offset, Size); } + if (opts::PrintCacheMiss) { + if (Function) { + if (const BinaryBasicBlock *BB = Function->getBasicBlockContainingOffset(Offset)) { + auto BCMI = BB->getCacheMissInfoAtInstr(&Instruction); + if (BCMI) { + OS << "# MissCount: " << BCMI->MissCount + << ", AccessCount: " << BCMI->AccessCount; + } + } + } + } + OS << Endl; if (PrintMCInst) { diff --git a/bolt/lib/Core/BinaryFunction.cpp b/bolt/lib/Core/BinaryFunction.cpp index 193b8a5404ab16d9f18748424cd2283330ce93e6..feadd3eedbb6af183b35f3e190566aea088b3059 100644 --- a/bolt/lib/Core/BinaryFunction.cpp +++ b/bolt/lib/Core/BinaryFunction.cpp @@ -589,7 +589,7 @@ void BinaryFunction::print(raw_ostream &OS, std::string Annotation) { if (ExecutionCount != COUNT_NO_PROFILE && BI.MispredictedCount != BinaryBasicBlock::COUNT_INFERRED) { OS << " (mispreds: " << BI.MispredictedCount - << ", count: " << BI.Count << ")"; + << ", count: " << BI.Count << ", cycles: " << BI.Cycle << ")"; } else if (ExecutionCount != COUNT_NO_PROFILE && BI.Count != BinaryBasicBlock::COUNT_NO_PROFILE) { OS << " (inferred count: " << BI.Count << ")"; diff --git a/bolt/lib/Passes/CMakeLists.txt b/bolt/lib/Passes/CMakeLists.txt index 407d8b03f739776b0e191c7d7c995e2af0c5c314..836147cccf6e252b296fa5824cbcd468207036f1 100644 --- a/bolt/lib/Passes/CMakeLists.txt +++ b/bolt/lib/Passes/CMakeLists.txt @@ -20,6 +20,7 @@ add_llvm_library(LLVMBOLTPasses Instrumentation.cpp JTFootprintReduction.cpp LongJmp.cpp + LoopDataPrefetch.cpp LoopInversionPass.cpp LivenessAnalysis.cpp MCF.cpp diff --git a/bolt/lib/Passes/LoopDataPrefetch.cpp b/bolt/lib/Passes/LoopDataPrefetch.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6d96cdcbbacd30a6fd2c6dbd9818966574475281 --- /dev/null +++ b/bolt/lib/Passes/LoopDataPrefetch.cpp @@ -0,0 +1,888 @@ +//===- bolt/Passes/LoopDataPrefetchs.cpp --------------------------------------------===// +// +// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions. +// See https://llvm.org/LICENSE.txt for license information. +// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception +// +//===----------------------------------------------------------------------===// +// +// This file implements the LoopDataPrefetchPass class. +// +//===----------------------------------------------------------------------===// + +#include "bolt/Passes/LoopDataPrefetch.h" +#include + +#define DEBUG_TYPE "loop-data-prefetch" + +using namespace llvm; +using namespace bolt; + +namespace opts { +extern cl::OptionCategory BoltOptCategory; +extern llvm::cl::opt AlignText; + +static cl::opt CacheMissCutOff( + "cache-miss-cutoff", cl::Hidden, cl::init(2000), + cl::desc("cutoff for cache misses"), cl::cat(BoltOptCategory)); + +static cl::opt MinInnerLoopPrefetchThreshold( + "min-inner-loop-prefetch-threshold", cl::Hidden, cl::init(200.0), + cl::desc("the minimum count for inner loops to insert the " + "prefetch into the nested loops based on the trip count."), + cl::cat(BoltOptCategory)); + +static cl::opt MaxOuterLoopPrefetchThreshold( + "max-outer-loop-prefetch-threshold", cl::Hidden, cl::init(5.0), + cl::desc("the maximum count for outer loops to insert the " + "prefetch into the nested loops based on the trip count."), + cl::cat(BoltOptCategory)); + +static cl::opt EnableBatchPrefetch( + "enable-batch-prefetch", cl::Hidden, cl::init(true), + cl::desc("Enable batch prefetch in outer loop for small inner loop"), + cl::cat(BoltOptCategory)); +} + +namespace llvm { +namespace bolt { + +bool LoopDataPrefetch::isHotMiss(uint64_t MissCount, uint64_t AccessCount) { + if (double(MissCount)/double(AccessCount) > double(opts::CacheMissCutOff)/10000) + return true; + + return false; +} + +std::pair +LoopDataPrefetch::selectLoopForInsertPrefetch(BinaryLoop *L) { + bool IsBatch = false; + if (L->isOutermost()) + return {L, IsBatch}; + + BinaryLoop *ParentL = L->getParentLoop(); + if (ParentL) { + double TripCount = ParentL->TotalBackEdgeCount ? (double)ParentL->TotalBackEdgeCount : 1.0; + double Ratio = double(L->TotalBackEdgeCount) / TripCount; + if (Ratio <= opts::MaxOuterLoopPrefetchThreshold) { + return selectLoopForInsertPrefetch(ParentL); + } else if (Ratio >= opts::MinInnerLoopPrefetchThreshold) { + return {L, IsBatch}; + } else { + if (opts::EnableBatchPrefetch) { + IsBatch = true; + } + return {ParentL, IsBatch}; + } + } + + return {L, IsBatch}; +} + +void LoopDataPrefetch::getPrefetchCandidates( + BinaryLoop *L, BinaryContext &BC, DenseSet IsVisited, + LoopDataPrefetch::LoopPrefetchMapTy &PrefetchCandidates) { + for (BinaryBasicBlock *BB : L->blocks()) { + for (MCInst &MI : *BB) { + // If instruction has visited in inner loops, skip it. + if (!IsVisited.insert(&MI).second) + continue; + + if (!BC.MIB->mayLoad(MI)) + continue; + + auto ErrorOrCacheMissProfile = + BC.MIB->tryGetAnnotationAs(MI, "CacheMissProfile"); + if (!ErrorOrCacheMissProfile) + continue; + + CacheMissProfile &CMP = ErrorOrCacheMissProfile.get(); + if (isHotMiss(CMP.MissCount, CMP.AccessCount)) { + auto InsertL = selectLoopForInsertPrefetch(L); + auto It = PrefetchCandidates.find(InsertL.first); + if (It != PrefetchCandidates.end()) { + It->second.push_back(Prefetch(&MI, BB, L, InsertL.first, InsertL.second)); + } else { + SmallVector CandidateInit = {Prefetch(&MI, BB, L, InsertL.first, InsertL.second)}; + PrefetchCandidates.insert(std::make_pair(InsertL.first, CandidateInit)); + } + LLVM_DEBUG(dbgs() << "Find the data prefetch candidate, cache miss ratio: " + << double(CMP.MissCount)/double(CMP.AccessCount) + << "\n"); + LLVM_DEBUG(BC.printInstruction(dbgs(), MI)); + } + } + } +} + +MCInst* LoopDataPrefetch::getLoopInductionInstr(BinaryContext &BC, BinaryLoop* L) { + BinaryBasicBlock* HeaderBB = L->getHeader(); + SmallVector Latches; + L->getLoopLatches(Latches); + + if (Latches.size() != 1) + return nullptr; + + BinaryBasicBlock* LatchBB = Latches[0]; + if (!LatchBB) + return nullptr; + + bool hasCondBranch = false; + MCInst* LoopGuardCMPInstr = nullptr; + for (auto I = LatchBB->rbegin(); I != LatchBB->rend(); I++) { + if (BC.MIB->isConditionalBranch(*I)) { + hasCondBranch = true; + } + // BC.printInstruction(dbgs(), *I); + if (BC.MIB->isCMP(*I)) { + if (hasCondBranch) { + LoopGuardCMPInstr = &*I; + } + } else if (BC.MIB->isSUB(*I)) { + if (hasCondBranch ) + return &*I; + break; + } else if (BC.MIB->isADD(*I)) { + if (!LoopGuardCMPInstr) + continue; + // MCInst* CmpInst = &*I; + for (unsigned i = 0; i < LoopGuardCMPInstr->getNumOperands(); i++) { + auto Op = LoopGuardCMPInstr->getOperand(i); + if (!Op.isReg()) + continue; + auto IndReg = Op.getReg(); + if (I->getOperand(0).getReg() == IndReg) { + return &*I; + } + } + } + } + + return nullptr; +} + +MCInst* LoopDataPrefetch::getSpecifyLoopIndInstr(BinaryContext &BC, Prefetch &P) { + BinaryLoop* CurL = P.CurL; + BinaryLoop* InsertL = P.InsertL; + MCInst* InductionInstr = nullptr; + if (CurL == InsertL) { + InductionInstr = getLoopInductionInstr(BC, InsertL); + LLVM_DEBUG(dbgs() << "Finding the loop induction instruction for loop: " + << InsertL->getHeader()->getName() + << "\n -- loop induction instruction: "); + if (InductionInstr) + LLVM_DEBUG(BC.printInstruction(dbgs(), *InductionInstr)); + else + LLVM_DEBUG(dbgs() << "NULL"); + LLVM_DEBUG(dbgs() << "\n"); + } else { + + } + return InductionInstr; +} + +int LoopDataPrefetch::predictPrefetchDistance(BinaryContext &BC, BinaryLoop *L, bool IsPositive) { + // TODO: support static heuristic prediction + int Distance = 0; + return Distance; +} + +int LoopDataPrefetch::calculatePrefetchDistance( + BinaryContext &BC, BinaryLoop *L, bool IsPositive) { + int PrefetchLatency = 100; + int Offset = 10; + int Distance = 0; + // FIXME: maybe we should use max iteraion cycles instead of avrage iteration cycles. + SmallVector Latches; + L->getLoopLatches(Latches); + int64_t LoopIterCycles = 0; + int64_t LoopCyclesSum = 0; + for (BinaryBasicBlock *Latch : Latches) { + auto BI = Latch->branch_info_begin(); + for (BinaryBasicBlock *Succ : Latch->successors()) { + if (Succ == L->getHeader()) { + LoopCyclesSum += BI->Count * BI->Cycle; + } + ++BI; + } + } + if (LoopCyclesSum == 0) { + // No cycle info is recorded, so we use a static heuristic algorithm to predict the distance. + Distance = predictPrefetchDistance(BC, L, IsPositive); + } + LoopIterCycles = LoopCyclesSum / L->TotalBackEdgeCount; + + Distance = PrefetchLatency / LoopIterCycles + Offset; + return IsPositive ? Distance : ~(Distance - 1); +} + +void LoopDataPrefetch::getCallerSavedRegs(MCInst &Inst, BitVector &Regs, + BinaryContext &BC) { + if (!BC.MIB->isCall(Inst)) + return; + BitVector CallRegs = BitVector(BC.MRI->getNumRegs(), false); + BC.MIB->getCalleeSavedRegs(CallRegs); + CallRegs.flip(); + Regs |= CallRegs; +} + +bool LoopDataPrefetch::regIsPossiblyOverwritten(MCInst &Inst, unsigned Reg, + BinaryContext &BC) { + BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false); + BC.MIB->getWrittenRegs(Inst, WrittenRegs); + getCallerSavedRegs(Inst, WrittenRegs, BC); + if (BC.MIB->isRep(Inst)) + BC.MIB->getRepRegs(WrittenRegs); + WrittenRegs &= BC.MIB->getAliases(Reg, false); + return WrittenRegs.any(); +} + +bool LoopDataPrefetch::regIsDefinitelyOverwritten(MCInst &Inst, + unsigned Reg, + BinaryContext &BC) { + BitVector WrittenRegs = BitVector(BC.MRI->getNumRegs(), false); + BC.MIB->getWrittenRegs(Inst, WrittenRegs); + getCallerSavedRegs(Inst, WrittenRegs, BC); + if (BC.MIB->isRep(Inst)) + BC.MIB->getRepRegs(WrittenRegs); + return (!regIsUsed(Inst, Reg, BC) && WrittenRegs.test(Reg)); +} + +bool LoopDataPrefetch::regIsUsed(MCInst &Inst, unsigned Reg, + BinaryContext &BC) { + BitVector SrcRegs = BitVector(BC.MRI->getNumRegs(), false); + BC.MIB->getSrcRegs(Inst, SrcRegs); + SrcRegs &= BC.MIB->getAliases(Reg, true); + return SrcRegs.any(); +} + +bool LoopDataPrefetch::getDependentBBs(BinaryBasicBlock* LoopInductionBB, BinaryBasicBlock* PrefetchCandBB, + SmallVectorImpl &DependentBBs) { + DenseSet LoopInductionForwards; + DenseSet PrefetchCandBackwards; + for (df_iterator I = df_begin(LoopInductionBB), + E = df_end(LoopInductionBB); I != E; ++I) { + LoopInductionForwards.insert(*I); + } + for (df_iterator I = df_begin(PrefetchCandBB), + E = df_end(PrefetchCandBB); I != E; ++I) { + PrefetchCandBackwards.insert(*I); + } + + ReversePostOrderTraversal RPO(LoopInductionBB); + for (BinaryBasicBlock *BB : RPO) { + if (LoopInductionForwards.count(BB) && PrefetchCandBackwards.count(BB)) + DependentBBs.push_back(BB); + if (BB == PrefetchCandBB) { + LLVM_DEBUG(dbgs() << "Have arrived at the PrefetchBB\n"); + break; + } + } + + if (LoopInductionBB != DependentBBs.front() || + PrefetchCandBB != DependentBBs.back()) { + // BC.outs() << "Find invalid dependent BBs\n"; + return false; + } + return true; +} + +// bool LoopDataPrefetch::getDependentInsts(BinaryContext &BC, MCInst* LoopInductionInst, +// MCInst* PrefetchCandInst, BinaryBasicBlock* LoopInductionBB, +// BinaryBasicBlock* PrefetchCandBB, SmallVectorImpl &DependentInsts, +// bool &IsIndirectLoad, SmallVectorImpl &WrittenRegs, +// std::set &OnlyWrittenRegs) { +bool LoopDataPrefetch::getDependentInsts(BinaryContext &BC, MCInst* LoopInductionInst, MCInst* PrefetchCandInst, + BinaryBasicBlock* LoopInductionBB, BinaryBasicBlock* PrefetchCandBB, + DependentInfo &Dependent, bool &IsIndirectLoad) { + SmallVector &DependentInsts = Dependent.DependentInsts; + std::set &NUBWRegs = Dependent.NotUsedBeforWrittenRegs; + std::set &WrittenRegs = Dependent.WrittenRegs; + std::set &UsedRegs = Dependent.UsedRegs; + + //MCInst* FromInst = LoopInductionInst; + //MCInst* EndInst = PrefetchCandInst; + SmallVector AllDependents; + // DependentInsts.push_back(PrefetchCandInst); + + IsIndirectLoad = false; + + auto findInstrLoc = [&](MCInst *MI, BinaryBasicBlock *BB) { + for (auto I = BB->begin(); I != BB->end(); I++) { + if (&(*I) == MI) { + return I; + } + } + return BB->end(); + }; + auto LocStart = findInstrLoc(LoopInductionInst, LoopInductionBB); + auto LocEnd = findInstrLoc(PrefetchCandInst, PrefetchCandBB); + + if (LoopInductionBB == PrefetchCandBB) { + auto I = LocStart; + AllDependents.push_back(&(*I)); + while (I++ != LocEnd) { + if (I == LoopInductionBB->end()) + I = LoopInductionBB->begin(); + AllDependents.push_back(&(*I)); + } + } else { + SmallVector DependentBBs; + if (!getDependentBBs(LoopInductionBB, PrefetchCandBB, DependentBBs)) { + return false; + } + + for (auto *BB : llvm::reverse(DependentBBs)) { + if (BB == PrefetchCandBB) { + for (auto I = BB->begin(); I != LocEnd + 1; I++) { + AllDependents.push_back(&(*I)); + } + } else if (BB == LoopInductionBB) { + for (auto I = LocStart; I != BB->end(); I++) { + AllDependents.push_back(&(*I)); + } + } else { + for (auto I = BB->begin(); I != BB->end(); I++) { + AllDependents.push_back(&(*I)); + } + } + } + } + + // TODO: remove unused MCInst + // For `ldr q2, [x19, x0, lsl #4]` + MCPhysReg Reg = PrefetchCandInst->getOperand(2).getReg(); + SmallSetVector DependentRegs; + DependentRegs.insert(Reg); + for (int I = AllDependents.size() - 1; I >= 0; I--) { + MCInst *MI = AllDependents[I]; + SmallSetVector RemovedRegs; + SmallSetVector AddedRegs; + + for (int J = 0; J < MI->getNumOperands(); J++) { + if (MI->getOperand(J).isReg()) { + MCPhysReg Reg = MI->getOperand(J).getReg(); + + if (regIsDefinitelyOverwritten(*MI, Reg, BC)) { + NUBWRegs.insert(Reg); + } + UsedRegs.insert(Reg); + } + } + auto It = NUBWRegs.begin(); + while (It != NUBWRegs.end()) { + if (regIsUsed(*MI, Reg, BC)) { + NUBWRegs.erase(It++); + } else { + It++; + } + } + + for (auto Reg : DependentRegs) { + if (regIsPossiblyOverwritten(*MI, Reg, BC)) { + WrittenRegs.insert(Reg); + NUBWRegs.erase(Reg); + if (BC.MIB->isCall(*MI)) + return false; + + DependentInsts.push_back(MI); + if (BC.MIB->mayLoad(*MI)) { + IsIndirectLoad = true; + } + RemovedRegs.insert(Reg); + + for (int J = 1; J < MI->getNumOperands(); J++) { + auto Op = MI->getOperand(J); + if (!Op.isReg()) + continue; + MCPhysReg NewReg = Op.getReg(); + BitVector SrcRegs = BitVector(BC.MRI->getNumRegs(), false); + BC.MIB->getSrcRegs(*MI, SrcRegs); + SrcRegs &= BC.MIB->getAliases(NewReg, true); + if (!SrcRegs.any()) + continue; + AddedRegs.insert(NewReg); + } + } + } + for (auto Reg : RemovedRegs) { + DependentRegs.remove(Reg); + } + for (auto Reg : AddedRegs) { + DependentRegs.insert(Reg); + } + } + + std::reverse(DependentInsts.begin(), DependentInsts.end()); + + return true; +} + +// By constructing a topological sequence, obtain the union of multiple dependents that preserve order. +SmallVector +LoopDataPrefetch::mergeAllDependents(SmallVectorImpl> &AllDependentInsts) { + SmallVector DependentInsts; + std::unordered_map> Graph; + std::unordered_map InDegree; + std::unordered_set Nodes; + + // Build graph, and compute indegree + for (auto DependentInsts : AllDependentInsts) { + for (size_t I = 0; I < DependentInsts.size(); I++) { + MCInst* Cur = DependentInsts[I]; + Nodes.insert(Cur); + if (InDegree.find(Cur) == InDegree.end()) + InDegree[Cur] = 0; + + if (I > 0) { + MCInst* Prev = DependentInsts[I - 1]; + if (!Graph[Prev].insert(Cur).second) { + InDegree[Cur]++; + } + } + } + } + + std::queue Queue; + for (auto Node : Nodes) { + if (InDegree[Node] == 0) + Queue.push(Node); + } + + while(!Queue.empty()) { + MCInst* Cur = Queue.front(); + Queue.pop(); + DependentInsts.push_back(Cur); + for (auto Neighbor : Graph[Cur]) { + InDegree[Neighbor]--; + if (InDegree[Neighbor] == 0) { + Queue.push(Neighbor); + } + } + } + return DependentInsts; +} + +SmallVector +LoopDataPrefetch::getCommonDependents(SmallVectorImpl> &AllDependentInsts) { + SmallVector CommonDependents; + if (AllDependentInsts.empty()) + return CommonDependents; + + // get the minius length + size_t MinLength = AllDependentInsts[0].size(); + for (auto DependentInsts : AllDependentInsts) { + MinLength = std::min(MinLength, DependentInsts.size()); + } + + for (size_t Pos = 0; Pos < MinLength; Pos++) { + MCInst* FirstElem = AllDependentInsts[0][Pos]; + bool AllSame = true; + + for (size_t I = 1; I < AllDependentInsts.size(); I++) { + if (AllDependentInsts[I][Pos] != FirstElem) { + AllSame = false; + break; + } + } + + if (AllSame) { + CommonDependents.push_back(FirstElem); + } else { + break; + } + } + return CommonDependents; +} + +BinaryBasicBlock* +LoopDataPrefetch::createPrefetchEntryBB(BinaryFunction* BF, BinaryBasicBlock* LatchBB, BinaryBasicBlock* HeaderBB, + SmallVectorImpl &CommonDependentInsts, + std::set WrittenRegs, int Distance) { + BinaryContext &BC = BF->getBinaryContext(); + MCSymbol *PrefetchEntryLabel = BC.Ctx->createNamedTempSymbol("PrefetchEntryBB"); + + std::vector> PrefetchEntryBBs; + PrefetchEntryBBs.emplace_back(BF->createBasicBlock(PrefetchEntryLabel)); + + LatchBB->replaceSuccessor(HeaderBB, &*PrefetchEntryBBs.back()); + + // Add instructions for push registers + for (auto Reg : WrittenRegs) { + MCInst PushInst; + BC.MIB->createPushRegister(PushInst, Reg, 64); + PrefetchEntryBBs.back()->addInstruction(PushInst); + } + + // Add index + offset(distance) + // The first of CommonDependentInsts must be InductionInst + auto InductionInst = *CommonDependentInsts.begin(); + MCPhysReg SrcReg; + if (InductionInst->getOperand(0).isReg()) { + SrcReg = InductionInst->getOperand(0).getReg(); + } else { + BC.outs() << "Invalid induction instruction\n"; + } + MCInst AddInst; + BC.MIB->createADD64ri(AddInst, SrcReg, SrcReg, Distance); + PrefetchEntryBBs.back()->addInstruction(AddInst); + + // Add common dependent instructions + for (size_t I = 1; I < CommonDependentInsts.size(); I++) { + PrefetchEntryBBs.back()->addInstruction(*CommonDependentInsts[I]); + } + // for (auto Inst : CommonDependentInsts) { + // PrefetchEntryBBs.back()->addInstruction(*Inst); + // } + //PrefetchEntryBBs.back()->addInstructions(CommonDependentInsts.begin(), CommonDependentInsts.end()); + + // insert this Basic Block to binary function + BF->insertBasicBlocks(LatchBB, std::move(PrefetchEntryBBs)); + + BinaryBasicBlock* PrefetchEntryBB = BF->getBasicBlockForLabel(PrefetchEntryLabel); + + return PrefetchEntryBB; +} + +SmallVector +LoopDataPrefetch::createPrefetchBBs(BinaryFunction* BF, SmallVectorImpl &DependentInsts, + SmallVectorImpl &CommonDependentInsts, BinaryBasicBlock* PrefetchEntryBB, + SmallVectorImpl &PrefetchCands, std::set WrittenRegs) { + + SmallVector InsertedPrefetchBBs; + + BinaryContext &BC = BF->getBinaryContext(); + MCSymbol *PrefetchLabel = BC.Ctx->createNamedTempSymbol("PrefetchBB"); + std::vector> PrefetchBBs; + PrefetchBBs.emplace_back(BF->createBasicBlock(PrefetchLabel)); + + size_t StartPos = CommonDependentInsts.size(); + for (size_t I = StartPos; I < DependentInsts.size(); I++) { + PrefetchBBs.back()->addInstruction(*DependentInsts[I]); + } + + MCPhysReg FreeReg; + bool FoundFreeReg = false; + for (auto Reg : WrittenRegs) { + bool IsUsed = false; + for (size_t I = StartPos; I < DependentInsts.size(); I++) { + if (regIsUsed(*DependentInsts[I], Reg, BC)) { + IsUsed = true; + break; + } + } + + if (!IsUsed && PrefetchCands.size() == 1) { + FreeReg = Reg; + FoundFreeReg = true; + break; + } + for (auto Cand : PrefetchCands) { + if (!Cand.HasValidDependency) + continue; + + if (regIsUsed(*Cand.MemI, Reg, BC)) { + IsUsed = true; + break; + } + } + if (!IsUsed) { + FreeReg = Reg; + FoundFreeReg = true; + break; + } + } + + if (!FoundFreeReg) { + BC.outs() << "Not find the valid free register\n"; + return InsertedPrefetchBBs; + } + + for (auto Cand: PrefetchCands) { + if (!Cand.HasValidDependency) + continue; + + // ldr q2, [x19, x0, lsl #4] + if (Cand.MemI->getOperand(1).isReg() && Cand.MemI->getOperand(2).isReg() && + Cand.MemI->getOperand(3).isImm() && Cand.MemI->getOperand(4).isImm()) { + // create add instruction as base address of prefetch + MCInst AddInst; + MCPhysReg SrcReg1 = Cand.MemI->getOperand(1).getReg(); + MCPhysReg SrcReg2 = Cand.MemI->getOperand(2).getReg(); + int64_t Imm = Cand.MemI->getOperand(4).getImm(); + BC.MIB->createADD64rs(AddInst, FreeReg, SrcReg1, SrcReg2, Imm); + PrefetchBBs.back()->addInstruction(AddInst); + + // create prefetch instruction + MCInst PrefetchInst; + BC.MIB->createPrefetchMui(PrefetchInst, FreeReg, "pldl1keep", 0); + PrefetchBBs.back()->addInstruction(PrefetchInst); + } + } + + BF->insertBasicBlocks(PrefetchEntryBB, std::move(PrefetchBBs)); + BinaryBasicBlock* PrefetchBB = BF->getBasicBlockForLabel(PrefetchLabel); + + PrefetchEntryBB->addSuccessor(PrefetchBB, 0, 0); + InsertedPrefetchBBs.push_back(PrefetchBB); + + return InsertedPrefetchBBs; +} + +BinaryBasicBlock* +LoopDataPrefetch::createPrefetchExitBB(BinaryFunction* BF, BinaryBasicBlock* SuccBB, + SmallVectorImpl &PredBBs, + std::set WrittenRegs) { + BinaryContext &BC = BF->getBinaryContext(); + MCSymbol *PrefetchExitLabel = BC.Ctx->createNamedTempSymbol("PrefetchExitBB"); + + std::vector> PrefetchExitBBs; + PrefetchExitBBs.emplace_back(BF->createBasicBlock(PrefetchExitLabel)); + + // set successor + PrefetchExitBBs.back()->addSuccessor(SuccBB, 0, 0); + + // add instructions + // create pop registers + for (auto Reg : llvm::reverse(WrittenRegs)) { + MCInst PopInst; + BC.MIB->createPopRegister(PopInst, Reg, 64); + PrefetchExitBBs.back()->addInstruction(PopInst); + } + + BF->insertBasicBlocks(PredBBs.back(), std::move(PrefetchExitBBs)); + + BinaryBasicBlock* PrefetchExitBB = BF->getBasicBlockForLabel(PrefetchExitLabel); + // set predecessor + for (auto* PredBB : PredBBs) { + + // PrefetchExitBBs.back()->addPredecessor(PredBB); + PredBB->addSuccessor(PrefetchExitBB, 0, 0); + } + + return PrefetchExitBB; +} + +void LoopDataPrefetch::doPrefetchForLoop(BinaryContext &BC, BinaryLoop *L, + SmallVectorImpl &PrefetchCands){ + auto InductionInstr = getLoopInductionInstr(BC, L); + BinaryBasicBlock* HeaderBB = L->getHeader(); + SmallVector Latches; + L->getLoopLatches(Latches); + + BinaryBasicBlock* LatchBB = Latches[0]; + BinaryFunction* BF = LatchBB->getFunction(); + if (!InductionInstr){ + BC.outs() << "BOLT-INFO: not found the loop induction instruction, loop header: " + << L->getHeader()->getName()<<"\n"; + return; + } + + bool IsPositive = false; + // check the direction of the loop + if (BC.MIB->isSUB(*InductionInstr)) { + auto Imm = InductionInstr->getOperand(2).getImm(); + if (Imm > 0) { + IsPositive = true; + } + } else if (BC.MIB->isADD(*InductionInstr)) { + auto Imm = InductionInstr->getOperand(2).getImm(); + if (Imm > 0) { + IsPositive = true; + } + } + + LLVM_DEBUG(dbgs() << "Finding the loop induction instruction for loop: " + << L->getHeader()->getName() + <<"\n -- loop induction instruction: "); + if (InductionInstr) + LLVM_DEBUG(BC.printInstruction(dbgs(),*InductionInstr)); + else + LLVM_DEBUG(dbgs()<<"NULL"); + LLVM_DEBUG(dbgs()<<"\n"); + + SmallVector, 8> AllDependentInsts; + // SmallVector, 8> AllNotUsedBeforWrittenRegs; + std::set AllNotUsedBeforWrittenRegs; + //SmallVector, 8> AllWrittenRegs; + std::set AllWrittenRegs; + std::set AllUsedRegs; + + for (auto &It : PrefetchCands) { + //SmallVector DependentInsts; + //std::set NotUsedBeforWrittenRegs; + //std::set WrittenRegs; + //std::set UsedRegs; + // std::set FreeRegs; + DependentInfo Dependent; + It.HasValidDependency = true; + if (!getDependentInsts(BC, InductionInstr, It.MemI, LatchBB, It.CacheMissBB, Dependent, + It.IsIndirectLoad)) { + It.HasValidDependency = false; + BC.outs() << "Not found the valid dependent instructions\n"; + continue; + } else { + BC.outs() << "Found the valid dependent instructions\n"; + } + AllDependentInsts.push_back(Dependent.DependentInsts); + for (auto Reg : Dependent.NotUsedBeforWrittenRegs) { + AllNotUsedBeforWrittenRegs.insert(Reg); + } + for (auto Reg : Dependent.WrittenRegs) { + AllWrittenRegs.insert(Reg); + } + } + + SmallVector DependentInsts = mergeAllDependents(AllDependentInsts); + if (*DependentInsts.begin() != InductionInstr) + return; + + SmallVector CommonDependentInsts = getCommonDependents(AllDependentInsts); + int Dis = calculatePrefetchDistance(BC, L, IsPositive); + + BinaryBasicBlock* PrefetchEntryBB = createPrefetchEntryBB(BF, LatchBB, HeaderBB, CommonDependentInsts, AllWrittenRegs, Dis); + + // BinaryBasicBlock* PrefetchBB = createPrefetchBB(BF, DependentInsts, CommonDependentInsts, PrefetchEntryBB, PrefetchCands); + + SmallVector PrefetchBBs = createPrefetchBBs(BF, DependentInsts, CommonDependentInsts, PrefetchEntryBB, PrefetchCands, AllWrittenRegs); + + if (PrefetchBBs.empty()) + return; + + BinaryBasicBlock* PrefetchExitBB = createPrefetchExitBB(BF, HeaderBB, PrefetchBBs, AllWrittenRegs); + + // std::set VisitedBB; + // for (auto &It : PrefetchCands){ + // bool IsIndirectLoad = false; + // auto Res = VisitedBB.insert(It.CacheMissBB); + // if (!Res.second) { + // LLVM_DEBUG(dbgs() << " -- Already inserted prefetch in this BB\n"); + // continue; + // } + // if (It.CurL == It.InsertL){ + // LLVM_DEBUG(dbgs() << " -- doing prefetch in inner loop for instruction: \n"); + // LLVM_DEBUG(BC.printInstruction(dbgs(), *(It.MemI))); + // // TODO + // SmallVector DependentInsts; + // // Registers that may be rewritten in the instruction dependency sequence. + // SmallVector WrittenRegs; + // // In the BB dependency sequence, the register that will be overwritten has + // // not been used prior to the overwrite. + // std::set OnlyWrittenRegs; + // if(!getDependentInsts(BC, InductionInstr, It.MemI, LatchBB, It.CacheMissBB, + // DependentInsts, IsIndirectLoad, WrittenRegs, OnlyWrittenRegs)) { + // BC.outs() << "Not found the valid dependent instructions\n"; + // continue; + // } + + // int Dis = calculatePrefetchDistance(BC, L, IsPositive); + // LLVM_DEBUG(dbgs() << " prefetch distance: " << Dis << "\n"); + + // if (DependentInsts.size() == 2) { + // MCInst PrefetchInst; + // MCPhysReg BaseReg = DependentInsts[1]->getOperand(1).getReg(); + // MCPhysReg IndexReg = DependentInsts[0]->getOperand(0).getReg(); + // MCPhysReg NewReg; + // // find the free register. + + // auto InsertLoc = It.CacheMissBB->end(); + // for (auto I = It.CacheMissBB->begin(); I != It.CacheMissBB->end(); I++) { + // if (&(*I) == It.MemI) { + // InsertLoc = I; + // break; + // } + // } + // if (InsertLoc == It.CacheMissBB->end()) + // continue; + // // push register + // // TODO: use createPushRegister or createPushRegisters to store register(s). + // LLVM_DEBUG(dbgs() << "Written registers size: " << WrittenRegs.size() << "\n"); + // for (auto Reg : WrittenRegs) { + // MCInst PushInst; + // BC.MIB->createPushRegister(PushInst, Reg, 64); + // InsertLoc = It.CacheMissBB->insertInstruction(InsertLoc, PushInst); + // InsertLoc++; + // } + + // if (!OnlyWrittenRegs.empty()) + // NewReg = *OnlyWrittenRegs.begin(); + // else { + // NewReg = DependentInsts[0]->getOperand(0).getReg(); + // } + + // MCInst AddInst; + // LLVM_DEBUG(dbgs() << "Add instruction newreg: " << NewReg << " , basereg: " << BaseReg << " , indexreg: " << IndexReg << "\n"); + // BC.MIB->createADD64rs(AddInst, NewReg, BaseReg, IndexReg); + // InsertLoc = It.CacheMissBB->insertInstruction(InsertLoc, AddInst); + // InsertLoc++; + + + // BC.MIB->createPrefetchMui(PrefetchInst, NewReg, "pldl1keep", Dis); + // InsertLoc = It.CacheMissBB->insertInstruction(InsertLoc, PrefetchInst); + // InsertLoc++; + // LLVM_DEBUG(dbgs() << " inserted a prefetch instruction\n"); + + // // pop register + // // TODO: use createPopRegister or createPopRegisters to read stack. + // for (auto Reg : llvm::reverse(WrittenRegs)) { + // MCInst PopInst; + // BC.MIB->createPopRegister(PopInst, Reg, 64); + // InsertLoc = It.CacheMissBB->insertInstruction(InsertLoc, PopInst); + // InsertLoc++; + // } + // } + // } else { + // LLVM_DEBUG(dbgs() << " -- doing prefetch in outer loop for instruction: \n"); + // LLVM_DEBUG(BC.printInstruction(dbgs(), *(It.MemI))); + // } + // } +} + +void LoopDataPrefetch::runOnLoop(BinaryLoop *L, BinaryContext &BC) { + LoopDataPrefetch::LoopPrefetchMapTy PrefetchCandidates; + // get prefetch candidates + DenseSet IsVisited; + SmallVector Loops = L->getLoopsInPreorder(); + for (auto *SubL : llvm::reverse(Loops)) { + getPrefetchCandidates(SubL, BC, IsVisited, PrefetchCandidates); + } + + MCInst* LoopInductionInstr = nullptr; + MCPhysReg* LoopInductionVar = nullptr; + + for (auto &It : PrefetchCandidates) { + doPrefetchForLoop(BC, It.first, It.second); + } +} + +void LoopDataPrefetch::runOnFunction(BinaryFunction &Function, + BinaryContext &BC) { + BC.outs() << "BOLT-INFO: doing loop data prefetch for function: " + << Function.getOneName() << "\n"; + Function.calculateLoopInfo(); + // get cache miss info with loops + if (!Function.hasLoopInfo()) + return; + + const BinaryLoopInfo &BLI = Function.getLoopInfo(); + for (BinaryLoop *L : BLI) { + runOnLoop(L, BC); + } +} + +Error LoopDataPrefetch::runOnFunctions(BinaryContext &BC) { + // TODO: consider to do it by parallel. + for (auto &It : BC.getBinaryFunctions()) { + BinaryFunction &Function = It.second; + if (!shouldOptimize(Function)) + continue; + runOnFunction(Function, BC); + } + + BC.outs() << "BOLT-INFO: number of prefetch instructions inserted: " + << "TODO" << "\n"; + return Error::success(); +} + +} // namespace bolt +} // namespace llvm diff --git a/bolt/lib/Profile/DataAggregator.cpp b/bolt/lib/Profile/DataAggregator.cpp index c9668bddd0d7e63a5ddb02c6e27bd295457ba3d3..b6b6e496037b9653a05e7c7d8d147056fdad31cc 100644 --- a/bolt/lib/Profile/DataAggregator.cpp +++ b/bolt/lib/Profile/DataAggregator.cpp @@ -110,6 +110,17 @@ static cl::opt WriteAutoFDOData( "autofdo", cl::desc("generate autofdo textual data instead of bolt data"), cl::cat(AggregatorCategory)); +enum CacheMissKind {NoCache_Miss, TLB_Miss, L1D_Miss, LLC_Miss}; + +static cl::opt CacheMissSample( + "cache-miss", cl::init(NoCache_Miss), + cl::desc("generate data cache miss into fdata"), + cl::values(clEnumValN(NoCache_Miss, "none", "No cache miss statistics"), + clEnumValN(TLB_Miss, "tlb_miss", "TLB cache miss statistics"), + clEnumValN(L1D_Miss, "l1d_miss", "L1D cache miss statistics"), + clEnumValN(LLC_Miss, "llc_miss", "LLC miss statistics")), + cl::cat(AggregatorCategory)); + } // namespace opts namespace { @@ -630,6 +641,8 @@ void DataAggregator::processProfile(BinaryContext &BC) { else processBranchEvents(); + processCacheMissEvents(); + processMemEvents(); // Mark all functions with registered events as having a valid profile. @@ -718,7 +731,7 @@ bool DataAggregator::doSample(BinaryFunction &OrigFunc, uint64_t Address, bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From, uint64_t To, uint64_t Count, - uint64_t Mispreds) { + uint64_t Mispreds, int64_t Cycles) { FuncBranchData *AggrData = getBranchData(Func); if (!AggrData) { AggrData = &NamesToBranches[Func.getOneName()]; @@ -728,14 +741,14 @@ bool DataAggregator::doIntraBranch(BinaryFunction &Func, uint64_t From, LLVM_DEBUG(dbgs() << "BOLT-DEBUG: bumpBranchCount: " << formatv("{0} @ {1:x} -> {0} @ {2:x}\n", Func, From, To)); - AggrData->bumpBranchCount(From, To, Count, Mispreds); + AggrData->bumpBranchCount(From, To, Count, Mispreds, Cycles); return true; } bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, BinaryFunction *ToFunc, uint64_t From, uint64_t To, uint64_t Count, - uint64_t Mispreds) { + uint64_t Mispreds, int64_t Cycles) { FuncBranchData *FromAggrData = nullptr; FuncBranchData *ToAggrData = nullptr; StringRef SrcFunc; @@ -773,7 +786,7 @@ bool DataAggregator::doInterBranch(BinaryFunction *FromFunc, } bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, - uint64_t Mispreds) { + uint64_t Mispreds, int64_t Cycles) { bool IsReturn = false; auto handleAddress = [&](uint64_t &Addr, bool IsFrom) -> BinaryFunction * { if (BinaryFunction *Func = getBinaryFunctionContainingAddress(Addr)) { @@ -812,11 +825,11 @@ bool DataAggregator::doBranch(uint64_t From, uint64_t To, uint64_t Count, // Treat recursive control transfers as inter-branches. if (FromFunc == ToFunc && To != 0) { - recordBranch(*FromFunc, From, To, Count, Mispreds); - return doIntraBranch(*FromFunc, From, To, Count, Mispreds); + recordBranch(*FromFunc, From, To, Count, Mispreds, Cycles); + return doIntraBranch(*FromFunc, From, To, Count, Mispreds, Cycles); } - return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds); + return doInterBranch(FromFunc, ToFunc, From, To, Count, Mispreds, Cycles); } bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, @@ -869,7 +882,8 @@ bool DataAggregator::doTrace(const LBREntry &First, const LBREntry &Second, From = BAT->translate(FromFunc->getAddress(), From, /*IsBranchSrc=*/true); To = BAT->translate(FromFunc->getAddress(), To, /*IsBranchSrc=*/false); } - doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false); + // TODO: assume cycles is 0; will fix this if needed. + doIntraBranch(ParentFunc ? *ParentFunc : *FromFunc, From, To, Count, false, 0); } return true; @@ -1040,13 +1054,20 @@ ErrorOr DataAggregator::parseLBREntry() { MispredWarning = false; } - ErrorOr Rest = parseString(FieldSeparator, true); - if (std::error_code EC = Rest.getError()) + ErrorOr IntxRes = parseString('/'); + if (std::error_code EC = IntxRes.getError()) return EC; - if (Rest.get().size() < 5) { - reportError("expected rest of LBR entry"); - Diag << "Found: " << Rest.get() << "\n"; - return make_error_code(llvm::errc::io_error); + + ErrorOr AbortRes = parseString('/'); + if (std::error_code EC = AbortRes.getError()) + return EC; + + ErrorOr CyclesRes = parseNumberField('/', true); + if (std::error_code EC = CyclesRes.getError()) + return EC; + Res.Cycles = CyclesRes.get(); + + while (checkAndConsumeFS()) { } return Res; } @@ -1160,8 +1181,25 @@ ErrorOr DataAggregator::parseBasicSample() { return PerfBasicSample{Event.get(), Address}; } +bool DataAggregator::hasCacheMissEvent(StringRef Event) { + if (Event == "tlb-miss:" || Event == "tlb-access:" || + Event == "l1d-miss:" || Event == "l1d-access:" || + Event == "llc-miss:" || Event == "llc-access:") + return true; + + return false; +} + +bool DataAggregator::isCacheMissEvent(StringRef Event) { + if (Event == "tlb-miss:" || Event == "l1d-miss:" || Event == "llc-miss:") + return true; + + return false; +} + ErrorOr DataAggregator::parseMemSample() { - PerfMemSample Res{0, 0}; + PerfMemSample Res{0, 0, 0}; + bool isMiss = false; while (checkAndConsumeFS()) { } @@ -1182,10 +1220,6 @@ ErrorOr DataAggregator::parseMemSample() { ErrorOr Event = parseString(FieldSeparator); if (std::error_code EC = Event.getError()) return EC; - if (!Event.get().contains("mem-loads")) { - consumeRestOfLine(); - return Res; - } while (checkAndConsumeFS()) { } @@ -1212,7 +1246,17 @@ ErrorOr DataAggregator::parseMemSample() { if (!BC->HasFixedLoadAddress) adjustAddress(Address, MMapInfoIter->second); - return PerfMemSample{PCRes.get(), Address}; + uint64_t PC = *PCRes; + if (!BC->HasFixedLoadAddress) + adjustAddress(PC, MMapInfoIter->second); + + if (opts::CacheMissSample != opts::CacheMissKind::NoCache_Miss) { + if (!hasCacheMissEvent(Event.get())) + return Res; + isMiss = isCacheMissEvent(Event.get()); + } + + return PerfMemSample{PC, Address, isMiss}; } ErrorOr DataAggregator::parseLocationOrOffset() { @@ -1410,8 +1454,12 @@ uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample, // recorded executed PC. uint64_t NextPC = opts::UseEventPC ? Sample.PC : 0; uint32_t NumEntry = 0; + + int64_t TotalCycles = 0; + std::unordered_map BranchLBRCycles; for (const LBREntry &LBR : Sample.LBR) { ++NumEntry; + TotalCycles += LBR.Cycles; // Hardware bug workaround: Intel Skylake (which has 32 LBR entries) // sometimes record entry 32 as an exact copy of entry 31. This will cause // us to likely record an invalid trace and generate a stale function for @@ -1467,6 +1515,26 @@ uint64_t DataAggregator::parseLBRSample(const PerfBranchSample &Sample, TakenBranchInfo &Info = BranchLBRs[Trace(From, To)]; ++Info.TakenCount; Info.MispredCount += LBR.Mispred; + + BranchCycleInfo &CycleInfo = BranchLBRCycles[Trace(From, To)]; + if (CycleInfo.Occurances++ == 0) { + CycleInfo.LastTotalCycles = TotalCycles; + continue; + } + int64_t CurDiff = TotalCycles - CycleInfo.LastTotalCycles; + CycleInfo.LastTotalCycles = TotalCycles; + CycleInfo.DiffSumCycles += CurDiff; + } + + for (auto It : BranchLBRCycles) { + TakenBranchInfo &Info = BranchLBRs[It.first]; + int64_t AvgCycles = It.second.DiffSumCycles / (It.second.Occurances - 1); + if (It.second.DiffSumCycles != 0 && AvgCycles == 0) + AvgCycles = 1; + if (Info.Cycles == 0) + Info.Cycles = AvgCycles; + else + Info.Cycles = (Info.Cycles + AvgCycles) / 2; } return NumTraces; } @@ -1610,7 +1678,7 @@ void DataAggregator::processBranchEvents() { for (const auto &AggrLBR : BranchLBRs) { const Trace &Loc = AggrLBR.first; const TakenBranchInfo &Info = AggrLBR.second; - doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount); + doBranch(Loc.From, Loc.To, Info.TakenCount, Info.MispredCount, Info.Cycles); } } @@ -1742,6 +1810,51 @@ void DataAggregator::processMemEvents() { } } +void DataAggregator::processCacheMissEvents() { + outs() << "PERF2BOLT: processing cache miss events...\n"; + NamedRegionTimer T("ProcessCacheMissEvent", "Processing Cache Miss Event", + TimerGroupName, TimerGroupDesc, opts::TimeAggregator); + + uint64_t NumMisses = 0; + uint64_t NumAccesses = 0; + for (auto &Sample : MemSamples) { + uint64_t PC = Sample.PC; + uint64_t Addr = Sample.Addr; + bool isMiss = Sample.isMiss; + if (isMiss) + NumMisses++; + else + NumAccesses++; + BinaryFunction *Func = getBinaryFunctionContainingAddress(PC); + if (!Func) { + LLVM_DEBUG(if (PC != 0) { + dbgs() << formatv("Skip cache miss event: {0: x} => {1:x}", PC, Addr); + }); + continue; + } + + StringRef FuncName = Func->getOneName(); + + BinaryFunction *ParentFunc = getBATParentFunction(*Func); + BinaryFunction &NewFunc = ParentFunc ? *ParentFunc : *Func; + + auto I = NamesToCacheMissEvents.find(NewFunc.getOneName()); + if (I == NamesToCacheMissEvents.end()) { + bool Success; + StringRef LocName = getLocationName(NewFunc, BAT); + std::tie(I, Success) = NamesToCacheMissEvents.insert( + std::make_pair(NewFunc.getOneName(), + FuncCacheMissData(LocName, FuncCacheMissData::ContainerTy()))); + } + + PC -= NewFunc.getAddress(); + if (BAT) + PC = BAT->translate(NewFunc.getAddress(), PC, /*IsBranchSrc=*/false); + + I->second.bumpCount(PC, NumMisses, NumAccesses); + } +} + std::error_code DataAggregator::parsePreAggregatedLBRSamples() { outs() << "PERF2BOLT: parsing pre-aggregated profile...\n"; NamedRegionTimer T("parseAggregated", "Parsing aggregated branch events", @@ -1770,8 +1883,9 @@ void DataAggregator::processPreAggregated() { for (const AggregatedLBREntry &AggrEntry : AggregatedLBRs) { switch (AggrEntry.EntryType) { case AggregatedLBREntry::BRANCH: + // TODO: assume cycles is 0; will fix this if needed. doBranch(AggrEntry.From.Offset, AggrEntry.To.Offset, AggrEntry.Count, - AggrEntry.Mispreds); + AggrEntry.Mispreds, 0); break; case AggregatedLBREntry::FT: case AggregatedLBREntry::FT_EXTERNAL_ORIGIN: { @@ -2221,9 +2335,12 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { return EC; bool WriteMemLocs = false; + bool WriteCacheMissLocs = false; - auto writeLocation = [&OutFile, &WriteMemLocs](const Location &Loc) { - if (WriteMemLocs) + auto writeLocation = [&OutFile, &WriteMemLocs, &WriteCacheMissLocs](const Location &Loc) { + if (WriteCacheMissLocs) + OutFile << (Loc.IsSymbol ? "7 " : "6 "); + else if (WriteMemLocs) OutFile << (Loc.IsSymbol ? "4 " : "3 "); else OutFile << (Loc.IsSymbol ? "1 " : "0 "); @@ -2233,6 +2350,7 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { uint64_t BranchValues = 0; uint64_t MemValues = 0; + uint64_t CacheMissValues = 0; if (BAT) OutFile << "boltedcollection\n"; @@ -2242,6 +2360,8 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { OutFile << " " << Entry.getKey(); OutFile << "\n"; + + // TODO: Distinguish between basic sampling and memory sampling for (const auto &KV : NamesToSamples) { const FuncSampleData &FSD = KV.second; for (const SampleInfo &SI : FSD.Data) { @@ -2256,7 +2376,7 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { for (const BranchInfo &BI : FBD.Data) { writeLocation(BI.From); writeLocation(BI.To); - OutFile << BI.Mispreds << " " << BI.Branches << "\n"; + OutFile << BI.Mispreds << " " << BI.Branches << " " << BI.Cycles << "\n"; ++BranchValues; } for (const BranchInfo &BI : FBD.EntryData) { @@ -2266,11 +2386,22 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { continue; writeLocation(BI.From); writeLocation(BI.To); - OutFile << BI.Mispreds << " " << BI.Branches << "\n"; + OutFile << BI.Mispreds << " " << BI.Branches << " " << BI.Cycles << "\n"; ++BranchValues; } } + WriteCacheMissLocs = true; + for (const auto &KV : NamesToCacheMissEvents) { + const FuncCacheMissData &FCMD = KV.second; + for (const CacheMissInfo &CMI : FCMD.Data) { + writeLocation(CMI.Offset); + OutFile << CMI.MissCount << " " << CMI.AccessCount << "\n"; + ++CacheMissValues; + } + } + + WriteCacheMissLocs = false; WriteMemLocs = true; for (const auto &KV : NamesToMemEvents) { const FuncMemData &FMD = KV.second; @@ -2283,7 +2414,8 @@ DataAggregator::writeAggregatedFile(StringRef OutputFilename) const { } } - outs() << "PERF2BOLT: wrote " << BranchValues << " objects and " << MemValues + outs() << "PERF2BOLT: wrote " << BranchValues << " objects, " + << CacheMissValues << " cache miss objects and " << MemValues << " memory objects to " << OutputFilename << "\n"; return std::error_code(); diff --git a/bolt/lib/Profile/DataReader.cpp b/bolt/lib/Profile/DataReader.cpp index f2e999bbfdc6dca0bbfc6df5918bd62b99859f68..f9d874aa503dddaaa7d7ad8c298ed8e7216c24e8 100644 --- a/bolt/lib/Profile/DataReader.cpp +++ b/bolt/lib/Profile/DataReader.cpp @@ -139,25 +139,55 @@ void FuncSampleData::bumpCount(uint64_t Offset, uint64_t Count) { SI.Hits += Count; } +void CacheMissInfo::mergeWith(const CacheMissInfo &CMI) { + MissCount += CMI.MissCount; + AccessCount += CMI.AccessCount; +} + +void CacheMissInfo::print(raw_ostream &OS) const { + OS << (Offset.IsSymbol + 5) << " " << Offset.Name << " " + << Twine::utohexstr(Offset.Offset) << " " << MissCount + << " " << AccessCount << "\n"; +} + +void FuncCacheMissData::bumpCount(uint64_t Offset, uint64_t MissCount, + uint64_t AccessCount) { + auto Iter = Index.find(Offset); + if (Iter == Index.end()) { + Data.emplace_back(Location(true, Name, Offset), MissCount, AccessCount); + Index[Offset] = Data.size() - 1; + return; + } + + CacheMissInfo &CMI = Data[Iter->second]; + CMI.MissCount += MissCount; + CMI.AccessCount += AccessCount; +} + void FuncBranchData::bumpBranchCount(uint64_t OffsetFrom, uint64_t OffsetTo, - uint64_t Count, uint64_t Mispreds) { + uint64_t Count, uint64_t Mispreds, int64_t Cycles) { auto Iter = IntraIndex[OffsetFrom].find(OffsetTo); if (Iter == IntraIndex[OffsetFrom].end()) { Data.emplace_back(Location(true, Name, OffsetFrom), - Location(true, Name, OffsetTo), Mispreds, Count); + Location(true, Name, OffsetTo), Mispreds, Count, Cycles); IntraIndex[OffsetFrom][OffsetTo] = Data.size() - 1; return; } BranchInfo &BI = Data[Iter->second]; BI.Branches += Count; BI.Mispreds += Mispreds; + if (BI.Cycles == 0) { + BI.Cycles = Cycles; + } else { + BI.Cycles = (BI.Cycles + Cycles) / 2; + } } void FuncBranchData::bumpCallCount(uint64_t OffsetFrom, const Location &To, uint64_t Count, uint64_t Mispreds) { auto Iter = InterIndex[OffsetFrom].find(To); if (Iter == InterIndex[OffsetFrom].end()) { - Data.emplace_back(Location(true, Name, OffsetFrom), To, Mispreds, Count); + Data.emplace_back(Location(true, Name, OffsetFrom), To, Mispreds, Count, 0); InterIndex[OffsetFrom][To] = Data.size() - 1; return; } @@ -171,7 +201,7 @@ void FuncBranchData::bumpEntryCount(const Location &From, uint64_t OffsetTo, auto Iter = EntryIndex[OffsetTo].find(From); if (Iter == EntryIndex[OffsetTo].end()) { EntryData.emplace_back(From, Location(true, Name, OffsetTo), Mispreds, - Count); + Count, 0); EntryIndex[OffsetTo][From] = EntryData.size() - 1; return; } @@ -183,6 +213,11 @@ void FuncBranchData::bumpEntryCount(const Location &From, uint64_t OffsetTo, void BranchInfo::mergeWith(const BranchInfo &BI) { Branches += BI.Branches; Mispreds += BI.Mispreds; + if (BI.Cycles == 0 || Cycles == 0) { + Cycles = BI.Cycles; + } else { + Cycles = (Cycles + BI.Cycles) / 2; + } } void BranchInfo::print(raw_ostream &OS) const { @@ -264,11 +299,17 @@ Error DataReader::preprocessProfile(BinaryContext &BC) { Function.ExecutionCount = FuncData->ExecutionCount; FuncData->Used = true; } + if (FuncCacheMissData *CacheMissData = getCacheMissDataForNames(Function.getNames())) { + setCacheMissData(Function, CacheMissData); + Function.HasCacheMissProfile = true; + CacheMissData->Used = true; + } } for (auto &BFI : BC.getBinaryFunctions()) { BinaryFunction &Function = BFI.second; matchProfileMemData(Function); + matchProfileCacheMissData(Function); } return Error::success(); @@ -278,32 +319,56 @@ Error DataReader::readProfilePreCFG(BinaryContext &BC) { for (auto &BFI : BC.getBinaryFunctions()) { BinaryFunction &Function = BFI.second; FuncMemData *MemoryData = getMemData(Function); - if (!MemoryData) - continue; + FuncCacheMissData *CacheMissData = getCacheMissData(Function); + + if (MemoryData) { + for (MemInfo &MI : MemoryData->Data) { + const uint64_t Offset = MI.Offset.Offset; + auto II = Function.Instructions.find(Offset); + if (II == Function.Instructions.end()) { + // Ignore bad instruction address. + continue; + } - for (MemInfo &MI : MemoryData->Data) { - const uint64_t Offset = MI.Offset.Offset; - auto II = Function.Instructions.find(Offset); - if (II == Function.Instructions.end()) { - // Ignore bad instruction address. - continue; + auto &MemAccessProfile = + BC.MIB->getOrCreateAnnotationAs( + II->second, "MemoryAccessProfile"); + BinaryData *BD = nullptr; + if (MI.Addr.IsSymbol) + BD = BC.getBinaryDataByName(MI.Addr.Name); + MemAccessProfile.AddressAccessInfo.push_back( + {BD, MI.Addr.Offset, MI.Count}); + auto NextII = std::next(II); + if (NextII == Function.Instructions.end()) + MemAccessProfile.NextInstrOffset = Function.getSize(); + else + MemAccessProfile.NextInstrOffset = II->first; } + Function.HasMemoryProfile = true; + } - auto &MemAccessProfile = - BC.MIB->getOrCreateAnnotationAs( - II->second, "MemoryAccessProfile"); - BinaryData *BD = nullptr; - if (MI.Addr.IsSymbol) - BD = BC.getBinaryDataByName(MI.Addr.Name); - MemAccessProfile.AddressAccessInfo.push_back( - {BD, MI.Addr.Offset, MI.Count}); - auto NextII = std::next(II); - if (NextII == Function.Instructions.end()) - MemAccessProfile.NextInstrOffset = Function.getSize(); - else - MemAccessProfile.NextInstrOffset = II->first; + // We must read the cache miss information before constructing + // the CFG; otherwise, after building the CFG, some instructions + // may lack offset information, making it impossible to read the + // profile. + if (CacheMissData) { + for (const CacheMissInfo &CMI : CacheMissData->Data) { + const uint64_t Offset = CMI.Offset.Offset; + auto II = Function.Instructions.find(Offset); + if (II == Function.Instructions.end()) { + // Ignore bad instruction address. + continue; + } + + auto &CacheMissProf = + BC.MIB->getOrCreateAnnotationAs( + II->second, "CacheMissProfile"); + CacheMissProf.InstrOffset = II->first; + CacheMissProf.MissCount = CMI.MissCount; + CacheMissProf.AccessCount = CMI.AccessCount; + } + Function.HasCacheMissProfile = true; } - Function.HasMemoryProfile = true; } return Error::success(); @@ -361,40 +426,66 @@ void DataReader::readProfile(BinaryFunction &BF) { // Possibly assign/re-assign branch profile data. matchProfileData(BF); - FuncBranchData *FBD = getBranchData(BF); - if (!FBD) - return; + // TODO: do we need to assign/re-assign cache miss profile data as branch? + // matchProfileCacheMissData(BF); - // Assign basic block counts to function entry points. These only include - // counts for outside entries. - // - // There is a slight skew introduced here as branches originated from RETs - // may be accounted for in the execution count of an entry block if the last - // instruction in a predecessor fall-through block is a call. This situation - // should rarely happen because there are few multiple-entry functions. - for (const BranchInfo &BI : FBD->EntryData) { - BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(BI.To.Offset); - if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { - uint64_t Count = BB->getExecutionCount(); - if (Count == BinaryBasicBlock::COUNT_NO_PROFILE) - Count = 0; - BB->setExecutionCount(Count + BI.Branches); + FuncBranchData *FBD = getBranchData(BF); + if (FBD) { + // Assign basic block counts to function entry points. These only include + // counts for outside entries. + // + // There is a slight skew introduced here as branches originated from RETs + // may be accounted for in the execution count of an entry block if the last + // instruction in a predecessor fall-through block is a call. This situation + // should rarely happen because there are few multiple-entry functions. + for (const BranchInfo &BI : FBD->EntryData) { + BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(BI.To.Offset); + if (BB && (BB->isEntryPoint() || BB->isLandingPad())) { + uint64_t Count = BB->getExecutionCount(); + if (Count == BinaryBasicBlock::COUNT_NO_PROFILE) + Count = 0; + BB->setExecutionCount(Count + BI.Branches); + } } - } - for (const BranchInfo &BI : FBD->Data) { - if (BI.From.Name != BI.To.Name) - continue; + for (const BranchInfo &BI : FBD->Data) { + if (BI.From.Name != BI.To.Name) + continue; - if (!recordBranch(BF, BI.From.Offset, BI.To.Offset, BI.Branches, - BI.Mispreds)) { - LLVM_DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> " - << BI.To.Offset << '\n'); + if (!recordBranch(BF, BI.From.Offset, BI.To.Offset, BI.Branches, + BI.Mispreds, BI.Cycles)) { + LLVM_DEBUG(dbgs() << "bad branch : " << BI.From.Offset << " -> " + << BI.To.Offset << '\n'); + } } + + // Convert branch data into annotations. + convertBranchData(BF); } - // Convert branch data into annotations. - convertBranchData(BF); + // FuncCacheMissData *FCMD = getCacheMissData(BF); + // if (FCMD) { + // + // for (const CacheMissInfo &CMI : FCMD->Data) { + // BinaryBasicBlock *BB = BF.getBasicBlockAtOffset(CMI.Offset.Offset); + // if (!BB) + // continue; +// + // const MCInst *MCInstr = BF.getInstructionAtOffset(CMI.Offset.Offset); + // BinaryContext &BC = BF.getBinaryContext(); + // if (!MCInstr || !BC.MIB->mayLoad(*MCInstr)) + // continue; + + // auto BCMI = BB->getCacheMissInfoAtInstr(MCInstr); + // if (BCMI) { + // // Now, setCacheMissInfoAtInstr will overwrite original MissCount/AccessCount. + // BB->setCacheMissInfoAtInstr(MCInstr, CMI.MissCount + BCMI->MissCount, + // CMI.AccessCount + BCMI->AccessCount); + // } + // Now, setCacheMissInfoAtInstr will overwrite original MissCount/AccessCount. + // BB->setCacheMissInfoAtInstr(MCInstr, CMI.MissCount, CMI.AccessCount); + // } + //} } void DataReader::matchProfileData(BinaryFunction &BF) { @@ -465,6 +556,24 @@ void DataReader::matchProfileMemData(BinaryFunction &BF) { } } +void DataReader::matchProfileCacheMissData(BinaryFunction &BF) { + const std::vector AllCacheMissData = + getCacheMissDataForNamesRegex(BF.getNames()); + for (FuncCacheMissData *NewCacheMissData : AllCacheMissData) { + // Prevent functions from sharing the same profile. + if (NewCacheMissData->Used) + continue; + + if (FuncCacheMissData *CMD = getCacheMissData(BF)) + CMD->Used = false; + + // Update function data with the new set. + setCacheMissData(BF, NewCacheMissData); + NewCacheMissData->Used = true; + break; + } +} + bool DataReader::fetchProfileForOtherEntryPoints(BinaryFunction &BF) { BinaryContext &BC = BF.getBinaryContext(); @@ -658,7 +767,7 @@ void DataReader::convertBranchData(BinaryFunction &BF) const { } bool DataReader::recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To, - uint64_t Count, uint64_t Mispreds) const { + uint64_t Count, uint64_t Mispreds, int64_t Cycles) const { BinaryContext &BC = BF.getBinaryContext(); BinaryBasicBlock *FromBB = BF.getBasicBlockContainingOffset(From); @@ -780,8 +889,14 @@ bool DataReader::recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To, BinaryBasicBlock::BinaryBranchInfo &FTBI = FTSuccessor->getBranchInfo(*ToBB); FTBI.Count += Count; - if (Count) + if (Count) { FTBI.MispredictedCount += Mispreds; + if (FTBI.Cycle == 0) { + FTBI.Cycle = Cycles; + } else { + FTBI.Cycle = (FTBI.Cycle + Cycles) / 2; + } + } ToBB = FTSuccessor; } else { LLVM_DEBUG(dbgs() << "invalid branch in " << BF @@ -795,6 +910,11 @@ bool DataReader::recordBranch(BinaryFunction &BF, uint64_t From, uint64_t To, // Only update mispredicted count if it the count was real. if (Count) { BI.MispredictedCount += Mispreds; + if (BI.Cycle == 0) { + BI.Cycle = Cycles; + } else { + BI.Cycle = (BI.Cycle + Cycles) / 2; + } } return true; @@ -903,28 +1023,36 @@ ErrorOr DataReader::parseHexField(char EndChar, bool EndNl) { return Num; } -ErrorOr DataReader::parseLocation(char EndChar, bool EndNl, - bool ExpectMemLoc) { +ErrorOr +DataReader::parseLocation(char EndChar, bool EndNl, bool ExpectMemLoc, + bool ExpectCacheMissLoc) { // Read whether the location of the branch should be DSO or a symbol // 0 means it is a DSO. 1 means it is a global symbol. 2 means it is a local // symbol. // The symbol flag is also used to tag memory load events by adding 3 to the // base values, i.e. 3 not a symbol, 4 global symbol and 5 local symbol. - if (!ExpectMemLoc && ParsingBuf[0] != '0' && ParsingBuf[0] != '1' && + if (!ExpectMemLoc && !ExpectCacheMissLoc && ParsingBuf[0] != '0' && ParsingBuf[0] != '1' && ParsingBuf[0] != '2') { reportError("expected 0, 1 or 2"); return make_error_code(llvm::errc::io_error); } - if (ExpectMemLoc && ParsingBuf[0] != '3' && ParsingBuf[0] != '4' && + if (ExpectMemLoc && !ExpectCacheMissLoc && ParsingBuf[0] != '3' && ParsingBuf[0] != '4' && ParsingBuf[0] != '5') { reportError("expected 3, 4 or 5"); return make_error_code(llvm::errc::io_error); } + if (!ExpectMemLoc && ExpectCacheMissLoc && ParsingBuf[0] != '6' && ParsingBuf[0] != '7' && + ParsingBuf[0] != '8') { + reportError("expected 6, 7 or 8"); + return make_error_code(llvm::errc::io_error); + } + bool IsSymbol = - (!ExpectMemLoc && (ParsingBuf[0] == '1' || ParsingBuf[0] == '2')) || - (ExpectMemLoc && (ParsingBuf[0] == '4' || ParsingBuf[0] == '5')); + (!ExpectMemLoc && !ExpectCacheMissLoc && (ParsingBuf[0] == '1' || ParsingBuf[0] == '2')) || + (ExpectMemLoc && !ExpectCacheMissLoc && (ParsingBuf[0] == '4' || ParsingBuf[0] == '5')) || + (!ExpectMemLoc && ExpectCacheMissLoc && (ParsingBuf[0] == '7' || ParsingBuf[0] == '8')); ParsingBuf = ParsingBuf.drop_front(1); Col += 1; @@ -966,18 +1094,24 @@ ErrorOr DataReader::parseBranchInfo() { int64_t NumMispreds = MRes.get(); consumeAllRemainingFS(); - ErrorOr BRes = parseNumberField(FieldSeparator, /* EndNl = */ true); + ErrorOr BRes = parseNumberField(FieldSeparator); if (std::error_code EC = BRes.getError()) return EC; int64_t NumBranches = BRes.get(); consumeAllRemainingFS(); + ErrorOr CRes = parseNumberField(FieldSeparator, /* EndNl = */ true); + if (std::error_code EC = CRes.getError()) + return EC; + int64_t NumCycles = CRes.get(); + if (!checkAndConsumeNewLine()) { reportError("expected end of line"); return make_error_code(llvm::errc::io_error); } - return BranchInfo(std::move(From), std::move(To), NumMispreds, NumBranches); + // TODO: support to parse cycles + return BranchInfo(std::move(From), std::move(To), NumMispreds, NumBranches, NumCycles); } ErrorOr DataReader::parseMemInfo() { @@ -1006,6 +1140,34 @@ ErrorOr DataReader::parseMemInfo() { return MemInfo(Offset, Addr, CountRes.get()); } +ErrorOr DataReader::parseCacheMissInfo() { + ErrorOr Res = parseCacheMissLocation(FieldSeparator); + if (std::error_code EC = Res.getError()) + return EC; + + Location Offset = Res.get(); + + consumeAllRemainingFS(); + ErrorOr MissRes = parseNumberField(FieldSeparator, /* EndNl = */ true); + if (std::error_code EC = MissRes.getError()) + return EC; + int64_t MissCount = MissRes.get(); + + consumeAllRemainingFS(); + ErrorOr AccessRes = parseNumberField(FieldSeparator, /* EndNl = */ true); + if (std::error_code EC = AccessRes.getError()) + return EC; + int64_t AccessCount = AccessRes.get(); + + consumeAllRemainingFS(); + if (!checkAndConsumeNewLine()) { + reportError("expected end of line"); + return make_error_code(llvm::errc::io_error); + } + + return CacheMissInfo(std::move(Offset), MissCount, AccessCount); +} + ErrorOr DataReader::parseSampleInfo() { ErrorOr Res = parseLocation(FieldSeparator); if (std::error_code EC = Res.getError()) @@ -1081,6 +1243,15 @@ bool DataReader::hasMemData() { return false; } +bool DataReader::hasCacheMissData() { + if (ParsingBuf.size() == 0) + return false; + + if (ParsingBuf[0] == '6' || ParsingBuf[0] == '7' || ParsingBuf[0] == '8') + return true; + return false; +} + std::error_code DataReader::parseInNoLBRMode() { auto GetOrCreateFuncEntry = [&](StringRef Name) { auto I = NamesToSamples.find(Name); @@ -1168,6 +1339,17 @@ std::error_code DataReader::parse() { return I; }; + auto GetOrCreateFuncCacheMissEntry = [&](StringRef Name) { + auto I = NamesToCacheMissEvents.find(Name); + if (I == NamesToCacheMissEvents.end()) { + bool Success; + std::tie(I, Success) = NamesToCacheMissEvents.insert(std::make_pair( + Name, FuncCacheMissData(Name, FuncCacheMissData::ContainerTy()))); + assert(Success && "unexpected result of insert"); + } + return I; + }; + Col = 0; Line = 1; ErrorOr FlagOrErr = maybeParseNoLBRFlag(); @@ -1180,7 +1362,7 @@ std::error_code DataReader::parse() { return BATFlagOrErr.getError(); BATMode = *BATFlagOrErr; - if (!hasBranchData() && !hasMemData()) { + if (!hasBranchData() && !hasMemData() && !hasCacheMissData()) { Diag << "ERROR: no valid profile data found\n"; return make_error_code(llvm::errc::io_error); } @@ -1218,6 +1400,21 @@ std::error_code DataReader::parse() { } } + while(hasCacheMissData()) { + ErrorOr Res = parseCacheMissInfo(); + if (std::error_code EC = Res.getError()) + return EC; + + CacheMissInfo CMI = Res.get(); + + // Ignore memory events not involving known pc. + if (!CMI.Offset.IsSymbol) + continue; + + auto I = GetOrCreateFuncCacheMissEntry(CMI.Offset.Name); + I->second.Data.emplace_back(std::move(CMI)); + } + while (hasMemData()) { ErrorOr Res = parseMemInfo(); if (std::error_code EC = Res.getError()) @@ -1239,6 +1436,9 @@ std::error_code DataReader::parse() { for (auto &MemEvents : NamesToMemEvents) llvm::stable_sort(MemEvents.second.Data); + for (auto &CacheMissEvents : NamesToCacheMissEvents) + llvm::stable_sort(CacheMissEvents.second.Data); + return std::error_code(); } @@ -1256,6 +1456,13 @@ void DataReader::buildLTONameMaps() { if (CommonName) LTOCommonNameMemMap[*CommonName].push_back(&FuncData.second); } + + for (auto &FuncData : NamesToCacheMissEvents) { + const StringRef FuncName = FuncData.first; + const std::optional CommonName = getLTOCommonName(FuncName); + if (CommonName) + LTOCommonNameCacheMissMap[*CommonName].push_back(&FuncData.second); + } } template @@ -1357,6 +1564,12 @@ DataReader::getFuncSampleData(const std::vector &FuncNames) { return fetchMapEntry(NamesToSamples, FuncNames); } +FuncCacheMissData * +DataReader::getCacheMissDataForNames(const std::vector &FuncNames) { + return fetchMapEntry(NamesToCacheMissEvents, + FuncNames); +} + std::vector DataReader::getBranchDataForNamesRegex( const std::vector &FuncNames) { return fetchMapEntriesRegex(NamesToBranches, LTOCommonNameMap, FuncNames); @@ -1367,6 +1580,11 @@ DataReader::getMemDataForNamesRegex(const std::vector &FuncNames) { return fetchMapEntriesRegex(NamesToMemEvents, LTOCommonNameMemMap, FuncNames); } +std::vector +DataReader::getCacheMissDataForNamesRegex(const std::vector &FuncNames) { + return fetchMapEntriesRegex(NamesToCacheMissEvents, LTOCommonNameCacheMissMap, FuncNames); +} + bool DataReader::hasLocalsWithFileName() const { for (const auto &Func : NamesToBranches) { const StringRef &FuncName = Func.first; @@ -1416,6 +1634,16 @@ void DataReader::dump() const { } Diag << "\n"; } + + for (const auto &KV : NamesToCacheMissEvents) { + const StringRef Name = KV.first; + const FuncCacheMissData &FCMD = KV.second; + Diag << "Cache Miss events for " << Name << "\n"; + for (const CacheMissInfo &CMI : FCMD.Data) { + Diag << CMI.Offset << ": " << CMI.MissCount << ", " + << CMI.AccessCount << "\n"; + } + } } } // namespace bolt diff --git a/bolt/lib/Rewrite/BinaryPassManager.cpp b/bolt/lib/Rewrite/BinaryPassManager.cpp index 9ea382ad246c7f7a3cb4348e9c6736f8a8a860c1..2baca2900e73de0362930a5c636977b77762b3b8 100644 --- a/bolt/lib/Rewrite/BinaryPassManager.cpp +++ b/bolt/lib/Rewrite/BinaryPassManager.cpp @@ -22,6 +22,7 @@ #include "bolt/Passes/Instrumentation.h" #include "bolt/Passes/JTFootprintReduction.h" #include "bolt/Passes/LongJmp.h" +#include "bolt/Passes/LoopDataPrefetch.h" #include "bolt/Passes/LoopInversionPass.h" #include "bolt/Passes/MCF.h" #include "bolt/Passes/PLTCall.h" @@ -135,6 +136,11 @@ static cl::opt cl::desc("print functions after longjmp pass"), cl::Hidden, cl::cat(BoltOptCategory)); +static cl::opt + PrintLoopDataPrefetch("loop-data-prefetch", cl::Hidden, + cl::desc("print functions after loopdataprefetch pass"), + cl::cat(BoltOptCategory)); + cl::opt PrintNormalized("print-normalized", cl::desc("print functions after CFG is normalized"), @@ -433,6 +439,9 @@ Error BinaryFunctionPassManager::runAllPasses(BinaryContext &BC) { Manager.registerPass(std::make_unique(PrintNormalized)); + if (!opts::Instrument) + Manager.registerPass(std::make_unique(PrintLoopDataPrefetch)); + if (BC.isX86()) Manager.registerPass(std::make_unique(NeverPrint), opts::StripRepRet); diff --git a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp index 7ad19ef1e74bb9591761b87bbbcce34f53ced579..f3cfca65f65a8c01652faea78596b3dd657a20b6 100644 --- a/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp +++ b/bolt/lib/Target/AArch64/AArch64MCPlusBuilder.cpp @@ -68,6 +68,7 @@ static void createPopRegisters(MCInst &Inst, MCPhysReg Reg1, MCPhysReg Reg2) { Inst.addOperand(MCOperand::createImm(2)); } + static void loadReg(MCInst &Inst, MCPhysReg To, MCPhysReg From) { Inst.setOpcode(AArch64::LDRXui); Inst.clear(); @@ -189,6 +190,32 @@ public: Inst.getOpcode() == AArch64::MOVZWi); } + bool isSUB(const MCInst &Inst) const { + return (Inst.getOpcode() == AArch64::SUBWri || + Inst.getOpcode() == AArch64::SUBWrs || + Inst.getOpcode() == AArch64::SUBWrx || + Inst.getOpcode() == AArch64::SUBXri || + Inst.getOpcode() == AArch64::SUBXrs || + Inst.getOpcode() == AArch64::SUBXrx || + Inst.getOpcode() == AArch64::SUBXrx64); + } + + // do we need consider select compare. + bool isCMP(const MCInst &Inst) const { + return (Inst.getOpcode() == AArch64::SUBSWri || + Inst.getOpcode() == AArch64::SUBSWrs || + Inst.getOpcode() == AArch64::SUBSWrx || + Inst.getOpcode() == AArch64::SUBSXri || + Inst.getOpcode() == AArch64::SUBSXrs || + Inst.getOpcode() == AArch64::SUBSXrx || + Inst.getOpcode() == AArch64::SUBSXrx64 || + Inst.getOpcode() == AArch64::CBZW || + Inst.getOpcode() == AArch64::CBNZW || + Inst.getOpcode() == AArch64::CBZX || + Inst.getOpcode() == AArch64::CBNZX); + } + + // bool isADD(const MCInst &Inst) const { return (Inst.getOpcode() == AArch64::ADDSWri || Inst.getOpcode() == AArch64::ADDSWrr || @@ -228,6 +255,14 @@ public: Inst.getOpcode() == AArch64::LDRSBXui); } + bool isLDRD(const MCInst &Inst) const { + return (Inst.getOpcode() == AArch64::LDRDpost || + Inst.getOpcode() == AArch64::LDRDpre || + Inst.getOpcode() == AArch64::LDRDroW || + Inst.getOpcode() == AArch64::LDRDroX || + Inst.getOpcode() == AArch64::LDRDui); + } + bool isLDRH(const MCInst &Inst) const { return (Inst.getOpcode() == AArch64::LDRHHpost || Inst.getOpcode() == AArch64::LDRHHpre || @@ -246,6 +281,14 @@ public: Inst.getOpcode() == AArch64::LDRSHXui); } + bool isLDRQ(const MCInst &Inst) const { + return (Inst.getOpcode() == AArch64::LDRQpost || + Inst.getOpcode() == AArch64::LDRQpre || + Inst.getOpcode() == AArch64::LDRQroW || + Inst.getOpcode() == AArch64::LDRQroX || + Inst.getOpcode() == AArch64::LDRQui); + } + bool isLDRW(const MCInst &Inst) const { return (Inst.getOpcode() == AArch64::LDRWpost || Inst.getOpcode() == AArch64::LDRWpre || @@ -263,7 +306,8 @@ public: } bool mayLoad(const MCInst &Inst) const override { - return isLDRB(Inst) || isLDRH(Inst) || isLDRW(Inst) || isLDRX(Inst); + return isLDRB(Inst) || isLDRD(Inst) || isLDRQ(Inst) || isLDRH(Inst) || + isLDRW(Inst) || isLDRX(Inst); } bool isAArch64ExclusiveLoad(const MCInst &Inst) const override { @@ -1542,6 +1586,80 @@ public: return Insts; } + void createPushRegister(MCInst &Inst, MCPhysReg Reg, unsigned Size) const override { + storeReg(Inst, Reg, AArch64::SP); + } + + void createPopRegister(MCInst &Inst, MCPhysReg Reg, unsigned Size) const override { + loadReg(Inst, Reg, AArch64::SP); + } + + bool createADD64ri(MCInst &Inst, const MCPhysReg &DstReg, const MCPhysReg &SrcReg, + int64_t Imm) const override { + Inst.setOpcode(AArch64::ADDXri); + Inst.addOperand(MCOperand::createReg(DstReg)); + Inst.addOperand(MCOperand::createReg(SrcReg)); + Inst.addOperand(MCOperand::createImm(Imm)); + Inst.addOperand(MCOperand::createImm(0)); + return true; + } + + bool createADD64rs(MCInst &Inst, const MCPhysReg &DstReg, const MCPhysReg &SrcReg1, + const MCPhysReg &SrcReg2, int64_t Imm) const override { + Inst.setOpcode(AArch64::ADDXrs); + Inst.addOperand(MCOperand::createReg(DstReg)); + Inst.addOperand(MCOperand::createReg(SrcReg1)); + Inst.addOperand(MCOperand::createReg(SrcReg2)); + Inst.addOperand(MCOperand::createImm(AArch64_AM::ShiftExtendType::LSL)); // Shifter + Inst.addOperand(MCOperand::createImm(Imm)); // imm6 + return true; + } + + bool createPrefetchMui(MCInst &Inst, const MCPhysReg &BaseReg, StringRef PrfOp, + unsigned Offset) const override { + return createPrefetch(Inst, BaseReg, AArch64::NoRegister, AArch64::PRFMui, PrfOp, Offset); + } + + // Opcode: + // 1. AArch64::PRFMui + // 2. AArch64::PRFMroX + // 3. AArch64::PRFMroW + // 4. AArch64::PRFMl + // 5. AArch64::PRFUMi + bool createPrefetch(MCInst &Inst, const MCPhysReg &BaseReg, const MCPhysReg &IndexReg, + unsigned Opcode, StringRef PrfOp, unsigned Offset) const override { + auto PrefetchOp = AArch64PRFM::lookupPRFMByName(PrfOp); + + switch(Opcode) { + case AArch64::PRFMui: + Inst.setOpcode(Opcode); + Inst.addOperand(MCOperand::createImm(AArch64PRFM::lookupPRFMByName("pldl1keep")->Encoding)); + // Inst.addOperand(MCOperand::createImm(AArch64PRFM::PRFMValues::pldl1keep)); + Inst.addOperand(MCOperand::createReg(BaseReg)); + Inst.addOperand(llvm::MCOperand::createImm(Offset)); + return true; + case AArch64::PRFMroX: + // TODO: + return false; + case AArch64::PRFMroW: + // TODO: + return false; + case AArch64::PRFUMi: + // TODO: + return false; + } + // Inst.setOpcode(AArch64::PRFMui); + // // auto Res = AArch64PRFM::lookupPRFMByName("pldl1keep"); + // // Inst.addOperand(MCOperand::createImm(Res->Encoding)); + // Inst.addOperand(MCOperand::createImm(AArch64PRFM::PRFMValues::pldl1keep)); + // Inst.addOperand(MCOperand::createReg(BaseReg)); + // //Inst.addOperand(MCOperand::createReg(IndexReg)); + // //unsigned Ext = llvm::AArch64_AM::getExtendEncoding(llvm::AArch64_AM::UXTX, Offset); + // //unsigned Ext = llvm::AArch64_AM::getArithExtendImm(llvm::AArch64_AM::UXTX, 1); + // Inst.addOperand(llvm::MCOperand::createImm(Offset)); + // //Inst.addOperand(MCOperand::createImm(Offset)); + } + void createIndirectCallInst(MCInst &Inst, bool IsTailCall, MCPhysReg Reg) const { Inst.clear(); diff --git a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp index 9b51ab0763e3500a3e52c50d013554efaa63f1ad..3ac60fd7e410182a33ae2e522e37048ccf0f087f 100644 --- a/bolt/lib/Target/X86/X86MCPlusBuilder.cpp +++ b/bolt/lib/Target/X86/X86MCPlusBuilder.cpp @@ -306,6 +306,14 @@ public: return 0; } + bool isCMP(const MCInst &Inst) const override { + return X86::isCMP(Inst.getOpcode()); + } + + bool isADD(const MCInst &Inst) const override { + return X86::isADD(Inst.getOpcode()); + } + bool isSUB(const MCInst &Inst) const override { return X86::isSUB(Inst.getOpcode()); } diff --git a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn index 620c9e8927faaf1d2c758870acd7d7374c9784f4..bd14763b684815b6cf190369ac2c67081fd649bd 100644 --- a/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn +++ b/llvm/utils/gn/secondary/bolt/lib/Passes/BUILD.gn @@ -34,6 +34,7 @@ static_library("Passes") { "JTFootprintReduction.cpp", "LivenessAnalysis.cpp", "LongJmp.cpp", + "LoopDataPrefetch.cpp", "LoopInversionPass.cpp", "MCF.cpp", "PLTCall.cpp",