diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h index 52156a600791cb6871bb6ad34cedfa51574b3896..27094bee771ad5293693d553d1d4e59eff31029f 100644 --- a/bolt/include/bolt/Passes/ReorderFunctions.h +++ b/bolt/include/bolt/Passes/ReorderFunctions.h @@ -32,6 +32,7 @@ public: RT_EXEC_COUNT, RT_HFSORT, RT_HFSORT_PLUS, + RT_CDS, RT_PETTIS_HANSEN, RT_RANDOM, RT_USER diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp index b5052cdaddb13e38fd8b8d7a3f3d5b999ad90ad9..3c3365e1d3d711321c3eda520012d5cbb64e0507 100644 --- a/bolt/lib/Passes/ReorderAlgorithm.cpp +++ b/bolt/lib/Passes/ReorderAlgorithm.cpp @@ -531,21 +531,21 @@ void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF, } // Initialize CFG edges - using JumpT = std::pair; - std::vector> JumpCounts; + std::vector JumpCounts; for (BinaryBasicBlock *BB : BF.getLayout().blocks()) { auto BI = BB->branch_info_begin(); for (BinaryBasicBlock *SuccBB : BB->successors()) { assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE && "missing profile for a jump"); - auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex()); - JumpCounts.push_back(std::make_pair(It, BI->Count)); + JumpCounts.push_back( + {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count}); ++BI; } } // Run the layout algorithm - auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto Result = + codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); Order.reserve(BF.getLayout().block_size()); for (uint64_t R : Result) Order.push_back(OrigOrder[R]); diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp index 998e0eab66fa67dd3d7d00cf2a3dae2743002a85..f0ad54ef106f32bcecedde42232a03bfefd03c21 100644 --- a/bolt/lib/Passes/ReorderFunctions.cpp +++ b/bolt/lib/Passes/ReorderFunctions.cpp @@ -13,6 +13,7 @@ #include "bolt/Passes/ReorderFunctions.h" #include "bolt/Passes/HFSort.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include #define DEBUG_TYPE "hfsort" @@ -27,33 +28,27 @@ extern cl::opt RandomSeed; extern size_t padFunction(const bolt::BinaryFunction &Function); -cl::opt -ReorderFunctions("reorder-functions", - cl::desc("reorder and cluster functions (works only with relocations)"), - cl::init(bolt::ReorderFunctions::RT_NONE), - cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, - "none", - "do not reorder functions"), - clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, - "exec-count", - "order by execution count"), - clEnumValN(bolt::ReorderFunctions::RT_HFSORT, - "hfsort", - "use hfsort algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, - "hfsort+", - "use hfsort+ algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, - "pettis-hansen", - "use Pettis-Hansen algorithm"), - clEnumValN(bolt::ReorderFunctions::RT_RANDOM, - "random", - "reorder functions randomly"), - clEnumValN(bolt::ReorderFunctions::RT_USER, - "user", - "use function order specified by -function-order")), - cl::ZeroOrMore, - cl::cat(BoltOptCategory)); +cl::opt ReorderFunctions( + "reorder-functions", + cl::desc("reorder and cluster functions (works only with relocations)"), + cl::init(bolt::ReorderFunctions::RT_NONE), + cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none", + "do not reorder functions"), + clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count", + "order by execution count"), + clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort", + "use hfsort algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+", + "use hfsort+ algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_CDS, "cds", + "use cache-directed sort"), + clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN, + "pettis-hansen", "use Pettis-Hansen algorithm"), + clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random", + "reorder functions randomly"), + clEnumValN(bolt::ReorderFunctions::RT_USER, "user", + "use function order specified by -function-order")), + cl::ZeroOrMore, cl::cat(BoltOptCategory)); static cl::opt ReorderFunctionsUseHotSize( "reorder-functions-use-hot-size", @@ -323,6 +318,34 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) { case RT_HFSORT_PLUS: Clusters = hfsortPlus(Cg); break; + case RT_CDS: { + // It is required that the sum of incoming arc weights is not greater + // than the number of samples for every function. Ensuring the call graph + // obeys the property before running the algorithm. + Cg.adjustArcWeights(); + + // Initialize CFG nodes and their data + std::vector FuncSizes; + std::vector FuncCounts; + std::vector CallCounts; + std::vector CallOffsets; + for (NodeId F = 0; F < Cg.numNodes(); ++F) { + FuncSizes.push_back(Cg.size(F)); + FuncCounts.push_back(Cg.samples(F)); + for (NodeId Succ : Cg.successors(F)) { + const Arc &Arc = *Cg.findArc(F, Succ); + CallCounts.push_back({F, Succ, uint64_t(Arc.weight())}); + CallOffsets.push_back(uint64_t(Arc.avgCallOffset())); + } + } + + // Run the layout algorithm. + std::vector Result = codelayout::computeCacheDirectedLayout( + FuncSizes, FuncCounts, CallCounts, CallOffsets); + + // Create a single cluster from the computed order of hot functions. + Clusters.emplace_back(Cluster(Result, Cg)); + } break; case RT_PETTIS_HANSEN: Clusters = pettisAndHansen(Cg); break; diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp index c8971ea5036dc026689899884b8540391ddc341a..7eeda0ec3efa8a38686a9a7cc72353f5f7758e77 100644 --- a/flang/lib/Lower/Bridge.cpp +++ b/flang/lib/Lower/Bridge.cpp @@ -174,7 +174,7 @@ private: /// Track symbols symbols processed during and after the registration /// to avoid infinite loops between type conversions and global variable /// creation. - llvm::SmallSetVector seen; + llvm::SmallSetVector seen; }; class DispatchTableConverter { diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt index 8e6a746d219ed366f8fbe03cef9893498956e77b..3a571b8d7b78bc56edb0def8abba6d8a89303e29 100644 --- a/lld/ELF/CMakeLists.txt +++ b/lld/ELF/CMakeLists.txt @@ -72,6 +72,7 @@ add_lld_library(lldELF Passes Support TargetParser + TransformUtils LINK_LIBS lldCommon diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp index ff72731b1f38d65a6896b109b62e72b20aea94fc..5e36964da94fc52328f66d978a65ee6d18a1e0f8 100644 --- a/lld/ELF/CallGraphSort.cpp +++ b/lld/ELF/CallGraphSort.cpp @@ -6,38 +6,21 @@ // //===----------------------------------------------------------------------===// /// -/// Implementation of Call-Chain Clustering from: Optimizing Function Placement -/// for Large-Scale Data-Center Applications -/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf -/// -/// The goal of this algorithm is to improve runtime performance of the final -/// executable by arranging code sections such that page table and i-cache -/// misses are minimized. -/// -/// Definitions: -/// * Cluster -/// * An ordered list of input sections which are laid out as a unit. At the -/// beginning of the algorithm each input section has its own cluster and -/// the weight of the cluster is the sum of the weight of all incoming -/// edges. -/// * Call-Chain Clustering (C³) Heuristic -/// * Defines when and how clusters are combined. Pick the highest weighted -/// input section then add it to its most likely predecessor if it wouldn't -/// penalize it too much. -/// * Density -/// * The weight of the cluster divided by the size of the cluster. This is a -/// proxy for the amount of execution time spent per byte of the cluster. -/// -/// It does so given a call graph profile by the following: -/// * Build a weighted call graph from the call graph profile -/// * Sort input sections by weight -/// * For each input section starting with the highest weight -/// * Find its most likely predecessor cluster -/// * Check if the combined cluster would be too large, or would have too low -/// a density. -/// * If not, then combine the clusters. -/// * Sort non-empty clusters by density +/// The file is responsible for sorting sections using LLVM call graph profile +/// data by placing frequently executed code sections together. The goal of the +/// placement is to improve the runtime performance of the final executable by +/// arranging code sections so that i-TLB misses and i-cache misses are reduced. /// +/// The algorithm first builds a call graph based on the profile data and then +/// iteratively merges "chains" (ordered lists) of input sections which will be +/// laid out as a unit. There are two implementations for deciding how to +/// merge a pair of chains: +/// - a simpler one, referred to as Call-Chain Clustering (C^3), that follows +/// "Optimizing Function Placement for Large-Scale Data-Center Applications" +/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf +/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which +/// typically produces layouts with higher locality, and hence, yields fewer +/// instruction cache misses on large binaries. //===----------------------------------------------------------------------===// #include "CallGraphSort.h" @@ -45,6 +28,7 @@ #include "InputSection.h" #include "Symbols.h" #include "llvm/Support/FileSystem.h" +#include "llvm/Transforms/Utils/CodeLayout.h" #include @@ -75,6 +59,33 @@ struct Cluster { Edge bestPred = {-1, 0}; }; +/// Implementation of the Call-Chain Clustering (C^3). The goal of this +/// algorithm is to improve runtime performance of the executable by arranging +/// code sections such that page table and i-cache misses are minimized. +/// +/// Definitions: +/// * Cluster +/// * An ordered list of input sections which are laid out as a unit. At the +/// beginning of the algorithm each input section has its own cluster and +/// the weight of the cluster is the sum of the weight of all incoming +/// edges. +/// * Call-Chain Clustering (C³) Heuristic +/// * Defines when and how clusters are combined. Pick the highest weighted +/// input section then add it to its most likely predecessor if it wouldn't +/// penalize it too much. +/// * Density +/// * The weight of the cluster divided by the size of the cluster. This is a +/// proxy for the amount of execution time spent per byte of the cluster. +/// +/// It does so given a call graph profile by the following: +/// * Build a weighted call graph from the call graph profile +/// * Sort input sections by weight +/// * For each input section starting with the highest weight +/// * Find its most likely predecessor cluster +/// * Check if the combined cluster would be too large, or would have too low +/// a density. +/// * If not, then combine the clusters. +/// * Sort non-empty clusters by density class CallGraphSort { public: CallGraphSort(); @@ -260,11 +271,74 @@ DenseMap CallGraphSort::run() { return orderMap; } +// Sort sections by the profile data using the Cache-Directed Sort algorithm. +// The placement is done by optimizing the locality by co-locating frequently +// executed code sections together. +DenseMap elf::computeCacheDirectedSortOrder() { + SmallVector funcSizes; + SmallVector funcCounts; + SmallVector callCounts; + SmallVector callOffsets; + SmallVector sections; + DenseMap secToTargetId; + + auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t { + auto res = secToTargetId.try_emplace(inSec, sections.size()); + if (res.second) { + // inSec does not appear before in the graph. + sections.push_back(inSec); + assert(inSec->getSize() > 0 && "found a function with zero size"); + funcSizes.push_back(inSec->getSize()); + funcCounts.push_back(0); + } + return res.first->second; + }; + + // Create the graph. + for (std::pair &c : config->callGraphProfile) { + const InputSectionBase *fromSB = cast(c.first.first); + const InputSectionBase *toSB = cast(c.first.second); + // Ignore edges between input sections belonging to different sections. + if (fromSB->getOutputSection() != toSB->getOutputSection()) + continue; + + uint64_t weight = c.second; + // Ignore edges with zero weight. + if (weight == 0) + continue; + + size_t from = getOrCreateNode(fromSB); + size_t to = getOrCreateNode(toSB); + // Ignore self-edges (recursive calls). + if (from == to) + continue; + + callCounts.push_back({from, to, weight}); + // Assume that the jump is at the middle of the input section. The profile + // data does not contain jump offsets. + callOffsets.push_back((funcSizes[from] + 1) / 2); + funcCounts[to] += weight; + } + + // Run the layout algorithm. + std::vector sortedSections = codelayout::computeCacheDirectedLayout( + funcSizes, funcCounts, callCounts, callOffsets); + + // Create the final order. + DenseMap orderMap; + int curOrder = 1; + for (uint64_t secIdx : sortedSections) + orderMap[sections[secIdx]] = curOrder++; + + return orderMap; +} + // Sort sections by the profile data provided by --callgraph-profile-file. // // This first builds a call graph based on the profile data then merges sections -// according to the C³ heuristic. All clusters are then sorted by a density -// metric to further improve locality. +// according either to the C³ or Cache-Directed-Sort ordering algorithm. DenseMap elf::computeCallGraphProfileOrder() { + if (config->callGraphProfileSort == CGProfileSortKind::Cdsort) + return computeCacheDirectedSortOrder(); return CallGraphSort().run(); } diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h index 4997cb102c326402480c3c418e0b34a2f652bba0..1b54f2b62482284bb2d02581dc7481b367ff1760 100644 --- a/lld/ELF/CallGraphSort.h +++ b/lld/ELF/CallGraphSort.h @@ -14,6 +14,8 @@ namespace lld::elf { class InputSectionBase; +llvm::DenseMap computeCacheDirectedSortOrder(); + llvm::DenseMap computeCallGraphProfileOrder(); } // namespace lld::elf diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h index 706f17b764c88f08c8eda37bcfb38d25f6fc20fb..90a1e312735f090af82a2dcd8048875994e651a8 100644 --- a/lld/ELF/Config.h +++ b/lld/ELF/Config.h @@ -57,6 +57,9 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All }; // For --build-id. enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid }; +// For --call-graph-profile-sort={none,hfsort,cdsort}. +enum class CGProfileSortKind { None, Hfsort, Cdsort }; + // For --discard-{all,locals,none}. enum class DiscardPolicy { Default, All, Locals, None }; @@ -193,7 +196,7 @@ struct Config { bool armJ1J2BranchEncoding = false; bool asNeeded = false; BsymbolicKind bsymbolic = BsymbolicKind::None; - bool callGraphProfileSort; + CGProfileSortKind callGraphProfileSort; bool checkSections; bool checkDynamicRelocs; llvm::DebugCompressionType compressDebugSections; @@ -225,6 +228,7 @@ struct Config { bool ltoDebugPassManager; bool ltoEmitAsm; bool ltoUniqueBasicBlockSectionNames; + bool ltoValidateAllVtablesHaveTypeInfos; bool ltoWholeProgramVisibility; bool mergeArmExidx; bool mipsN32Abi = false; @@ -441,6 +445,9 @@ struct Ctx { std::atomic hasTlsIe{false}; // True if we need to reserve two .got entries for local-dynamic TLS model. std::atomic needsTlsLd{false}; + // True if all native vtable symbols have corresponding type info symbols + // during LTO. + bool ltoAllVtablesHaveTypeInfos; void reset(); }; diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp index 7e2a72acf8f64e28f6ae0bf8f6934f99f48b4f28..b666602a558647d7be51b25cf1d4716f98a28510 100644 --- a/lld/ELF/Driver.cpp +++ b/lld/ELF/Driver.cpp @@ -104,6 +104,7 @@ void Ctx::reset() { backwardReferences.clear(); hasSympart.store(false, std::memory_order_relaxed); needsTlsLd.store(false, std::memory_order_relaxed); + ltoAllVtablesHaveTypeInfos = false; } bool elf::link(ArrayRef args, llvm::raw_ostream &stdoutOS, @@ -974,21 +975,87 @@ template static void readCallGraphsFromObjectFiles() { } } -static DebugCompressionType getCompressDebugSections(opt::InputArgList &args) { - StringRef s = args.getLastArgValue(OPT_compress_debug_sections, "none"); - if (s == "zlib") { - if (!compression::zlib::isAvailable()) - error("--compress-debug-sections: zlib is not available"); - return DebugCompressionType::Zlib; +template +static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) { + DenseSet typeInfoSymbols; + SmallSetVector vtableSymbols; + auto processVtableAndTypeInfoSymbols = [&](StringRef name) { + if (name.consume_front("_ZTI")) + typeInfoSymbols.insert(name); + else if (name.consume_front("_ZTV")) + vtableSymbols.insert(name); + }; + + // Examine all native symbol tables. + for (ELFFileBase *f : ctx.objectFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getGlobalELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } + } + + for (SharedFile *f : ctx.sharedFiles) { + using Elf_Sym = typename ELFT::Sym; + for (const Elf_Sym &s : f->template getELFSyms()) { + if (s.st_shndx != SHN_UNDEF) { + StringRef name = check(s.getName(f->getStringTable())); + processVtableAndTypeInfoSymbols(name); + } + } } - if (s == "zstd") { - if (!compression::zstd::isAvailable()) - error("--compress-debug-sections: zstd is not available"); - return DebugCompressionType::Zstd; + + SmallSetVector vtableSymbolsWithNoRTTI; + for (StringRef s : vtableSymbols) + if (!typeInfoSymbols.count(s)) + vtableSymbolsWithNoRTTI.insert(s); + + // Remove known safe symbols. + for (auto *arg : args.filtered(OPT_lto_known_safe_vtables)) { + StringRef knownSafeName = arg->getValue(); + if (!knownSafeName.consume_front("_ZTV")) + error("--lto-known-safe-vtables=: expected symbol to start with _ZTV, " + "but got " + + knownSafeName); + vtableSymbolsWithNoRTTI.remove(knownSafeName); } + + ctx.ltoAllVtablesHaveTypeInfos = vtableSymbolsWithNoRTTI.empty(); + // Check for unmatched RTTI symbols + for (StringRef s : vtableSymbolsWithNoRTTI) { + message( + "--lto-validate-all-vtables-have-type-infos: RTTI missing for vtable " + "_ZTV" + + s + ", --lto-whole-program-visibility disabled"); + } +} + +static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) { + StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort"); + if (s == "hfsort") + return CGProfileSortKind::Hfsort; + if (s == "cdsort") + return CGProfileSortKind::Cdsort; if (s != "none") - error("unknown --compress-debug-sections value: " + s); - return DebugCompressionType::None; + error("unknown --call-graph-profile-sort= value: " + s); + return CGProfileSortKind::None; +} + +static DebugCompressionType getCompressionType(StringRef s, StringRef option) { + DebugCompressionType type = StringSwitch(s) + .Case("zlib", DebugCompressionType::Zlib) + .Case("zstd", DebugCompressionType::Zstd) + .Default(DebugCompressionType::None); + if (type == DebugCompressionType::None) { + if (s != "none") + error("unknown " + option + " value: " + s); + } else if (const char *reason = compression::getReasonIfUnsupported( + compression::formatFor(type))) { + error(option + ": " + reason); + } + return type; } static StringRef getAliasSpelling(opt::Arg *arg) { @@ -1078,10 +1145,13 @@ static void readConfigs(opt::InputArgList &args) { else if (arg->getOption().matches(OPT_Bsymbolic)) config->bsymbolic = BsymbolicKind::All; } + config->callGraphProfileSort = getCGProfileSortKind(args); config->checkSections = args.hasFlag(OPT_check_sections, OPT_no_check_sections, true); config->chroot = args.getLastArgValue(OPT_chroot); - config->compressDebugSections = getCompressDebugSections(args); + config->compressDebugSections = getCompressionType( + args.getLastArgValue(OPT_compress_debug_sections, "none"), + "--compress-debug-sections"); config->cref = args.hasArg(OPT_cref); config->optimizeBBJumps = args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false); @@ -1096,8 +1166,6 @@ static void readConfigs(opt::InputArgList &args) { args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false); config->emitLLVM = args.hasArg(OPT_plugin_opt_emit_llvm, false); config->emitRelocs = args.hasArg(OPT_emit_relocs); - config->callGraphProfileSort = args.hasFlag( - OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true); config->enableNewDtags = args.hasFlag(OPT_enable_new_dtags, OPT_disable_new_dtags, true); config->entry = args.getLastArgValue(OPT_entry); @@ -1138,6 +1206,9 @@ static void readConfigs(opt::InputArgList &args) { config->ltoWholeProgramVisibility = args.hasFlag(OPT_lto_whole_program_visibility, OPT_no_lto_whole_program_visibility, false); + config->ltoValidateAllVtablesHaveTypeInfos = + args.hasFlag(OPT_lto_validate_all_vtables_have_type_infos, + OPT_no_lto_validate_all_vtables_have_type_infos, false); config->ltoo = args::getInteger(args, OPT_lto_O, 2); config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq); config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1); @@ -1465,7 +1536,7 @@ static void readConfigs(opt::InputArgList &args) { config->symbolOrderingFile = getSymbolOrderingFile(*buffer); // Also need to disable CallGraphProfileSort to prevent // LLD order symbols with CGProfile - config->callGraphProfileSort = false; + config->callGraphProfileSort = CGProfileSortKind::None; } } @@ -2666,6 +2737,10 @@ void LinkerDriver::link(opt::InputArgList &args) { config->ltoEmitAsm || !config->thinLTOModulesToCompile.empty(); + // Handle --lto-validate-all-vtables-have-type-infos. + if (config->ltoValidateAllVtablesHaveTypeInfos) + invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args); + // Do link-time optimization if given files are LLVM bitcode files. // This compiles bitcode files into real object files. // @@ -2849,7 +2924,7 @@ void LinkerDriver::link(opt::InputArgList &args) { } // Read the callgraph now that we know what was gced or icfed - if (config->callGraphProfileSort) { + if (config->callGraphProfileSort != CGProfileSortKind::None) { if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file)) if (std::optional buffer = readFile(arg->getValue())) readCallGraph(*buffer); diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp index b80f1f48f768aa8210377d8c26b8bf2654467481..e8c0e9778c5fdf671e4e25ee95e9bd9e00f39393 100644 --- a/lld/ELF/LTO.cpp +++ b/lld/ELF/LTO.cpp @@ -153,6 +153,9 @@ static lto::Config createConfig() { c.DwoDir = std::string(config->dwoDir); c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility; + c.ValidateAllVtablesHaveTypeInfos = + config->ltoValidateAllVtablesHaveTypeInfos; + c.AllVtablesHaveTypeInfos = ctx.ltoAllVtablesHaveTypeInfos; c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty(); for (const llvm::StringRef &name : config->thinLTOModulesToCompile) diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td index b6a6ef64d017a4778094326ed2c31e77ca3e9fcb..c91111006942c53dc3bed5143ed96c84b0116efc 100644 --- a/lld/ELF/Options.td +++ b/lld/ELF/Options.td @@ -111,9 +111,12 @@ defm as_needed: B<"as-needed", defm call_graph_ordering_file: Eq<"call-graph-ordering-file", "Layout sections to optimize the given callgraph">; -defm call_graph_profile_sort: BB<"call-graph-profile-sort", - "Reorder sections with call graph profile (default)", - "Do not reorder sections with call graph profile">; +def call_graph_profile_sort: JJ<"call-graph-profile-sort=">, + HelpText<"Reorder input sections with call graph profile using the specified algorithm (default: hfsort)">, + MetaVarName<"[none,hfsort]">, + Values<"none,hfsort">; +def : FF<"no-call-graph-profile-sort">, Alias, AliasArgs<["none"]>, + Flags<[HelpHidden]>; // --chroot doesn't have a help text because it is an internal option. def chroot: Separate<["--"], "chroot">; @@ -569,9 +572,14 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">, defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch", "turn on warnings about profile cfg mismatch (default)>", "turn off warnings about profile cfg mismatch">; +defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", + "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">; def lto_obj_path_eq: JJ<"lto-obj-path=">; def lto_sample_profile: JJ<"lto-sample-profile=">, HelpText<"Sample profile file path">; +defm lto_validate_all_vtables_have_type_infos: BB<"lto-validate-all-vtables-have-type-infos", + "Validate that all vtables have type infos for LTO link", + "Do not validate that all vtables have type infos for LTO link">; defm lto_whole_program_visibility: BB<"lto-whole-program-visibility", "Asserts that the LTO link has whole program visibility", "Asserts that the LTO link does not have whole program visibility">; diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1 index edeb7c4bfe37ca6c876fc598da008358c0a0af1d..35ee7ff0447debcc9e58d28260ac00fdef842bf2 100644 --- a/lld/docs/ld.lld.1 +++ b/lld/docs/ld.lld.1 @@ -115,6 +115,19 @@ is not intended to be cryptographically secure. .It Fl -build-id Synonym for .Fl -build-id Ns = Ns Cm fast . +.It Fl -call-graph-profile-sort Ns = Ns Ar algorithm +.Ar algorithm +may be: +.Pp +.Bl -tag -width 2n -compact +.It Cm none +Ignore call graph profile. +.It Cm hfsort +Use hfsort (default). +.It Cm cdsort +Use cdsort. +.El +.Pp .It Fl -color-diagnostics Ns = Ns Ar value Use colors in diagnostics. .Ar value diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s index f56f3bcbf0c3c5e92c11e83b8c692f0cb17450e9..0848adc5e4279a7edbb8fdc3730f104ed711819b 100644 --- a/lld/test/ELF/cgprofile-obj.s +++ b/lld/test/ELF/cgprofile-obj.s @@ -3,8 +3,11 @@ # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o # RUN: ld.lld -e A %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s -# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t +# RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG +## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none. +# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1 +# RUN: cmp %t %t1 .section .text.D,"ax",@progbits D: diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s index 99cbfa574532523a842e8fa539598c08b2e61ae1..c9194bbbc43cbe0284091ef63b66ebdedc4e5813 100644 --- a/lld/test/ELF/cgprofile-txt.s +++ b/lld/test/ELF/cgprofile-txt.s @@ -24,8 +24,19 @@ # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2 # RUN: llvm-readobj --symbols %t2 | FileCheck %s +## --call-graph-profile-sort=hfsort is the default. +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b +# RUN: cmp %t2 %t2b + +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT + +# RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \ +# RUN: -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN + +# UNKNOWN: error: unknown --call-graph-profile-sort= value: sort .section .text.D,"ax",@progbits D: @@ -159,6 +170,31 @@ TooManyPreds10: # CHECK: Name: _init2 # CHECK-NEXT: Value: 0x201141 +# CDSORT: Name: D +# CDSORT-NEXT: Value: 0x201123 +# CDSORT: Name: TooManyPreds +# CDSORT-NEXT: Value: 0x20112F +# CDSORT: Name: TooManyPreds10 +# CDSORT-NEXT: Value: 0x20112E +# CDSORT: Name: C +# CDSORT-NEXT: Value: 0x201122 +# CDSORT: Name: B +# CDSORT-NEXT: Value: 0x201121 +# CDSORT: Name: A +# CDSORT-NEXT: Value: 0x201120 +# CDSORT: Name: TS +# CDSORT-NEXT: Value: 0x20113D +# CDSORT: Name: PP +# CDSORT-NEXT: Value: 0x20113C +# CDSORT: Name: QC +# CDSORT-NEXT: Value: 0x20113E +# CDSORT: Name: GB +# CDSORT-NEXT: Value: 0x20113F +# CDSORT: Name: _init +# CDSORT-NEXT: Value: 0x201140 +# CDSORT: Name: _init2 +# CDSORT-NEXT: Value: 0x201141 + # NOSORT: Name: D # NOSORT-NEXT: Value: 0x201120 # NOSORT: Name: TooManyPreds diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s index 91961db39c3a883fc948c3b609e2e8b95a1f4e4c..b59b6eeb292fabff00e32148498208b799d3cf46 100644 --- a/lld/test/ELF/cgprofile-txt2.s +++ b/lld/test/ELF/cgprofile-txt2.s @@ -5,17 +5,28 @@ # RUN: echo "B C 50" >> %t.call_graph # RUN: echo "C D 40" >> %t.call_graph # RUN: echo "D B 10" >> %t.call_graph -# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2 -# RUN: llvm-readobj --symbols %t2 | FileCheck %s +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3 +# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2 +# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS -# CHECK: Name: A -# CHECK-NEXT: Value: 0x201123 -# CHECK: Name: B -# CHECK-NEXT: Value: 0x201120 -# CHECK: Name: C -# CHECK-NEXT: Value: 0x201121 -# CHECK: Name: D -# CHECK-NEXT: Value: 0x201122 +# CHECKC3: Name: A +# CHECKC3-NEXT: Value: 0x201123 +# CHECKC3: Name: B +# CHECKC3-NEXT: Value: 0x201120 +# CHECKC3: Name: C +# CHECKC3-NEXT: Value: 0x201121 +# CHECKC3: Name: D +# CHECKC3-NEXT: Value: 0x201122 + +# CHECKCDS: Name: A +# CHECKCDS-NEXT: Value: 0x201120 +# CHECKCDS: Name: B +# CHECKCDS-NEXT: Value: 0x201121 +# CHECKCDS: Name: C +# CHECKCDS-NEXT: Value: 0x201122 +# CHECKCDS: Name: D +# CHECKCDS-NEXT: Value: 0x201123 .section .text.A,"ax",@progbits .globl A diff --git a/lld/test/ELF/compress-sections-err.s b/lld/test/ELF/compress-sections-err.s new file mode 100644 index 0000000000000000000000000000000000000000..09780380708319c99a848e46f8b682301bc64ce2 --- /dev/null +++ b/lld/test/ELF/compress-sections-err.s @@ -0,0 +1,12 @@ +# REQUIRES: x86 +# UNSUPPORTED: zlib + +# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o +# RUN: ld.lld %t.o --compress-debug-sections=zlib --compress-debug-sections=none -o /dev/null 2>&1 | count 0 +# RUN: not ld.lld %t.o --compress-debug-sections=zlib -o /dev/null 2>&1 | \ +# RUN: FileCheck %s --implicit-check-not=error: + +# CHECK: error: --compress-debug-sections: LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time + +.globl _start +_start: diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 0000000000000000000000000000000000000000..fb357831d6f21a97f34d9a4bf09e70818669bbc4 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,26 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI6Native, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } +@_ZTS6Native = linkonce_odr constant [8 x i8] c"6Native\00" +@_ZTI6Native = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS6Native, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 0000000000000000000000000000000000000000..4533504c601803158a2ecbc550163c32fc21620a --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,19 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] } + +define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 0000000000000000000000000000000000000000..43df8366aa2ae0c68e9a5531a0661f90e897ae2e --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,68 @@ +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > b.cc <<'eof' +;; #include "a.h" +;; struct B : A { int foo() { return 2; } }; +;; int baz() { B b; return bar(&b); } +;; eof +;; clang++ -flto=thin b.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.B = type { %struct.A } +%struct.A = type { ptr } + +@_ZTV1B = linkonce_odr dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B3fooEv] }, !type !0, !type !1, !type !2, !type !3 +@_ZTS1B = linkonce_odr dso_local constant [3 x i8] c"1B\00" +@_ZTI1A = external constant ptr +@_ZTI1B = linkonce_odr dso_local constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } +@_ZTV1A = external unnamed_addr constant { [3 x ptr] } + +define dso_local noundef i32 @_Z3bazv() #0 { +entry: + %b = alloca %struct.B + call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b) + %call = call noundef i32 @_Z3barP1A(ptr noundef %b) + ret i32 %call +} + +define linkonce_odr dso_local void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1) + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +declare i32 @_Z3barP1A(ptr noundef) + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1 + ret void +} + +define linkonce_odr i32 @_ZN1B3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 2 +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} +!2 = !{i64 16, !"_ZTS1B"} +!3 = !{i64 16, !"_ZTSM1BFivE.virtual"} diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll new file mode 100644 index 0000000000000000000000000000000000000000..6cc55df82e2f2814b1717a0ad09c55a81030ed95 --- /dev/null +++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll @@ -0,0 +1,16 @@ +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +@_ZTV1B = external unnamed_addr constant { [4 x ptr] } + +define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 { + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +attributes #0 = { noinline optnone } diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll new file mode 100644 index 0000000000000000000000000000000000000000..d6ac53f9fb936b0d1eb4f86549242288613dcf26 --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll @@ -0,0 +1,263 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!12, !13}' >> %t1_regular.ll +; RUN: echo '!12 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!13 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o +; RUN: ld.lld %t2.o -o %t2.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll -o %t2_nortti.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_nortti.bc -o %t2_nortti.o +; RUN: ld.lld %t2_nortti.o -o %t2_nortti.so -shared + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_undef.ll -o %t2_undef.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2_undef.bc -o %t2_undef.o +; RUN: ld.lld %t2_undef.o -o %t2_undef.so -shared + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-validate-all-vtables-have-type-infos, the linker checks for the presence of vtables +;; and RTTI in native files and blocks devirtualization to be conservative on correctness +;; for these types. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t4_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t4_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t4_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t4_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.so -o %t5_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.so -o %t5_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.so -o %t5_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t5_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-NOT: single-impl: +; VALIDATE: single-impl: devirtualized a call to _ZN1D1mEi +; VALIDATE-NOT: single-impl: + +;; When vtables without type infos are detected in native files, we have a hole in our knowledge so +;; --lto-validate-all-vtables-have-type-infos conservatively disables --lto-whole-program-visibility + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t6_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t6_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t6_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t6_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; DSOs behave similarly + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.so -o %t7_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.so -o %t7_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.so -o %t7_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI +; RUN: llvm-dis %t7_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR + +; NO-RTTI-DAG: --lto-validate-all-vtables-have-type-infos: RTTI missing for vtable _ZTV6Native, --lto-whole-program-visibility disabled +; NO-RTTI-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; --lto-known-safe-vtables=* can be used to specifically allow types to participate in WPD +;; even if they don't have corresponding RTTI + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_nortti.o -o %t8_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t8_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t8_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t8_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Only check for definitions of vtables symbols, just having a reference does not allow a type to +;; be derived from + +;; Index based WPD +; RUN: ld.lld %t1.o %t2_undef.o -o %t9_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2_undef.o -o %t9_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2_undef.o -o %t9_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t9_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; --lto-whole-program-visibility disabled so no devirtualization + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + ; CHECK-NO-RTTI-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call2 = tail call i32 %fptr22 + ; CHECK-NO-RTTI-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; Types not present in native files can still be devirtualized + ; CHECK-VALIDATE-IR: %call3 = tail call i32 @_ZN1D1mEi + ;; --lto-whole-program-visibility disabled but being local this + ;; has VCallVisibilityTranslationUnit visibility so it's still devirtualized + ; CHECK-NO-RTTI-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll new file mode 100644 index 0000000000000000000000000000000000000000..15040b8707aede995aea588638eb7c7c3eafafaf --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll @@ -0,0 +1,183 @@ +; REQUIRES: x86 + +; RUN: rm -rf %t.dir +; RUN: split-file %s %t.dir +; RUN: cd %t.dir + +;; Common artifacts +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1.o ThinLTO.ll +; RUN: opt -module-summary -o %t2.o RegularLTO.ll + +;; --lto-whole-program-visibility when there's split ThinLTO and a RegularLTO with summary optimizes +;; using the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +;; --lto-validate-all-vtables-have-type-infos when there's split ThinLTO and a RegularLTO with summary behaves the same +;; as everything is present in the combined index. +; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR +; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi + +;--- ThinLTO.ll +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11 + +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00" +@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A } + +@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00" +@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A } + +@_ZTS1D = internal constant [3 x i8] c"1D\00" +@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata" + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { + ;; Call function built with RegularLTO + %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a) + + ;; ThinLTO code starts here + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call2 = tail call i32 %fptr22 + %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi + %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2) + + ret i32 %call3 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i32 @RegularLTO(ptr) +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1A1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS1B"} +!4 = !{i64 16, !"_ZTSM1BFviE.virtual"} +!5 = !{i64 24, !"_ZTSM1BFviE.virtual"} +!6 = !{i64 16, !"_ZTS1C"} +!7 = !{i64 16, !"_ZTSM1CFviE.virtual"} +!8 = !{i64 24, !"_ZTSM1CFviE.virtual"} +!9 = !{i64 16, !10} +!10 = distinct !{} +!11 = !{i64 2} + +;--- RegularLTO.ll +; REQUIRES: x86 + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.Native = type { %struct.A } + +@_ZTV7Regular = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI7Regular, ptr @_ZN7Regular1fEi, ptr @_ZN1A1nEi] } , !type !0, !type !1, !type !2, !type !3, !type !4, !type !5 +@_ZTS7Regular = linkonce_odr constant [9 x i8] c"7Regular\00" +@_ZTI7Regular = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS7Regular, ptr @_ZTI1A } + +; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated +@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00" +@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A } + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [1 x ptr] [ ptr @_ZTV7Regular ], section "llvm.metadata" + +; CHECK-COMMON-REGULAR-IR-LABEL: define dso_local i32 @RegularLTO +define i32 @RegularLTO(ptr %obj, i32 %a) #0 { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptr1 = load ptr, ptr %vtable, align 8 + + ;; Check that the call was not devirtualized. + ; CHECK-REGULAR-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + ret i32 %call +} +; CHECK-COMMON-REGULAR-IR-LABEL: ret i32 +; CHECK-COMMON-REGULAR-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN7Regular1fEi(ptr %this, i32 %a) #0 { + ret i32 1; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +attributes #0 = { noinline optnone } +!llvm.module.flags = !{!6, !7} + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFviE.virtual"} +!2 = !{i64 24, !"_ZTSM1AFviE.virtual"} +!3 = !{i64 16, !"_ZTS7Regular"} +!4 = !{i64 16, !"_ZTSM7RegularFviE.virtual"} +!5 = !{i64 24, !"_ZTSM7RegularFviE.virtual"} +!6 = !{i32 1, !"ThinLTO", i32 0} +!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll new file mode 100644 index 0000000000000000000000000000000000000000..30bd75606f7d2d0aeb4bfeb2e82f289941101d0a --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll @@ -0,0 +1,136 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!6, !7}' >> %t1_regular.ll +; RUN: echo '!6 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +;; With --lto-whole-program-visibility, we assume no native types can interfere +;; and thus proceed with devirtualization even in the presence of native types + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR + +; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi +; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +;; With --lto-whole-program-visibility and --lto-validate-all-vtables-have-type-infos +;; we rely on resolutions on the typename symbol to inform us of what's outside the summary. +;; Without the typename symbol in the LTO unit (e.g. RTTI disabled) this causes +;; conservative disablement of WPD on these types unless it's local + +;; Index based WPD +; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR + +; VALIDATE-DAG: single-impl: devirtualized a call to _ZN1D1mEi + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { ptr } +%struct.B = type { %struct.A } +%struct.C = type { %struct.A } +%struct.D = type { ptr } + +@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1 +@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !2 +@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5 + +;; Prevent the vtables from being dead code eliminated. +@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ] + +; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start +define i32 @_start(ptr %obj, ptr %obj2, i32 %a) { +entry: + %vtable = load ptr, ptr %obj + %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %p) + %fptrptr = getelementptr ptr, ptr %vtable, i32 1 + %fptr1 = load ptr, ptr %fptrptr, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi + ;; No resolution for _ZTS1A means we don't devirtualize + ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1 + %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a) + + %fptr22 = load ptr, ptr %vtable, align 8 + + ;; We still have to call it as virtual. + ; CHECK-IR: %call3 = tail call i32 %fptr22 + ; CHECK-VALIDATE-IR: %call3 = tail call i32 %fptr22 + %call3 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call) + + %vtable2 = load ptr, ptr %obj2 + %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !4) + call void @llvm.assume(i1 %p2) + + %fptr33 = load ptr, ptr %vtable2, align 8 + + ;; Check that the call was devirtualized. + ; CHECK-IR: %call4 = tail call i32 @_ZN1D1mEi + ;; Being local this has VCallVisibilityTranslationUnit + ;; visibility so it's still devirtualized + ; CHECK-VALIDATE-IR: %call4 = tail call i32 @_ZN1D1mEi + %call4 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call3) + ret i32 %call4 +} +; CHECK-COMMON-IR-LABEL: ret i32 +; CHECK-COMMON-IR-LABEL: } + +declare i1 @llvm.type.test(ptr, metadata) +declare void @llvm.assume(i1) + +define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 { + ret i32 0; +} + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTS1B"} +!2 = !{i64 16, !"_ZTS1C"} +!3 = !{i64 16, !4} +!4 = distinct !{} +!5 = !{i64 2} diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll new file mode 100644 index 0000000000000000000000000000000000000000..4ef048d6b6c601b9bf174c24f3c8f4372814d0bc --- /dev/null +++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll @@ -0,0 +1,130 @@ +; REQUIRES: x86 + +;; Common artifacts +; RUN: opt --thinlto-bc -o %t1.o %s +; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s +; RUN: cp %s %t1_regular.ll +; RUN: echo '!llvm.module.flags = !{!2, !3}' >> %t1_regular.ll +; RUN: echo '!2 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll +; RUN: echo '!3 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll +; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll + +; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_ref.ll -o %t2.bc +; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o + +;; Native objects can contain only a reference to the base type infos if the base declaration has no key functions. +;; Because of that, --lto-validate-all-vtables-have-type-infos needs to query for the type info symbol inside native files rather than the +;; type name symbol that's used as the key in !type metadata to correctly stop devirtualization on the native type. + +;; Index based WPD +; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Hybrid WPD +; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +;; Regular LTO WPD +; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \ +; RUN: -mllvm -pass-remarks=. 2>&1 | FileCheck %s +; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR + +; CHECK-NOT: single-impl: devirtualized a call to _ZN1A3fooEv + +;; Source code: +;; cat > a.h <<'eof' +;; struct A { virtual int foo(); }; +;; int bar(A *a); +;; eof +;; cat > main.cc <<'eof' +;; #include "a.h" +;; +;; int A::foo() { return 1; } +;; int bar(A *a) { return a->foo(); } +;; +;; extern int baz(); +;; int main() { +;; A a; +;; int i = bar(&a); +;; int j = baz(); +;; return i + j; +;; } +;; eof +;; clang++ -fwhole-program-vtables -fno-split-lto-unit -flto=thin main.cc -c + +target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128" +target triple = "x86_64-unknown-linux-gnu" + +%struct.A = type { %struct.Abase } +%struct.Abase = type { ptr } + +@_ZTV1A = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv] }, align 8, !type !0, !type !1 +@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1 +@_ZTI1A = dso_local constant { ptr, ptr } { ptr null, ptr @_ZTS1A }, align 8 + +define dso_local noundef i32 @_ZN1A3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr + store ptr %this, ptr %this.addr + %this1 = load ptr, ptr %this.addr + ret i32 1 +} + +; CHECK-IR: define dso_local noundef i32 @_Z3barP1A +define dso_local noundef i32 @_Z3barP1A(ptr noundef %a) #0 { +entry: + %a.addr = alloca ptr + store ptr %a, ptr %a.addr + %0 = load ptr, ptr %a.addr + %vtable = load ptr, ptr %0 + %1 = call i1 @llvm.public.type.test(ptr %vtable, metadata !"_ZTS1A") + call void @llvm.assume(i1 %1) + %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0 + %fptr = load ptr, ptr %vfn + ;; Check that the call was not devirtualized. + ; CHECK-IR: %call = call noundef i32 %fptr + %call = call noundef i32 %fptr(ptr noundef nonnull align 8 dereferenceable(8) %0) + ret i32 %call +} +; CHECK-IR: ret i32 +; CHECK-IR: } + +declare i1 @llvm.public.type.test(ptr, metadata) +declare void @llvm.assume(i1 noundef) + +define dso_local noundef i32 @main() #0 { +entry: + %retval = alloca i32, align 4 + %a = alloca %struct.A, align 8 + %i = alloca i32, align 4 + %j = alloca i32, align 4 + store i32 0, ptr %retval, align 4 + call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a) + %call = call noundef i32 @_Z3barP1A(ptr noundef %a) + store i32 %call, ptr %i, align 4 + %call1 = call noundef i32 @_Z3bazv() + store i32 %call1, ptr %j, align 4 + %0 = load i32, ptr %i, align 4 + %1 = load i32, ptr %j, align 4 + %add = add nsw i32 %0, %1 + ret i32 %add +} + +define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 { +entry: + %this.addr = alloca ptr, align 8 + store ptr %this, ptr %this.addr, align 8 + %this1 = load ptr, ptr %this.addr, align 8 + store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8 + ret void +} + +declare noundef i32 @_Z3bazv() + +;; Make sure we don't inline or otherwise optimize out the direct calls. +attributes #0 = { noinline optnone } + +!0 = !{i64 16, !"_ZTS1A"} +!1 = !{i64 16, !"_ZTSM1AFivE.virtual"} diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h index 37509e28f89100789720f326593077d204d805e9..2eabe578a479cf052c8199cf6d9ce32ef5f1d3f7 100644 --- a/llvm/include/llvm/ADT/SetVector.h +++ b/llvm/include/llvm/ADT/SetVector.h @@ -35,9 +35,30 @@ namespace llvm { /// This adapter class provides a way to keep a set of things that also has the /// property of a deterministic iteration order. The order of iteration is the /// order of insertion. +/// +/// The key and value types are derived from the Set and Vector types +/// respectively. This allows the vector-type operations and set-type operations +/// to have different types. In particular, this is useful when storing pointers +/// as "Foo *" values but looking them up as "const Foo *" keys. +/// +/// No constraint is placed on the key and value types, although it is assumed +/// that value_type can be converted into key_type for insertion. Users must be +/// aware of any loss of information in this conversion. For example, setting +/// value_type to float and key_type to int can produce very surprising results, +/// but it is not explicitly disallowed. +/// +/// The parameter N specifies the "small" size of the container, which is the +/// number of elements upto which a linear scan over the Vector will be used +/// when searching for elements instead of checking Set, due to it being better +/// for performance. A value of 0 means that this mode of operation is not used, +/// and is the default value. template , - typename Set = DenseSet> + typename Set = DenseSet, unsigned N = 0> class SetVector { + // Much like in SmallPtrSet, this value should not be too high to prevent + // excessively long linear scans from occuring. + static_assert(N <= 32, "Small size should be less than or equal to 32!"); + public: using value_type = T; using key_type = T; @@ -139,6 +160,17 @@ public: /// Insert a new element into the SetVector. /// \returns true if the element was inserted into the SetVector. bool insert(const value_type &X) { + if constexpr (canBeSmall()) + if (isSmall()) { + if (llvm::find(vector_, X) == vector_.end()) { + vector_.push_back(X); + if (vector_.size() > N) + makeBig(); + return true; + } + return false; + } + bool result = set_.insert(X).second; if (result) vector_.push_back(X); @@ -149,12 +181,21 @@ public: template void insert(It Start, It End) { for (; Start != End; ++Start) - if (set_.insert(*Start).second) - vector_.push_back(*Start); + insert(*Start); } /// Remove an item from the set vector. bool remove(const value_type& X) { + if constexpr (canBeSmall()) + if (isSmall()) { + typename vector_type::iterator I = find(vector_, X); + if (I != vector_.end()) { + vector_.erase(I); + return true; + } + return false; + } + if (set_.erase(X)) { typename vector_type::iterator I = find(vector_, X); assert(I != vector_.end() && "Corrupted SetVector instances!"); @@ -169,6 +210,10 @@ public: /// element erased. This is the end of the SetVector if the last element is /// erased. iterator erase(const_iterator I) { + if constexpr (canBeSmall()) + if (isSmall()) + return vector_.erase(I); + const key_type &V = *I; assert(set_.count(V) && "Corrupted SetVector instances!"); set_.erase(V); @@ -190,8 +235,15 @@ public: /// \returns true if any element is removed. template bool remove_if(UnaryPredicate P) { - typename vector_type::iterator I = - llvm::remove_if(vector_, TestAndEraseFromSet(P, set_)); + typename vector_type::iterator I = [this, P] { + if constexpr (canBeSmall()) + if (isSmall()) + return llvm::remove_if(vector_, P); + + return llvm::remove_if(vector_, + TestAndEraseFromSet(P, set_)); + }(); + if (I == vector_.end()) return false; vector_.erase(I, vector_.end()); @@ -200,12 +252,20 @@ public: /// Check if the SetVector contains the given key. bool contains(const key_type &key) const { + if constexpr (canBeSmall()) + if (isSmall()) + return is_contained(vector_, key); + return set_.find(key) != set_.end(); } /// Count the number of elements of a given key in the SetVector. /// \returns 0 if the element is not in the SetVector, 1 if it is. size_type count(const key_type &key) const { + if constexpr (canBeSmall()) + if (isSmall()) + return is_contained(vector_, key); + return set_.count(key); } @@ -261,7 +321,7 @@ public: remove(*SI); } - void swap(SetVector &RHS) { + void swap(SetVector &RHS) { set_.swap(RHS.set_); vector_.swap(RHS.vector_); } @@ -290,6 +350,16 @@ private: } }; + [[nodiscard]] static constexpr bool canBeSmall() { return N != 0; } + + [[nodiscard]] bool isSmall() const { return set_.empty(); } + + void makeBig() { + if constexpr (canBeSmall()) + for (const auto &entry : vector_) + set_.insert(entry); + } + set_type set_; ///< The set. vector_type vector_; ///< The vector. }; @@ -297,8 +367,7 @@ private: /// A SetVector that performs no allocations if smaller than /// a certain size. template -class SmallSetVector - : public SetVector, SmallDenseSet> { +class SmallSetVector : public SetVector, DenseSet, N> { public: SmallSetVector() = default; @@ -314,9 +383,9 @@ public: namespace std { /// Implement std::swap in terms of SetVector swap. -template -inline void -swap(llvm::SetVector &LHS, llvm::SetVector &RHS) { +template +inline void swap(llvm::SetVector &LHS, + llvm::SetVector &RHS) { LHS.swap(RHS); } diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h index 7a746592c9fcd79897a98a35f2f45687f2f2bc7a..cb825926982040a0285b9ea6636bcdc8ad855f89 100644 --- a/llvm/include/llvm/LTO/Config.h +++ b/llvm/include/llvm/LTO/Config.h @@ -79,6 +79,12 @@ struct Config { /// link. bool HasWholeProgramVisibility = false; + /// We're validating that all native vtables have corresponding type infos. + bool ValidateAllVtablesHaveTypeInfos = false; + /// If all native vtables have corresponding type infos, allow + /// usage of RTTI to block devirtualization on types used in native files. + bool AllVtablesHaveTypeInfos = false; + /// Always emit a Regular LTO object even when it is empty because no Regular /// LTO modules were linked. This option is useful for some build system which /// want to know a priori all possible output files. diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h index e85c13f4b7cce27ae67cb47058eb52d53151e81f..4bca4b13d729ab6445cd4349962a22f357a2310e 100644 --- a/llvm/include/llvm/TableGen/DirectiveEmitter.h +++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h @@ -1,8 +1,13 @@ #ifndef LLVM_TABLEGEN_DIRECTIVEEMITTER_H #define LLVM_TABLEGEN_DIRECTIVEEMITTER_H +#include "llvm/ADT/STLExtras.h" #include "llvm/ADT/StringExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/TableGen/Record.h" +#include +#include +#include namespace llvm { diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h index a2296a064213641e2e3eb706eae864a51b50189f..4932157a7a3dc54f57f652d8a2e6fbfed0c7615d 100644 --- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h +++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h @@ -244,10 +244,18 @@ void updatePublicTypeTestCalls(Module &M, bool WholeProgramVisibilityEnabledInLTO); void updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj); void updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols); + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols); + +void getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj); /// Perform index-based whole program devirtualization on the \p Summary /// index. Any devirtualized targets used by a type test in another module diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h index e8106e474332199a9e49a19b04fe0d91725a90ca..f5127cff24af0dfd3901d19706db6f36656adcb8 100644 --- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h +++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h @@ -14,14 +14,21 @@ #ifndef LLVM_TRANSFORMS_UTILS_CODELAYOUT_H #define LLVM_TRANSFORMS_UTILS_CODELAYOUT_H +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include #include -namespace llvm { +namespace llvm::codelayout { using EdgeT = std::pair; -using EdgeCountT = std::pair; + +struct EdgeCount { + uint64_t src; + uint64_t dst; + uint64_t count; +}; /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump /// locality and thus processor I-cache utilization. This is achieved via @@ -34,25 +41,55 @@ using EdgeCountT = std::pair; /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The /// map also defines the edges in CFG and should include 0-count edges. /// \returns The best block order found. -std::vector -applyExtTspLayout(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +std::vector computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of a given node order in CFG. The higher the score, /// the better the order is. The score is designed to reflect the locality of /// the given order, which is anti-correlated with the number of I-cache misses /// in a typical execution of the function. -double calcExtTspScore(const std::vector &Order, - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef Order, ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); /// Estimate the "quality" of the current node order in CFG. -double calcExtTspScore(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector &EdgeCounts); +double calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts); + +/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for +/// the best performance of large-scale front-end bound binaries. +struct CDSortConfig { + /// The size of the cache. + unsigned CacheEntries = 16; + /// The size of a line in the cache. + unsigned CacheSize = 2048; + /// The power exponent for the distance-based locality. + double DistancePower = 0.25; + /// The scale factor for the frequency-based locality. + double FrequencyScale = 0.25; +}; + +/// Apply a Cache-Directed Sort for functions represented by a call graph. +/// The placement is done by optimizing the call locality by co-locating +/// frequently executed functions. +/// \p FuncSizes: The sizes of the nodes (in bytes). +/// \p FuncCounts: The execution counts of the nodes in the profile. +/// \p CallCounts: The execution counts of every edge (jump) in the profile. The +/// map also defines the edges in CFG and should include 0-count edges. +/// \p CallOffsets: The offsets of the calls from their source nodes. +/// \returns The best function order found. +std::vector computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets); + +/// Apply a Cache-Directed Sort with a custom config. +std::vector computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets); -} // end namespace llvm +} // namespace llvm::codelayout #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp index 7bbc347a8cf88c3b3dca4705fb833bae9ec28e68..b6fbc65d83b800266bb711d42b58a917e4f397a5 100644 --- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp +++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp @@ -3502,7 +3502,7 @@ void MachineBlockPlacement::applyExtTsp() { auto BlockSizes = std::vector(F->size()); auto BlockCounts = std::vector(F->size()); - std::vector JumpCounts; + std::vector JumpCounts; for (MachineBasicBlock &MBB : *F) { // Getting the block frequency. BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB); @@ -3521,8 +3521,8 @@ void MachineBlockPlacement::applyExtTsp() { for (MachineBasicBlock *Succ : MBB.successors()) { auto EP = MBPI->getEdgeProbability(&MBB, Succ); BlockFrequency JumpFreq = BlockFreq * EP; - auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]); - JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency())); + JumpCounts.push_back( + {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()}); } } @@ -3535,7 +3535,7 @@ void MachineBlockPlacement::applyExtTsp() { calcExtTspScore(BlockSizes, BlockCounts, JumpCounts))); // Run the layout algorithm. - auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts); + auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts); std::vector NewBlockOrder; NewBlockOrder.reserve(F->size()); for (uint64_t Node : NewOrder) { diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp index 1cd48adac3f0a11b263c289bccb771ec03fc828c..0e5eeb6ff978e8c987c110fc6631119ad3bc4c7a 100644 --- a/llvm/lib/LTO/LTO.cpp +++ b/llvm/lib/LTO/LTO.cpp @@ -1134,13 +1134,27 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) { } } + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary); + }; + // If allowed, upgrade public vcall visibility metadata to linkage unit // visibility before whole program devirtualization in the optimizer. - updateVCallVisibilityInModule(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInModule( + *RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos, + IsVisibleToRegularObj); updatePublicTypeTestCalls(*RegularLTO.CombinedModule, - Conf.HasWholeProgramVisibility); + WholeProgramVisibilityEnabledInLTO); if (Conf.PreOptModuleHook && !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule)) @@ -1521,13 +1535,38 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache, std::set ExportedGUIDs; - if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility)) + bool WholeProgramVisibilityEnabledInLTO = + Conf.HasWholeProgramVisibility && + // If validation is enabled, upgrade visibility only when all vtables + // have typeinfos. + (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos); + if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) ThinLTO.CombinedIndex.setWithWholeProgramVisibility(); + + // If we're validating, get the vtable symbols that should not be + // upgraded because they correspond to typeIDs outside of index-based + // WPD info. + DenseSet VisibleToRegularObjSymbols; + if (WholeProgramVisibilityEnabledInLTO && + Conf.ValidateAllVtablesHaveTypeInfos) { + // This returns true when the name is local or not defined. Locals are + // expected to be handled separately. + auto IsVisibleToRegularObj = [&](StringRef name) { + auto It = GlobalResolutions.find(name); + return (It == GlobalResolutions.end() || + It->second.VisibleOutsideSummary); + }; + + getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex, + VisibleToRegularObjSymbols, + IsVisibleToRegularObj); + } + // If allowed, upgrade public vcall visibility to linkage unit visibility in // the summaries before whole program devirtualization below. - updateVCallVisibilityInIndex(ThinLTO.CombinedIndex, - Conf.HasWholeProgramVisibility, - DynamicExportSymbols); + updateVCallVisibilityInIndex( + ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO, + DynamicExportSymbols, VisibleToRegularObjSymbols); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp index ae7b7e4b548124ad88f433253fba3a6b79c26911..d7aed2fbc2a1cae090b96beba9cd62f066106472 100644 --- a/llvm/lib/LTO/LTOCodeGenerator.cpp +++ b/llvm/lib/LTO/LTOCodeGenerator.cpp @@ -604,11 +604,14 @@ bool LTOCodeGenerator::optimize() { // pipeline run below. updatePublicTypeTestCalls(*MergedModule, /* WholeProgramVisibilityEnabledInLTO */ false); - updateVCallVisibilityInModule(*MergedModule, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a - // TBD new interface. - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *MergedModule, + /* WholeProgramVisibilityEnabledInLTO */ false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // We always run the verifier once on the merged module, the `DisableVerify` // parameter only applies to subsequent verify. diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp index 5b137a8f8cb344a227439526c16335a1206beb90..0d2e66008f1f6c676d5f013149c5036cfe0963c1 100644 --- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp +++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp @@ -1053,11 +1053,14 @@ void ThinLTOCodeGenerator::run() { // via the internal option. Must be done before WPD below. if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false)) Index->setWithWholeProgramVisibility(); + + // FIXME: This needs linker information via a TBD new interface updateVCallVisibilityInIndex(*Index, - /* WholeProgramVisibilityEnabledInLTO */ false, - // FIXME: This needs linker information via a + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a // TBD new interface. - /* DynamicExportSymbols */ {}); + /*DynamicExportSymbols=*/{}, + /*VisibleToRegularObjSymbols=*/{}); // Perform index-based WPD. This will return immediately if there are // no index entries in the typeIdMetadata map (e.g. if we are instead diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp index 1d5f130737ee5783ef88840e2af0bcc7078731a3..2f9ac86e1f07bfac01a94095c2a0cb88d2a5deac 100644 --- a/llvm/lib/TableGen/Main.cpp +++ b/llvm/lib/TableGen/Main.cpp @@ -15,15 +15,25 @@ //===----------------------------------------------------------------------===// #include "llvm/TableGen/Main.h" +#include "TGLexer.h" #include "TGParser.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/ADT/Twine.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/ErrorOr.h" #include "llvm/Support/FileSystem.h" #include "llvm/Support/MemoryBuffer.h" +#include "llvm/Support/SMLoc.h" +#include "llvm/Support/SourceMgr.h" #include "llvm/Support/ToolOutputFile.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" -#include +#include +#include #include +#include +#include using namespace llvm; static cl::opt diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp index 487a0a4a97f7f8e439ebda6299f96f92607ccb8d..f60cd1c2b2eca6953d4725b1bf1a1b92678b1a82 100644 --- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp +++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp @@ -780,12 +780,52 @@ bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) { !DisableWholeProgramVisibility; } +static bool +typeIDVisibleToRegularObj(StringRef TypeID, + function_ref IsVisibleToRegularObj) { + // TypeID for member function pointer type is an internal construct + // and won't exist in IsVisibleToRegularObj. The full TypeID + // will be present and participate in invalidation. + if (TypeID.ends_with(".virtual")) + return false; + + // TypeID that doesn't start with Itanium mangling (_ZTS) will be + // non-externally visible types which cannot interact with + // external native files. See CodeGenModule::CreateMetadataIdentifierImpl. + if (!TypeID.consume_front("_ZTS")) + return false; + + // TypeID is keyed off the type name symbol (_ZTS). However, the native + // object may not contain this symbol if it does not contain a key + // function for the base type and thus only contains a reference to the + // type info (_ZTI). To catch this case we query using the type info + // symbol corresponding to the TypeID. + std::string typeInfo = ("_ZTI" + TypeID).str(); + return IsVisibleToRegularObj(typeInfo); +} + +static bool +skipUpdateDueToValidation(GlobalVariable &GV, + function_ref IsVisibleToRegularObj) { + SmallVector Types; + GV.getMetadata(LLVMContext::MD_type, Types); + + for (auto Type : Types) + if (auto *TypeID = dyn_cast(Type->getOperand(1).get())) + return typeIDVisibleToRegularObj(TypeID->getString(), + IsVisibleToRegularObj); + + return false; +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definitions to linkage unit visibility in /// Module IR (for regular or hybrid LTO). void updateVCallVisibilityInModule( Module &M, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + bool ValidateAllVtablesHaveTypeInfos, + function_ref IsVisibleToRegularObj) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (GlobalVariable &GV : M.globals()) { @@ -796,7 +836,13 @@ void updateVCallVisibilityInModule( GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic && // Don't upgrade the visibility for symbols exported to the dynamic // linker, as we have no information on their eventual use. - !DynamicExportSymbols.count(GV.getGUID())) + !DynamicExportSymbols.count(GV.getGUID()) && + // With validation enabled, we want to exclude symbols visible to + // regular objects. Local symbols will be in this group due to the + // current implementation but those with VCallVisibilityTranslationUnit + // will have already been marked in clang so are unaffected. + !(ValidateAllVtablesHaveTypeInfos && + skipUpdateDueToValidation(GV, IsVisibleToRegularObj))) GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -828,12 +874,26 @@ void updatePublicTypeTestCalls(Module &M, } } +/// Based on typeID string, get all associated vtable GUIDS that are +/// visible to regular objects. +void getVisibleToRegularObjVtableGUIDs( + ModuleSummaryIndex &Index, + DenseSet &VisibleToRegularObjSymbols, + function_ref IsVisibleToRegularObj) { + for (const auto &typeID : Index.typeIdCompatibleVtableMap()) { + if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj)) + for (const TypeIdOffsetVtableInfo &P : typeID.second) + VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID()); + } +} + /// If whole program visibility asserted, then upgrade all public vcall /// visibility metadata on vtable definition summaries to linkage unit /// visibility in Module summary index (for ThinLTO). void updateVCallVisibilityInIndex( ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO, - const DenseSet &DynamicExportSymbols) { + const DenseSet &DynamicExportSymbols, + const DenseSet &VisibleToRegularObjSymbols) { if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) return; for (auto &P : Index) { @@ -846,6 +906,12 @@ void updateVCallVisibilityInIndex( if (!GVar || GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic) continue; + // With validation enabled, we want to exclude symbols visible to regular + // objects. Local symbols will be in this group due to the current + // implementation but those with VCallVisibilityTranslationUnit will have + // already been marked in clang so are unaffected. + if (VisibleToRegularObjSymbols.count(P.first)) + continue; GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit); } } @@ -1032,8 +1098,8 @@ bool DevirtModule::tryFindVirtualCallTargets( } bool DevirtIndex::tryFindVirtualCallTargets( - std::vector &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo, - uint64_t ByteOffset) { + std::vector &TargetsForSlot, + const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) { for (const TypeIdOffsetVtableInfo &P : TIdInfo) { // Find a representative copy of the vtable initializer. // We can have multiple available_externally, linkonce_odr and weak_odr diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp index 9eb3aff3ffe8f8e87708d434dd2519b6dfab9324..f4a820918ee8bb72bbe6affde7efd9ce0a99efde 100644 --- a/llvm/lib/Transforms/Utils/CodeLayout.cpp +++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp @@ -6,7 +6,8 @@ // //===----------------------------------------------------------------------===// // -// ExtTSP - layout of basic blocks with i-cache optimization. +// The file implements "cache-aware" layout algorithms of basic blocks and +// functions in a binary. // // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG // optimizing jump locality and thus processor I-cache utilization. This is @@ -41,10 +42,14 @@ #include "llvm/Transforms/Utils/CodeLayout.h" #include "llvm/Support/CommandLine.h" +#include "llvm/Support/Debug.h" #include +#include using namespace llvm; +using namespace llvm::codelayout; + #define DEBUG_TYPE "code-layout" cl::opt EnableExtTspBlockPlacement( @@ -57,8 +62,8 @@ cl::opt ApplyExtTspWithoutProfile( cl::desc("Whether to apply ext-tsp placement for instances w/o profile"), cl::init(true), cl::Hidden); -// Algorithm-specific params. The values are tuned for the best performance -// of large-scale front-end bound binaries. +// Algorithm-specific params for Ext-TSP. The values are tuned for the best +// performance of large-scale front-end bound binaries. static cl::opt ForwardWeightCond( "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1), cl::desc("The weight of conditional forward jumps for ExtTSP value")); @@ -69,11 +74,11 @@ static cl::opt ForwardWeightUncond( static cl::opt BackwardWeightCond( "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1), - cl::desc("The weight of conditonal backward jumps for ExtTSP value")); + cl::desc("The weight of conditional backward jumps for ExtTSP value")); static cl::opt BackwardWeightUncond( "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1), - cl::desc("The weight of unconditonal backward jumps for ExtTSP value")); + cl::desc("The weight of unconditional backward jumps for ExtTSP value")); static cl::opt FallthroughWeightCond( "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0), @@ -109,6 +114,21 @@ static cl::opt EnableChainSplitAlongJumps( "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true), cl::desc("The maximum size of a chain to apply splitting")); +// Algorithm-specific options for CDS. +static cl::opt CacheEntries("cds-cache-entries", cl::ReallyHidden, + cl::desc("The size of the cache")); + +static cl::opt CacheSize("cds-cache-size", cl::ReallyHidden, + cl::desc("The size of a line in the cache")); + +static cl::opt DistancePower( + "cds-distance-power", cl::ReallyHidden, + cl::desc("The power exponent for the distance-based locality")); + +static cl::opt FrequencyScale( + "cds-frequency-scale", cl::ReallyHidden, + cl::desc("The scale factor for the frequency-based locality")); + namespace { // Epsilon for comparison of doubles. @@ -149,29 +169,30 @@ double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr, /// A type of merging two chains, X and Y. The former chain is split into /// X1 and X2 and then concatenated with Y in the order specified by the type. -enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y }; +enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y }; /// The gain of merging two chains, that is, the Ext-TSP score of the merge -/// together with the corresponfiding merge 'type' and 'offset'. -class MergeGainTy { -public: - explicit MergeGainTy() = default; - explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType) +/// together with the corresponding merge 'type' and 'offset'. +struct MergeGainT { + explicit MergeGainT() = default; + explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType) : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {} double score() const { return Score; } size_t mergeOffset() const { return MergeOffset; } - MergeTypeTy mergeType() const { return MergeType; } + MergeTypeT mergeType() const { return MergeType; } + + void setMergeType(MergeTypeT Ty) { MergeType = Ty; } // Returns 'true' iff Other is preferred over this. - bool operator<(const MergeGainTy &Other) const { + bool operator<(const MergeGainT &Other) const { return (Other.Score > EPS && Other.Score > Score + EPS); } // Update the current gain if Other is preferred over this. - void updateIfLessThan(const MergeGainTy &Other) { + void updateIfLessThan(const MergeGainT &Other) { if (*this < Other) *this = Other; } @@ -179,114 +200,110 @@ public: private: double Score{-1.0}; size_t MergeOffset{0}; - MergeTypeTy MergeType{MergeTypeTy::X_Y}; + MergeTypeT MergeType{MergeTypeT::X_Y}; }; -class Jump; -class Chain; -class ChainEdge; +struct JumpT; +struct ChainT; +struct ChainEdge; -/// A node in the graph, typically corresponding to a basic block in CFG. -class Block { -public: - Block(const Block &) = delete; - Block(Block &&) = default; - Block &operator=(const Block &) = delete; - Block &operator=(Block &&) = default; +/// A node in the graph, typically corresponding to a basic block in the CFG or +/// a function in the call graph. +struct NodeT { + NodeT(const NodeT &) = delete; + NodeT(NodeT &&) = default; + NodeT &operator=(const NodeT &) = delete; + NodeT &operator=(NodeT &&) = default; - // The original index of the block in CFG. + explicit NodeT(size_t Index, uint64_t Size, uint64_t EC) + : Index(Index), Size(Size), ExecutionCount(EC) {} + + bool isEntry() const { return Index == 0; } + + // The total execution count of outgoing jumps. + uint64_t outCount() const; + + // The total execution count of incoming jumps. + uint64_t inCount() const; + + // The original index of the node in graph. size_t Index{0}; - // The index of the block in the current chain. + // The index of the node in the current chain. size_t CurIndex{0}; - // Size of the block in the binary. + // The size of the node in the binary. uint64_t Size{0}; - // Execution count of the block in the profile data. + // The execution count of the node in the profile data. uint64_t ExecutionCount{0}; - // Current chain of the node. - Chain *CurChain{nullptr}; - // An offset of the block in the current chain. + // The current chain of the node. + ChainT *CurChain{nullptr}; + // The offset of the node in the current chain. mutable uint64_t EstimatedAddr{0}; - // Forced successor of the block in CFG. - Block *ForcedSucc{nullptr}; - // Forced predecessor of the block in CFG. - Block *ForcedPred{nullptr}; - // Outgoing jumps from the block. - std::vector OutJumps; - // Incoming jumps to the block. - std::vector InJumps; - -public: - explicit Block(size_t Index, uint64_t Size, uint64_t EC) - : Index(Index), Size(Size), ExecutionCount(EC) {} - bool isEntry() const { return Index == 0; } + // Forced successor of the node in the graph. + NodeT *ForcedSucc{nullptr}; + // Forced predecessor of the node in the graph. + NodeT *ForcedPred{nullptr}; + // Outgoing jumps from the node. + std::vector OutJumps; + // Incoming jumps to the node. + std::vector InJumps; }; -/// An arc in the graph, typically corresponding to a jump between two blocks. -class Jump { -public: - Jump(const Jump &) = delete; - Jump(Jump &&) = default; - Jump &operator=(const Jump &) = delete; - Jump &operator=(Jump &&) = default; - - // Source block of the jump. - Block *Source; - // Target block of the jump. - Block *Target; +/// An arc in the graph, typically corresponding to a jump between two nodes. +struct JumpT { + JumpT(const JumpT &) = delete; + JumpT(JumpT &&) = default; + JumpT &operator=(const JumpT &) = delete; + JumpT &operator=(JumpT &&) = default; + + explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount) + : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {} + + // Source node of the jump. + NodeT *Source; + // Target node of the jump. + NodeT *Target; // Execution count of the arc in the profile data. uint64_t ExecutionCount{0}; // Whether the jump corresponds to a conditional branch. bool IsConditional{false}; - -public: - explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount) - : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {} + // The offset of the jump from the source node. + uint64_t Offset{0}; }; -/// A chain (ordered sequence) of blocks. -class Chain { -public: - Chain(const Chain &) = delete; - Chain(Chain &&) = default; - Chain &operator=(const Chain &) = delete; - Chain &operator=(Chain &&) = default; +/// A chain (ordered sequence) of nodes in the graph. +struct ChainT { + ChainT(const ChainT &) = delete; + ChainT(ChainT &&) = default; + ChainT &operator=(const ChainT &) = delete; + ChainT &operator=(ChainT &&) = default; + + explicit ChainT(uint64_t Id, NodeT *Node) + : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size), + Nodes(1, Node) {} - explicit Chain(uint64_t Id, Block *Block) - : Id(Id), Score(0), Blocks(1, Block) {} + size_t numBlocks() const { return Nodes.size(); } - uint64_t id() const { return Id; } + double density() const { return static_cast(ExecutionCount) / Size; } - bool isEntry() const { return Blocks[0]->Index == 0; } + bool isEntry() const { return Nodes[0]->Index == 0; } bool isCold() const { - for (auto *Block : Blocks) { - if (Block->ExecutionCount > 0) + for (NodeT *Node : Nodes) { + if (Node->ExecutionCount > 0) return false; } return true; } - double score() const { return Score; } - - void setScore(double NewScore) { Score = NewScore; } - - const std::vector &blocks() const { return Blocks; } - - size_t numBlocks() const { return Blocks.size(); } - - const std::vector> &edges() const { - return Edges; - } - - ChainEdge *getEdge(Chain *Other) const { - for (auto It : Edges) { - if (It.first == Other) - return It.second; + ChainEdge *getEdge(ChainT *Other) const { + for (const auto &[Chain, ChainEdge] : Edges) { + if (Chain == Other) + return ChainEdge; } return nullptr; } - void removeEdge(Chain *Other) { + void removeEdge(ChainT *Other) { auto It = Edges.begin(); while (It != Edges.end()) { if (It->first == Other) { @@ -297,63 +314,68 @@ public: } } - void addEdge(Chain *Other, ChainEdge *Edge) { + void addEdge(ChainT *Other, ChainEdge *Edge) { Edges.push_back(std::make_pair(Other, Edge)); } - void merge(Chain *Other, const std::vector &MergedBlocks) { - Blocks = MergedBlocks; - // Update the block's chains - for (size_t Idx = 0; Idx < Blocks.size(); Idx++) { - Blocks[Idx]->CurChain = this; - Blocks[Idx]->CurIndex = Idx; + void merge(ChainT *Other, std::vector MergedBlocks) { + Nodes = std::move(MergedBlocks); + // Update the chain's data. + ExecutionCount += Other->ExecutionCount; + Size += Other->Size; + Id = Nodes[0]->Index; + // Update the node's data. + for (size_t Idx = 0; Idx < Nodes.size(); Idx++) { + Nodes[Idx]->CurChain = this; + Nodes[Idx]->CurIndex = Idx; } } - void mergeEdges(Chain *Other); + void mergeEdges(ChainT *Other); void clear() { - Blocks.clear(); - Blocks.shrink_to_fit(); + Nodes.clear(); + Nodes.shrink_to_fit(); Edges.clear(); Edges.shrink_to_fit(); } -private: // Unique chain identifier. uint64_t Id; // Cached ext-tsp score for the chain. - double Score; - // Blocks of the chain. - std::vector Blocks; + double Score{0}; + // The total execution count of the chain. + uint64_t ExecutionCount{0}; + // The total size of the chain. + uint64_t Size{0}; + // Nodes of the chain. + std::vector Nodes; // Adjacent chains and corresponding edges (lists of jumps). - std::vector> Edges; + std::vector> Edges; }; -/// An edge in CFG representing jumps between two chains. -/// When blocks are merged into chains, the edges are combined too so that -/// there is always at most one edge between a pair of chains -class ChainEdge { -public: +/// An edge in the graph representing jumps between two chains. +/// When nodes are merged into chains, the edges are combined too so that +/// there is always at most one edge between a pair of chains. +struct ChainEdge { ChainEdge(const ChainEdge &) = delete; ChainEdge(ChainEdge &&) = default; ChainEdge &operator=(const ChainEdge &) = delete; - ChainEdge &operator=(ChainEdge &&) = default; + ChainEdge &operator=(ChainEdge &&) = delete; - explicit ChainEdge(Jump *Jump) + explicit ChainEdge(JumpT *Jump) : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain), Jumps(1, Jump) {} - const std::vector &jumps() const { return Jumps; } + ChainT *srcChain() const { return SrcChain; } - void changeEndpoint(Chain *From, Chain *To) { - if (From == SrcChain) - SrcChain = To; - if (From == DstChain) - DstChain = To; - } + ChainT *dstChain() const { return DstChain; } + + bool isSelfEdge() const { return SrcChain == DstChain; } + + const std::vector &jumps() const { return Jumps; } - void appendJump(Jump *Jump) { Jumps.push_back(Jump); } + void appendJump(JumpT *Jump) { Jumps.push_back(Jump); } void moveJumps(ChainEdge *Other) { Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end()); @@ -361,15 +383,22 @@ public: Other->Jumps.shrink_to_fit(); } - bool hasCachedMergeGain(Chain *Src, Chain *Dst) const { + void changeEndpoint(ChainT *From, ChainT *To) { + if (From == SrcChain) + SrcChain = To; + if (From == DstChain) + DstChain = To; + } + + bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const { return Src == SrcChain ? CacheValidForward : CacheValidBackward; } - MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const { + MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const { return Src == SrcChain ? CachedGainForward : CachedGainBackward; } - void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) { + void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) { if (Src == SrcChain) { CachedGainForward = MergeGain; CacheValidForward = true; @@ -384,57 +413,74 @@ public: CacheValidBackward = false; } + void setMergeGain(MergeGainT Gain) { CachedGain = Gain; } + + MergeGainT getMergeGain() const { return CachedGain; } + + double gain() const { return CachedGain.score(); } + private: // Source chain. - Chain *SrcChain{nullptr}; + ChainT *SrcChain{nullptr}; // Destination chain. - Chain *DstChain{nullptr}; - // Original jumps in the binary with correspinding execution counts. - std::vector Jumps; - // Cached ext-tsp value for merging the pair of chains. - // Since the gain of merging (Src, Dst) and (Dst, Src) might be different, - // we store both values here. - MergeGainTy CachedGainForward; - MergeGainTy CachedGainBackward; + ChainT *DstChain{nullptr}; + // Original jumps in the binary with corresponding execution counts. + std::vector Jumps; + // Cached gain value for merging the pair of chains. + MergeGainT CachedGain; + + // Cached gain values for merging the pair of chains. Since the gain of + // merging (Src, Dst) and (Dst, Src) might be different, we store both values + // here and a flag indicating which of the options results in a higher gain. + // Cached gain values. + MergeGainT CachedGainForward; + MergeGainT CachedGainBackward; // Whether the cached value must be recomputed. bool CacheValidForward{false}; bool CacheValidBackward{false}; }; -void Chain::mergeEdges(Chain *Other) { - assert(this != Other && "cannot merge a chain with itself"); +uint64_t NodeT::outCount() const { + uint64_t Count = 0; + for (JumpT *Jump : OutJumps) + Count += Jump->ExecutionCount; + return Count; +} - // Update edges adjacent to chain Other - for (auto EdgeIt : Other->Edges) { - Chain *DstChain = EdgeIt.first; - ChainEdge *DstEdge = EdgeIt.second; - Chain *TargetChain = DstChain == Other ? this : DstChain; +uint64_t NodeT::inCount() const { + uint64_t Count = 0; + for (JumpT *Jump : InJumps) + Count += Jump->ExecutionCount; + return Count; +} + +void ChainT::mergeEdges(ChainT *Other) { + // Update edges adjacent to chain Other. + for (const auto &[DstChain, DstEdge] : Other->Edges) { + ChainT *TargetChain = DstChain == Other ? this : DstChain; ChainEdge *CurEdge = getEdge(TargetChain); if (CurEdge == nullptr) { DstEdge->changeEndpoint(Other, this); this->addEdge(TargetChain, DstEdge); - if (DstChain != this && DstChain != Other) { + if (DstChain != this && DstChain != Other) DstChain->addEdge(this, DstEdge); - } } else { CurEdge->moveJumps(DstEdge); } - // Cleanup leftover edge - if (DstChain != Other) { + // Cleanup leftover edge. + if (DstChain != Other) DstChain->removeEdge(Other); - } } } -using BlockIter = std::vector::const_iterator; +using NodeIter = std::vector::const_iterator; -/// A wrapper around three chains of blocks; it is used to avoid extra +/// A wrapper around three chains of nodes; it is used to avoid extra /// instantiation of the vectors. -class MergedChain { -public: - MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(), - BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(), - BlockIter End3 = BlockIter()) +struct MergedChain { + MergedChain(NodeIter Begin1, NodeIter End1, NodeIter Begin2 = NodeIter(), + NodeIter End2 = NodeIter(), NodeIter Begin3 = NodeIter(), + NodeIter End3 = NodeIter()) : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3), End3(End3) {} @@ -447,8 +493,8 @@ public: Func(*It); } - std::vector getBlocks() const { - std::vector Result; + std::vector getNodes() const { + std::vector Result; Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) + std::distance(Begin3, End3)); Result.insert(Result.end(), Begin1, End1); @@ -457,165 +503,189 @@ public: return Result; } - const Block *getFirstBlock() const { return *Begin1; } + const NodeT *getFirstNode() const { return *Begin1; } private: - BlockIter Begin1; - BlockIter End1; - BlockIter Begin2; - BlockIter End2; - BlockIter Begin3; - BlockIter End3; + NodeIter Begin1; + NodeIter End1; + NodeIter Begin2; + NodeIter End2; + NodeIter Begin3; + NodeIter End3; }; +/// Merge two chains of nodes respecting a given 'type' and 'offset'. +/// +/// If MergeType == 0, then the result is a concatenation of two chains. +/// Otherwise, the first chain is cut into two sub-chains at the offset, +/// and merged using all possible ways of concatenating three chains. +MergedChain mergeNodes(const std::vector &X, + const std::vector &Y, size_t MergeOffset, + MergeTypeT MergeType) { + // Split the first chain, X, into X1 and X2. + NodeIter BeginX1 = X.begin(); + NodeIter EndX1 = X.begin() + MergeOffset; + NodeIter BeginX2 = X.begin() + MergeOffset; + NodeIter EndX2 = X.end(); + NodeIter BeginY = Y.begin(); + NodeIter EndY = Y.end(); + + // Construct a new chain from the three existing ones. + switch (MergeType) { + case MergeTypeT::X_Y: + return MergedChain(BeginX1, EndX2, BeginY, EndY); + case MergeTypeT::Y_X: + return MergedChain(BeginY, EndY, BeginX1, EndX2); + case MergeTypeT::X1_Y_X2: + return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); + case MergeTypeT::Y_X2_X1: + return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); + case MergeTypeT::X2_X1_Y: + return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); + } + llvm_unreachable("unexpected chain merge type"); +} + /// The implementation of the ExtTSP algorithm. class ExtTSPImpl { - using EdgeT = std::pair; - using EdgeCountMap = std::vector>; - public: - ExtTSPImpl(size_t NumNodes, const std::vector &NodeSizes, - const std::vector &NodeCounts, - const EdgeCountMap &EdgeCounts) - : NumNodes(NumNodes) { + ExtTSPImpl(ArrayRef NodeSizes, ArrayRef NodeCounts, + ArrayRef EdgeCounts) + : NumNodes(NodeSizes.size()) { initialize(NodeSizes, NodeCounts, EdgeCounts); } - /// Run the algorithm and return an optimized ordering of blocks. - void run(std::vector &Result) { - // Pass 1: Merge blocks with their mutually forced successors + /// Run the algorithm and return an optimized ordering of nodes. + std::vector run() { + // Pass 1: Merge nodes with their mutually forced successors mergeForcedPairs(); // Pass 2: Merge pairs of chains while improving the ExtTSP objective mergeChainPairs(); - // Pass 3: Merge cold blocks to reduce code size + // Pass 3: Merge cold nodes to reduce code size mergeColdChains(); - // Collect blocks from all chains - concatChains(Result); + // Collect nodes from all chains + return concatChains(); } private: /// Initialize the algorithm's data structures. - void initialize(const std::vector &NodeSizes, - const std::vector &NodeCounts, - const EdgeCountMap &EdgeCounts) { - // Initialize blocks - AllBlocks.reserve(NumNodes); - for (uint64_t Node = 0; Node < NumNodes; Node++) { - uint64_t Size = std::max(NodeSizes[Node], 1ULL); - uint64_t ExecutionCount = NodeCounts[Node]; - // The execution count of the entry block is set to at least 1 - if (Node == 0 && ExecutionCount == 0) + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts) { + // Initialize nodes + AllNodes.reserve(NumNodes); + for (uint64_t Idx = 0; Idx < NumNodes; Idx++) { + uint64_t Size = std::max(NodeSizes[Idx], 1ULL); + uint64_t ExecutionCount = NodeCounts[Idx]; + // The execution count of the entry node is set to at least one. + if (Idx == 0 && ExecutionCount == 0) ExecutionCount = 1; - AllBlocks.emplace_back(Node, Size, ExecutionCount); + AllNodes.emplace_back(Idx, Size, ExecutionCount); } - // Initialize jumps between blocks + // Initialize jumps between nodes SuccNodes.resize(NumNodes); PredNodes.resize(NumNodes); std::vector OutDegree(NumNodes, 0); AllJumps.reserve(EdgeCounts.size()); - for (auto It : EdgeCounts) { - auto Pred = It.first.first; - auto Succ = It.first.second; - OutDegree[Pred]++; - // Ignore self-edges - if (Pred == Succ) + for (auto Edge : EdgeCounts) { + ++OutDegree[Edge.src]; + // Ignore self-edges. + if (Edge.src == Edge.dst) continue; - SuccNodes[Pred].push_back(Succ); - PredNodes[Succ].push_back(Pred); - auto ExecutionCount = It.second; - if (ExecutionCount > 0) { - auto &Block = AllBlocks[Pred]; - auto &SuccBlock = AllBlocks[Succ]; - AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount); - SuccBlock.InJumps.push_back(&AllJumps.back()); - Block.OutJumps.push_back(&AllJumps.back()); + SuccNodes[Edge.src].push_back(Edge.dst); + PredNodes[Edge.dst].push_back(Edge.src); + if (Edge.count > 0) { + NodeT &PredNode = AllNodes[Edge.src]; + NodeT &SuccNode = AllNodes[Edge.dst]; + AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count); + SuccNode.InJumps.push_back(&AllJumps.back()); + PredNode.OutJumps.push_back(&AllJumps.back()); } } - for (auto &Jump : AllJumps) { + for (JumpT &Jump : AllJumps) { assert(OutDegree[Jump.Source->Index] > 0); Jump.IsConditional = OutDegree[Jump.Source->Index] > 1; } - // Initialize chains + // Initialize chains. AllChains.reserve(NumNodes); HotChains.reserve(NumNodes); - for (Block &Block : AllBlocks) { - AllChains.emplace_back(Block.Index, &Block); - Block.CurChain = &AllChains.back(); - if (Block.ExecutionCount > 0) { + for (NodeT &Node : AllNodes) { + AllChains.emplace_back(Node.Index, &Node); + Node.CurChain = &AllChains.back(); + if (Node.ExecutionCount > 0) HotChains.push_back(&AllChains.back()); - } } - // Initialize chain edges + // Initialize chain edges. AllEdges.reserve(AllJumps.size()); - for (Block &Block : AllBlocks) { - for (auto &Jump : Block.OutJumps) { - auto SuccBlock = Jump->Target; - ChainEdge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain); - // this edge is already present in the graph + for (NodeT &PredNode : AllNodes) { + for (JumpT *Jump : PredNode.OutJumps) { + NodeT *SuccNode = Jump->Target; + ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); + // this edge is already present in the graph. if (CurEdge != nullptr) { - assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr); + assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); CurEdge->appendJump(Jump); continue; } - // this is a new edge + // this is a new edge. AllEdges.emplace_back(Jump); - Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back()); - SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back()); + PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); + SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); } } } - /// For a pair of blocks, A and B, block B is the forced successor of A, + /// For a pair of nodes, A and B, node B is the forced successor of A, /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps - /// to B are from A. Such blocks should be adjacent in the optimal ordering; - /// the method finds and merges such pairs of blocks. + /// to B are from A. Such nodes should be adjacent in the optimal ordering; + /// the method finds and merges such pairs of nodes. void mergeForcedPairs() { - // Find fallthroughs based on edge weights - for (auto &Block : AllBlocks) { - if (SuccNodes[Block.Index].size() == 1 && - PredNodes[SuccNodes[Block.Index][0]].size() == 1 && - SuccNodes[Block.Index][0] != 0) { - size_t SuccIndex = SuccNodes[Block.Index][0]; - Block.ForcedSucc = &AllBlocks[SuccIndex]; - AllBlocks[SuccIndex].ForcedPred = &Block; + // Find fallthroughs based on edge weights. + for (NodeT &Node : AllNodes) { + if (SuccNodes[Node.Index].size() == 1 && + PredNodes[SuccNodes[Node.Index][0]].size() == 1 && + SuccNodes[Node.Index][0] != 0) { + size_t SuccIndex = SuccNodes[Node.Index][0]; + Node.ForcedSucc = &AllNodes[SuccIndex]; + AllNodes[SuccIndex].ForcedPred = &Node; } } // There might be 'cycles' in the forced dependencies, since profile // data isn't 100% accurate. Typically this is observed in loops, when the // loop edges are the hottest successors for the basic blocks of the loop. - // Break the cycles by choosing the block with the smallest index as the + // Break the cycles by choosing the node with the smallest index as the // head. This helps to keep the original order of the loops, which likely // have already been rotated in the optimized manner. - for (auto &Block : AllBlocks) { - if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr) + for (NodeT &Node : AllNodes) { + if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr) continue; - auto SuccBlock = Block.ForcedSucc; - while (SuccBlock != nullptr && SuccBlock != &Block) { - SuccBlock = SuccBlock->ForcedSucc; + NodeT *SuccNode = Node.ForcedSucc; + while (SuccNode != nullptr && SuccNode != &Node) { + SuccNode = SuccNode->ForcedSucc; } - if (SuccBlock == nullptr) + if (SuccNode == nullptr) continue; - // Break the cycle - AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr; - Block.ForcedPred = nullptr; + // Break the cycle. + AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr; + Node.ForcedPred = nullptr; } - // Merge blocks with their fallthrough successors - for (auto &Block : AllBlocks) { - if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) { - auto CurBlock = &Block; + // Merge nodes with their fallthrough successors. + for (NodeT &Node : AllNodes) { + if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) { + const NodeT *CurBlock = &Node; while (CurBlock->ForcedSucc != nullptr) { - const auto NextBlock = CurBlock->ForcedSucc; - mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y); + const NodeT *NextBlock = CurBlock->ForcedSucc; + mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y); CurBlock = NextBlock; } } @@ -624,35 +694,32 @@ private: /// Merge pairs of chains while improving the ExtTSP objective. void mergeChainPairs() { - /// Deterministically compare pairs of chains - auto compareChainPairs = [](const Chain *A1, const Chain *B1, - const Chain *A2, const Chain *B2) { + /// Deterministically compare pairs of chains. + auto compareChainPairs = [](const ChainT *A1, const ChainT *B1, + const ChainT *A2, const ChainT *B2) { if (A1 != A2) - return A1->id() < A2->id(); - return B1->id() < B2->id(); + return A1->Id < A2->Id; + return B1->Id < B2->Id; }; while (HotChains.size() > 1) { - Chain *BestChainPred = nullptr; - Chain *BestChainSucc = nullptr; - auto BestGain = MergeGainTy(); - // Iterate over all pairs of chains - for (Chain *ChainPred : HotChains) { - // Get candidates for merging with the current chain - for (auto EdgeIter : ChainPred->edges()) { - Chain *ChainSucc = EdgeIter.first; - class ChainEdge *ChainEdge = EdgeIter.second; - // Ignore loop edges + ChainT *BestChainPred = nullptr; + ChainT *BestChainSucc = nullptr; + MergeGainT BestGain; + // Iterate over all pairs of chains. + for (ChainT *ChainPred : HotChains) { + // Get candidates for merging with the current chain. + for (const auto &[ChainSucc, Edge] : ChainPred->Edges) { + // Ignore loop edges. if (ChainPred == ChainSucc) continue; - // Stop early if the combined chain violates the maximum allowed size + // Stop early if the combined chain violates the maximum allowed size. if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize) continue; - // Compute the gain of merging the two chains - MergeGainTy CurGain = - getBestMergeGain(ChainPred, ChainSucc, ChainEdge); + // Compute the gain of merging the two chains. + MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge); if (CurGain.score() <= EPS) continue; @@ -667,53 +734,53 @@ private: } } - // Stop merging when there is no improvement + // Stop merging when there is no improvement. if (BestGain.score() <= EPS) break; - // Merge the best pair of chains + // Merge the best pair of chains. mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(), BestGain.mergeType()); } } - /// Merge remaining blocks into chains w/o taking jump counts into - /// consideration. This allows to maintain the original block order in the - /// absense of profile data + /// Merge remaining nodes into chains w/o taking jump counts into + /// consideration. This allows to maintain the original node order in the + /// absence of profile data. void mergeColdChains() { for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) { // Iterating in reverse order to make sure original fallthrough jumps are // merged first; this might be beneficial for code size. size_t NumSuccs = SuccNodes[SrcBB].size(); for (size_t Idx = 0; Idx < NumSuccs; Idx++) { - auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1]; - auto SrcChain = AllBlocks[SrcBB].CurChain; - auto DstChain = AllBlocks[DstBB].CurChain; + size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1]; + ChainT *SrcChain = AllNodes[SrcBB].CurChain; + ChainT *DstChain = AllNodes[DstBB].CurChain; if (SrcChain != DstChain && !DstChain->isEntry() && - SrcChain->blocks().back()->Index == SrcBB && - DstChain->blocks().front()->Index == DstBB && + SrcChain->Nodes.back()->Index == SrcBB && + DstChain->Nodes.front()->Index == DstBB && SrcChain->isCold() == DstChain->isCold()) { - mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y); + mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y); } } } } - /// Compute the Ext-TSP score for a given block order and a list of jumps. + /// Compute the Ext-TSP score for a given node order and a list of jumps. double extTSPScore(const MergedChain &MergedBlocks, - const std::vector &Jumps) const { + const std::vector &Jumps) const { if (Jumps.empty()) return 0.0; uint64_t CurAddr = 0; - MergedBlocks.forEach([&](const Block *BB) { - BB->EstimatedAddr = CurAddr; - CurAddr += BB->Size; + MergedBlocks.forEach([&](const NodeT *Node) { + Node->EstimatedAddr = CurAddr; + CurAddr += Node->Size; }); double Score = 0; - for (auto &Jump : Jumps) { - const Block *SrcBlock = Jump->Source; - const Block *DstBlock = Jump->Target; + for (JumpT *Jump : Jumps) { + const NodeT *SrcBlock = Jump->Source; + const NodeT *DstBlock = Jump->Target; Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size, DstBlock->EstimatedAddr, Jump->ExecutionCount, Jump->IsConditional); @@ -727,13 +794,13 @@ private: /// computes the one having the largest increase in ExtTSP objective. The /// result is a pair with the first element being the gain and the second /// element being the corresponding merging type. - MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc, - ChainEdge *Edge) const { + MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc, + ChainEdge *Edge) const { if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) { return Edge->getCachedMergeGain(ChainPred, ChainSucc); } - // Precompute jumps between ChainPred and ChainSucc + // Precompute jumps between ChainPred and ChainSucc. auto Jumps = Edge->jumps(); ChainEdge *EdgePP = ChainPred->getEdge(ChainPred); if (EdgePP != nullptr) { @@ -741,60 +808,60 @@ private: } assert(!Jumps.empty() && "trying to merge chains w/o jumps"); - // The object holds the best currently chosen gain of merging the two chains - MergeGainTy Gain = MergeGainTy(); + // This object holds the best chosen gain of merging two chains. + MergeGainT Gain = MergeGainT(); /// Given a merge offset and a list of merge types, try to merge two chains - /// and update Gain with a better alternative + /// and update Gain with a better alternative. auto tryChainMerging = [&](size_t Offset, - const std::vector &MergeTypes) { - // Skip merging corresponding to concatenation w/o splitting - if (Offset == 0 || Offset == ChainPred->blocks().size()) + const std::vector &MergeTypes) { + // Skip merging corresponding to concatenation w/o splitting. + if (Offset == 0 || Offset == ChainPred->Nodes.size()) return; - // Skip merging if it breaks Forced successors - auto BB = ChainPred->blocks()[Offset - 1]; - if (BB->ForcedSucc != nullptr) + // Skip merging if it breaks Forced successors. + NodeT *Node = ChainPred->Nodes[Offset - 1]; + if (Node->ForcedSucc != nullptr) return; // Apply the merge, compute the corresponding gain, and update the best - // value, if the merge is beneficial - for (const auto &MergeType : MergeTypes) { + // value, if the merge is beneficial. + for (const MergeTypeT &MergeType : MergeTypes) { Gain.updateIfLessThan( computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType)); } }; - // Try to concatenate two chains w/o splitting + // Try to concatenate two chains w/o splitting. Gain.updateIfLessThan( - computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y)); + computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y)); if (EnableChainSplitAlongJumps) { - // Attach (a part of) ChainPred before the first block of ChainSucc - for (auto &Jump : ChainSucc->blocks().front()->InJumps) { - const auto SrcBlock = Jump->Source; + // Attach (a part of) ChainPred before the first node of ChainSucc. + for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) { + const NodeT *SrcBlock = Jump->Source; if (SrcBlock->CurChain != ChainPred) continue; size_t Offset = SrcBlock->CurIndex + 1; - tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y}); + tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y}); } - // Attach (a part of) ChainPred after the last block of ChainSucc - for (auto &Jump : ChainSucc->blocks().back()->OutJumps) { - const auto DstBlock = Jump->Source; + // Attach (a part of) ChainPred after the last node of ChainSucc. + for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) { + const NodeT *DstBlock = Jump->Source; if (DstBlock->CurChain != ChainPred) continue; size_t Offset = DstBlock->CurIndex; - tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1}); + tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1}); } } - // Try to break ChainPred in various ways and concatenate with ChainSucc - if (ChainPred->blocks().size() <= ChainSplitThreshold) { - for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) { + // Try to break ChainPred in various ways and concatenate with ChainSucc. + if (ChainPred->Nodes.size() <= ChainSplitThreshold) { + for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) { // Try to split the chain in different ways. In practice, applying // X2_Y_X1 merging is almost never provides benefits; thus, we exclude - // it from consideration to reduce the search space - tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1, - MergeTypeTy::X2_X1_Y}); + // it from consideration to reduce the search space. + tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1, + MergeTypeT::X2_X1_Y}); } } Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain); @@ -805,127 +872,471 @@ private: /// merge 'type' and 'offset'. /// /// The two chains are not modified in the method. - MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc, - const std::vector &Jumps, - size_t MergeOffset, - MergeTypeTy MergeType) const { - auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(), - MergeOffset, MergeType); - - // Do not allow a merge that does not preserve the original entry block - if ((ChainPred->isEntry() || ChainSucc->isEntry()) && - !MergedBlocks.getFirstBlock()->isEntry()) - return MergeGainTy(); + MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc, + const std::vector &Jumps, + size_t MergeOffset, MergeTypeT MergeType) const { + auto MergedBlocks = + mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); - // The gain for the new chain - auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score(); - return MergeGainTy(NewGainScore, MergeOffset, MergeType); - } + // Do not allow a merge that does not preserve the original entry point. + if ((ChainPred->isEntry() || ChainSucc->isEntry()) && + !MergedBlocks.getFirstNode()->isEntry()) + return MergeGainT(); - /// Merge two chains of blocks respecting a given merge 'type' and 'offset'. - /// - /// If MergeType == 0, then the result is a concatenation of two chains. - /// Otherwise, the first chain is cut into two sub-chains at the offset, - /// and merged using all possible ways of concatenating three chains. - MergedChain mergeBlocks(const std::vector &X, - const std::vector &Y, size_t MergeOffset, - MergeTypeTy MergeType) const { - // Split the first chain, X, into X1 and X2 - BlockIter BeginX1 = X.begin(); - BlockIter EndX1 = X.begin() + MergeOffset; - BlockIter BeginX2 = X.begin() + MergeOffset; - BlockIter EndX2 = X.end(); - BlockIter BeginY = Y.begin(); - BlockIter EndY = Y.end(); - - // Construct a new chain from the three existing ones - switch (MergeType) { - case MergeTypeTy::X_Y: - return MergedChain(BeginX1, EndX2, BeginY, EndY); - case MergeTypeTy::X1_Y_X2: - return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2); - case MergeTypeTy::Y_X2_X1: - return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1); - case MergeTypeTy::X2_X1_Y: - return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY); - } - llvm_unreachable("unexpected chain merge type"); + // The gain for the new chain. + auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score; + return MergeGainT(NewGainScore, MergeOffset, MergeType); } /// Merge chain From into chain Into, update the list of active chains, /// adjacency information, and the corresponding cached values. - void mergeChains(Chain *Into, Chain *From, size_t MergeOffset, - MergeTypeTy MergeType) { + void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset, + MergeTypeT MergeType) { assert(Into != From && "a chain cannot be merged with itself"); - // Merge the blocks - MergedChain MergedBlocks = - mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType); - Into->merge(From, MergedBlocks.getBlocks()); + // Merge the nodes. + MergedChain MergedNodes = + mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); + Into->merge(From, MergedNodes.getNodes()); + + // Merge the edges. Into->mergeEdges(From); From->clear(); - // Update cached ext-tsp score for the new chain + // Update cached ext-tsp score for the new chain. ChainEdge *SelfEdge = Into->getEdge(Into); if (SelfEdge != nullptr) { - MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end()); - Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps())); + MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end()); + Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps()); } - // Remove chain From from the list of active chains + // Remove the chain from the list of active chains. llvm::erase_value(HotChains, From); - // Invalidate caches - for (auto EdgeIter : Into->edges()) { - EdgeIter.second->invalidateCache(); - } + // Invalidate caches. + for (auto EdgeIt : Into->Edges) + EdgeIt.second->invalidateCache(); } - /// Concatenate all chains into a final order of blocks. - void concatChains(std::vector &Order) { - // Collect chains and calculate some stats for their sorting - std::vector SortedChains; - DenseMap ChainDensity; - for (auto &Chain : AllChains) { - if (!Chain.blocks().empty()) { + /// Concatenate all chains into the final order. + std::vector concatChains() { + // Collect chains and calculate density stats for their sorting. + std::vector SortedChains; + DenseMap ChainDensity; + for (ChainT &Chain : AllChains) { + if (!Chain.Nodes.empty()) { SortedChains.push_back(&Chain); - // Using doubles to avoid overflow of ExecutionCount + // Using doubles to avoid overflow of ExecutionCounts. double Size = 0; double ExecutionCount = 0; - for (auto *Block : Chain.blocks()) { - Size += static_cast(Block->Size); - ExecutionCount += static_cast(Block->ExecutionCount); + for (NodeT *Node : Chain.Nodes) { + Size += static_cast(Node->Size); + ExecutionCount += static_cast(Node->ExecutionCount); } assert(Size > 0 && "a chain of zero size"); ChainDensity[&Chain] = ExecutionCount / Size; } } - // Sorting chains by density in the decreasing order - std::stable_sort(SortedChains.begin(), SortedChains.end(), - [&](const Chain *C1, const Chain *C2) { - // Make sure the original entry block is at the - // beginning of the order - if (C1->isEntry() != C2->isEntry()) { - return C1->isEntry(); - } - - const double D1 = ChainDensity[C1]; - const double D2 = ChainDensity[C2]; - // Compare by density and break ties by chain identifiers - return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id()); - }); - - // Collect the blocks in the order specified by their chains + // Sorting chains by density in the decreasing order. + std::sort(SortedChains.begin(), SortedChains.end(), + [&](const ChainT *L, const ChainT *R) { + // Place the entry point is at the beginning of the order. + if (L->isEntry() != R->isEntry()) + return L->isEntry(); + + const double DL = ChainDensity[L]; + const double DR = ChainDensity[R]; + // Compare by density and break ties by chain identifiers. + return (DL != DR) ? (DL > DR) : (L->Id < R->Id); + return std::make_tuple(-DL, L->Id) < + std::make_tuple(-DR, R->Id); + }); + + // Collect the nodes in the order specified by their chains. + std::vector Order; Order.reserve(NumNodes); - for (Chain *Chain : SortedChains) { - for (Block *Block : Chain->blocks()) { - Order.push_back(Block->Index); + for (const ChainT *Chain : SortedChains) + for (NodeT *Node : Chain->Nodes) + Order.push_back(Node->Index); + return Order; + } + +private: + /// The number of nodes in the graph. + const size_t NumNodes; + + /// Successors of each node. + std::vector> SuccNodes; + + /// Predecessors of each node. + std::vector> PredNodes; + + /// All nodes (basic blocks) in the graph. + std::vector AllNodes; + + /// All jumps between the nodes. + std::vector AllJumps; + + /// All chains of nodes. + std::vector AllChains; + + /// All edges between the chains. + std::vector AllEdges; + + /// Active chains. The vector gets updated at runtime when chains are merged. + std::vector HotChains; +}; + +/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering +/// functions represented by a call graph. +class CDSortImpl { +public: + CDSortImpl(const CDSortConfig &Config, ArrayRef NodeSizes, + ArrayRef NodeCounts, ArrayRef EdgeCounts, + ArrayRef EdgeOffsets) + : Config(Config), NumNodes(NodeSizes.size()) { + initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets); + } + + /// Run the algorithm and return an ordered set of function clusters. + std::vector run() { + // Merge pairs of chains while improving the objective. + mergeChainPairs(); + + LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number" + << " of chains from " << NumNodes << " to " + << HotChains.size() << "\n"); + + // Collect nodes from all the chains. + return concatChains(); + } + +private: + /// Initialize the algorithm's data structures. + void initialize(const ArrayRef &NodeSizes, + const ArrayRef &NodeCounts, + const ArrayRef &EdgeCounts, + const ArrayRef &EdgeOffsets) { + // Initialize nodes. + AllNodes.reserve(NumNodes); + for (uint64_t Node = 0; Node < NumNodes; Node++) { + uint64_t Size = std::max(NodeSizes[Node], 1ULL); + uint64_t ExecutionCount = NodeCounts[Node]; + AllNodes.emplace_back(Node, Size, ExecutionCount); + TotalSamples += ExecutionCount; + if (ExecutionCount > 0) + TotalSize += Size; + } + + // Initialize jumps between the nodes. + SuccNodes.resize(NumNodes); + PredNodes.resize(NumNodes); + AllJumps.reserve(EdgeCounts.size()); + for (size_t I = 0; I < EdgeCounts.size(); I++) { + auto [Pred, Succ, Count] = EdgeCounts[I]; + // Ignore recursive calls. + if (Pred == Succ) + continue; + + SuccNodes[Pred].push_back(Succ); + PredNodes[Succ].push_back(Pred); + if (Count > 0) { + NodeT &PredNode = AllNodes[Pred]; + NodeT &SuccNode = AllNodes[Succ]; + AllJumps.emplace_back(&PredNode, &SuccNode, Count); + AllJumps.back().Offset = EdgeOffsets[I]; + SuccNode.InJumps.push_back(&AllJumps.back()); + PredNode.OutJumps.push_back(&AllJumps.back()); + } + } + + // Initialize chains. + AllChains.reserve(NumNodes); + HotChains.reserve(NumNodes); + for (NodeT &Node : AllNodes) { + // Adjust execution counts. + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount()); + Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount()); + // Create chain. + AllChains.emplace_back(Node.Index, &Node); + Node.CurChain = &AllChains.back(); + if (Node.ExecutionCount > 0) + HotChains.push_back(&AllChains.back()); + } + + // Initialize chain edges. + AllEdges.reserve(AllJumps.size()); + for (NodeT &PredNode : AllNodes) { + for (JumpT *Jump : PredNode.OutJumps) { + NodeT *SuccNode = Jump->Target; + ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain); + // this edge is already present in the graph. + if (CurEdge != nullptr) { + assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr); + CurEdge->appendJump(Jump); + continue; + } + // this is a new edge. + AllEdges.emplace_back(Jump); + PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back()); + SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back()); } } } + /// Merge pairs of chains while there is an improvement in the objective. + void mergeChainPairs() { + // Create a priority queue containing all edges ordered by the merge gain. + auto GainComparator = [](ChainEdge *L, ChainEdge *R) { + return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) < + std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id); + }; + std::set Queue(GainComparator); + + // Insert the edges into the queue. + for (ChainT *ChainPred : HotChains) { + for (const auto &[Chain, Edge] : ChainPred->Edges) { + // Ignore self-edges. + if (Edge->isSelfEdge()) + continue; + // Ignore already processed edges. + if (Edge->gain() != -1.0) + continue; + + // Compute the gain of merging the two chains. + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + + // Merge the chains while the gain of merging is positive. + while (!Queue.empty()) { + // Extract the best (top) edge for merging. + ChainEdge *BestEdge = *Queue.begin(); + Queue.erase(Queue.begin()); + // Ignore self-edges. + if (BestEdge->isSelfEdge()) + continue; + // Ignore edges with non-positive gains. + if (BestEdge->gain() <= EPS) + continue; + + ChainT *BestSrcChain = BestEdge->srcChain(); + ChainT *BestDstChain = BestEdge->dstChain(); + + // Remove outdated edges from the queue. + for (const auto &[Chain, ChainEdge] : BestSrcChain->Edges) + Queue.erase(ChainEdge); + for (const auto &[Chain, ChainEdge] : BestDstChain->Edges) + Queue.erase(ChainEdge); + + // Merge the best pair of chains. + MergeGainT BestGain = BestEdge->getMergeGain(); + mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(), + BestGain.mergeType()); + + // Insert newly created edges into the queue. + for (const auto &[Chain, Edge] : BestSrcChain->Edges) { + // Ignore loop edges. + if (Edge->isSelfEdge()) + continue; + + // Compute the gain of merging the two chains. + MergeGainT Gain = getBestMergeGain(Edge); + Edge->setMergeGain(Gain); + + if (Edge->gain() > EPS) + Queue.insert(Edge); + } + } + } + + /// Compute the gain of merging two chains. + /// + /// The function considers all possible ways of merging two chains and + /// computes the one having the largest increase in ExtTSP objective. The + /// result is a pair with the first element being the gain and the second + /// element being the corresponding merging type. + MergeGainT getBestMergeGain(ChainEdge *Edge) const { + // Precompute jumps between ChainPred and ChainSucc. + auto Jumps = Edge->jumps(); + assert(!Jumps.empty() && "trying to merge chains w/o jumps"); + ChainT *SrcChain = Edge->srcChain(); + ChainT *DstChain = Edge->dstChain(); + + // This object holds the best currently chosen gain of merging two chains. + MergeGainT Gain = MergeGainT(); + + /// Given a list of merge types, try to merge two chains and update Gain + /// with a better alternative. + auto tryChainMerging = [&](const std::vector &MergeTypes) { + // Apply the merge, compute the corresponding gain, and update the best + // value, if the merge is beneficial. + for (const MergeTypeT &MergeType : MergeTypes) { + MergeGainT NewGain = + computeMergeGain(SrcChain, DstChain, Jumps, MergeType); + + // When forward and backward gains are the same, prioritize merging that + // preserves the original order of the functions in the binary. + if (std::abs(Gain.score() - NewGain.score()) < EPS) { + if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) || + (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) { + Gain = NewGain; + } + } else if (NewGain.score() > Gain.score() + EPS) { + Gain = NewGain; + } + } + }; + + // Try to concatenate two chains w/o splitting. + tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X}); + + return Gain; + } + + /// Compute the score gain of merging two chains, respecting a given type. + /// + /// The two chains are not modified in the method. + MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc, + const std::vector &Jumps, + MergeTypeT MergeType) const { + // This doesn't depend on the ordering of the nodes + double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc); + + // Merge offset is always 0, as the chains are not split. + size_t MergeOffset = 0; + auto MergedBlocks = + mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType); + double DistGain = distBasedLocalityGain(MergedBlocks, Jumps); + + double GainScore = DistGain + Config.FrequencyScale * FreqGain; + // Scale the result to increase the importance of merging short chains. + if (GainScore >= 0.0) + GainScore /= std::min(ChainPred->Size, ChainSucc->Size); + + return MergeGainT(GainScore, MergeOffset, MergeType); + } + + /// Compute the change of the frequency locality after merging the chains. + double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const { + auto missProbability = [&](double ChainDensity) { + double PageSamples = ChainDensity * Config.CacheSize; + if (PageSamples >= TotalSamples) + return 0.0; + double P = PageSamples / TotalSamples; + return pow(1.0 - P, static_cast(Config.CacheEntries)); + }; + + // Cache misses on the chains before merging. + double CurScore = + ChainPred->ExecutionCount * missProbability(ChainPred->density()) + + ChainSucc->ExecutionCount * missProbability(ChainSucc->density()); + + // Cache misses on the merged chain + double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount; + double MergedSize = ChainPred->Size + ChainSucc->Size; + double MergedDensity = static_cast(MergedCounts) / MergedSize; + double NewScore = MergedCounts * missProbability(MergedDensity); + + return CurScore - NewScore; + } + + /// Compute the distance locality for a jump / call. + double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const { + uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr; + double D = Dist == 0 ? 0.1 : static_cast(Dist); + return static_cast(Count) * std::pow(D, -Config.DistancePower); + } + + /// Compute the change of the distance locality after merging the chains. + double distBasedLocalityGain(const MergedChain &MergedBlocks, + const std::vector &Jumps) const { + if (Jumps.empty()) + return 0.0; + uint64_t CurAddr = 0; + MergedBlocks.forEach([&](const NodeT *Node) { + Node->EstimatedAddr = CurAddr; + CurAddr += Node->Size; + }); + + double CurScore = 0; + double NewScore = 0; + for (const JumpT *Arc : Jumps) { + uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset; + uint64_t DstAddr = Arc->Target->EstimatedAddr; + NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount); + CurScore += distScore(0, TotalSize, Arc->ExecutionCount); + } + return NewScore - CurScore; + } + + /// Merge chain From into chain Into, update the list of active chains, + /// adjacency information, and the corresponding cached values. + void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset, + MergeTypeT MergeType) { + assert(Into != From && "a chain cannot be merged with itself"); + + // Merge the nodes. + MergedChain MergedNodes = + mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType); + Into->merge(From, MergedNodes.getNodes()); + + // Merge the edges. + Into->mergeEdges(From); + From->clear(); + + // Remove the chain from the list of active chains. + llvm::erase_value(HotChains, From); + } + + /// Concatenate all chains into the final order. + std::vector concatChains() { + // Collect chains and calculate density stats for their sorting. + std::vector SortedChains; + DenseMap ChainDensity; + for (ChainT &Chain : AllChains) { + if (!Chain.Nodes.empty()) { + SortedChains.push_back(&Chain); + // Using doubles to avoid overflow of ExecutionCounts. + double Size = 0; + double ExecutionCount = 0; + for (NodeT *Node : Chain.Nodes) { + Size += static_cast(Node->Size); + ExecutionCount += static_cast(Node->ExecutionCount); + } + assert(Size > 0 && "a chain of zero size"); + ChainDensity[&Chain] = ExecutionCount / Size; + } + } + + // Sort chains by density in the decreasing order. + std::sort(SortedChains.begin(), SortedChains.end(), + [&](const ChainT *L, const ChainT *R) { + const double DL = ChainDensity[L]; + const double DR = ChainDensity[R]; + // Compare by density and break ties by chain identifiers. + return std::make_tuple(-DL, L->Id) < + std::make_tuple(-DR, R->Id); + }); + + // Collect the nodes in the order specified by their chains. + std::vector Order; + Order.reserve(NumNodes); + for (const ChainT *Chain : SortedChains) + for (NodeT *Node : Chain->Nodes) + Order.push_back(Node->Index); + return Order; + } + private: + /// Config for the algorithm. + const CDSortConfig Config; + /// The number of nodes in the graph. const size_t NumNodes; @@ -935,80 +1346,108 @@ private: /// Predecessors of each node. std::vector> PredNodes; - /// All basic blocks. - std::vector AllBlocks; + /// All nodes (functions) in the graph. + std::vector AllNodes; - /// All jumps between blocks. - std::vector AllJumps; + /// All jumps (function calls) between the nodes. + std::vector AllJumps; - /// All chains of basic blocks. - std::vector AllChains; + /// All chains of nodes. + std::vector AllChains; - /// All edges between chains. + /// All edges between the chains. std::vector AllEdges; /// Active chains. The vector gets updated at runtime when chains are merged. - std::vector HotChains; + std::vector HotChains; + + /// The total number of samples in the graph. + uint64_t TotalSamples{0}; + + /// The total size of the nodes in the graph. + uint64_t TotalSize{0}; }; } // end of anonymous namespace -std::vector llvm::applyExtTspLayout( - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector> &EdgeCounts) { - size_t NumNodes = NodeSizes.size(); - +std::vector +codelayout::computeExtTspLayout(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { // Verify correctness of the input data. assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input"); - assert(NumNodes > 2 && "Incorrect input"); + assert(NodeSizes.size() > 2 && "Incorrect input"); // Apply the reordering algorithm. - auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts); - std::vector Result; - Alg.run(Result); + ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts); + std::vector Result = Alg.run(); // Verify correctness of the output. assert(Result.front() == 0 && "Original entry point is not preserved"); - assert(Result.size() == NumNodes && "Incorrect size of reordered layout"); + assert(Result.size() == NodeSizes.size() && "Incorrect size of layout"); return Result; } -double llvm::calcExtTspScore( - const std::vector &Order, const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector> &EdgeCounts) { - // Estimate addresses of the blocks in memory +double codelayout::calcExtTspScore(ArrayRef Order, + ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { + // Estimate addresses of the blocks in memory. std::vector Addr(NodeSizes.size(), 0); for (size_t Idx = 1; Idx < Order.size(); Idx++) { Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]]; } std::vector OutDegree(NodeSizes.size(), 0); - for (auto It : EdgeCounts) { - auto Pred = It.first.first; - OutDegree[Pred]++; - } + for (auto Edge : EdgeCounts) + ++OutDegree[Edge.src]; - // Increase the score for each jump + // Increase the score for each jump. double Score = 0; - for (auto It : EdgeCounts) { - auto Pred = It.first.first; - auto Succ = It.first.second; - uint64_t Count = It.second; - bool IsConditional = OutDegree[Pred] > 1; - Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count, - IsConditional); + for (auto Edge : EdgeCounts) { + bool IsConditional = OutDegree[Edge.src] > 1; + Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst], + Edge.count, IsConditional); } return Score; } -double llvm::calcExtTspScore( - const std::vector &NodeSizes, - const std::vector &NodeCounts, - const std::vector> &EdgeCounts) { +double codelayout::calcExtTspScore(ArrayRef NodeSizes, + ArrayRef NodeCounts, + ArrayRef EdgeCounts) { std::vector Order(NodeSizes.size()); for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) { Order[Idx] = Idx; } return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts); } + +std::vector codelayout::computeCacheDirectedLayout( + const CDSortConfig &Config, ArrayRef FuncSizes, + ArrayRef FuncCounts, ArrayRef CallCounts, + ArrayRef CallOffsets) { + // Verify correctness of the input data. + assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input"); + + // Apply the reordering algorithm. + CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets); + std::vector Result = Alg.run(); + assert(Result.size() == FuncSizes.size() && "Incorrect size of layout"); + return Result; +} + +std::vector codelayout::computeCacheDirectedLayout( + ArrayRef FuncSizes, ArrayRef FuncCounts, + ArrayRef CallCounts, ArrayRef CallOffsets) { + CDSortConfig Config; + // Populate the config from the command-line options. + if (CacheEntries.getNumOccurrences() > 0) + Config.CacheEntries = CacheEntries; + if (CacheSize.getNumOccurrences() > 0) + Config.CacheSize = CacheSize; + if (DistancePower.getNumOccurrences() > 0) + Config.DistancePower = DistancePower; + if (FrequencyScale.getNumOccurrences() > 0) + Config.FrequencyScale = FrequencyScale; + return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts, + CallOffsets); +} diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp index 40632b43e73bfe4e4f81a8f19f7244f28ce5eb27..f0459f47605463c498d4de745de06507bea8e2fc 100644 --- a/llvm/tools/opt/opt.cpp +++ b/llvm/tools/opt/opt.cpp @@ -582,9 +582,14 @@ int main(int argc, char **argv) { // the facility for updating public visibility to linkage unit visibility when // specified by an internal option. This is normally done during LTO which is // not performed via opt. - updateVCallVisibilityInModule(*M, - /* WholeProgramVisibilityEnabledInLTO */ false, - /* DynamicExportSymbols */ {}); + updateVCallVisibilityInModule( + *M, + /*WholeProgramVisibilityEnabledInLTO=*/false, + // FIXME: These need linker information via a + // TBD new interface. + /*DynamicExportSymbols=*/{}, + /*ValidateAllVtablesHaveTypeInfos=*/false, + /*IsVisibleToRegularObj=*/[](StringRef) { return true; }); // Figure out what stream we are supposed to write to... std::unique_ptr Out; diff --git a/llvm/utils/TableGen/AsmWriterInst.cpp b/llvm/utils/TableGen/AsmWriterInst.cpp index 4a78108d6f4a15253b27b686fd87ec30676674c3..c9558593e142cde9eb6b800f5e63fce1810f0ac6 100644 --- a/llvm/utils/TableGen/AsmWriterInst.cpp +++ b/llvm/utils/TableGen/AsmWriterInst.cpp @@ -12,7 +12,6 @@ #include "AsmWriterInst.h" #include "CodeGenInstruction.h" -#include "CodeGenTarget.h" #include "llvm/ADT/StringExtras.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" diff --git a/llvm/utils/TableGen/AsmWriterInst.h b/llvm/utils/TableGen/AsmWriterInst.h index fe2b934e266f18b1df504b2522d5e6456f33a89d..9c93e82b611b6b5603b024dd1d1002b2318f52ff 100644 --- a/llvm/utils/TableGen/AsmWriterInst.h +++ b/llvm/utils/TableGen/AsmWriterInst.h @@ -21,7 +21,6 @@ namespace llvm { class CodeGenInstruction; - class Record; struct AsmWriterOperand { enum OpType { diff --git a/llvm/utils/TableGen/CTagsEmitter.cpp b/llvm/utils/TableGen/CTagsEmitter.cpp index fe62d6a9b67f2d9f80f17207220575e623be3527..b4ffbfa2012cb8f14dd4fc879b4a2f131b3a9950 100644 --- a/llvm/utils/TableGen/CTagsEmitter.cpp +++ b/llvm/utils/TableGen/CTagsEmitter.cpp @@ -17,7 +17,6 @@ #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" #include -#include #include using namespace llvm; diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp index dc4fd589eaa83fd0e44b689bb6ec7f2768294f9e..65befa0473fc89f570ee7ba1ede7d7285bdfe6a5 100644 --- a/llvm/utils/TableGen/CodeEmitterGen.cpp +++ b/llvm/utils/TableGen/CodeEmitterGen.cpp @@ -12,10 +12,10 @@ // //===----------------------------------------------------------------------===// +#include "CodeGenHwModes.h" #include "CodeGenInstruction.h" #include "CodeGenTarget.h" -#include "SubtargetFeatureInfo.h" -#include "Types.h" +#include "InfoByHwMode.h" #include "VarLenCodeEmitterGen.h" #include "llvm/ADT/APInt.h" #include "llvm/ADT/ArrayRef.h" diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp index dd04778e2dbe7903047a0b06566159706b9ae787..2713e7a1a8ed5b663567ad67eddc7c1350ab2802 100644 --- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp +++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp @@ -13,6 +13,7 @@ #include "CodeGenDAGPatterns.h" #include "CodeGenInstruction.h" +#include "CodeGenRegisters.h" #include "llvm/ADT/DenseSet.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/STLExtras.h" diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h index 55507cbca37dddd8fe84f200e27465ec18d39275..335e918bfe73815caa79664b787ced1b720e4bf9 100644 --- a/llvm/utils/TableGen/CodeGenHwModes.h +++ b/llvm/utils/TableGen/CodeGenHwModes.h @@ -12,9 +12,11 @@ #define LLVM_UTILS_TABLEGEN_CODEGENHWMODES_H #include "llvm/ADT/StringMap.h" +#include "llvm/ADT/StringRef.h" #include #include #include +#include #include // HwModeId -> list of predicates (definition) diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp index 02695942f5c12017adc5203ee8c8e163601eae21..fd375735dfd2ad739bda1c9ba05c14873620ecc4 100644 --- a/llvm/utils/TableGen/CodeGenMapTable.cpp +++ b/llvm/utils/TableGen/CodeGenMapTable.cpp @@ -78,6 +78,7 @@ #include "CodeGenInstruction.h" #include "CodeGenTarget.h" #include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" using namespace llvm; typedef std::map > InstrRelMapTy; diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h index 765425ed68cbdafb5de5db268a9eab1246651a3c..7638816811e8f0b22259c0358830ee690de092d6 100644 --- a/llvm/utils/TableGen/CodeGenRegisters.h +++ b/llvm/utils/TableGen/CodeGenRegisters.h @@ -14,6 +14,7 @@ #ifndef LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H #define LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H +#include "CodeGenHwModes.h" #include "InfoByHwMode.h" #include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/BitVector.h" @@ -32,8 +33,11 @@ #include #include #include +#include #include #include +#include +#include #include #include #include @@ -41,7 +45,6 @@ namespace llvm { class CodeGenRegBank; - template class SetVector; /// Used to encode a step in a register lane mask transformation. /// Mask the bits specified in Mask, then rotate them Rol bits to the left diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h index bbf5381ad086b3f40072f2e673742b5b759ed477..76ef1e43953078e77add8458838ad3e5e5875ef7 100644 --- a/llvm/utils/TableGen/CodeGenSchedule.h +++ b/llvm/utils/TableGen/CodeGenSchedule.h @@ -15,10 +15,17 @@ #define LLVM_UTILS_TABLEGEN_CODEGENSCHEDULE_H #include "llvm/ADT/APInt.h" +#include "llvm/ADT/ArrayRef.h" #include "llvm/ADT/DenseMap.h" +#include "llvm/ADT/DenseSet.h" #include "llvm/ADT/STLExtras.h" +#include "llvm/ADT/StringRef.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/SetTheory.h" +#include +#include +#include +#include namespace llvm { diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp index a18d6a6b8854f41dc900b11753cb84c3b75d1e61..1f9c39c08e677a88bae4fde26eac7cf5de0ce20b 100644 --- a/llvm/utils/TableGen/CompressInstEmitter.cpp +++ b/llvm/utils/TableGen/CompressInstEmitter.cpp @@ -65,6 +65,7 @@ //===----------------------------------------------------------------------===// #include "CodeGenInstruction.h" +#include "CodeGenRegisters.h" #include "CodeGenTarget.h" #include "llvm/ADT/IndexedMap.h" #include "llvm/ADT/SmallVector.h" diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp index d012a0172a8fd245a87ad97d81dffd467afe86a4..70738c7adca8d61b2288fa49bfa6ed0aa037752a 100644 --- a/llvm/utils/TableGen/DAGISelEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelEmitter.cpp @@ -12,6 +12,7 @@ #include "CodeGenDAGPatterns.h" #include "CodeGenInstruction.h" +#include "CodeGenTarget.h" #include "DAGISelMatcher.h" #include "llvm/Support/Debug.h" #include "llvm/TableGen/Record.h" diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp index e436a931a9f556ca578550c388b2478f32262435..c08c6a9a30a290f09ab425e711f110e75b90bdf5 100644 --- a/llvm/utils/TableGen/DAGISelMatcher.cpp +++ b/llvm/utils/TableGen/DAGISelMatcher.cpp @@ -8,6 +8,7 @@ #include "DAGISelMatcher.h" #include "CodeGenDAGPatterns.h" +#include "CodeGenRegisters.h" #include "CodeGenTarget.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Record.h" diff --git a/llvm/utils/TableGen/DAGISelMatcher.h b/llvm/utils/TableGen/DAGISelMatcher.h index 77280acaf4cac3622c24393d4a64f5623479df29..c9094f5675e63334ccc708b5a2af7be9ddb4c2f2 100644 --- a/llvm/utils/TableGen/DAGISelMatcher.h +++ b/llvm/utils/TableGen/DAGISelMatcher.h @@ -14,6 +14,11 @@ #include "llvm/ADT/StringRef.h" #include "llvm/Support/Casting.h" #include "llvm/Support/MachineValueType.h" +#include +#include +#include +#include +#include namespace llvm { struct CodeGenRegister; diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp index 777e75dcd92911e5da04e71f00bf460b4bf30203..2b876c2f7496923e3f030b968a2af5b710af2662 100644 --- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp @@ -11,7 +11,10 @@ //===----------------------------------------------------------------------===// #include "CodeGenDAGPatterns.h" +#include "CodeGenRegisters.h" +#include "CodeGenTarget.h" #include "DAGISelMatcher.h" +#include "SDNodeProperties.h" #include "llvm/ADT/DenseMap.h" #include "llvm/ADT/MapVector.h" #include "llvm/ADT/StringMap.h" diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp index 44bff4c67ab31c0172c9b3a684c5c36c4e893725..03f7bc4ff51914c1aea179b3b9905deba88a4e2c 100644 --- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp @@ -9,7 +9,10 @@ #include "CodeGenDAGPatterns.h" #include "CodeGenInstruction.h" #include "CodeGenRegisters.h" +#include "CodeGenTarget.h" #include "DAGISelMatcher.h" +#include "InfoByHwMode.h" +#include "SDNodeProperties.h" #include "llvm/ADT/SmallVector.h" #include "llvm/ADT/StringMap.h" #include "llvm/TableGen/Error.h" diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp index 4273bd69b87d3d138b85ce1d0dbb138da4379aaa..764b86c97dbf831d942ef8816c10273faad7a422 100644 --- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp +++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp @@ -12,6 +12,7 @@ #include "DAGISelMatcher.h" #include "CodeGenDAGPatterns.h" +#include "SDNodeProperties.h" #include "llvm/ADT/StringSet.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" diff --git a/llvm/utils/TableGen/DFAEmitter.h b/llvm/utils/TableGen/DFAEmitter.h index 44e5d97d544ffe4716480932df92caf2a0913be6..c831a65a73cdf64a1cd0c2dd2cb5aeeb06a235b0 100644 --- a/llvm/utils/TableGen/DFAEmitter.h +++ b/llvm/utils/TableGen/DFAEmitter.h @@ -21,6 +21,8 @@ #include "llvm/ADT/UniqueVector.h" #include #include +#include +#include namespace llvm { diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp index 44c1df3e9ac4693e1b45d383d44b889ffbef47f3..c9cd5b0d7ec6e6df9b4377452637ed03f6ec0b21 100644 --- a/llvm/utils/TableGen/DXILEmitter.cpp +++ b/llvm/utils/TableGen/DXILEmitter.cpp @@ -17,7 +17,6 @@ #include "llvm/ADT/StringSet.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/DXILOperationCommon.h" -#include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" using namespace llvm; diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp index 8f816744370c046e5775d5bbd5f31480b8d919fc..eabc158ab91eda124b2528460ea79aa7ee5fd00a 100644 --- a/llvm/utils/TableGen/DecoderEmitter.cpp +++ b/llvm/utils/TableGen/DecoderEmitter.cpp @@ -11,6 +11,7 @@ // //===----------------------------------------------------------------------===// +#include "CodeGenHwModes.h" #include "CodeGenInstruction.h" #include "CodeGenTarget.h" #include "InfoByHwMode.h" diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp index 0a88f67be168ac0e00f43e1d0df05d16d9795cad..ef501f86f291587d082944153fc2d71273669c78 100644 --- a/llvm/utils/TableGen/FastISelEmitter.cpp +++ b/llvm/utils/TableGen/FastISelEmitter.cpp @@ -18,6 +18,9 @@ #include "CodeGenDAGPatterns.h" #include "CodeGenInstruction.h" +#include "CodeGenRegisters.h" +#include "CodeGenTarget.h" +#include "InfoByHwMode.h" #include "llvm/ADT/StringSwitch.h" #include "llvm/Support/ErrorHandling.h" #include "llvm/TableGen/Error.h" diff --git a/llvm/utils/TableGen/GICombinerEmitter.cpp b/llvm/utils/TableGen/GICombinerEmitter.cpp index 2ae313081a6f161b47b248d13de87ce595223d3a..927fb81dc74baf559eb37b0dc5b07d78970b1b6c 100644 --- a/llvm/utils/TableGen/GICombinerEmitter.cpp +++ b/llvm/utils/TableGen/GICombinerEmitter.cpp @@ -15,6 +15,9 @@ #include "GlobalISel/CodeExpander.h" #include "GlobalISel/CodeExpansions.h" #include "GlobalISel/GIMatchDag.h" +#include "GlobalISel/GIMatchDagEdge.h" +#include "GlobalISel/GIMatchDagInstr.h" +#include "GlobalISel/GIMatchDagOperands.h" #include "GlobalISel/GIMatchDagPredicate.h" #include "GlobalISel/GIMatchTree.h" #include "llvm/ADT/SmallSet.h" diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp index c79c79948a80d42d23ee9166d2f2ddaed9ede433..360d42f3978aced6de6e7657a23d88dc0a865e82 100644 --- a/llvm/utils/TableGen/GlobalISelEmitter.cpp +++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp @@ -31,6 +31,10 @@ #include "CodeGenDAGPatterns.h" #include "CodeGenInstruction.h" +#include "CodeGenIntrinsics.h" +#include "CodeGenRegisters.h" +#include "CodeGenTarget.h" +#include "InfoByHwMode.h" #include "SubtargetFeatureInfo.h" #include "llvm/ADT/Statistic.h" #include "llvm/Support/CodeGenCoverage.h" diff --git a/llvm/utils/TableGen/InfoByHwMode.cpp b/llvm/utils/TableGen/InfoByHwMode.cpp index 73c4fbf0a5eb5024695e8ec3d8995b27dd2d02a0..5140c5a0d20f0b20eb60e6a5cff4089b972077d4 100644 --- a/llvm/utils/TableGen/InfoByHwMode.cpp +++ b/llvm/utils/TableGen/InfoByHwMode.cpp @@ -17,7 +17,7 @@ #include "llvm/ADT/Twine.h" #include "llvm/Support/Debug.h" #include "llvm/Support/raw_ostream.h" - +#include "llvm/TableGen/Record.h" #include using namespace llvm; diff --git a/llvm/utils/TableGen/InfoByHwMode.h b/llvm/utils/TableGen/InfoByHwMode.h index 44927d0bf0df9a2bfc7b8907da202a128c494fc9..6cfd6e8bb49373ee1524a1402cadb756d2053519 100644 --- a/llvm/utils/TableGen/InfoByHwMode.h +++ b/llvm/utils/TableGen/InfoByHwMode.h @@ -16,10 +16,16 @@ #include "CodeGenHwModes.h" #include "llvm/ADT/SmallSet.h" +#include "llvm/ADT/SmallVector.h" +#include "llvm/ADT/StringRef.h" +#include "llvm/Support/Compiler.h" #include "llvm/Support/MachineValueType.h" - +#include +#include #include #include +#include +#include namespace llvm { diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp index b129401461b51e9eb974ce1194ea2487ff585263..8f96d3307ded8beccd3bad1f973eae18889f978b 100644 --- a/llvm/utils/TableGen/PredicateExpander.cpp +++ b/llvm/utils/TableGen/PredicateExpander.cpp @@ -12,6 +12,7 @@ #include "PredicateExpander.h" #include "CodeGenSchedule.h" // Definition of STIPredicateFunction. +#include "llvm/TableGen/Record.h" namespace llvm { diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp index e6689b211a7d3a0abaae84d085cabae3dfef843a..01f2f7864d8dd3371ac3d338f76f34d963c3ab8a 100644 --- a/llvm/utils/TableGen/RegisterBankEmitter.cpp +++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp @@ -11,15 +11,15 @@ // //===----------------------------------------------------------------------===// +#include "CodeGenRegisters.h" +#include "CodeGenTarget.h" +#include "InfoByHwMode.h" #include "llvm/ADT/BitVector.h" #include "llvm/Support/Debug.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/TableGenBackend.h" -#include "CodeGenRegisters.h" -#include "CodeGenTarget.h" - #define DEBUG_TYPE "register-bank-emitter" using namespace llvm; diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp index 113cebf8a08e9c0019b498a8b1965b6f966a647b..5715dc1deb3062e72177e05e7dacf5692020b341 100644 --- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp +++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp @@ -12,8 +12,10 @@ // //===----------------------------------------------------------------------===// +#include "CodeGenHwModes.h" #include "CodeGenRegisters.h" #include "CodeGenTarget.h" +#include "InfoByHwMode.h" #include "SequenceToOffsetTable.h" #include "Types.h" #include "llvm/ADT/ArrayRef.h" diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp index 8afe6d37d0e0d5a5f145159cb3ef6859bf44d4f5..ec26e1c41f85495a50e1a22a9c3923e51ecbae90 100644 --- a/llvm/utils/TableGen/SubtargetEmitter.cpp +++ b/llvm/utils/TableGen/SubtargetEmitter.cpp @@ -10,6 +10,7 @@ // //===----------------------------------------------------------------------===// +#include "CodeGenHwModes.h" #include "CodeGenSchedule.h" #include "CodeGenTarget.h" #include "PredicateExpander.h" diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp index 2a63fc490380fb3f002ae40cc68b7ab28d21d5c5..1abcf485f8564dde37a60612f4eecfecd6f63f29 100644 --- a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp +++ b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp @@ -11,7 +11,6 @@ #include "llvm/Config/llvm-config.h" #include "llvm/TableGen/Error.h" #include "llvm/TableGen/Record.h" -#include using namespace llvm; diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.h b/llvm/utils/TableGen/SubtargetFeatureInfo.h index 8c8a4487934cd7856ee58038b9a8fc759ba067bb..e6a3f82d9bb83f79da6f314d0fa9bb132972b126 100644 --- a/llvm/utils/TableGen/SubtargetFeatureInfo.h +++ b/llvm/utils/TableGen/SubtargetFeatureInfo.h @@ -9,9 +9,11 @@ #ifndef LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H #define LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H +#include "llvm/ADT/StringRef.h" #include "llvm/TableGen/Record.h" #include #include +#include #include namespace llvm { diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp index 746e2dd1db16a67646fa8f05f435af078365dcbe..4117ed5a3f6a97641b32584eaf1cec5afbeff1d0 100644 --- a/llvm/utils/TableGen/TableGen.cpp +++ b/llvm/utils/TableGen/TableGen.cpp @@ -13,9 +13,13 @@ #include "TableGenBackends.h" // Declares all backends. #include "llvm/Support/CommandLine.h" #include "llvm/Support/InitLLVM.h" +#include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Main.h" #include "llvm/TableGen/Record.h" #include "llvm/TableGen/SetTheory.h" +#include +#include +#include using namespace llvm; diff --git a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp index 2c1acd8d910c4ab1f0c617aef37feb37ba39131e..85da547d04c1336db74d346bc53fae103bdcdccd 100644 --- a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp +++ b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp @@ -58,6 +58,7 @@ #include "llvm/ADT/DenseMap.h" #include "llvm/Support/raw_ostream.h" #include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" using namespace llvm; diff --git a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp index 1384330ee8a12ae08b640726e58678eb42079f99..b42ffa2aec1a3616b69e3b0e60aa42bbda307c1c 100644 --- a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp @@ -15,6 +15,7 @@ #include "CodeGenTarget.h" #include "X86RecognizableInstr.h" #include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" #include "llvm/TableGen/TableGenBackend.h" using namespace llvm; diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp index 5b3f11848de6c781eb5bebf0fff5a56898cdbb5a..7bc17e6f2b64d2140baa13847634b4ae6b025855 100644 --- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp +++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp @@ -11,10 +11,11 @@ // //===----------------------------------------------------------------------===// +#include "CodeGenInstruction.h" #include "CodeGenTarget.h" #include "X86RecognizableInstr.h" #include "llvm/Support/FormattedStream.h" -#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" #include "llvm/TableGen/TableGenBackend.h" using namespace llvm; diff --git a/llvm/utils/TableGen/X86MnemonicTables.cpp b/llvm/utils/TableGen/X86MnemonicTables.cpp index f405e051e3559060a1e93640417c01a49093bdc7..b5405488de0e5b7d3784722631b085886cf8fb50 100644 --- a/llvm/utils/TableGen/X86MnemonicTables.cpp +++ b/llvm/utils/TableGen/X86MnemonicTables.cpp @@ -14,7 +14,7 @@ #include "CodeGenInstruction.h" #include "CodeGenTarget.h" #include "X86RecognizableInstr.h" -#include "llvm/TableGen/Error.h" +#include "llvm/TableGen/Record.h" #include "llvm/TableGen/TableGenBackend.h" using namespace llvm; diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h index e2d0907b4f8b5bea9c49db7c3244a25427b6f508..d2169a8e879b6fc82671f0373afae5dc361ad8a6 100644 --- a/llvm/utils/TableGen/X86ModRMFilters.h +++ b/llvm/utils/TableGen/X86ModRMFilters.h @@ -17,7 +17,7 @@ #ifndef LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H #define LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H -#include "llvm/Support/DataTypes.h" +#include namespace llvm { diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h index ea56a9d7d994ed7fe7ae6545d220f6ec7fac3897..f389ff01670c091209366e858342d43a504a08ae 100644 --- a/llvm/utils/TableGen/X86RecognizableInstr.h +++ b/llvm/utils/TableGen/X86RecognizableInstr.h @@ -17,8 +17,10 @@ #define LLVM_UTILS_TABLEGEN_X86RECOGNIZABLEINSTR_H #include "CodeGenInstruction.h" -#include "llvm/Support/DataTypes.h" #include "llvm/Support/X86DisassemblerDecoderCommon.h" +#include +#include +#include struct InstructionSpecifier; diff --git a/mlir/include/mlir/Support/LLVM.h b/mlir/include/mlir/Support/LLVM.h index 7acf97b5a2531e5002233ffe93243fe82446e30d..48d569b0ca756279ca862c56220e1f9fcfa929e9 100644 --- a/mlir/include/mlir/Support/LLVM.h +++ b/mlir/include/mlir/Support/LLVM.h @@ -61,7 +61,7 @@ class MutableArrayRef; template using Optional = std::optional; template class PointerUnion; -template +template class SetVector; template class SmallPtrSet; @@ -123,8 +123,8 @@ using DenseMap = llvm::DenseMap; template > using DenseSet = llvm::DenseSet; template , - typename Set = DenseSet> -using SetVector = llvm::SetVector; + typename Set = DenseSet, unsigned N = 0> +using SetVector = llvm::SetVector; template using StringSet = llvm::StringSet; using llvm::MutableArrayRef;