diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index 52156a600791cb6871bb6ad34cedfa51574b3896..27094bee771ad5293693d553d1d4e59eff31029f 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -32,6 +32,7 @@ public:
     RT_EXEC_COUNT,
     RT_HFSORT,
     RT_HFSORT_PLUS,
+    RT_CDS,
     RT_PETTIS_HANSEN,
     RT_RANDOM,
     RT_USER
diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp
index b5052cdaddb13e38fd8b8d7a3f3d5b999ad90ad9..3c3365e1d3d711321c3eda520012d5cbb64e0507 100644
--- a/bolt/lib/Passes/ReorderAlgorithm.cpp
+++ b/bolt/lib/Passes/ReorderAlgorithm.cpp
@@ -531,21 +531,21 @@ void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
   }
 
   // Initialize CFG edges
-  using JumpT = std::pair<uint64_t, uint64_t>;
-  std::vector<std::pair<JumpT, uint64_t>> JumpCounts;
+  std::vector<codelayout::EdgeCount> JumpCounts;
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     auto BI = BB->branch_info_begin();
     for (BinaryBasicBlock *SuccBB : BB->successors()) {
       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
              "missing profile for a jump");
-      auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex());
-      JumpCounts.push_back(std::make_pair(It, BI->Count));
+      JumpCounts.push_back(
+          {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count});
       ++BI;
     }
   }
 
   // Run the layout algorithm
-  auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  auto Result =
+      codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
   Order.reserve(BF.getLayout().block_size());
   for (uint64_t R : Result)
     Order.push_back(OrigOrder[R]);
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 998e0eab66fa67dd3d7d00cf2a3dae2743002a85..f0ad54ef106f32bcecedde42232a03bfefd03c21 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -13,6 +13,7 @@
 #include "bolt/Passes/ReorderFunctions.h"
 #include "bolt/Passes/HFSort.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 #include <fstream>
 
 #define DEBUG_TYPE "hfsort"
@@ -27,33 +28,27 @@ extern cl::opt<uint32_t> RandomSeed;
 
 extern size_t padFunction(const bolt::BinaryFunction &Function);
 
-cl::opt<bolt::ReorderFunctions::ReorderType>
-ReorderFunctions("reorder-functions",
-  cl::desc("reorder and cluster functions (works only with relocations)"),
-  cl::init(bolt::ReorderFunctions::RT_NONE),
-  cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE,
-      "none",
-      "do not reorder functions"),
-    clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT,
-      "exec-count",
-      "order by execution count"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT,
-      "hfsort",
-      "use hfsort algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS,
-      "hfsort+",
-      "use hfsort+ algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
-      "pettis-hansen",
-      "use Pettis-Hansen algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_RANDOM,
-      "random",
-      "reorder functions randomly"),
-    clEnumValN(bolt::ReorderFunctions::RT_USER,
-      "user",
-      "use function order specified by -function-order")),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
+cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions(
+    "reorder-functions",
+    cl::desc("reorder and cluster functions (works only with relocations)"),
+    cl::init(bolt::ReorderFunctions::RT_NONE),
+    cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none",
+                          "do not reorder functions"),
+               clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count",
+                          "order by execution count"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort",
+                          "use hfsort algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+",
+                          "use hfsort+ algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_CDS, "cds",
+                          "use cache-directed sort"),
+               clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
+                          "pettis-hansen", "use Pettis-Hansen algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random",
+                          "reorder functions randomly"),
+               clEnumValN(bolt::ReorderFunctions::RT_USER, "user",
+                          "use function order specified by -function-order")),
+    cl::ZeroOrMore, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> ReorderFunctionsUseHotSize(
     "reorder-functions-use-hot-size",
@@ -323,6 +318,34 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
   case RT_HFSORT_PLUS:
     Clusters = hfsortPlus(Cg);
     break;
+  case RT_CDS: {
+    // It is required that the sum of incoming arc weights is not greater
+    // than the number of samples for every function. Ensuring the call graph
+    // obeys the property before running the algorithm.
+    Cg.adjustArcWeights();
+
+    // Initialize CFG nodes and their data
+    std::vector<uint64_t> FuncSizes;
+    std::vector<uint64_t> FuncCounts;
+    std::vector<codelayout::EdgeCount> CallCounts;
+    std::vector<uint64_t> CallOffsets;
+    for (NodeId F = 0; F < Cg.numNodes(); ++F) {
+      FuncSizes.push_back(Cg.size(F));
+      FuncCounts.push_back(Cg.samples(F));
+      for (NodeId Succ : Cg.successors(F)) {
+        const Arc &Arc = *Cg.findArc(F, Succ);
+        CallCounts.push_back({F, Succ, uint64_t(Arc.weight())});
+        CallOffsets.push_back(uint64_t(Arc.avgCallOffset()));
+      }
+    }
+
+    // Run the layout algorithm.
+    std::vector<uint64_t> Result = codelayout::computeCacheDirectedLayout(
+        FuncSizes, FuncCounts, CallCounts, CallOffsets);
+
+    // Create a single cluster from the computed order of hot functions.
+    Clusters.emplace_back(Cluster(Result, Cg));
+  } break;
   case RT_PETTIS_HANSEN:
     Clusters = pettisAndHansen(Cg);
     break;
diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c8971ea5036dc026689899884b8540391ddc341a..7eeda0ec3efa8a38686a9a7cc72353f5f7758e77 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -174,7 +174,7 @@ private:
   /// Track symbols symbols processed during and after the registration
   /// to avoid infinite loops between type conversions and global variable
   /// creation.
-  llvm::SmallSetVector<Fortran::semantics::SymbolRef, 64> seen;
+  llvm::SmallSetVector<Fortran::semantics::SymbolRef, 32> seen;
 };
 
 class DispatchTableConverter {
diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index 8e6a746d219ed366f8fbe03cef9893498956e77b..3a571b8d7b78bc56edb0def8abba6d8a89303e29 100644
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -72,6 +72,7 @@ add_lld_library(lldELF
   Passes
   Support
   TargetParser
+  TransformUtils
 
   LINK_LIBS
   lldCommon
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
index ff72731b1f38d65a6896b109b62e72b20aea94fc..5e36964da94fc52328f66d978a65ee6d18a1e0f8 100644
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp
@@ -6,38 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
-/// for Large-Scale Data-Center Applications
-/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
-///
-/// The goal of this algorithm is to improve runtime performance of the final
-/// executable by arranging code sections such that page table and i-cache
-/// misses are minimized.
-///
-/// Definitions:
-/// * Cluster
-///   * An ordered list of input sections which are laid out as a unit. At the
-///     beginning of the algorithm each input section has its own cluster and
-///     the weight of the cluster is the sum of the weight of all incoming
-///     edges.
-/// * Call-Chain Clustering (C³) Heuristic
-///   * Defines when and how clusters are combined. Pick the highest weighted
-///     input section then add it to its most likely predecessor if it wouldn't
-///     penalize it too much.
-/// * Density
-///   * The weight of the cluster divided by the size of the cluster. This is a
-///     proxy for the amount of execution time spent per byte of the cluster.
-///
-/// It does so given a call graph profile by the following:
-/// * Build a weighted call graph from the call graph profile
-/// * Sort input sections by weight
-/// * For each input section starting with the highest weight
-///   * Find its most likely predecessor cluster
-///   * Check if the combined cluster would be too large, or would have too low
-///     a density.
-///   * If not, then combine the clusters.
-/// * Sort non-empty clusters by density
+/// The file is responsible for sorting sections using LLVM call graph profile
+/// data by placing frequently executed code sections together. The goal of the
+/// placement is to improve the runtime performance of the final executable by
+/// arranging code sections so that i-TLB misses and i-cache misses are reduced.
 ///
+/// The algorithm first builds a call graph based on the profile data and then
+/// iteratively merges "chains" (ordered lists) of input sections which will be
+/// laid out as a unit. There are two implementations for deciding how to
+/// merge a pair of chains:
+///  - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
+///    "Optimizing Function Placement for Large-Scale Data-Center Applications"
+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
+///   typically produces layouts with higher locality, and hence, yields fewer
+///   instruction cache misses on large binaries.
 //===----------------------------------------------------------------------===//
 
 #include "CallGraphSort.h"
@@ -45,6 +28,7 @@
 #include "InputSection.h"
 #include "Symbols.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 
 #include <numeric>
 
@@ -75,6 +59,33 @@ struct Cluster {
   Edge bestPred = {-1, 0};
 };
 
+/// Implementation of the Call-Chain Clustering (C^3). The goal of this
+/// algorithm is to improve runtime performance of the executable by arranging
+/// code sections such that page table and i-cache misses are minimized.
+///
+/// Definitions:
+/// * Cluster
+///   * An ordered list of input sections which are laid out as a unit. At the
+///     beginning of the algorithm each input section has its own cluster and
+///     the weight of the cluster is the sum of the weight of all incoming
+///     edges.
+/// * Call-Chain Clustering (C³) Heuristic
+///   * Defines when and how clusters are combined. Pick the highest weighted
+///     input section then add it to its most likely predecessor if it wouldn't
+///     penalize it too much.
+/// * Density
+///   * The weight of the cluster divided by the size of the cluster. This is a
+///     proxy for the amount of execution time spent per byte of the cluster.
+///
+/// It does so given a call graph profile by the following:
+/// * Build a weighted call graph from the call graph profile
+/// * Sort input sections by weight
+/// * For each input section starting with the highest weight
+///   * Find its most likely predecessor cluster
+///   * Check if the combined cluster would be too large, or would have too low
+///     a density.
+///   * If not, then combine the clusters.
+/// * Sort non-empty clusters by density
 class CallGraphSort {
 public:
   CallGraphSort();
@@ -260,11 +271,74 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
   return orderMap;
 }
 
+// Sort sections by the profile data using the Cache-Directed Sort algorithm.
+// The placement is done by optimizing the locality by co-locating frequently
+// executed code sections together.
+DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {
+  SmallVector<uint64_t, 0> funcSizes;
+  SmallVector<uint64_t, 0> funcCounts;
+  SmallVector<codelayout::EdgeCount, 0> callCounts;
+  SmallVector<uint64_t, 0> callOffsets;
+  SmallVector<const InputSectionBase *, 0> sections;
+  DenseMap<const InputSectionBase *, size_t> secToTargetId;
+
+  auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
+    auto res = secToTargetId.try_emplace(inSec, sections.size());
+    if (res.second) {
+      // inSec does not appear before in the graph.
+      sections.push_back(inSec);
+      assert(inSec->getSize() > 0 && "found a function with zero size");
+      funcSizes.push_back(inSec->getSize());
+      funcCounts.push_back(0);
+    }
+    return res.first->second;
+  };
+
+  // Create the graph.
+  for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {
+    const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
+    const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
+    // Ignore edges between input sections belonging to different sections.
+    if (fromSB->getOutputSection() != toSB->getOutputSection())
+      continue;
+
+    uint64_t weight = c.second;
+    // Ignore edges with zero weight.
+    if (weight == 0)
+      continue;
+
+    size_t from = getOrCreateNode(fromSB);
+    size_t to = getOrCreateNode(toSB);
+    // Ignore self-edges (recursive calls).
+    if (from == to)
+      continue;
+
+    callCounts.push_back({from, to, weight});
+    // Assume that the jump is at the middle of the input section. The profile
+    // data does not contain jump offsets.
+    callOffsets.push_back((funcSizes[from] + 1) / 2);
+    funcCounts[to] += weight;
+  }
+
+  // Run the layout algorithm.
+  std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(
+      funcSizes, funcCounts, callCounts, callOffsets);
+
+  // Create the final order.
+  DenseMap<const InputSectionBase *, int> orderMap;
+  int curOrder = 1;
+  for (uint64_t secIdx : sortedSections)
+    orderMap[sections[secIdx]] = curOrder++;
+
+  return orderMap;
+}
+
 // Sort sections by the profile data provided by --callgraph-profile-file.
 //
 // This first builds a call graph based on the profile data then merges sections
-// according to the C³ heuristic. All clusters are then sorted by a density
-// metric to further improve locality.
+// according either to the C³ or Cache-Directed-Sort ordering algorithm.
 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
+  if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)
+    return computeCacheDirectedSortOrder();
   return CallGraphSort().run();
 }
diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h
index 4997cb102c326402480c3c418e0b34a2f652bba0..1b54f2b62482284bb2d02581dc7481b367ff1760 100644
--- a/lld/ELF/CallGraphSort.h
+++ b/lld/ELF/CallGraphSort.h
@@ -14,6 +14,8 @@
 namespace lld::elf {
 class InputSectionBase;
 
+llvm::DenseMap<const InputSectionBase *, int> computeCacheDirectedSortOrder();
+
 llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
 } // namespace lld::elf
 
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 706f17b764c88f08c8eda37bcfb38d25f6fc20fb..90a1e312735f090af82a2dcd8048875994e651a8 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -57,6 +57,9 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All };
 // For --build-id.
 enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid };
 
+// For --call-graph-profile-sort={none,hfsort,cdsort}.
+enum class CGProfileSortKind { None, Hfsort, Cdsort };
+
 // For --discard-{all,locals,none}.
 enum class DiscardPolicy { Default, All, Locals, None };
 
@@ -193,7 +196,7 @@ struct Config {
   bool armJ1J2BranchEncoding = false;
   bool asNeeded = false;
   BsymbolicKind bsymbolic = BsymbolicKind::None;
-  bool callGraphProfileSort;
+  CGProfileSortKind callGraphProfileSort;
   bool checkSections;
   bool checkDynamicRelocs;
   llvm::DebugCompressionType compressDebugSections;
@@ -225,6 +228,7 @@ struct Config {
   bool ltoDebugPassManager;
   bool ltoEmitAsm;
   bool ltoUniqueBasicBlockSectionNames;
+  bool ltoValidateAllVtablesHaveTypeInfos;
   bool ltoWholeProgramVisibility;
   bool mergeArmExidx;
   bool mipsN32Abi = false;
@@ -441,6 +445,9 @@ struct Ctx {
   std::atomic<bool> hasTlsIe{false};
   // True if we need to reserve two .got entries for local-dynamic TLS model.
   std::atomic<bool> needsTlsLd{false};
+  // True if all native vtable symbols have corresponding type info symbols
+  // during LTO.
+  bool ltoAllVtablesHaveTypeInfos;
 
   void reset();
 };
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 7e2a72acf8f64e28f6ae0bf8f6934f99f48b4f28..b666602a558647d7be51b25cf1d4716f98a28510 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -104,6 +104,7 @@ void Ctx::reset() {
   backwardReferences.clear();
   hasSympart.store(false, std::memory_order_relaxed);
   needsTlsLd.store(false, std::memory_order_relaxed);
+  ltoAllVtablesHaveTypeInfos = false;
 }
 
 bool elf::link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
@@ -974,21 +975,87 @@ template <class ELFT> static void readCallGraphsFromObjectFiles() {
   }
 }
 
-static DebugCompressionType getCompressDebugSections(opt::InputArgList &args) {
-  StringRef s = args.getLastArgValue(OPT_compress_debug_sections, "none");
-  if (s == "zlib") {
-    if (!compression::zlib::isAvailable())
-      error("--compress-debug-sections: zlib is not available");
-    return DebugCompressionType::Zlib;
+template <class ELFT>
+static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) {
+  DenseSet<StringRef> typeInfoSymbols;
+  SmallSetVector<StringRef, 0> vtableSymbols;
+  auto processVtableAndTypeInfoSymbols = [&](StringRef name) {
+    if (name.consume_front("_ZTI"))
+      typeInfoSymbols.insert(name);
+    else if (name.consume_front("_ZTV"))
+      vtableSymbols.insert(name);
+  };
+
+  // Examine all native symbol tables.
+  for (ELFFileBase *f : ctx.objectFiles) {
+    using Elf_Sym = typename ELFT::Sym;
+    for (const Elf_Sym &s : f->template getGlobalELFSyms<ELFT>()) {
+      if (s.st_shndx != SHN_UNDEF) {
+        StringRef name = check(s.getName(f->getStringTable()));
+        processVtableAndTypeInfoSymbols(name);
+      }
+    }
+  }
+
+  for (SharedFile *f : ctx.sharedFiles) {
+    using Elf_Sym = typename ELFT::Sym;
+    for (const Elf_Sym &s : f->template getELFSyms<ELFT>()) {
+      if (s.st_shndx != SHN_UNDEF) {
+        StringRef name = check(s.getName(f->getStringTable()));
+        processVtableAndTypeInfoSymbols(name);
+      }
+    }
   }
-  if (s == "zstd") {
-    if (!compression::zstd::isAvailable())
-      error("--compress-debug-sections: zstd is not available");
-    return DebugCompressionType::Zstd;
+
+  SmallSetVector<StringRef, 0> vtableSymbolsWithNoRTTI;
+  for (StringRef s : vtableSymbols)
+    if (!typeInfoSymbols.count(s))
+      vtableSymbolsWithNoRTTI.insert(s);
+
+  // Remove known safe symbols.
+  for (auto *arg : args.filtered(OPT_lto_known_safe_vtables)) {
+    StringRef knownSafeName = arg->getValue();
+    if (!knownSafeName.consume_front("_ZTV"))
+      error("--lto-known-safe-vtables=: expected symbol to start with _ZTV, "
+            "but got " +
+            knownSafeName);
+    vtableSymbolsWithNoRTTI.remove(knownSafeName);
   }
+
+  ctx.ltoAllVtablesHaveTypeInfos = vtableSymbolsWithNoRTTI.empty();
+  // Check for unmatched RTTI symbols
+  for (StringRef s : vtableSymbolsWithNoRTTI) {
+    message(
+        "--lto-validate-all-vtables-have-type-infos: RTTI missing for vtable "
+        "_ZTV" +
+        s + ", --lto-whole-program-visibility disabled");
+  }
+}
+
+static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) {
+  StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort");
+  if (s == "hfsort")
+    return CGProfileSortKind::Hfsort;
+  if (s == "cdsort")
+    return CGProfileSortKind::Cdsort;
   if (s != "none")
-    error("unknown --compress-debug-sections value: " + s);
-  return DebugCompressionType::None;
+    error("unknown --call-graph-profile-sort= value: " + s);
+  return CGProfileSortKind::None;
+}
+
+static DebugCompressionType getCompressionType(StringRef s, StringRef option) {
+  DebugCompressionType type = StringSwitch<DebugCompressionType>(s)
+                                  .Case("zlib", DebugCompressionType::Zlib)
+                                  .Case("zstd", DebugCompressionType::Zstd)
+                                  .Default(DebugCompressionType::None);
+  if (type == DebugCompressionType::None) {
+    if (s != "none")
+      error("unknown " + option + " value: " + s);
+  } else if (const char *reason = compression::getReasonIfUnsupported(
+                 compression::formatFor(type))) {
+    error(option + ": " + reason);
+  }
+  return type;
 }
 
 static StringRef getAliasSpelling(opt::Arg *arg) {
@@ -1078,10 +1145,13 @@ static void readConfigs(opt::InputArgList &args) {
     else if (arg->getOption().matches(OPT_Bsymbolic))
       config->bsymbolic = BsymbolicKind::All;
   }
+  config->callGraphProfileSort = getCGProfileSortKind(args);
   config->checkSections =
       args.hasFlag(OPT_check_sections, OPT_no_check_sections, true);
   config->chroot = args.getLastArgValue(OPT_chroot);
-  config->compressDebugSections = getCompressDebugSections(args);
+  config->compressDebugSections = getCompressionType(
+      args.getLastArgValue(OPT_compress_debug_sections, "none"),
+      "--compress-debug-sections");
   config->cref = args.hasArg(OPT_cref);
   config->optimizeBBJumps =
       args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false);
@@ -1096,8 +1166,6 @@ static void readConfigs(opt::InputArgList &args) {
       args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false);
   config->emitLLVM = args.hasArg(OPT_plugin_opt_emit_llvm, false);
   config->emitRelocs = args.hasArg(OPT_emit_relocs);
-  config->callGraphProfileSort = args.hasFlag(
-      OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true);
   config->enableNewDtags =
       args.hasFlag(OPT_enable_new_dtags, OPT_disable_new_dtags, true);
   config->entry = args.getLastArgValue(OPT_entry);
@@ -1138,6 +1206,9 @@ static void readConfigs(opt::InputArgList &args) {
   config->ltoWholeProgramVisibility =
       args.hasFlag(OPT_lto_whole_program_visibility,
                    OPT_no_lto_whole_program_visibility, false);
+  config->ltoValidateAllVtablesHaveTypeInfos =
+      args.hasFlag(OPT_lto_validate_all_vtables_have_type_infos,
+                   OPT_no_lto_validate_all_vtables_have_type_infos, false);
   config->ltoo = args::getInteger(args, OPT_lto_O, 2);
   config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq);
   config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1);
@@ -1465,7 +1536,7 @@ static void readConfigs(opt::InputArgList &args) {
       config->symbolOrderingFile = getSymbolOrderingFile(*buffer);
       // Also need to disable CallGraphProfileSort to prevent
       // LLD order symbols with CGProfile
-      config->callGraphProfileSort = false;
+      config->callGraphProfileSort = CGProfileSortKind::None;
     }
   }
 
@@ -2666,6 +2737,10 @@ void LinkerDriver::link(opt::InputArgList &args) {
                                 config->ltoEmitAsm ||
                                 !config->thinLTOModulesToCompile.empty();
 
+  // Handle --lto-validate-all-vtables-have-type-infos.
+  if (config->ltoValidateAllVtablesHaveTypeInfos)
+    invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args);
+
   // Do link-time optimization if given files are LLVM bitcode files.
   // This compiles bitcode files into real object files.
   //
@@ -2849,7 +2924,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
   }
 
   // Read the callgraph now that we know what was gced or icfed
-  if (config->callGraphProfileSort) {
+  if (config->callGraphProfileSort != CGProfileSortKind::None) {
     if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
       if (std::optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
         readCallGraph(*buffer);
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index b80f1f48f768aa8210377d8c26b8bf2654467481..e8c0e9778c5fdf671e4e25ee95e9bd9e00f39393 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -153,6 +153,9 @@ static lto::Config createConfig() {
   c.DwoDir = std::string(config->dwoDir);
 
   c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility;
+  c.ValidateAllVtablesHaveTypeInfos =
+      config->ltoValidateAllVtablesHaveTypeInfos;
+  c.AllVtablesHaveTypeInfos = ctx.ltoAllVtablesHaveTypeInfos;
   c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty();
 
   for (const llvm::StringRef &name : config->thinLTOModulesToCompile)
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index b6a6ef64d017a4778094326ed2c31e77ca3e9fcb..c91111006942c53dc3bed5143ed96c84b0116efc 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -111,9 +111,12 @@ defm as_needed: B<"as-needed",
 defm call_graph_ordering_file:
   Eq<"call-graph-ordering-file", "Layout sections to optimize the given callgraph">;
 
-defm call_graph_profile_sort: BB<"call-graph-profile-sort",
-    "Reorder sections with call graph profile (default)",
-    "Do not reorder sections with call graph profile">;
+def call_graph_profile_sort: JJ<"call-graph-profile-sort=">,
+  HelpText<"Reorder input sections with call graph profile using the specified algorithm (default: hfsort)">,
+  MetaVarName<"[none,hfsort]">,
+  Values<"none,hfsort">;
+def : FF<"no-call-graph-profile-sort">, Alias<call_graph_profile_sort>, AliasArgs<["none"]>,
+  Flags<[HelpHidden]>;
 
 // --chroot doesn't have a help text because it is an internal option.
 def chroot: Separate<["--"], "chroot">;
@@ -569,9 +572,14 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
 defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch",
   "turn on warnings about profile cfg mismatch (default)>",
   "turn off warnings about profile cfg mismatch">;
+defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables",
+  "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">;
 def lto_obj_path_eq: JJ<"lto-obj-path=">;
 def lto_sample_profile: JJ<"lto-sample-profile=">,
   HelpText<"Sample profile file path">;
+defm lto_validate_all_vtables_have_type_infos: BB<"lto-validate-all-vtables-have-type-infos",
+  "Validate that all vtables have type infos for LTO link",
+  "Do not validate that all vtables have type infos for LTO link">;
 defm lto_whole_program_visibility: BB<"lto-whole-program-visibility",
   "Asserts that the LTO link has whole program visibility",
   "Asserts that the LTO link does not have whole program visibility">;
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index edeb7c4bfe37ca6c876fc598da008358c0a0af1d..35ee7ff0447debcc9e58d28260ac00fdef842bf2 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -115,6 +115,19 @@ is not intended to be cryptographically secure.
 .It Fl -build-id
 Synonym for
 .Fl -build-id Ns = Ns Cm fast .
+.It Fl -call-graph-profile-sort Ns = Ns Ar algorithm
+.Ar algorithm
+may be:
+.Pp
+.Bl -tag -width 2n -compact
+.It Cm none
+Ignore call graph profile.
+.It Cm hfsort
+Use hfsort (default).
+.It Cm cdsort
+Use cdsort.
+.El
+.Pp
 .It Fl -color-diagnostics Ns = Ns Ar value
 Use colors in diagnostics.
 .Ar value
diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s
index f56f3bcbf0c3c5e92c11e83b8c692f0cb17450e9..0848adc5e4279a7edbb8fdc3730f104ed711819b 100644
--- a/lld/test/ELF/cgprofile-obj.s
+++ b/lld/test/ELF/cgprofile-obj.s
@@ -3,8 +3,11 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld -e A %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s
-# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t
+# RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG
+## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none.
+# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1
+# RUN: cmp %t %t1
 
     .section    .text.D,"ax",@progbits
 D:
diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s
index 99cbfa574532523a842e8fa539598c08b2e61ae1..c9194bbbc43cbe0284091ef63b66ebdedc4e5813 100644
--- a/lld/test/ELF/cgprofile-txt.s
+++ b/lld/test/ELF/cgprofile-txt.s
@@ -24,8 +24,19 @@
 # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2
 # RUN: llvm-readobj --symbols %t2 | FileCheck %s
+## --call-graph-profile-sort=hfsort is the default.
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b
+# RUN: cmp %t2 %t2b
+
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT
+
+# RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \
+# RUN:   -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN
+
+# UNKNOWN: error: unknown --call-graph-profile-sort= value: sort
 
     .section    .text.D,"ax",@progbits
 D:
@@ -159,6 +170,31 @@ TooManyPreds10:
 # CHECK:          Name: _init2
 # CHECK-NEXT:     Value: 0x201141
 
+# CDSORT:          Name: D
+# CDSORT-NEXT:     Value: 0x201123
+# CDSORT:          Name: TooManyPreds
+# CDSORT-NEXT:     Value: 0x20112F
+# CDSORT:          Name: TooManyPreds10
+# CDSORT-NEXT:     Value: 0x20112E
+# CDSORT:          Name: C
+# CDSORT-NEXT:     Value: 0x201122
+# CDSORT:          Name: B
+# CDSORT-NEXT:     Value: 0x201121
+# CDSORT:          Name: A
+# CDSORT-NEXT:     Value: 0x201120
+# CDSORT:          Name: TS
+# CDSORT-NEXT:     Value: 0x20113D
+# CDSORT:          Name: PP
+# CDSORT-NEXT:     Value: 0x20113C
+# CDSORT:          Name: QC
+# CDSORT-NEXT:     Value: 0x20113E
+# CDSORT:          Name: GB
+# CDSORT-NEXT:     Value: 0x20113F
+# CDSORT:          Name: _init
+# CDSORT-NEXT:     Value: 0x201140
+# CDSORT:          Name: _init2
+# CDSORT-NEXT:     Value: 0x201141
+
 # NOSORT:          Name: D
 # NOSORT-NEXT:     Value: 0x201120
 # NOSORT:          Name: TooManyPreds
diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s
index 91961db39c3a883fc948c3b609e2e8b95a1f4e4c..b59b6eeb292fabff00e32148498208b799d3cf46 100644
--- a/lld/test/ELF/cgprofile-txt2.s
+++ b/lld/test/ELF/cgprofile-txt2.s
@@ -5,17 +5,28 @@
 # RUN: echo "B C 50" >> %t.call_graph
 # RUN: echo "C D 40" >> %t.call_graph
 # RUN: echo "D B 10" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
-# RUN: llvm-readobj --symbols %t2 | FileCheck %s
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS
 
-# CHECK:      Name: A
-# CHECK-NEXT: Value: 0x201123
-# CHECK:      Name: B
-# CHECK-NEXT: Value: 0x201120
-# CHECK:      Name: C
-# CHECK-NEXT: Value: 0x201121
-# CHECK:      Name: D
-# CHECK-NEXT: Value: 0x201122
+# CHECKC3:      Name: A
+# CHECKC3-NEXT: Value: 0x201123
+# CHECKC3:      Name: B
+# CHECKC3-NEXT: Value: 0x201120
+# CHECKC3:      Name: C
+# CHECKC3-NEXT: Value: 0x201121
+# CHECKC3:      Name: D
+# CHECKC3-NEXT: Value: 0x201122
+
+# CHECKCDS:      Name: A
+# CHECKCDS-NEXT: Value: 0x201120
+# CHECKCDS:      Name: B
+# CHECKCDS-NEXT: Value: 0x201121
+# CHECKCDS:      Name: C
+# CHECKCDS-NEXT: Value: 0x201122
+# CHECKCDS:      Name: D
+# CHECKCDS-NEXT: Value: 0x201123
 
 .section    .text.A,"ax",@progbits
 .globl  A
diff --git a/lld/test/ELF/compress-sections-err.s b/lld/test/ELF/compress-sections-err.s
new file mode 100644
index 0000000000000000000000000000000000000000..09780380708319c99a848e46f8b682301bc64ce2
--- /dev/null
+++ b/lld/test/ELF/compress-sections-err.s
@@ -0,0 +1,12 @@
+# REQUIRES: x86
+# UNSUPPORTED: zlib
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld %t.o --compress-debug-sections=zlib --compress-debug-sections=none -o /dev/null 2>&1 | count 0
+# RUN: not ld.lld %t.o --compress-debug-sections=zlib -o /dev/null 2>&1 | \
+# RUN:   FileCheck %s --implicit-check-not=error:
+
+# CHECK: error: --compress-debug-sections: LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+
+.globl _start
+_start:
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll
new file mode 100644
index 0000000000000000000000000000000000000000..fb357831d6f21a97f34d9a4bf09e70818669bbc4
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll
@@ -0,0 +1,26 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI6Native, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] }
+@_ZTS6Native = linkonce_odr constant [8 x i8] c"6Native\00"
+@_ZTI6Native = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS6Native, ptr @_ZTI1A }
+
+; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+
+define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4533504c601803158a2ecbc550163c32fc21620a
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll
@@ -0,0 +1,19 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] }
+
+define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
new file mode 100644
index 0000000000000000000000000000000000000000..43df8366aa2ae0c68e9a5531a0661f90e897ae2e
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
@@ -0,0 +1,68 @@
+;; Source code:
+;; cat > a.h <<'eof'
+;; struct A { virtual int foo(); };
+;; int bar(A *a);
+;; eof
+;; cat > b.cc <<'eof'
+;; #include "a.h"
+;; struct B : A { int foo() { return 2; } };
+;; int baz() { B b; return bar(&b); }
+;; eof
+;; clang++ -flto=thin b.cc -c
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.B = type { %struct.A }
+%struct.A = type { ptr }
+
+@_ZTV1B = linkonce_odr dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B3fooEv] }, !type !0, !type !1, !type !2, !type !3
+@_ZTS1B = linkonce_odr dso_local constant [3 x i8] c"1B\00"
+@_ZTI1A = external constant ptr
+@_ZTI1B = linkonce_odr dso_local constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+@_ZTV1A = external unnamed_addr constant { [3 x ptr] }
+
+define dso_local noundef i32 @_Z3bazv() #0 {
+entry:
+  %b = alloca %struct.B
+  call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b)
+  %call = call noundef i32 @_Z3barP1A(ptr noundef %b)
+  ret i32 %call
+}
+
+define linkonce_odr dso_local void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1)
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1
+  ret void
+}
+
+declare i32 @_Z3barP1A(ptr noundef)
+
+define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1
+  ret void
+}
+
+define linkonce_odr i32 @_ZN1B3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  ret i32 2
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFivE.virtual"}
+!2 = !{i64 16, !"_ZTS1B"}
+!3 = !{i64 16, !"_ZTSM1BFivE.virtual"}
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
new file mode 100644
index 0000000000000000000000000000000000000000..6cc55df82e2f2814b1717a0ad09c55a81030ed95
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
@@ -0,0 +1,16 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@_ZTV1B = external unnamed_addr constant { [4 x ptr] }
+
+define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 {
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  ret void
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
new file mode 100644
index 0000000000000000000000000000000000000000..d6ac53f9fb936b0d1eb4f86549242288613dcf26
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
@@ -0,0 +1,263 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!12, !13}' >> %t1_regular.ll
+; RUN: echo '!12 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!13 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos.ll -o %t2.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o
+; RUN: ld.lld %t2.o -o %t2.so -shared
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll -o %t2_nortti.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2_nortti.bc -o %t2_nortti.o
+; RUN: ld.lld %t2_nortti.o -o %t2_nortti.so -shared
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_undef.ll -o %t2_undef.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2_undef.bc -o %t2_undef.o
+; RUN: ld.lld %t2_undef.o -o %t2_undef.so -shared
+
+;; With --lto-whole-program-visibility, we assume no native types can interfere
+;; and thus proceed with devirtualization even in the presence of native types
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; With --lto-validate-all-vtables-have-type-infos, the linker checks for the presence of vtables
+;; and RTTI in native files and blocks devirtualization to be conservative on correctness
+;; for these types.
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t4_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t4_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t4_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t4_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; DSOs behave similarly
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.so -o %t5_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.so -o %t5_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.so -o %t5_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t5_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+; VALIDATE-NOT: single-impl:
+; VALIDATE:     single-impl: devirtualized a call to _ZN1D1mEi
+; VALIDATE-NOT: single-impl:
+
+;; When vtables without type infos are detected in native files, we have a hole in our knowledge so
+;; --lto-validate-all-vtables-have-type-infos conservatively disables --lto-whole-program-visibility
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.o -o %t6_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t6_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t6_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t6_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; DSOs behave similarly
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.so -o %t7_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.so -o %t7_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.so -o %t7_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t7_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+; NO-RTTI-DAG: --lto-validate-all-vtables-have-type-infos: RTTI missing for vtable _ZTV6Native, --lto-whole-program-visibility disabled
+; NO-RTTI-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; --lto-known-safe-vtables=* can be used to specifically allow types to participate in WPD
+;; even if they don't have corresponding RTTI
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.o -o %t8_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t8_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t8_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t8_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Only check for definitions of vtables symbols, just having a reference does not allow a type to
+;; be derived from
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_undef.o -o %t9_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_undef.o -o %t9_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_undef.o -o %t9_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t9_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11
+
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00"
+@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+
+@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00"
+@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A }
+
+@_ZTS1D = internal constant [3 x i8] c"1D\00"
+@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  ;; --lto-whole-program-visibility disabled so no devirtualization
+  ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1
+  ; CHECK-NO-RTTI-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; We still have to call it as virtual.
+  ; CHECK-IR: %call2 = tail call i32 %fptr22
+  ; CHECK-VALIDATE-IR: %call2 = tail call i32 %fptr22
+  ; CHECK-NO-RTTI-IR: %call2 = tail call i32 %fptr22
+  %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi
+  ;; Types not present in native files can still be devirtualized
+  ; CHECK-VALIDATE-IR: %call3 = tail call i32 @_ZN1D1mEi
+  ;; --lto-whole-program-visibility disabled but being local this
+  ;;  has VCallVisibilityTranslationUnit visibility so it's still devirtualized
+  ; CHECK-NO-RTTI-IR: %call3 = tail call i32 @_ZN1D1mEi
+  %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2)
+
+  ret i32 %call3
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS1B"}
+!4 = !{i64 16, !"_ZTSM1BFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM1BFviE.virtual"}
+!6 = !{i64 16, !"_ZTS1C"}
+!7 = !{i64 16, !"_ZTSM1CFviE.virtual"}
+!8 = !{i64 24, !"_ZTSM1CFviE.virtual"}
+!9 = !{i64 16, !10}
+!10 = distinct !{}
+!11 = !{i64 2}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
new file mode 100644
index 0000000000000000000000000000000000000000..15040b8707aede995aea588638eb7c7c3eafafaf
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
@@ -0,0 +1,183 @@
+; REQUIRES: x86
+
+; RUN: rm -rf %t.dir
+; RUN: split-file %s %t.dir
+; RUN: cd %t.dir
+
+;; Common artifacts
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1.o ThinLTO.ll
+; RUN: opt -module-summary -o %t2.o RegularLTO.ll
+
+;; --lto-whole-program-visibility when there's split ThinLTO and a RegularLTO with summary optimizes
+;; using the combined index.
+; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR
+; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR
+
+;; --lto-validate-all-vtables-have-type-infos when there's split ThinLTO and a RegularLTO with summary behaves the same
+;; as everything is present in the combined index.
+; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR
+; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+
+;--- ThinLTO.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11
+
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00"
+@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+
+@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00"
+@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A }
+
+@_ZTS1D = internal constant [3 x i8] c"1D\00"
+@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata"
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+  ;; Call function built with RegularLTO
+  %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a)
+
+  ;; ThinLTO code starts here
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; Check that the call was not devirtualized.
+  ; CHECK-IR: %call2 = tail call i32 %fptr22
+  %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi
+  %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2)
+
+  ret i32 %call3
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i32 @RegularLTO(ptr)
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1A1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS1B"}
+!4 = !{i64 16, !"_ZTSM1BFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM1BFviE.virtual"}
+!6 = !{i64 16, !"_ZTS1C"}
+!7 = !{i64 16, !"_ZTSM1CFviE.virtual"}
+!8 = !{i64 24, !"_ZTSM1CFviE.virtual"}
+!9 = !{i64 16, !10}
+!10 = distinct !{}
+!11 = !{i64 2}
+
+;--- RegularLTO.ll
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV7Regular = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI7Regular, ptr @_ZN7Regular1fEi, ptr @_ZN1A1nEi] } , !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTS7Regular = linkonce_odr constant [9 x i8] c"7Regular\00"
+@_ZTI7Regular = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS7Regular, ptr @_ZTI1A }
+
+; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [1 x ptr] [ ptr @_ZTV7Regular ], section "llvm.metadata"
+
+; CHECK-COMMON-REGULAR-IR-LABEL: define dso_local i32 @RegularLTO
+define i32 @RegularLTO(ptr %obj, i32 %a) #0 {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptr1 = load ptr, ptr %vtable, align 8
+
+  ;; Check that the call was not devirtualized.
+  ; CHECK-REGULAR-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  ret i32 %call
+}
+; CHECK-COMMON-REGULAR-IR-LABEL: ret i32
+; CHECK-COMMON-REGULAR-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN7Regular1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
+!llvm.module.flags = !{!6, !7}
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS7Regular"}
+!4 = !{i64 16, !"_ZTSM7RegularFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM7RegularFviE.virtual"}
+!6 = !{i32 1, !"ThinLTO", i32 0}
+!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
new file mode 100644
index 0000000000000000000000000000000000000000..30bd75606f7d2d0aeb4bfeb2e82f289941101d0a
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
@@ -0,0 +1,136 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!6, !7}' >> %t1_regular.ll
+; RUN: echo '!6 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+;; With --lto-whole-program-visibility, we assume no native types can interfere
+;; and thus proceed with devirtualization even in the presence of native types
+
+;; Index based WPD
+; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; With --lto-whole-program-visibility and --lto-validate-all-vtables-have-type-infos
+;; we rely on resolutions on the typename symbol to inform us of what's outside the summary.
+;; Without the typename symbol in the LTO unit (e.g. RTTI disabled) this causes
+;; conservative disablement of WPD on these types unless it's local
+
+;; Index based WPD
+; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+; VALIDATE-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !2
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  ;; No resolution for _ZTS1A means we don't devirtualize
+  ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; We still have to call it as virtual.
+  ; CHECK-IR: %call3 = tail call i32 %fptr22
+  ; CHECK-VALIDATE-IR: %call3 = tail call i32 %fptr22
+  %call3 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !4)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call4 = tail call i32 @_ZN1D1mEi
+  ;; Being local this has VCallVisibilityTranslationUnit
+  ;; visibility so it's still devirtualized
+  ; CHECK-VALIDATE-IR: %call4 = tail call i32 @_ZN1D1mEi
+  %call4 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call3)
+  ret i32 %call4
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTS1B"}
+!2 = !{i64 16, !"_ZTS1C"}
+!3 = !{i64 16, !4}
+!4 = distinct !{}
+!5 = !{i64 2}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
new file mode 100644
index 0000000000000000000000000000000000000000..4ef048d6b6c601b9bf174c24f3c8f4372814d0bc
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
@@ -0,0 +1,130 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!2, !3}' >> %t1_regular.ll
+; RUN: echo '!2 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!3 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_ref.ll -o %t2.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o
+
+;; Native objects can contain only a reference to the base type infos if the base declaration has no key functions.
+;; Because of that, --lto-validate-all-vtables-have-type-infos needs to query for the type info symbol inside native files rather than the
+;; type name symbol that's used as the key in !type metadata to correctly stop devirtualization on the native type.
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+; CHECK-NOT:     single-impl: devirtualized a call to _ZN1A3fooEv
+
+;; Source code:
+;; cat > a.h <<'eof'
+;; struct A { virtual int foo(); };
+;; int bar(A *a);
+;; eof
+;; cat > main.cc <<'eof'
+;; #include "a.h"
+;;
+;; int A::foo() { return 1; }
+;; int bar(A *a) { return a->foo(); }
+;;
+;; extern int baz();
+;; int main() {
+;;   A a;
+;;   int i = bar(&a);
+;;   int j = baz();
+;;   return i + j;
+;; }
+;; eof
+;; clang++ -fwhole-program-vtables -fno-split-lto-unit -flto=thin main.cc -c
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { %struct.Abase }
+%struct.Abase = type { ptr }
+
+@_ZTV1A = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv] }, align 8, !type !0, !type !1
+@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1
+@_ZTI1A = dso_local constant { ptr, ptr } { ptr null, ptr @_ZTS1A }, align 8
+
+define dso_local noundef i32 @_ZN1A3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  ret i32 1
+}
+
+; CHECK-IR: define dso_local noundef i32 @_Z3barP1A
+define dso_local noundef i32 @_Z3barP1A(ptr noundef %a) #0 {
+entry:
+  %a.addr = alloca ptr
+  store ptr %a, ptr %a.addr
+  %0 = load ptr, ptr %a.addr
+  %vtable = load ptr, ptr %0
+  %1 = call i1 @llvm.public.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %1)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0
+  %fptr = load ptr, ptr %vfn
+  ;; Check that the call was not devirtualized.
+  ; CHECK-IR: %call = call noundef i32 %fptr
+  %call = call noundef i32 %fptr(ptr noundef nonnull align 8 dereferenceable(8) %0)
+  ret i32 %call
+}
+; CHECK-IR: ret i32
+; CHECK-IR: }
+
+declare i1 @llvm.public.type.test(ptr, metadata)
+declare void @llvm.assume(i1 noundef)
+
+define dso_local noundef i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca %struct.A, align 8
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a)
+  %call = call noundef i32 @_Z3barP1A(ptr noundef %a)
+  store i32 %call, ptr %i, align 4
+  %call1 = call noundef i32 @_Z3bazv()
+  store i32 %call1, ptr %j, align 4
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %j, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 {
+entry:
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  ret void
+}
+
+declare noundef i32 @_Z3bazv()
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFivE.virtual"}
diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h
index 37509e28f89100789720f326593077d204d805e9..2eabe578a479cf052c8199cf6d9ce32ef5f1d3f7 100644
--- a/llvm/include/llvm/ADT/SetVector.h
+++ b/llvm/include/llvm/ADT/SetVector.h
@@ -35,9 +35,30 @@ namespace llvm {
 /// This adapter class provides a way to keep a set of things that also has the
 /// property of a deterministic iteration order. The order of iteration is the
 /// order of insertion.
+///
+/// The key and value types are derived from the Set and Vector types
+/// respectively. This allows the vector-type operations and set-type operations
+/// to have different types. In particular, this is useful when storing pointers
+/// as "Foo *" values but looking them up as "const Foo *" keys.
+///
+/// No constraint is placed on the key and value types, although it is assumed
+/// that value_type can be converted into key_type for insertion. Users must be
+/// aware of any loss of information in this conversion. For example, setting
+/// value_type to float and key_type to int can produce very surprising results,
+/// but it is not explicitly disallowed.
+///
+/// The parameter N specifies the "small" size of the container, which is the
+/// number of elements upto which a linear scan over the Vector will be used
+/// when searching for elements instead of checking Set, due to it being better
+/// for performance. A value of 0 means that this mode of operation is not used,
+/// and is the default value.
 template <typename T, typename Vector = std::vector<T>,
-          typename Set = DenseSet<T>>
+          typename Set = DenseSet<T>, unsigned N = 0>
 class SetVector {
+  // Much like in SmallPtrSet, this value should not be too high to prevent
+  // excessively long linear scans from occuring.
+  static_assert(N <= 32, "Small size should be less than or equal to 32!");
+
 public:
   using value_type = T;
   using key_type = T;
@@ -139,6 +160,17 @@ public:
   /// Insert a new element into the SetVector.
   /// \returns true if the element was inserted into the SetVector.
   bool insert(const value_type &X) {
+    if constexpr (canBeSmall())
+      if (isSmall()) {
+        if (llvm::find(vector_, X) == vector_.end()) {
+          vector_.push_back(X);
+          if (vector_.size() > N)
+            makeBig();
+          return true;
+        }
+        return false;
+      }
+
     bool result = set_.insert(X).second;
     if (result)
       vector_.push_back(X);
@@ -149,12 +181,21 @@ public:
   template<typename It>
   void insert(It Start, It End) {
     for (; Start != End; ++Start)
-      if (set_.insert(*Start).second)
-        vector_.push_back(*Start);
+      insert(*Start);
   }
 
   /// Remove an item from the set vector.
   bool remove(const value_type& X) {
+    if constexpr (canBeSmall())
+      if (isSmall()) {
+        typename vector_type::iterator I = find(vector_, X);
+        if (I != vector_.end()) {
+          vector_.erase(I);
+          return true;
+        }
+        return false;
+      }
+
     if (set_.erase(X)) {
       typename vector_type::iterator I = find(vector_, X);
       assert(I != vector_.end() && "Corrupted SetVector instances!");
@@ -169,6 +210,10 @@ public:
   /// element erased. This is the end of the SetVector if the last element is
   /// erased.
   iterator erase(const_iterator I) {
+    if constexpr (canBeSmall())
+      if (isSmall())
+        return vector_.erase(I);
+
     const key_type &V = *I;
     assert(set_.count(V) && "Corrupted SetVector instances!");
     set_.erase(V);
@@ -190,8 +235,15 @@ public:
   /// \returns true if any element is removed.
   template <typename UnaryPredicate>
   bool remove_if(UnaryPredicate P) {
-    typename vector_type::iterator I =
-        llvm::remove_if(vector_, TestAndEraseFromSet<UnaryPredicate>(P, set_));
+    typename vector_type::iterator I = [this, P] {
+      if constexpr (canBeSmall())
+        if (isSmall())
+          return llvm::remove_if(vector_, P);
+
+      return llvm::remove_if(vector_,
+                             TestAndEraseFromSet<UnaryPredicate>(P, set_));
+    }();
+
     if (I == vector_.end())
       return false;
     vector_.erase(I, vector_.end());
@@ -200,12 +252,20 @@ public:
 
   /// Check if the SetVector contains the given key.
   bool contains(const key_type &key) const {
+    if constexpr (canBeSmall())
+      if (isSmall())
+        return is_contained(vector_, key);
+
     return set_.find(key) != set_.end();
   }
 
   /// Count the number of elements of a given key in the SetVector.
   /// \returns 0 if the element is not in the SetVector, 1 if it is.
   size_type count(const key_type &key) const {
+    if constexpr (canBeSmall())
+      if (isSmall())
+        return is_contained(vector_, key);
+
     return set_.count(key);
   }
 
@@ -261,7 +321,7 @@ public:
       remove(*SI);
   }
 
-  void swap(SetVector<T, Vector, Set> &RHS) {
+  void swap(SetVector<T, Vector, Set, N> &RHS) {
     set_.swap(RHS.set_);
     vector_.swap(RHS.vector_);
   }
@@ -290,6 +350,16 @@ private:
     }
   };
 
+  [[nodiscard]] static constexpr bool canBeSmall() { return N != 0; }
+
+  [[nodiscard]] bool isSmall() const { return set_.empty(); }
+
+  void makeBig() {
+    if constexpr (canBeSmall())
+      for (const auto &entry : vector_)
+        set_.insert(entry);
+  }
+
   set_type set_;         ///< The set.
   vector_type vector_;   ///< The vector.
 };
@@ -297,8 +367,7 @@ private:
 /// A SetVector that performs no allocations if smaller than
 /// a certain size.
 template <typename T, unsigned N>
-class SmallSetVector
-    : public SetVector<T, SmallVector<T, N>, SmallDenseSet<T, N>> {
+class SmallSetVector : public SetVector<T, SmallVector<T, N>, DenseSet<T>, N> {
 public:
   SmallSetVector() = default;
 
@@ -314,9 +383,9 @@ public:
 namespace std {
 
 /// Implement std::swap in terms of SetVector swap.
-template<typename T, typename V, typename S>
-inline void
-swap(llvm::SetVector<T, V, S> &LHS, llvm::SetVector<T, V, S> &RHS) {
+template <typename T, typename V, typename S, unsigned N>
+inline void swap(llvm::SetVector<T, V, S, N> &LHS,
+                 llvm::SetVector<T, V, S, N> &RHS) {
   LHS.swap(RHS);
 }
 
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 7a746592c9fcd79897a98a35f2f45687f2f2bc7a..cb825926982040a0285b9ea6636bcdc8ad855f89 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -79,6 +79,12 @@ struct Config {
   /// link.
   bool HasWholeProgramVisibility = false;
 
+  /// We're validating that all native vtables have corresponding type infos.
+  bool ValidateAllVtablesHaveTypeInfos = false;
+  /// If all native vtables have corresponding type infos, allow
+  /// usage of RTTI to block devirtualization on types used in native files.
+  bool AllVtablesHaveTypeInfos = false;
+
   /// Always emit a Regular LTO object even when it is empty because no Regular
   /// LTO modules were linked. This option is useful for some build system which
   /// want to know a priori all possible output files.
diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index e85c13f4b7cce27ae67cb47058eb52d53151e81f..4bca4b13d729ab6445cd4349962a22f357a2310e 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -1,8 +1,13 @@
 #ifndef LLVM_TABLEGEN_DIRECTIVEEMITTER_H
 #define LLVM_TABLEGEN_DIRECTIVEEMITTER_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/TableGen/Record.h"
+#include <algorithm>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index a2296a064213641e2e3eb706eae864a51b50189f..4932157a7a3dc54f57f652d8a2e6fbfed0c7615d 100644
--- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -244,10 +244,18 @@ void updatePublicTypeTestCalls(Module &M,
                                bool WholeProgramVisibilityEnabledInLTO);
 void updateVCallVisibilityInModule(
     Module &M, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols);
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    bool ValidateAllVtablesHaveTypeInfos,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj);
 void updateVCallVisibilityInIndex(
     ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols);
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    const DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols);
+
+void getVisibleToRegularObjVtableGUIDs(
+    ModuleSummaryIndex &Index,
+    DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj);
 
 /// Perform index-based whole program devirtualization on the \p Summary
 /// index. Any devirtualized targets used by a type test in another module
diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
index e8106e474332199a9e49a19b04fe0d91725a90ca..f5127cff24af0dfd3901d19706db6f36656adcb8 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
@@ -14,14 +14,21 @@
 #ifndef LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
 #define LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 
+#include <utility>
 #include <vector>
 
-namespace llvm {
+namespace llvm::codelayout {
 
 using EdgeT = std::pair<uint64_t, uint64_t>;
-using EdgeCountT = std::pair<EdgeT, uint64_t>;
+
+struct EdgeCount {
+  uint64_t src;
+  uint64_t dst;
+  uint64_t count;
+};
 
 /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump
 /// locality and thus processor I-cache utilization. This is achieved via
@@ -34,25 +41,55 @@ using EdgeCountT = std::pair<EdgeT, uint64_t>;
 /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The
 ///    map also defines the edges in CFG and should include 0-count edges.
 /// \returns The best block order found.
-std::vector<uint64_t>
-applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
-                  const std::vector<uint64_t> &NodeCounts,
-                  const std::vector<EdgeCountT> &EdgeCounts);
+std::vector<uint64_t> computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
+                                          ArrayRef<uint64_t> NodeCounts,
+                                          ArrayRef<EdgeCount> EdgeCounts);
 
 /// Estimate the "quality" of a given node order in CFG. The higher the score,
 /// the better the order is. The score is designed to reflect the locality of
 /// the given order, which is anti-correlated with the number of I-cache misses
 /// in a typical execution of the function.
-double calcExtTspScore(const std::vector<uint64_t> &Order,
-                       const std::vector<uint64_t> &NodeSizes,
-                       const std::vector<uint64_t> &NodeCounts,
-                       const std::vector<EdgeCountT> &EdgeCounts);
+double calcExtTspScore(ArrayRef<uint64_t> Order, ArrayRef<uint64_t> NodeSizes,
+                       ArrayRef<uint64_t> NodeCounts,
+                       ArrayRef<EdgeCount> EdgeCounts);
 
 /// Estimate the "quality" of the current node order in CFG.
-double calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
-                       const std::vector<uint64_t> &NodeCounts,
-                       const std::vector<EdgeCountT> &EdgeCounts);
+double calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
+                       ArrayRef<uint64_t> NodeCounts,
+                       ArrayRef<EdgeCount> EdgeCounts);
+
+/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for
+/// the best performance of large-scale front-end bound binaries.
+struct CDSortConfig {
+  /// The size of the cache.
+  unsigned CacheEntries = 16;
+  /// The size of a line in the cache.
+  unsigned CacheSize = 2048;
+  /// The power exponent for the distance-based locality.
+  double DistancePower = 0.25;
+  /// The scale factor for the frequency-based locality.
+  double FrequencyScale = 0.25;
+};
+
+/// Apply a Cache-Directed Sort for functions represented by a call graph.
+/// The placement is done by optimizing the call locality by co-locating
+/// frequently executed functions.
+/// \p FuncSizes: The sizes of the nodes (in bytes).
+/// \p FuncCounts: The execution counts of the nodes in the profile.
+/// \p CallCounts: The execution counts of every edge (jump) in the profile. The
+///    map also defines the edges in CFG and should include 0-count edges.
+/// \p CallOffsets: The offsets of the calls from their source nodes.
+/// \returns The best function order found.
+std::vector<uint64_t> computeCacheDirectedLayout(
+    ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
+    ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets);
+
+/// Apply a Cache-Directed Sort with a custom config.
+std::vector<uint64_t> computeCacheDirectedLayout(
+    const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
+    ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
+    ArrayRef<uint64_t> CallOffsets);
 
-} // end namespace llvm
+} // namespace llvm::codelayout
 
 #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 7bbc347a8cf88c3b3dca4705fb833bae9ec28e68..b6fbc65d83b800266bb711d42b58a917e4f397a5 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3502,7 +3502,7 @@ void MachineBlockPlacement::applyExtTsp() {
 
   auto BlockSizes = std::vector<uint64_t>(F->size());
   auto BlockCounts = std::vector<uint64_t>(F->size());
-  std::vector<EdgeCountT> JumpCounts;
+  std::vector<codelayout::EdgeCount> JumpCounts;
   for (MachineBasicBlock &MBB : *F) {
     // Getting the block frequency.
     BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
@@ -3521,8 +3521,8 @@ void MachineBlockPlacement::applyExtTsp() {
     for (MachineBasicBlock *Succ : MBB.successors()) {
       auto EP = MBPI->getEdgeProbability(&MBB, Succ);
       BlockFrequency JumpFreq = BlockFreq * EP;
-      auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
-      JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency()));
+      JumpCounts.push_back(
+          {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
     }
   }
 
@@ -3535,7 +3535,7 @@ void MachineBlockPlacement::applyExtTsp() {
                        calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
 
   // Run the layout algorithm.
-  auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
   std::vector<const MachineBasicBlock *> NewBlockOrder;
   NewBlockOrder.reserve(F->size());
   for (uint64_t Node : NewOrder) {
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 1cd48adac3f0a11b263c289bccb771ec03fc828c..0e5eeb6ff978e8c987c110fc6631119ad3bc4c7a 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1134,13 +1134,27 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
     }
   }
 
+  bool WholeProgramVisibilityEnabledInLTO =
+      Conf.HasWholeProgramVisibility &&
+      // If validation is enabled, upgrade visibility only when all vtables
+      // have typeinfos.
+      (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
+
+  // This returns true when the name is local or not defined. Locals are
+  // expected to be handled separately.
+  auto IsVisibleToRegularObj = [&](StringRef name) {
+    auto It = GlobalResolutions.find(name);
+    return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary);
+  };
+
   // If allowed, upgrade public vcall visibility metadata to linkage unit
   // visibility before whole program devirtualization in the optimizer.
-  updateVCallVisibilityInModule(*RegularLTO.CombinedModule,
-                                Conf.HasWholeProgramVisibility,
-                                DynamicExportSymbols);
+  updateVCallVisibilityInModule(
+      *RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO,
+      DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos,
+      IsVisibleToRegularObj);
   updatePublicTypeTestCalls(*RegularLTO.CombinedModule,
-                            Conf.HasWholeProgramVisibility);
+                            WholeProgramVisibilityEnabledInLTO);
 
   if (Conf.PreOptModuleHook &&
       !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule))
@@ -1521,13 +1535,38 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   std::set<GlobalValue::GUID> ExportedGUIDs;
 
-  if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility))
+  bool WholeProgramVisibilityEnabledInLTO =
+      Conf.HasWholeProgramVisibility &&
+      // If validation is enabled, upgrade visibility only when all vtables
+      // have typeinfos.
+      (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
+  if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     ThinLTO.CombinedIndex.setWithWholeProgramVisibility();
+
+  // If we're validating, get the vtable symbols that should not be
+  // upgraded because they correspond to typeIDs outside of index-based
+  // WPD info.
+  DenseSet<GlobalValue::GUID> VisibleToRegularObjSymbols;
+  if (WholeProgramVisibilityEnabledInLTO &&
+      Conf.ValidateAllVtablesHaveTypeInfos) {
+    // This returns true when the name is local or not defined. Locals are
+    // expected to be handled separately.
+    auto IsVisibleToRegularObj = [&](StringRef name) {
+      auto It = GlobalResolutions.find(name);
+      return (It == GlobalResolutions.end() ||
+              It->second.VisibleOutsideSummary);
+    };
+
+    getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex,
+                                      VisibleToRegularObjSymbols,
+                                      IsVisibleToRegularObj);
+  }
+
   // If allowed, upgrade public vcall visibility to linkage unit visibility in
   // the summaries before whole program devirtualization below.
-  updateVCallVisibilityInIndex(ThinLTO.CombinedIndex,
-                               Conf.HasWholeProgramVisibility,
-                               DynamicExportSymbols);
+  updateVCallVisibilityInIndex(
+      ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO,
+      DynamicExportSymbols, VisibleToRegularObjSymbols);
 
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index ae7b7e4b548124ad88f433253fba3a6b79c26911..d7aed2fbc2a1cae090b96beba9cd62f066106472 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -604,11 +604,14 @@ bool LTOCodeGenerator::optimize() {
   // pipeline run below.
   updatePublicTypeTestCalls(*MergedModule,
                             /* WholeProgramVisibilityEnabledInLTO */ false);
-  updateVCallVisibilityInModule(*MergedModule,
-                                /* WholeProgramVisibilityEnabledInLTO */ false,
-                                // FIXME: This needs linker information via a
-                                // TBD new interface.
-                                /* DynamicExportSymbols */ {});
+  updateVCallVisibilityInModule(
+      *MergedModule,
+      /* WholeProgramVisibilityEnabledInLTO */ false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
 
   // We always run the verifier once on the merged module, the `DisableVerify`
   // parameter only applies to subsequent verify.
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 5b137a8f8cb344a227439526c16335a1206beb90..0d2e66008f1f6c676d5f013149c5036cfe0963c1 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -1053,11 +1053,14 @@ void ThinLTOCodeGenerator::run() {
   // via the internal option. Must be done before WPD below.
   if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false))
     Index->setWithWholeProgramVisibility();
+
+  // FIXME: This needs linker information via a TBD new interface
   updateVCallVisibilityInIndex(*Index,
-                               /* WholeProgramVisibilityEnabledInLTO */ false,
-                               // FIXME: This needs linker information via a
+                               /*WholeProgramVisibilityEnabledInLTO=*/false,
+                               // FIXME: These need linker information via a
                                // TBD new interface.
-                               /* DynamicExportSymbols */ {});
+                               /*DynamicExportSymbols=*/{},
+                               /*VisibleToRegularObjSymbols=*/{});
 
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 1d5f130737ee5783ef88840e2af0bcc7078731a3..2f9ac86e1f07bfac01a94095c2a0cb88d2a5deac 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -15,15 +15,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TableGen/Main.h"
+#include "TGLexer.h"
 #include "TGParser.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include <algorithm>
+#include <memory>
+#include <string>
 #include <system_error>
+#include <utility>
+#include <vector>
 using namespace llvm;
 
 static cl::opt<std::string>
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 487a0a4a97f7f8e439ebda6299f96f92607ccb8d..f60cd1c2b2eca6953d4725b1bf1a1b92678b1a82 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -780,12 +780,52 @@ bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) {
          !DisableWholeProgramVisibility;
 }
 
+static bool
+typeIDVisibleToRegularObj(StringRef TypeID,
+                          function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  // TypeID for member function pointer type is an internal construct
+  // and won't exist in IsVisibleToRegularObj. The full TypeID
+  // will be present and participate in invalidation.
+  if (TypeID.ends_with(".virtual"))
+    return false;
+
+  // TypeID that doesn't start with Itanium mangling (_ZTS) will be
+  // non-externally visible types which cannot interact with
+  // external native files. See CodeGenModule::CreateMetadataIdentifierImpl.
+  if (!TypeID.consume_front("_ZTS"))
+    return false;
+
+  // TypeID is keyed off the type name symbol (_ZTS). However, the native
+  // object may not contain this symbol if it does not contain a key
+  // function for the base type and thus only contains a reference to the
+  // type info (_ZTI). To catch this case we query using the type info
+  // symbol corresponding to the TypeID.
+  std::string typeInfo = ("_ZTI" + TypeID).str();
+  return IsVisibleToRegularObj(typeInfo);
+}
+
+static bool
+skipUpdateDueToValidation(GlobalVariable &GV,
+                          function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  SmallVector<MDNode *, 2> Types;
+  GV.getMetadata(LLVMContext::MD_type, Types);
+
+  for (auto Type : Types)
+    if (auto *TypeID = dyn_cast<MDString>(Type->getOperand(1).get()))
+      return typeIDVisibleToRegularObj(TypeID->getString(),
+                                       IsVisibleToRegularObj);
+
+  return false;
+}
+
 /// If whole program visibility asserted, then upgrade all public vcall
 /// visibility metadata on vtable definitions to linkage unit visibility in
 /// Module IR (for regular or hybrid LTO).
 void updateVCallVisibilityInModule(
     Module &M, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols) {
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    bool ValidateAllVtablesHaveTypeInfos,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj) {
   if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     return;
   for (GlobalVariable &GV : M.globals()) {
@@ -796,7 +836,13 @@ void updateVCallVisibilityInModule(
         GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic &&
         // Don't upgrade the visibility for symbols exported to the dynamic
         // linker, as we have no information on their eventual use.
-        !DynamicExportSymbols.count(GV.getGUID()))
+        !DynamicExportSymbols.count(GV.getGUID()) &&
+        // With validation enabled, we want to exclude symbols visible to
+        // regular objects. Local symbols will be in this group due to the
+        // current implementation but those with VCallVisibilityTranslationUnit
+        // will have already been marked in clang so are unaffected.
+        !(ValidateAllVtablesHaveTypeInfos &&
+          skipUpdateDueToValidation(GV, IsVisibleToRegularObj)))
       GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit);
   }
 }
@@ -828,12 +874,26 @@ void updatePublicTypeTestCalls(Module &M,
   }
 }
 
+/// Based on typeID string, get all associated vtable GUIDS that are
+/// visible to regular objects.
+void getVisibleToRegularObjVtableGUIDs(
+    ModuleSummaryIndex &Index,
+    DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  for (const auto &typeID : Index.typeIdCompatibleVtableMap()) {
+    if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj))
+      for (const TypeIdOffsetVtableInfo &P : typeID.second)
+        VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID());
+  }
+}
+
 /// If whole program visibility asserted, then upgrade all public vcall
 /// visibility metadata on vtable definition summaries to linkage unit
 /// visibility in Module summary index (for ThinLTO).
 void updateVCallVisibilityInIndex(
     ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols) {
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    const DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols) {
   if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     return;
   for (auto &P : Index) {
@@ -846,6 +906,12 @@ void updateVCallVisibilityInIndex(
       if (!GVar ||
           GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
         continue;
+      // With validation enabled, we want to exclude symbols visible to regular
+      // objects. Local symbols will be in this group due to the current
+      // implementation but those with VCallVisibilityTranslationUnit will have
+      // already been marked in clang so are unaffected.
+      if (VisibleToRegularObjSymbols.count(P.first))
+        continue;
       GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
     }
   }
@@ -1032,8 +1098,8 @@ bool DevirtModule::tryFindVirtualCallTargets(
 }
 
 bool DevirtIndex::tryFindVirtualCallTargets(
-    std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
-    uint64_t ByteOffset) {
+    std::vector<ValueInfo> &TargetsForSlot,
+    const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) {
   for (const TypeIdOffsetVtableInfo &P : TIdInfo) {
     // Find a representative copy of the vtable initializer.
     // We can have multiple available_externally, linkonce_odr and weak_odr
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index 9eb3aff3ffe8f8e87708d434dd2519b6dfab9324..f4a820918ee8bb72bbe6affde7efd9ce0a99efde 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// ExtTSP - layout of basic blocks with i-cache optimization.
+// The file implements "cache-aware" layout algorithms of basic blocks and
+// functions in a binary.
 //
 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
 // optimizing jump locality and thus processor I-cache utilization. This is
@@ -41,10 +42,14 @@
 
 #include "llvm/Transforms/Utils/CodeLayout.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 
 #include <cmath>
+#include <set>
 
 using namespace llvm;
+using namespace llvm::codelayout;
+
 #define DEBUG_TYPE "code-layout"
 
 cl::opt<bool> EnableExtTspBlockPlacement(
@@ -57,8 +62,8 @@ cl::opt<bool> ApplyExtTspWithoutProfile(
     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
     cl::init(true), cl::Hidden);
 
-// Algorithm-specific params. The values are tuned for the best performance
-// of large-scale front-end bound binaries.
+// Algorithm-specific params for Ext-TSP. The values are tuned for the best
+// performance of large-scale front-end bound binaries.
 static cl::opt<double> ForwardWeightCond(
     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
@@ -69,11 +74,11 @@ static cl::opt<double> ForwardWeightUncond(
 
 static cl::opt<double> BackwardWeightCond(
     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
-    cl::desc("The weight of conditonal backward jumps for ExtTSP value"));
+    cl::desc("The weight of conditional backward jumps for ExtTSP value"));
 
 static cl::opt<double> BackwardWeightUncond(
     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
-    cl::desc("The weight of unconditonal backward jumps for ExtTSP value"));
+    cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
 
 static cl::opt<double> FallthroughWeightCond(
     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
@@ -109,6 +114,21 @@ static cl::opt<bool> EnableChainSplitAlongJumps(
     "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
     cl::desc("The maximum size of a chain to apply splitting"));
 
+// Algorithm-specific options for CDS.
+static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
+                                      cl::desc("The size of the cache"));
+
+static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
+                                   cl::desc("The size of a line in the cache"));
+
+static cl::opt<double> DistancePower(
+    "cds-distance-power", cl::ReallyHidden,
+    cl::desc("The power exponent for the distance-based locality"));
+
+static cl::opt<double> FrequencyScale(
+    "cds-frequency-scale", cl::ReallyHidden,
+    cl::desc("The scale factor for the frequency-based locality"));
+
 namespace {
 
 // Epsilon for comparison of doubles.
@@ -149,29 +169,30 @@ double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
 
 /// A type of merging two chains, X and Y. The former chain is split into
 /// X1 and X2 and then concatenated with Y in the order specified by the type.
-enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
+enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
 
 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
-/// together with the corresponfiding merge 'type' and 'offset'.
-class MergeGainTy {
-public:
-  explicit MergeGainTy() = default;
-  explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
+/// together with the corresponding merge 'type' and 'offset'.
+struct MergeGainT {
+  explicit MergeGainT() = default;
+  explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
 
   double score() const { return Score; }
 
   size_t mergeOffset() const { return MergeOffset; }
 
-  MergeTypeTy mergeType() const { return MergeType; }
+  MergeTypeT mergeType() const { return MergeType; }
+
+  void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
 
   // Returns 'true' iff Other is preferred over this.
-  bool operator<(const MergeGainTy &Other) const {
+  bool operator<(const MergeGainT &Other) const {
     return (Other.Score > EPS && Other.Score > Score + EPS);
   }
 
   // Update the current gain if Other is preferred over this.
-  void updateIfLessThan(const MergeGainTy &Other) {
+  void updateIfLessThan(const MergeGainT &Other) {
     if (*this < Other)
       *this = Other;
   }
@@ -179,114 +200,110 @@ public:
 private:
   double Score{-1.0};
   size_t MergeOffset{0};
-  MergeTypeTy MergeType{MergeTypeTy::X_Y};
+  MergeTypeT MergeType{MergeTypeT::X_Y};
 };
 
-class Jump;
-class Chain;
-class ChainEdge;
+struct JumpT;
+struct ChainT;
+struct ChainEdge;
 
-/// A node in the graph, typically corresponding to a basic block in CFG.
-class Block {
-public:
-  Block(const Block &) = delete;
-  Block(Block &&) = default;
-  Block &operator=(const Block &) = delete;
-  Block &operator=(Block &&) = default;
+/// A node in the graph, typically corresponding to a basic block in the CFG or
+/// a function in the call graph.
+struct NodeT {
+  NodeT(const NodeT &) = delete;
+  NodeT(NodeT &&) = default;
+  NodeT &operator=(const NodeT &) = delete;
+  NodeT &operator=(NodeT &&) = default;
 
-  // The original index of the block in CFG.
+  explicit NodeT(size_t Index, uint64_t Size, uint64_t EC)
+      : Index(Index), Size(Size), ExecutionCount(EC) {}
+
+  bool isEntry() const { return Index == 0; }
+
+  // The total execution count of outgoing jumps.
+  uint64_t outCount() const;
+
+  // The total execution count of incoming jumps.
+  uint64_t inCount() const;
+
+  // The original index of the node in graph.
   size_t Index{0};
-  // The index of the block in the current chain.
+  // The index of the node in the current chain.
   size_t CurIndex{0};
-  // Size of the block in the binary.
+  // The size of the node in the binary.
   uint64_t Size{0};
-  // Execution count of the block in the profile data.
+  // The execution count of the node in the profile data.
   uint64_t ExecutionCount{0};
-  // Current chain of the node.
-  Chain *CurChain{nullptr};
-  // An offset of the block in the current chain.
+  // The current chain of the node.
+  ChainT *CurChain{nullptr};
+  // The offset of the node in the current chain.
   mutable uint64_t EstimatedAddr{0};
-  // Forced successor of the block in CFG.
-  Block *ForcedSucc{nullptr};
-  // Forced predecessor of the block in CFG.
-  Block *ForcedPred{nullptr};
-  // Outgoing jumps from the block.
-  std::vector<Jump *> OutJumps;
-  // Incoming jumps to the block.
-  std::vector<Jump *> InJumps;
-
-public:
-  explicit Block(size_t Index, uint64_t Size, uint64_t EC)
-      : Index(Index), Size(Size), ExecutionCount(EC) {}
-  bool isEntry() const { return Index == 0; }
+  // Forced successor of the node in the graph.
+  NodeT *ForcedSucc{nullptr};
+  // Forced predecessor of the node in the graph.
+  NodeT *ForcedPred{nullptr};
+  // Outgoing jumps from the node.
+  std::vector<JumpT *> OutJumps;
+  // Incoming jumps to the node.
+  std::vector<JumpT *> InJumps;
 };
 
-/// An arc in the graph, typically corresponding to a jump between two blocks.
-class Jump {
-public:
-  Jump(const Jump &) = delete;
-  Jump(Jump &&) = default;
-  Jump &operator=(const Jump &) = delete;
-  Jump &operator=(Jump &&) = default;
-
-  // Source block of the jump.
-  Block *Source;
-  // Target block of the jump.
-  Block *Target;
+/// An arc in the graph, typically corresponding to a jump between two nodes.
+struct JumpT {
+  JumpT(const JumpT &) = delete;
+  JumpT(JumpT &&) = default;
+  JumpT &operator=(const JumpT &) = delete;
+  JumpT &operator=(JumpT &&) = default;
+
+  explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
+      : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
+
+  // Source node of the jump.
+  NodeT *Source;
+  // Target node of the jump.
+  NodeT *Target;
   // Execution count of the arc in the profile data.
   uint64_t ExecutionCount{0};
   // Whether the jump corresponds to a conditional branch.
   bool IsConditional{false};
-
-public:
-  explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
-      : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
+  // The offset of the jump from the source node.
+  uint64_t Offset{0};
 };
 
-/// A chain (ordered sequence) of blocks.
-class Chain {
-public:
-  Chain(const Chain &) = delete;
-  Chain(Chain &&) = default;
-  Chain &operator=(const Chain &) = delete;
-  Chain &operator=(Chain &&) = default;
+/// A chain (ordered sequence) of nodes in the graph.
+struct ChainT {
+  ChainT(const ChainT &) = delete;
+  ChainT(ChainT &&) = default;
+  ChainT &operator=(const ChainT &) = delete;
+  ChainT &operator=(ChainT &&) = default;
+
+  explicit ChainT(uint64_t Id, NodeT *Node)
+      : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
+        Nodes(1, Node) {}
 
-  explicit Chain(uint64_t Id, Block *Block)
-      : Id(Id), Score(0), Blocks(1, Block) {}
+  size_t numBlocks() const { return Nodes.size(); }
 
-  uint64_t id() const { return Id; }
+  double density() const { return static_cast<double>(ExecutionCount) / Size; }
 
-  bool isEntry() const { return Blocks[0]->Index == 0; }
+  bool isEntry() const { return Nodes[0]->Index == 0; }
 
   bool isCold() const {
-    for (auto *Block : Blocks) {
-      if (Block->ExecutionCount > 0)
+    for (NodeT *Node : Nodes) {
+      if (Node->ExecutionCount > 0)
         return false;
     }
     return true;
   }
 
-  double score() const { return Score; }
-
-  void setScore(double NewScore) { Score = NewScore; }
-
-  const std::vector<Block *> &blocks() const { return Blocks; }
-
-  size_t numBlocks() const { return Blocks.size(); }
-
-  const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
-    return Edges;
-  }
-
-  ChainEdge *getEdge(Chain *Other) const {
-    for (auto It : Edges) {
-      if (It.first == Other)
-        return It.second;
+  ChainEdge *getEdge(ChainT *Other) const {
+    for (const auto &[Chain, ChainEdge] : Edges) {
+      if (Chain == Other)
+        return ChainEdge;
     }
     return nullptr;
   }
 
-  void removeEdge(Chain *Other) {
+  void removeEdge(ChainT *Other) {
     auto It = Edges.begin();
     while (It != Edges.end()) {
       if (It->first == Other) {
@@ -297,63 +314,68 @@ public:
     }
   }
 
-  void addEdge(Chain *Other, ChainEdge *Edge) {
+  void addEdge(ChainT *Other, ChainEdge *Edge) {
     Edges.push_back(std::make_pair(Other, Edge));
   }
 
-  void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
-    Blocks = MergedBlocks;
-    // Update the block's chains
-    for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
-      Blocks[Idx]->CurChain = this;
-      Blocks[Idx]->CurIndex = Idx;
+  void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
+    Nodes = std::move(MergedBlocks);
+    // Update the chain's data.
+    ExecutionCount += Other->ExecutionCount;
+    Size += Other->Size;
+    Id = Nodes[0]->Index;
+    // Update the node's data.
+    for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
+      Nodes[Idx]->CurChain = this;
+      Nodes[Idx]->CurIndex = Idx;
     }
   }
 
-  void mergeEdges(Chain *Other);
+  void mergeEdges(ChainT *Other);
 
   void clear() {
-    Blocks.clear();
-    Blocks.shrink_to_fit();
+    Nodes.clear();
+    Nodes.shrink_to_fit();
     Edges.clear();
     Edges.shrink_to_fit();
   }
 
-private:
   // Unique chain identifier.
   uint64_t Id;
   // Cached ext-tsp score for the chain.
-  double Score;
-  // Blocks of the chain.
-  std::vector<Block *> Blocks;
+  double Score{0};
+  // The total execution count of the chain.
+  uint64_t ExecutionCount{0};
+  // The total size of the chain.
+  uint64_t Size{0};
+  // Nodes of the chain.
+  std::vector<NodeT *> Nodes;
   // Adjacent chains and corresponding edges (lists of jumps).
-  std::vector<std::pair<Chain *, ChainEdge *>> Edges;
+  std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
 };
 
-/// An edge in CFG representing jumps between two chains.
-/// When blocks are merged into chains, the edges are combined too so that
-/// there is always at most one edge between a pair of chains
-class ChainEdge {
-public:
+/// An edge in the graph representing jumps between two chains.
+/// When nodes are merged into chains, the edges are combined too so that
+/// there is always at most one edge between a pair of chains.
+struct ChainEdge {
   ChainEdge(const ChainEdge &) = delete;
   ChainEdge(ChainEdge &&) = default;
   ChainEdge &operator=(const ChainEdge &) = delete;
-  ChainEdge &operator=(ChainEdge &&) = default;
+  ChainEdge &operator=(ChainEdge &&) = delete;
 
-  explicit ChainEdge(Jump *Jump)
+  explicit ChainEdge(JumpT *Jump)
       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
         Jumps(1, Jump) {}
 
-  const std::vector<Jump *> &jumps() const { return Jumps; }
+  ChainT *srcChain() const { return SrcChain; }
 
-  void changeEndpoint(Chain *From, Chain *To) {
-    if (From == SrcChain)
-      SrcChain = To;
-    if (From == DstChain)
-      DstChain = To;
-  }
+  ChainT *dstChain() const { return DstChain; }
+
+  bool isSelfEdge() const { return SrcChain == DstChain; }
+
+  const std::vector<JumpT *> &jumps() const { return Jumps; }
 
-  void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
+  void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
 
   void moveJumps(ChainEdge *Other) {
     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
@@ -361,15 +383,22 @@ public:
     Other->Jumps.shrink_to_fit();
   }
 
-  bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
+  void changeEndpoint(ChainT *From, ChainT *To) {
+    if (From == SrcChain)
+      SrcChain = To;
+    if (From == DstChain)
+      DstChain = To;
+  }
+
+  bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
   }
 
-  MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
+  MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
   }
 
-  void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
+  void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
     if (Src == SrcChain) {
       CachedGainForward = MergeGain;
       CacheValidForward = true;
@@ -384,57 +413,74 @@ public:
     CacheValidBackward = false;
   }
 
+  void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
+
+  MergeGainT getMergeGain() const { return CachedGain; }
+
+  double gain() const { return CachedGain.score(); }
+
 private:
   // Source chain.
-  Chain *SrcChain{nullptr};
+  ChainT *SrcChain{nullptr};
   // Destination chain.
-  Chain *DstChain{nullptr};
-  // Original jumps in the binary with correspinding execution counts.
-  std::vector<Jump *> Jumps;
-  // Cached ext-tsp value for merging the pair of chains.
-  // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
-  // we store both values here.
-  MergeGainTy CachedGainForward;
-  MergeGainTy CachedGainBackward;
+  ChainT *DstChain{nullptr};
+  // Original jumps in the binary with corresponding execution counts.
+  std::vector<JumpT *> Jumps;
+  // Cached gain value for merging the pair of chains.
+  MergeGainT CachedGain;
+
+  // Cached gain values for merging the pair of chains. Since the gain of
+  // merging (Src, Dst) and (Dst, Src) might be different, we store both values
+  // here and a flag indicating which of the options results in a higher gain.
+  // Cached gain values.
+  MergeGainT CachedGainForward;
+  MergeGainT CachedGainBackward;
   // Whether the cached value must be recomputed.
   bool CacheValidForward{false};
   bool CacheValidBackward{false};
 };
 
-void Chain::mergeEdges(Chain *Other) {
-  assert(this != Other && "cannot merge a chain with itself");
+uint64_t NodeT::outCount() const {
+  uint64_t Count = 0;
+  for (JumpT *Jump : OutJumps)
+    Count += Jump->ExecutionCount;
+  return Count;
+}
 
-  // Update edges adjacent to chain Other
-  for (auto EdgeIt : Other->Edges) {
-    Chain *DstChain = EdgeIt.first;
-    ChainEdge *DstEdge = EdgeIt.second;
-    Chain *TargetChain = DstChain == Other ? this : DstChain;
+uint64_t NodeT::inCount() const {
+  uint64_t Count = 0;
+  for (JumpT *Jump : InJumps)
+    Count += Jump->ExecutionCount;
+  return Count;
+}
+
+void ChainT::mergeEdges(ChainT *Other) {
+  // Update edges adjacent to chain Other.
+  for (const auto &[DstChain, DstEdge] : Other->Edges) {
+    ChainT *TargetChain = DstChain == Other ? this : DstChain;
     ChainEdge *CurEdge = getEdge(TargetChain);
     if (CurEdge == nullptr) {
       DstEdge->changeEndpoint(Other, this);
       this->addEdge(TargetChain, DstEdge);
-      if (DstChain != this && DstChain != Other) {
+      if (DstChain != this && DstChain != Other)
         DstChain->addEdge(this, DstEdge);
-      }
     } else {
       CurEdge->moveJumps(DstEdge);
     }
-    // Cleanup leftover edge
-    if (DstChain != Other) {
+    // Cleanup leftover edge.
+    if (DstChain != Other)
       DstChain->removeEdge(Other);
-    }
   }
 }
 
-using BlockIter = std::vector<Block *>::const_iterator;
+using NodeIter = std::vector<NodeT *>::const_iterator;
 
-/// A wrapper around three chains of blocks; it is used to avoid extra
+/// A wrapper around three chains of nodes; it is used to avoid extra
 /// instantiation of the vectors.
-class MergedChain {
-public:
-  MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
-              BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
-              BlockIter End3 = BlockIter())
+struct MergedChain {
+  MergedChain(NodeIter Begin1, NodeIter End1, NodeIter Begin2 = NodeIter(),
+              NodeIter End2 = NodeIter(), NodeIter Begin3 = NodeIter(),
+              NodeIter End3 = NodeIter())
       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
         End3(End3) {}
 
@@ -447,8 +493,8 @@ public:
       Func(*It);
   }
 
-  std::vector<Block *> getBlocks() const {
-    std::vector<Block *> Result;
+  std::vector<NodeT *> getNodes() const {
+    std::vector<NodeT *> Result;
     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
                    std::distance(Begin3, End3));
     Result.insert(Result.end(), Begin1, End1);
@@ -457,165 +503,189 @@ public:
     return Result;
   }
 
-  const Block *getFirstBlock() const { return *Begin1; }
+  const NodeT *getFirstNode() const { return *Begin1; }
 
 private:
-  BlockIter Begin1;
-  BlockIter End1;
-  BlockIter Begin2;
-  BlockIter End2;
-  BlockIter Begin3;
-  BlockIter End3;
+  NodeIter Begin1;
+  NodeIter End1;
+  NodeIter Begin2;
+  NodeIter End2;
+  NodeIter Begin3;
+  NodeIter End3;
 };
 
+/// Merge two chains of nodes respecting a given 'type' and 'offset'.
+///
+/// If MergeType == 0, then the result is a concatenation of two chains.
+/// Otherwise, the first chain is cut into two sub-chains at the offset,
+/// and merged using all possible ways of concatenating three chains.
+MergedChain mergeNodes(const std::vector<NodeT *> &X,
+                       const std::vector<NodeT *> &Y, size_t MergeOffset,
+                       MergeTypeT MergeType) {
+  // Split the first chain, X, into X1 and X2.
+  NodeIter BeginX1 = X.begin();
+  NodeIter EndX1 = X.begin() + MergeOffset;
+  NodeIter BeginX2 = X.begin() + MergeOffset;
+  NodeIter EndX2 = X.end();
+  NodeIter BeginY = Y.begin();
+  NodeIter EndY = Y.end();
+
+  // Construct a new chain from the three existing ones.
+  switch (MergeType) {
+  case MergeTypeT::X_Y:
+    return MergedChain(BeginX1, EndX2, BeginY, EndY);
+  case MergeTypeT::Y_X:
+    return MergedChain(BeginY, EndY, BeginX1, EndX2);
+  case MergeTypeT::X1_Y_X2:
+    return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
+  case MergeTypeT::Y_X2_X1:
+    return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
+  case MergeTypeT::X2_X1_Y:
+    return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
+  }
+  llvm_unreachable("unexpected chain merge type");
+}
+
 /// The implementation of the ExtTSP algorithm.
 class ExtTSPImpl {
-  using EdgeT = std::pair<uint64_t, uint64_t>;
-  using EdgeCountMap = std::vector<std::pair<EdgeT, uint64_t>>;
-
 public:
-  ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
-             const std::vector<uint64_t> &NodeCounts,
-             const EdgeCountMap &EdgeCounts)
-      : NumNodes(NumNodes) {
+  ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
+             ArrayRef<EdgeCount> EdgeCounts)
+      : NumNodes(NodeSizes.size()) {
     initialize(NodeSizes, NodeCounts, EdgeCounts);
   }
 
-  /// Run the algorithm and return an optimized ordering of blocks.
-  void run(std::vector<uint64_t> &Result) {
-    // Pass 1: Merge blocks with their mutually forced successors
+  /// Run the algorithm and return an optimized ordering of nodes.
+  std::vector<uint64_t> run() {
+    // Pass 1: Merge nodes with their mutually forced successors
     mergeForcedPairs();
 
     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
     mergeChainPairs();
 
-    // Pass 3: Merge cold blocks to reduce code size
+    // Pass 3: Merge cold nodes to reduce code size
     mergeColdChains();
 
-    // Collect blocks from all chains
-    concatChains(Result);
+    // Collect nodes from all chains
+    return concatChains();
   }
 
 private:
   /// Initialize the algorithm's data structures.
-  void initialize(const std::vector<uint64_t> &NodeSizes,
-                  const std::vector<uint64_t> &NodeCounts,
-                  const EdgeCountMap &EdgeCounts) {
-    // Initialize blocks
-    AllBlocks.reserve(NumNodes);
-    for (uint64_t Node = 0; Node < NumNodes; Node++) {
-      uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
-      uint64_t ExecutionCount = NodeCounts[Node];
-      // The execution count of the entry block is set to at least 1
-      if (Node == 0 && ExecutionCount == 0)
+  void initialize(const ArrayRef<uint64_t> &NodeSizes,
+                  const ArrayRef<uint64_t> &NodeCounts,
+                  const ArrayRef<EdgeCount> &EdgeCounts) {
+    // Initialize nodes
+    AllNodes.reserve(NumNodes);
+    for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
+      uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
+      uint64_t ExecutionCount = NodeCounts[Idx];
+      // The execution count of the entry node is set to at least one.
+      if (Idx == 0 && ExecutionCount == 0)
         ExecutionCount = 1;
-      AllBlocks.emplace_back(Node, Size, ExecutionCount);
+      AllNodes.emplace_back(Idx, Size, ExecutionCount);
     }
 
-    // Initialize jumps between blocks
+    // Initialize jumps between nodes
     SuccNodes.resize(NumNodes);
     PredNodes.resize(NumNodes);
     std::vector<uint64_t> OutDegree(NumNodes, 0);
     AllJumps.reserve(EdgeCounts.size());
-    for (auto It : EdgeCounts) {
-      auto Pred = It.first.first;
-      auto Succ = It.first.second;
-      OutDegree[Pred]++;
-      // Ignore self-edges
-      if (Pred == Succ)
+    for (auto Edge : EdgeCounts) {
+      ++OutDegree[Edge.src];
+      // Ignore self-edges.
+      if (Edge.src == Edge.dst)
         continue;
 
-      SuccNodes[Pred].push_back(Succ);
-      PredNodes[Succ].push_back(Pred);
-      auto ExecutionCount = It.second;
-      if (ExecutionCount > 0) {
-        auto &Block = AllBlocks[Pred];
-        auto &SuccBlock = AllBlocks[Succ];
-        AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
-        SuccBlock.InJumps.push_back(&AllJumps.back());
-        Block.OutJumps.push_back(&AllJumps.back());
+      SuccNodes[Edge.src].push_back(Edge.dst);
+      PredNodes[Edge.dst].push_back(Edge.src);
+      if (Edge.count > 0) {
+        NodeT &PredNode = AllNodes[Edge.src];
+        NodeT &SuccNode = AllNodes[Edge.dst];
+        AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
+        SuccNode.InJumps.push_back(&AllJumps.back());
+        PredNode.OutJumps.push_back(&AllJumps.back());
       }
     }
-    for (auto &Jump : AllJumps) {
+    for (JumpT &Jump : AllJumps) {
       assert(OutDegree[Jump.Source->Index] > 0);
       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
     }
 
-    // Initialize chains
+    // Initialize chains.
     AllChains.reserve(NumNodes);
     HotChains.reserve(NumNodes);
-    for (Block &Block : AllBlocks) {
-      AllChains.emplace_back(Block.Index, &Block);
-      Block.CurChain = &AllChains.back();
-      if (Block.ExecutionCount > 0) {
+    for (NodeT &Node : AllNodes) {
+      AllChains.emplace_back(Node.Index, &Node);
+      Node.CurChain = &AllChains.back();
+      if (Node.ExecutionCount > 0)
         HotChains.push_back(&AllChains.back());
-      }
     }
 
-    // Initialize chain edges
+    // Initialize chain edges.
     AllEdges.reserve(AllJumps.size());
-    for (Block &Block : AllBlocks) {
-      for (auto &Jump : Block.OutJumps) {
-        auto SuccBlock = Jump->Target;
-        ChainEdge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
-        // this edge is already present in the graph
+    for (NodeT &PredNode : AllNodes) {
+      for (JumpT *Jump : PredNode.OutJumps) {
+        NodeT *SuccNode = Jump->Target;
+        ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
+        // this edge is already present in the graph.
         if (CurEdge != nullptr) {
-          assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
+          assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
           CurEdge->appendJump(Jump);
           continue;
         }
-        // this is a new edge
+        // this is a new edge.
         AllEdges.emplace_back(Jump);
-        Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
-        SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
+        PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
+        SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
       }
     }
   }
 
-  /// For a pair of blocks, A and B, block B is the forced successor of A,
+  /// For a pair of nodes, A and B, node B is the forced successor of A,
   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
-  /// to B are from A. Such blocks should be adjacent in the optimal ordering;
-  /// the method finds and merges such pairs of blocks.
+  /// to B are from A. Such nodes should be adjacent in the optimal ordering;
+  /// the method finds and merges such pairs of nodes.
   void mergeForcedPairs() {
-    // Find fallthroughs based on edge weights
-    for (auto &Block : AllBlocks) {
-      if (SuccNodes[Block.Index].size() == 1 &&
-          PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
-          SuccNodes[Block.Index][0] != 0) {
-        size_t SuccIndex = SuccNodes[Block.Index][0];
-        Block.ForcedSucc = &AllBlocks[SuccIndex];
-        AllBlocks[SuccIndex].ForcedPred = &Block;
+    // Find fallthroughs based on edge weights.
+    for (NodeT &Node : AllNodes) {
+      if (SuccNodes[Node.Index].size() == 1 &&
+          PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
+          SuccNodes[Node.Index][0] != 0) {
+        size_t SuccIndex = SuccNodes[Node.Index][0];
+        Node.ForcedSucc = &AllNodes[SuccIndex];
+        AllNodes[SuccIndex].ForcedPred = &Node;
       }
     }
 
     // There might be 'cycles' in the forced dependencies, since profile
     // data isn't 100% accurate. Typically this is observed in loops, when the
     // loop edges are the hottest successors for the basic blocks of the loop.
-    // Break the cycles by choosing the block with the smallest index as the
+    // Break the cycles by choosing the node with the smallest index as the
     // head. This helps to keep the original order of the loops, which likely
     // have already been rotated in the optimized manner.
-    for (auto &Block : AllBlocks) {
-      if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
+    for (NodeT &Node : AllNodes) {
+      if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
         continue;
 
-      auto SuccBlock = Block.ForcedSucc;
-      while (SuccBlock != nullptr && SuccBlock != &Block) {
-        SuccBlock = SuccBlock->ForcedSucc;
+      NodeT *SuccNode = Node.ForcedSucc;
+      while (SuccNode != nullptr && SuccNode != &Node) {
+        SuccNode = SuccNode->ForcedSucc;
       }
-      if (SuccBlock == nullptr)
+      if (SuccNode == nullptr)
         continue;
-      // Break the cycle
-      AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
-      Block.ForcedPred = nullptr;
+      // Break the cycle.
+      AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
+      Node.ForcedPred = nullptr;
     }
 
-    // Merge blocks with their fallthrough successors
-    for (auto &Block : AllBlocks) {
-      if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
-        auto CurBlock = &Block;
+    // Merge nodes with their fallthrough successors.
+    for (NodeT &Node : AllNodes) {
+      if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
+        const NodeT *CurBlock = &Node;
         while (CurBlock->ForcedSucc != nullptr) {
-          const auto NextBlock = CurBlock->ForcedSucc;
-          mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
+          const NodeT *NextBlock = CurBlock->ForcedSucc;
+          mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
           CurBlock = NextBlock;
         }
       }
@@ -624,35 +694,32 @@ private:
 
   /// Merge pairs of chains while improving the ExtTSP objective.
   void mergeChainPairs() {
-    /// Deterministically compare pairs of chains
-    auto compareChainPairs = [](const Chain *A1, const Chain *B1,
-                                const Chain *A2, const Chain *B2) {
+    /// Deterministically compare pairs of chains.
+    auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
+                                const ChainT *A2, const ChainT *B2) {
       if (A1 != A2)
-        return A1->id() < A2->id();
-      return B1->id() < B2->id();
+        return A1->Id < A2->Id;
+      return B1->Id < B2->Id;
     };
 
     while (HotChains.size() > 1) {
-      Chain *BestChainPred = nullptr;
-      Chain *BestChainSucc = nullptr;
-      auto BestGain = MergeGainTy();
-      // Iterate over all pairs of chains
-      for (Chain *ChainPred : HotChains) {
-        // Get candidates for merging with the current chain
-        for (auto EdgeIter : ChainPred->edges()) {
-          Chain *ChainSucc = EdgeIter.first;
-          class ChainEdge *ChainEdge = EdgeIter.second;
-          // Ignore loop edges
+      ChainT *BestChainPred = nullptr;
+      ChainT *BestChainSucc = nullptr;
+      MergeGainT BestGain;
+      // Iterate over all pairs of chains.
+      for (ChainT *ChainPred : HotChains) {
+        // Get candidates for merging with the current chain.
+        for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
+          // Ignore loop edges.
           if (ChainPred == ChainSucc)
             continue;
 
-          // Stop early if the combined chain violates the maximum allowed size
+          // Stop early if the combined chain violates the maximum allowed size.
           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
             continue;
 
-          // Compute the gain of merging the two chains
-          MergeGainTy CurGain =
-              getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
+          // Compute the gain of merging the two chains.
+          MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
           if (CurGain.score() <= EPS)
             continue;
 
@@ -667,53 +734,53 @@ private:
         }
       }
 
-      // Stop merging when there is no improvement
+      // Stop merging when there is no improvement.
       if (BestGain.score() <= EPS)
         break;
 
-      // Merge the best pair of chains
+      // Merge the best pair of chains.
       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
                   BestGain.mergeType());
     }
   }
 
-  /// Merge remaining blocks into chains w/o taking jump counts into
-  /// consideration. This allows to maintain the original block order in the
-  /// absense of profile data
+  /// Merge remaining nodes into chains w/o taking jump counts into
+  /// consideration. This allows to maintain the original node order in the
+  /// absence of profile data.
   void mergeColdChains() {
     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
       // Iterating in reverse order to make sure original fallthrough jumps are
       // merged first; this might be beneficial for code size.
       size_t NumSuccs = SuccNodes[SrcBB].size();
       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
-        auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
-        auto SrcChain = AllBlocks[SrcBB].CurChain;
-        auto DstChain = AllBlocks[DstBB].CurChain;
+        size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
+        ChainT *SrcChain = AllNodes[SrcBB].CurChain;
+        ChainT *DstChain = AllNodes[DstBB].CurChain;
         if (SrcChain != DstChain && !DstChain->isEntry() &&
-            SrcChain->blocks().back()->Index == SrcBB &&
-            DstChain->blocks().front()->Index == DstBB &&
+            SrcChain->Nodes.back()->Index == SrcBB &&
+            DstChain->Nodes.front()->Index == DstBB &&
             SrcChain->isCold() == DstChain->isCold()) {
-          mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
+          mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
         }
       }
     }
   }
 
-  /// Compute the Ext-TSP score for a given block order and a list of jumps.
+  /// Compute the Ext-TSP score for a given node order and a list of jumps.
   double extTSPScore(const MergedChain &MergedBlocks,
-                     const std::vector<Jump *> &Jumps) const {
+                     const std::vector<JumpT *> &Jumps) const {
     if (Jumps.empty())
       return 0.0;
     uint64_t CurAddr = 0;
-    MergedBlocks.forEach([&](const Block *BB) {
-      BB->EstimatedAddr = CurAddr;
-      CurAddr += BB->Size;
+    MergedBlocks.forEach([&](const NodeT *Node) {
+      Node->EstimatedAddr = CurAddr;
+      CurAddr += Node->Size;
     });
 
     double Score = 0;
-    for (auto &Jump : Jumps) {
-      const Block *SrcBlock = Jump->Source;
-      const Block *DstBlock = Jump->Target;
+    for (JumpT *Jump : Jumps) {
+      const NodeT *SrcBlock = Jump->Source;
+      const NodeT *DstBlock = Jump->Target;
       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
                              Jump->IsConditional);
@@ -727,13 +794,13 @@ private:
   /// computes the one having the largest increase in ExtTSP objective. The
   /// result is a pair with the first element being the gain and the second
   /// element being the corresponding merging type.
-  MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
-                               ChainEdge *Edge) const {
+  MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
+                              ChainEdge *Edge) const {
     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
     }
 
-    // Precompute jumps between ChainPred and ChainSucc
+    // Precompute jumps between ChainPred and ChainSucc.
     auto Jumps = Edge->jumps();
     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
     if (EdgePP != nullptr) {
@@ -741,60 +808,60 @@ private:
     }
     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
 
-    // The object holds the best currently chosen gain of merging the two chains
-    MergeGainTy Gain = MergeGainTy();
+    // This object holds the best chosen gain of merging two chains.
+    MergeGainT Gain = MergeGainT();
 
     /// Given a merge offset and a list of merge types, try to merge two chains
-    /// and update Gain with a better alternative
+    /// and update Gain with a better alternative.
     auto tryChainMerging = [&](size_t Offset,
-                               const std::vector<MergeTypeTy> &MergeTypes) {
-      // Skip merging corresponding to concatenation w/o splitting
-      if (Offset == 0 || Offset == ChainPred->blocks().size())
+                               const std::vector<MergeTypeT> &MergeTypes) {
+      // Skip merging corresponding to concatenation w/o splitting.
+      if (Offset == 0 || Offset == ChainPred->Nodes.size())
         return;
-      // Skip merging if it breaks Forced successors
-      auto BB = ChainPred->blocks()[Offset - 1];
-      if (BB->ForcedSucc != nullptr)
+      // Skip merging if it breaks Forced successors.
+      NodeT *Node = ChainPred->Nodes[Offset - 1];
+      if (Node->ForcedSucc != nullptr)
         return;
       // Apply the merge, compute the corresponding gain, and update the best
-      // value, if the merge is beneficial
-      for (const auto &MergeType : MergeTypes) {
+      // value, if the merge is beneficial.
+      for (const MergeTypeT &MergeType : MergeTypes) {
         Gain.updateIfLessThan(
             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
       }
     };
 
-    // Try to concatenate two chains w/o splitting
+    // Try to concatenate two chains w/o splitting.
     Gain.updateIfLessThan(
-        computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
+        computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
 
     if (EnableChainSplitAlongJumps) {
-      // Attach (a part of) ChainPred before the first block of ChainSucc
-      for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
-        const auto SrcBlock = Jump->Source;
+      // Attach (a part of) ChainPred before the first node of ChainSucc.
+      for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
+        const NodeT *SrcBlock = Jump->Source;
         if (SrcBlock->CurChain != ChainPred)
           continue;
         size_t Offset = SrcBlock->CurIndex + 1;
-        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
+        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
       }
 
-      // Attach (a part of) ChainPred after the last block of ChainSucc
-      for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
-        const auto DstBlock = Jump->Source;
+      // Attach (a part of) ChainPred after the last node of ChainSucc.
+      for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
+        const NodeT *DstBlock = Jump->Source;
         if (DstBlock->CurChain != ChainPred)
           continue;
         size_t Offset = DstBlock->CurIndex;
-        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
+        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
       }
     }
 
-    // Try to break ChainPred in various ways and concatenate with ChainSucc
-    if (ChainPred->blocks().size() <= ChainSplitThreshold) {
-      for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
+    // Try to break ChainPred in various ways and concatenate with ChainSucc.
+    if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
+      for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
         // Try to split the chain in different ways. In practice, applying
         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
-        // it from consideration to reduce the search space
-        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
-                                 MergeTypeTy::X2_X1_Y});
+        // it from consideration to reduce the search space.
+        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
+                                 MergeTypeT::X2_X1_Y});
       }
     }
     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
@@ -805,127 +872,471 @@ private:
   /// merge 'type' and 'offset'.
   ///
   /// The two chains are not modified in the method.
-  MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
-                               const std::vector<Jump *> &Jumps,
-                               size_t MergeOffset,
-                               MergeTypeTy MergeType) const {
-    auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
-                                    MergeOffset, MergeType);
-
-    // Do not allow a merge that does not preserve the original entry block
-    if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
-        !MergedBlocks.getFirstBlock()->isEntry())
-      return MergeGainTy();
+  MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
+                              const std::vector<JumpT *> &Jumps,
+                              size_t MergeOffset, MergeTypeT MergeType) const {
+    auto MergedBlocks =
+        mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
 
-    // The gain for the new chain
-    auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
-    return MergeGainTy(NewGainScore, MergeOffset, MergeType);
-  }
+    // Do not allow a merge that does not preserve the original entry point.
+    if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
+        !MergedBlocks.getFirstNode()->isEntry())
+      return MergeGainT();
 
-  /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
-  ///
-  /// If MergeType == 0, then the result is a concatenation of two chains.
-  /// Otherwise, the first chain is cut into two sub-chains at the offset,
-  /// and merged using all possible ways of concatenating three chains.
-  MergedChain mergeBlocks(const std::vector<Block *> &X,
-                          const std::vector<Block *> &Y, size_t MergeOffset,
-                          MergeTypeTy MergeType) const {
-    // Split the first chain, X, into X1 and X2
-    BlockIter BeginX1 = X.begin();
-    BlockIter EndX1 = X.begin() + MergeOffset;
-    BlockIter BeginX2 = X.begin() + MergeOffset;
-    BlockIter EndX2 = X.end();
-    BlockIter BeginY = Y.begin();
-    BlockIter EndY = Y.end();
-
-    // Construct a new chain from the three existing ones
-    switch (MergeType) {
-    case MergeTypeTy::X_Y:
-      return MergedChain(BeginX1, EndX2, BeginY, EndY);
-    case MergeTypeTy::X1_Y_X2:
-      return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
-    case MergeTypeTy::Y_X2_X1:
-      return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
-    case MergeTypeTy::X2_X1_Y:
-      return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
-    }
-    llvm_unreachable("unexpected chain merge type");
+    // The gain for the new chain.
+    auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score;
+    return MergeGainT(NewGainScore, MergeOffset, MergeType);
   }
 
   /// Merge chain From into chain Into, update the list of active chains,
   /// adjacency information, and the corresponding cached values.
-  void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
-                   MergeTypeTy MergeType) {
+  void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
+                   MergeTypeT MergeType) {
     assert(Into != From && "a chain cannot be merged with itself");
 
-    // Merge the blocks
-    MergedChain MergedBlocks =
-        mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
-    Into->merge(From, MergedBlocks.getBlocks());
+    // Merge the nodes.
+    MergedChain MergedNodes =
+        mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
+    Into->merge(From, MergedNodes.getNodes());
+
+    // Merge the edges.
     Into->mergeEdges(From);
     From->clear();
 
-    // Update cached ext-tsp score for the new chain
+    // Update cached ext-tsp score for the new chain.
     ChainEdge *SelfEdge = Into->getEdge(Into);
     if (SelfEdge != nullptr) {
-      MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
-      Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
+      MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end());
+      Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps());
     }
 
-    // Remove chain From from the list of active chains
+    // Remove the chain from the list of active chains.
     llvm::erase_value(HotChains, From);
 
-    // Invalidate caches
-    for (auto EdgeIter : Into->edges()) {
-      EdgeIter.second->invalidateCache();
-    }
+    // Invalidate caches.
+    for (auto EdgeIt : Into->Edges)
+      EdgeIt.second->invalidateCache();
   }
 
-  /// Concatenate all chains into a final order of blocks.
-  void concatChains(std::vector<uint64_t> &Order) {
-    // Collect chains and calculate some stats for their sorting
-    std::vector<Chain *> SortedChains;
-    DenseMap<const Chain *, double> ChainDensity;
-    for (auto &Chain : AllChains) {
-      if (!Chain.blocks().empty()) {
+  /// Concatenate all chains into the final order.
+  std::vector<uint64_t> concatChains() {
+    // Collect chains and calculate density stats for their sorting.
+    std::vector<const ChainT *> SortedChains;
+    DenseMap<const ChainT *, double> ChainDensity;
+    for (ChainT &Chain : AllChains) {
+      if (!Chain.Nodes.empty()) {
         SortedChains.push_back(&Chain);
-        // Using doubles to avoid overflow of ExecutionCount
+        // Using doubles to avoid overflow of ExecutionCounts.
         double Size = 0;
         double ExecutionCount = 0;
-        for (auto *Block : Chain.blocks()) {
-          Size += static_cast<double>(Block->Size);
-          ExecutionCount += static_cast<double>(Block->ExecutionCount);
+        for (NodeT *Node : Chain.Nodes) {
+          Size += static_cast<double>(Node->Size);
+          ExecutionCount += static_cast<double>(Node->ExecutionCount);
         }
         assert(Size > 0 && "a chain of zero size");
         ChainDensity[&Chain] = ExecutionCount / Size;
       }
     }
 
-    // Sorting chains by density in the decreasing order
-    std::stable_sort(SortedChains.begin(), SortedChains.end(),
-                     [&](const Chain *C1, const Chain *C2) {
-                       // Make sure the original entry block is at the
-                       // beginning of the order
-                       if (C1->isEntry() != C2->isEntry()) {
-                         return C1->isEntry();
-                       }
-
-                       const double D1 = ChainDensity[C1];
-                       const double D2 = ChainDensity[C2];
-                       // Compare by density and break ties by chain identifiers
-                       return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
-                     });
-
-    // Collect the blocks in the order specified by their chains
+    // Sorting chains by density in the decreasing order.
+    std::sort(SortedChains.begin(), SortedChains.end(),
+              [&](const ChainT *L, const ChainT *R) {
+                // Place the entry point is at the beginning of the order.
+                if (L->isEntry() != R->isEntry())
+                  return L->isEntry();
+
+                const double DL = ChainDensity[L];
+                const double DR = ChainDensity[R];
+                // Compare by density and break ties by chain identifiers.
+                return (DL != DR) ? (DL > DR) : (L->Id < R->Id);
+                return std::make_tuple(-DL, L->Id) <
+                       std::make_tuple(-DR, R->Id);
+              });
+
+    // Collect the nodes in the order specified by their chains.
+    std::vector<uint64_t> Order;
     Order.reserve(NumNodes);
-    for (Chain *Chain : SortedChains) {
-      for (Block *Block : Chain->blocks()) {
-        Order.push_back(Block->Index);
+    for (const ChainT *Chain : SortedChains)
+      for (NodeT *Node : Chain->Nodes)
+        Order.push_back(Node->Index);
+    return Order;
+  }
+
+private:
+  /// The number of nodes in the graph.
+  const size_t NumNodes;
+
+  /// Successors of each node.
+  std::vector<std::vector<uint64_t>> SuccNodes;
+
+  /// Predecessors of each node.
+  std::vector<std::vector<uint64_t>> PredNodes;
+
+  /// All nodes (basic blocks) in the graph.
+  std::vector<NodeT> AllNodes;
+
+  /// All jumps between the nodes.
+  std::vector<JumpT> AllJumps;
+
+  /// All chains of nodes.
+  std::vector<ChainT> AllChains;
+
+  /// All edges between the chains.
+  std::vector<ChainEdge> AllEdges;
+
+  /// Active chains. The vector gets updated at runtime when chains are merged.
+  std::vector<ChainT *> HotChains;
+};
+
+/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
+/// functions represented by a call graph.
+class CDSortImpl {
+public:
+  CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
+             ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
+             ArrayRef<uint64_t> EdgeOffsets)
+      : Config(Config), NumNodes(NodeSizes.size()) {
+    initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
+  }
+
+  /// Run the algorithm and return an ordered set of function clusters.
+  std::vector<uint64_t> run() {
+    // Merge pairs of chains while improving the objective.
+    mergeChainPairs();
+
+    LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
+                      << " of chains from " << NumNodes << " to "
+                      << HotChains.size() << "\n");
+
+    // Collect nodes from all the chains.
+    return concatChains();
+  }
+
+private:
+  /// Initialize the algorithm's data structures.
+  void initialize(const ArrayRef<uint64_t> &NodeSizes,
+                  const ArrayRef<uint64_t> &NodeCounts,
+                  const ArrayRef<EdgeCount> &EdgeCounts,
+                  const ArrayRef<uint64_t> &EdgeOffsets) {
+    // Initialize nodes.
+    AllNodes.reserve(NumNodes);
+    for (uint64_t Node = 0; Node < NumNodes; Node++) {
+      uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
+      uint64_t ExecutionCount = NodeCounts[Node];
+      AllNodes.emplace_back(Node, Size, ExecutionCount);
+      TotalSamples += ExecutionCount;
+      if (ExecutionCount > 0)
+        TotalSize += Size;
+    }
+
+    // Initialize jumps between the nodes.
+    SuccNodes.resize(NumNodes);
+    PredNodes.resize(NumNodes);
+    AllJumps.reserve(EdgeCounts.size());
+    for (size_t I = 0; I < EdgeCounts.size(); I++) {
+      auto [Pred, Succ, Count] = EdgeCounts[I];
+      // Ignore recursive calls.
+      if (Pred == Succ)
+        continue;
+
+      SuccNodes[Pred].push_back(Succ);
+      PredNodes[Succ].push_back(Pred);
+      if (Count > 0) {
+        NodeT &PredNode = AllNodes[Pred];
+        NodeT &SuccNode = AllNodes[Succ];
+        AllJumps.emplace_back(&PredNode, &SuccNode, Count);
+        AllJumps.back().Offset = EdgeOffsets[I];
+        SuccNode.InJumps.push_back(&AllJumps.back());
+        PredNode.OutJumps.push_back(&AllJumps.back());
+      }
+    }
+
+    // Initialize chains.
+    AllChains.reserve(NumNodes);
+    HotChains.reserve(NumNodes);
+    for (NodeT &Node : AllNodes) {
+      // Adjust execution counts.
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
+      // Create chain.
+      AllChains.emplace_back(Node.Index, &Node);
+      Node.CurChain = &AllChains.back();
+      if (Node.ExecutionCount > 0)
+        HotChains.push_back(&AllChains.back());
+    }
+
+    // Initialize chain edges.
+    AllEdges.reserve(AllJumps.size());
+    for (NodeT &PredNode : AllNodes) {
+      for (JumpT *Jump : PredNode.OutJumps) {
+        NodeT *SuccNode = Jump->Target;
+        ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
+        // this edge is already present in the graph.
+        if (CurEdge != nullptr) {
+          assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
+          CurEdge->appendJump(Jump);
+          continue;
+        }
+        // this is a new edge.
+        AllEdges.emplace_back(Jump);
+        PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
+        SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
       }
     }
   }
 
+  /// Merge pairs of chains while there is an improvement in the objective.
+  void mergeChainPairs() {
+    // Create a priority queue containing all edges ordered by the merge gain.
+    auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
+      return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
+             std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
+    };
+    std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
+
+    // Insert the edges into the queue.
+    for (ChainT *ChainPred : HotChains) {
+      for (const auto &[Chain, Edge] : ChainPred->Edges) {
+        // Ignore self-edges.
+        if (Edge->isSelfEdge())
+          continue;
+        // Ignore already processed edges.
+        if (Edge->gain() != -1.0)
+          continue;
+
+        // Compute the gain of merging the two chains.
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+
+    // Merge the chains while the gain of merging is positive.
+    while (!Queue.empty()) {
+      // Extract the best (top) edge for merging.
+      ChainEdge *BestEdge = *Queue.begin();
+      Queue.erase(Queue.begin());
+      // Ignore self-edges.
+      if (BestEdge->isSelfEdge())
+        continue;
+      // Ignore edges with non-positive gains.
+      if (BestEdge->gain() <= EPS)
+        continue;
+
+      ChainT *BestSrcChain = BestEdge->srcChain();
+      ChainT *BestDstChain = BestEdge->dstChain();
+
+      // Remove outdated edges from the queue.
+      for (const auto &[Chain, ChainEdge] : BestSrcChain->Edges)
+        Queue.erase(ChainEdge);
+      for (const auto &[Chain, ChainEdge] : BestDstChain->Edges)
+        Queue.erase(ChainEdge);
+
+      // Merge the best pair of chains.
+      MergeGainT BestGain = BestEdge->getMergeGain();
+      mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
+                  BestGain.mergeType());
+
+      // Insert newly created edges into the queue.
+      for (const auto &[Chain, Edge] : BestSrcChain->Edges) {
+        // Ignore loop edges.
+        if (Edge->isSelfEdge())
+          continue;
+
+        // Compute the gain of merging the two chains.
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+  }
+
+  /// Compute the gain of merging two chains.
+  ///
+  /// The function considers all possible ways of merging two chains and
+  /// computes the one having the largest increase in ExtTSP objective. The
+  /// result is a pair with the first element being the gain and the second
+  /// element being the corresponding merging type.
+  MergeGainT getBestMergeGain(ChainEdge *Edge) const {
+    // Precompute jumps between ChainPred and ChainSucc.
+    auto Jumps = Edge->jumps();
+    assert(!Jumps.empty() && "trying to merge chains w/o jumps");
+    ChainT *SrcChain = Edge->srcChain();
+    ChainT *DstChain = Edge->dstChain();
+
+    // This object holds the best currently chosen gain of merging two chains.
+    MergeGainT Gain = MergeGainT();
+
+    /// Given a list of merge types, try to merge two chains and update Gain
+    /// with a better alternative.
+    auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
+      // Apply the merge, compute the corresponding gain, and update the best
+      // value, if the merge is beneficial.
+      for (const MergeTypeT &MergeType : MergeTypes) {
+        MergeGainT NewGain =
+            computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
+
+        // When forward and backward gains are the same, prioritize merging that
+        // preserves the original order of the functions in the binary.
+        if (std::abs(Gain.score() - NewGain.score()) < EPS) {
+          if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
+              (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
+            Gain = NewGain;
+          }
+        } else if (NewGain.score() > Gain.score() + EPS) {
+          Gain = NewGain;
+        }
+      }
+    };
+
+    // Try to concatenate two chains w/o splitting.
+    tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
+
+    return Gain;
+  }
+
+  /// Compute the score gain of merging two chains, respecting a given type.
+  ///
+  /// The two chains are not modified in the method.
+  MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
+                              const std::vector<JumpT *> &Jumps,
+                              MergeTypeT MergeType) const {
+    // This doesn't depend on the ordering of the nodes
+    double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
+
+    // Merge offset is always 0, as the chains are not split.
+    size_t MergeOffset = 0;
+    auto MergedBlocks =
+        mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
+    double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
+
+    double GainScore = DistGain + Config.FrequencyScale * FreqGain;
+    // Scale the result to increase the importance of merging short chains.
+    if (GainScore >= 0.0)
+      GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
+
+    return MergeGainT(GainScore, MergeOffset, MergeType);
+  }
+
+  /// Compute the change of the frequency locality after merging the chains.
+  double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
+    auto missProbability = [&](double ChainDensity) {
+      double PageSamples = ChainDensity * Config.CacheSize;
+      if (PageSamples >= TotalSamples)
+        return 0.0;
+      double P = PageSamples / TotalSamples;
+      return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
+    };
+
+    // Cache misses on the chains before merging.
+    double CurScore =
+        ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
+        ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
+
+    // Cache misses on the merged chain
+    double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
+    double MergedSize = ChainPred->Size + ChainSucc->Size;
+    double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
+    double NewScore = MergedCounts * missProbability(MergedDensity);
+
+    return CurScore - NewScore;
+  }
+
+  /// Compute the distance locality for a jump / call.
+  double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
+    uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
+    double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
+    return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
+  }
+
+  /// Compute the change of the distance locality after merging the chains.
+  double distBasedLocalityGain(const MergedChain &MergedBlocks,
+                               const std::vector<JumpT *> &Jumps) const {
+    if (Jumps.empty())
+      return 0.0;
+    uint64_t CurAddr = 0;
+    MergedBlocks.forEach([&](const NodeT *Node) {
+      Node->EstimatedAddr = CurAddr;
+      CurAddr += Node->Size;
+    });
+
+    double CurScore = 0;
+    double NewScore = 0;
+    for (const JumpT *Arc : Jumps) {
+      uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset;
+      uint64_t DstAddr = Arc->Target->EstimatedAddr;
+      NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount);
+      CurScore += distScore(0, TotalSize, Arc->ExecutionCount);
+    }
+    return NewScore - CurScore;
+  }
+
+  /// Merge chain From into chain Into, update the list of active chains,
+  /// adjacency information, and the corresponding cached values.
+  void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
+                   MergeTypeT MergeType) {
+    assert(Into != From && "a chain cannot be merged with itself");
+
+    // Merge the nodes.
+    MergedChain MergedNodes =
+        mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
+    Into->merge(From, MergedNodes.getNodes());
+
+    // Merge the edges.
+    Into->mergeEdges(From);
+    From->clear();
+
+    // Remove the chain from the list of active chains.
+    llvm::erase_value(HotChains, From);
+  }
+
+  /// Concatenate all chains into the final order.
+  std::vector<uint64_t> concatChains() {
+    // Collect chains and calculate density stats for their sorting.
+    std::vector<const ChainT *> SortedChains;
+    DenseMap<const ChainT *, double> ChainDensity;
+    for (ChainT &Chain : AllChains) {
+      if (!Chain.Nodes.empty()) {
+        SortedChains.push_back(&Chain);
+        // Using doubles to avoid overflow of ExecutionCounts.
+        double Size = 0;
+        double ExecutionCount = 0;
+        for (NodeT *Node : Chain.Nodes) {
+          Size += static_cast<double>(Node->Size);
+          ExecutionCount += static_cast<double>(Node->ExecutionCount);
+        }
+        assert(Size > 0 && "a chain of zero size");
+        ChainDensity[&Chain] = ExecutionCount / Size;
+      }
+    }
+
+    // Sort chains by density in the decreasing order.
+    std::sort(SortedChains.begin(), SortedChains.end(),
+              [&](const ChainT *L, const ChainT *R) {
+                const double DL = ChainDensity[L];
+                const double DR = ChainDensity[R];
+                // Compare by density and break ties by chain identifiers.
+                return std::make_tuple(-DL, L->Id) <
+                       std::make_tuple(-DR, R->Id);
+              });
+
+    // Collect the nodes in the order specified by their chains.
+    std::vector<uint64_t> Order;
+    Order.reserve(NumNodes);
+    for (const ChainT *Chain : SortedChains)
+      for (NodeT *Node : Chain->Nodes)
+        Order.push_back(Node->Index);
+    return Order;
+  }
+
 private:
+  /// Config for the algorithm.
+  const CDSortConfig Config;
+
   /// The number of nodes in the graph.
   const size_t NumNodes;
 
@@ -935,80 +1346,108 @@ private:
   /// Predecessors of each node.
   std::vector<std::vector<uint64_t>> PredNodes;
 
-  /// All basic blocks.
-  std::vector<Block> AllBlocks;
+  /// All nodes (functions) in the graph.
+  std::vector<NodeT> AllNodes;
 
-  /// All jumps between blocks.
-  std::vector<Jump> AllJumps;
+  /// All jumps (function calls) between the nodes.
+  std::vector<JumpT> AllJumps;
 
-  /// All chains of basic blocks.
-  std::vector<Chain> AllChains;
+  /// All chains of nodes.
+  std::vector<ChainT> AllChains;
 
-  /// All edges between chains.
+  /// All edges between the chains.
   std::vector<ChainEdge> AllEdges;
 
   /// Active chains. The vector gets updated at runtime when chains are merged.
-  std::vector<Chain *> HotChains;
+  std::vector<ChainT *> HotChains;
+
+  /// The total number of samples in the graph.
+  uint64_t TotalSamples{0};
+
+  /// The total size of the nodes in the graph.
+  uint64_t TotalSize{0};
 };
 
 } // end of anonymous namespace
 
-std::vector<uint64_t> llvm::applyExtTspLayout(
-    const std::vector<uint64_t> &NodeSizes,
-    const std::vector<uint64_t> &NodeCounts,
-    const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
-  size_t NumNodes = NodeSizes.size();
-
+std::vector<uint64_t>
+codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
+                                ArrayRef<uint64_t> NodeCounts,
+                                ArrayRef<EdgeCount> EdgeCounts) {
   // Verify correctness of the input data.
   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
-  assert(NumNodes > 2 && "Incorrect input");
+  assert(NodeSizes.size() > 2 && "Incorrect input");
 
   // Apply the reordering algorithm.
-  auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
-  std::vector<uint64_t> Result;
-  Alg.run(Result);
+  ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
+  std::vector<uint64_t> Result = Alg.run();
 
   // Verify correctness of the output.
   assert(Result.front() == 0 && "Original entry point is not preserved");
-  assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
+  assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
   return Result;
 }
 
-double llvm::calcExtTspScore(
-    const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
-    const std::vector<uint64_t> &NodeCounts,
-    const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
-  // Estimate addresses of the blocks in memory
+double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
+                                   ArrayRef<uint64_t> NodeSizes,
+                                   ArrayRef<uint64_t> NodeCounts,
+                                   ArrayRef<EdgeCount> EdgeCounts) {
+  // Estimate addresses of the blocks in memory.
   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
   }
   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
-  for (auto It : EdgeCounts) {
-    auto Pred = It.first.first;
-    OutDegree[Pred]++;
-  }
+  for (auto Edge : EdgeCounts)
+    ++OutDegree[Edge.src];
 
-  // Increase the score for each jump
+  // Increase the score for each jump.
   double Score = 0;
-  for (auto It : EdgeCounts) {
-    auto Pred = It.first.first;
-    auto Succ = It.first.second;
-    uint64_t Count = It.second;
-    bool IsConditional = OutDegree[Pred] > 1;
-    Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
-                           IsConditional);
+  for (auto Edge : EdgeCounts) {
+    bool IsConditional = OutDegree[Edge.src] > 1;
+    Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
+                           Edge.count, IsConditional);
   }
   return Score;
 }
 
-double llvm::calcExtTspScore(
-    const std::vector<uint64_t> &NodeSizes,
-    const std::vector<uint64_t> &NodeCounts,
-    const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
+double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
+                                   ArrayRef<uint64_t> NodeCounts,
+                                   ArrayRef<EdgeCount> EdgeCounts) {
   std::vector<uint64_t> Order(NodeSizes.size());
   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
     Order[Idx] = Idx;
   }
   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
 }
+
+std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
+    const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
+    ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
+    ArrayRef<uint64_t> CallOffsets) {
+  // Verify correctness of the input data.
+  assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
+
+  // Apply the reordering algorithm.
+  CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
+  std::vector<uint64_t> Result = Alg.run();
+  assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
+  return Result;
+}
+
+std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
+    ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
+    ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
+  CDSortConfig Config;
+  // Populate the config from the command-line options.
+  if (CacheEntries.getNumOccurrences() > 0)
+    Config.CacheEntries = CacheEntries;
+  if (CacheSize.getNumOccurrences() > 0)
+    Config.CacheSize = CacheSize;
+  if (DistancePower.getNumOccurrences() > 0)
+    Config.DistancePower = DistancePower;
+  if (FrequencyScale.getNumOccurrences() > 0)
+    Config.FrequencyScale = FrequencyScale;
+  return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
+                                    CallOffsets);
+}
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 40632b43e73bfe4e4f81a8f19f7244f28ce5eb27..f0459f47605463c498d4de745de06507bea8e2fc 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -582,9 +582,14 @@ int main(int argc, char **argv) {
   // the facility for updating public visibility to linkage unit visibility when
   // specified by an internal option. This is normally done during LTO which is
   // not performed via opt.
-  updateVCallVisibilityInModule(*M,
-                                /* WholeProgramVisibilityEnabledInLTO */ false,
-                                /* DynamicExportSymbols */ {});
+  updateVCallVisibilityInModule(
+      *M,
+      /*WholeProgramVisibilityEnabledInLTO=*/false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
 
   // Figure out what stream we are supposed to write to...
   std::unique_ptr<ToolOutputFile> Out;
diff --git a/llvm/utils/TableGen/AsmWriterInst.cpp b/llvm/utils/TableGen/AsmWriterInst.cpp
index 4a78108d6f4a15253b27b686fd87ec30676674c3..c9558593e142cde9eb6b800f5e63fce1810f0ac6 100644
--- a/llvm/utils/TableGen/AsmWriterInst.cpp
+++ b/llvm/utils/TableGen/AsmWriterInst.cpp
@@ -12,7 +12,6 @@
 
 #include "AsmWriterInst.h"
 #include "CodeGenInstruction.h"
-#include "CodeGenTarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/AsmWriterInst.h b/llvm/utils/TableGen/AsmWriterInst.h
index fe2b934e266f18b1df504b2522d5e6456f33a89d..9c93e82b611b6b5603b024dd1d1002b2318f52ff 100644
--- a/llvm/utils/TableGen/AsmWriterInst.h
+++ b/llvm/utils/TableGen/AsmWriterInst.h
@@ -21,7 +21,6 @@
 
 namespace llvm {
   class CodeGenInstruction;
-  class Record;
 
   struct AsmWriterOperand {
     enum OpType {
diff --git a/llvm/utils/TableGen/CTagsEmitter.cpp b/llvm/utils/TableGen/CTagsEmitter.cpp
index fe62d6a9b67f2d9f80f17207220575e623be3527..b4ffbfa2012cb8f14dd4fc879b4a2f131b3a9950 100644
--- a/llvm/utils/TableGen/CTagsEmitter.cpp
+++ b/llvm/utils/TableGen/CTagsEmitter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include <algorithm>
-#include <string>
 #include <vector>
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index dc4fd589eaa83fd0e44b689bb6ec7f2768294f9e..65befa0473fc89f570ee7ba1ede7d7285bdfe6a5 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
-#include "SubtargetFeatureInfo.h"
-#include "Types.h"
+#include "InfoByHwMode.h"
 #include "VarLenCodeEmitterGen.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index dd04778e2dbe7903047a0b06566159706b9ae787..2713e7a1a8ed5b663567ad67eddc7c1350ab2802 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -13,6 +13,7 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenRegisters.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 55507cbca37dddd8fe84f200e27465ec18d39275..335e918bfe73815caa79664b787ced1b720e4bf9 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -12,9 +12,11 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENHWMODES_H
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include <cassert>
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 
 // HwModeId -> list of predicates (definition)
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index 02695942f5c12017adc5203ee8c8e163601eae21..fd375735dfd2ad739bda1c9ba05c14873620ecc4 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -78,6 +78,7 @@
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 using namespace llvm;
 typedef std::map<std::string, std::vector<Record*> > InstrRelMapTy;
 
diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h
index 765425ed68cbdafb5de5db268a9eab1246651a3c..7638816811e8f0b22259c0358830ee690de092d6 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/CodeGenRegisters.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H
 #define LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H
 
+#include "CodeGenHwModes.h"
 #include "InfoByHwMode.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
@@ -32,8 +33,11 @@
 #include <cassert>
 #include <cstdint>
 #include <deque>
+#include <functional>
 #include <list>
 #include <map>
+#include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -41,7 +45,6 @@
 namespace llvm {
 
   class CodeGenRegBank;
-  template <typename T, typename Vector, typename Set> class SetVector;
 
   /// Used to encode a step in a register lane mask transformation.
   /// Mask the bits specified in Mask, then rotate them Rol bits to the left
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
index bbf5381ad086b3f40072f2e673742b5b759ed477..76ef1e43953078e77add8458838ad3e5e5875ef7 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -15,10 +15,17 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENSCHEDULE_H
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
+#include <cassert>
+#include <string>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index a18d6a6b8854f41dc900b11753cb84c3b75d1e61..1f9c39c08e677a88bae4fde26eac7cf5de0ce20b 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -65,6 +65,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenInstruction.h"
+#include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index d012a0172a8fd245a87ad97d81dffd467afe86a4..70738c7adca8d61b2288fa49bfa6ed0aa037752a 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -12,6 +12,7 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenTarget.h"
 #include "DAGISelMatcher.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp
index e436a931a9f556ca578550c388b2478f32262435..c08c6a9a30a290f09ab425e711f110e75b90bdf5 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcher.cpp
@@ -8,6 +8,7 @@
 
 #include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcher.h b/llvm/utils/TableGen/DAGISelMatcher.h
index 77280acaf4cac3622c24393d4a64f5623479df29..c9094f5675e63334ccc708b5a2af7be9ddb4c2f2 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/DAGISelMatcher.h
@@ -14,6 +14,11 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
 
 namespace llvm {
   struct CodeGenRegister;
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 777e75dcd92911e5da04e71f00bf460b4bf30203..2b876c2f7496923e3f030b968a2af5b710af2662 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
 #include "DAGISelMatcher.h"
+#include "SDNodeProperties.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 44bff4c67ab31c0172c9b3a684c5c36c4e893725..03f7bc4ff51914c1aea179b3b9905deba88a4e2c 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -9,7 +9,10 @@
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
 #include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
 #include "DAGISelMatcher.h"
+#include "InfoByHwMode.h"
+#include "SDNodeProperties.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/TableGen/Error.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
index 4273bd69b87d3d138b85ce1d0dbb138da4379aaa..764b86c97dbf831d942ef8816c10273faad7a422 100644
--- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -12,6 +12,7 @@
 
 #include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
+#include "SDNodeProperties.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/utils/TableGen/DFAEmitter.h b/llvm/utils/TableGen/DFAEmitter.h
index 44e5d97d544ffe4716480932df92caf2a0913be6..c831a65a73cdf64a1cd0c2dd2cb5aeeb06a235b0 100644
--- a/llvm/utils/TableGen/DFAEmitter.h
+++ b/llvm/utils/TableGen/DFAEmitter.h
@@ -21,6 +21,8 @@
 #include "llvm/ADT/UniqueVector.h"
 #include <map>
 #include <set>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 44c1df3e9ac4693e1b45d383d44b889ffbef47f3..c9cd5b0d7ec6e6df9b4377452637ed03f6ec0b21 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/DXILOperationCommon.h"
-#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 8f816744370c046e5775d5bbd5f31480b8d919fc..eabc158ab91eda124b2528460ea79aa7ee5fd00a 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "InfoByHwMode.h"
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index 0a88f67be168ac0e00f43e1d0df05d16d9795cad..ef501f86f291587d082944153fc2d71273669c78 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -18,6 +18,9 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
diff --git a/llvm/utils/TableGen/GICombinerEmitter.cpp b/llvm/utils/TableGen/GICombinerEmitter.cpp
index 2ae313081a6f161b47b248d13de87ce595223d3a..927fb81dc74baf559eb37b0dc5b07d78970b1b6c 100644
--- a/llvm/utils/TableGen/GICombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GICombinerEmitter.cpp
@@ -15,6 +15,9 @@
 #include "GlobalISel/CodeExpander.h"
 #include "GlobalISel/CodeExpansions.h"
 #include "GlobalISel/GIMatchDag.h"
+#include "GlobalISel/GIMatchDagEdge.h"
+#include "GlobalISel/GIMatchDagInstr.h"
+#include "GlobalISel/GIMatchDagOperands.h"
 #include "GlobalISel/GIMatchDagPredicate.h"
 #include "GlobalISel/GIMatchTree.h"
 #include "llvm/ADT/SmallSet.h"
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index c79c79948a80d42d23ee9166d2f2ddaed9ede433..360d42f3978aced6de6e7657a23d88dc0a865e82 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -31,6 +31,10 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenIntrinsics.h"
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "SubtargetFeatureInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CodeGenCoverage.h"
diff --git a/llvm/utils/TableGen/InfoByHwMode.cpp b/llvm/utils/TableGen/InfoByHwMode.cpp
index 73c4fbf0a5eb5024695e8ec3d8995b27dd2d02a0..5140c5a0d20f0b20eb60e6a5cff4089b972077d4 100644
--- a/llvm/utils/TableGen/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/InfoByHwMode.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include "llvm/TableGen/Record.h"
 #include <string>
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/InfoByHwMode.h b/llvm/utils/TableGen/InfoByHwMode.h
index 44927d0bf0df9a2bfc7b8907da202a128c494fc9..6cfd6e8bb49373ee1524a1402cadb756d2053519 100644
--- a/llvm/utils/TableGen/InfoByHwMode.h
+++ b/llvm/utils/TableGen/InfoByHwMode.h
@@ -16,10 +16,16 @@
 
 #include "CodeGenHwModes.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MachineValueType.h"
-
+#include <cassert>
+#include <limits>
 #include <map>
 #include <string>
+#include <tuple>
+#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp
index b129401461b51e9eb974ce1194ea2487ff585263..8f96d3307ded8beccd3bad1f973eae18889f978b 100644
--- a/llvm/utils/TableGen/PredicateExpander.cpp
+++ b/llvm/utils/TableGen/PredicateExpander.cpp
@@ -12,6 +12,7 @@
 
 #include "PredicateExpander.h"
 #include "CodeGenSchedule.h" // Definition of STIPredicateFunction.
+#include "llvm/TableGen/Record.h"
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index e6689b211a7d3a0abaae84d085cabae3dfef843a..01f2f7864d8dd3371ac3d338f76f34d963c3ab8a 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -11,15 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
-#include "CodeGenRegisters.h"
-#include "CodeGenTarget.h"
-
 #define DEBUG_TYPE "register-bank-emitter"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 113cebf8a08e9c0019b498a8b1965b6f966a647b..5715dc1deb3062e72177e05e7dacf5692020b341 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -12,8 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "SequenceToOffsetTable.h"
 #include "Types.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 8afe6d37d0e0d5a5f145159cb3ef6859bf44d4f5..ec26e1c41f85495a50e1a22a9c3923e51ecbae90 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
 #include "PredicateExpander.h"
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
index 2a63fc490380fb3f002ae40cc68b7ab28d21d5c5..1abcf485f8564dde37a60612f4eecfecd6f63f29 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include <map>
 
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.h b/llvm/utils/TableGen/SubtargetFeatureInfo.h
index 8c8a4487934cd7856ee58038b9a8fc759ba067bb..e6a3f82d9bb83f79da6f314d0fa9bb132972b126 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.h
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.h
@@ -9,9 +9,11 @@
 #ifndef LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H
 #define LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/TableGen/Record.h"
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
index 746e2dd1db16a67646fa8f05f435af078365dcbe..4117ed5a3f6a97641b32584eaf1cec5afbeff1d0 100644
--- a/llvm/utils/TableGen/TableGen.cpp
+++ b/llvm/utils/TableGen/TableGen.cpp
@@ -13,9 +13,13 @@
 #include "TableGenBackends.h" // Declares all backends.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
+#include <cassert>
+#include <string>
+#include <vector>
 
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
index 2c1acd8d910c4ab1f0c617aef37feb37ba39131e..85da547d04c1336db74d346bc53fae103bdcdccd 100644
--- a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
@@ -58,6 +58,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 1384330ee8a12ae08b640726e58678eb42079f99..b42ffa2aec1a3616b69e3b0e60aa42bbda307c1c 100644
--- a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenTarget.h"
 #include "X86RecognizableInstr.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 5b3f11848de6c781eb5bebf0fff5a56898cdbb5a..7bc17e6f2b64d2140baa13847634b4ae6b025855 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "X86RecognizableInstr.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/X86MnemonicTables.cpp b/llvm/utils/TableGen/X86MnemonicTables.cpp
index f405e051e3559060a1e93640417c01a49093bdc7..b5405488de0e5b7d3784722631b085886cf8fb50 100644
--- a/llvm/utils/TableGen/X86MnemonicTables.cpp
+++ b/llvm/utils/TableGen/X86MnemonicTables.cpp
@@ -14,7 +14,7 @@
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "X86RecognizableInstr.h"
-#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index e2d0907b4f8b5bea9c49db7c3244a25427b6f508..d2169a8e879b6fc82671f0373afae5dc361ad8a6 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -17,7 +17,7 @@
 #ifndef LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H
 #define LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H
 
-#include "llvm/Support/DataTypes.h"
+#include <cstdint>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index ea56a9d7d994ed7fe7ae6545d220f6ec7fac3897..f389ff01670c091209366e858342d43a504a08ae 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -17,8 +17,10 @@
 #define LLVM_UTILS_TABLEGEN_X86RECOGNIZABLEINSTR_H
 
 #include "CodeGenInstruction.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/X86DisassemblerDecoderCommon.h"
+#include <cstdint>
+#include <string>
+#include <vector>
 
 struct InstructionSpecifier;
 
diff --git a/mlir/include/mlir/Support/LLVM.h b/mlir/include/mlir/Support/LLVM.h
index 7acf97b5a2531e5002233ffe93243fe82446e30d..48d569b0ca756279ca862c56220e1f9fcfa929e9 100644
--- a/mlir/include/mlir/Support/LLVM.h
+++ b/mlir/include/mlir/Support/LLVM.h
@@ -61,7 +61,7 @@ class MutableArrayRef;
 template <typename T> using Optional = std::optional<T>;
 template <typename... PT>
 class PointerUnion;
-template <typename T, typename Vector, typename Set>
+template <typename T, typename Vector, typename Set, unsigned N>
 class SetVector;
 template <typename T, unsigned N>
 class SmallPtrSet;
@@ -123,8 +123,8 @@ using DenseMap = llvm::DenseMap<KeyT, ValueT, KeyInfoT, BucketT>;
 template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
 using DenseSet = llvm::DenseSet<ValueT, ValueInfoT>;
 template <typename T, typename Vector = std::vector<T>,
-          typename Set = DenseSet<T>>
-using SetVector = llvm::SetVector<T, Vector, Set>;
+          typename Set = DenseSet<T>, unsigned N = 0>
+using SetVector = llvm::SetVector<T, Vector, Set, N>;
 template <typename AllocatorTy = llvm::MallocAllocator>
 using StringSet = llvm::StringSet<AllocatorTy>;
 using llvm::MutableArrayRef;