From 6dd77e631124d79713ba387c44083dc086fe4d7d Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@fb.com>
Date: Tue, 13 Jun 2023 10:08:00 -0700
Subject: [PATCH 01/10] A new code layout algorithm for function reordering
 [1/3]

We are brining a new algorithm for function layout (reordering) based on the
call graph (extracted from a profile data). The algorithm is an improvement of
top of a known heuristic, C^3. It tries to co-locate hot and frequently executed
together functions in the resulting ordering. Unlike C^3, it explores a larger
search space and have an objective closely tied to the performance of
instruction and i-TLB caches. Hence, the name CDS = Cache-Directed Sort.
The algorithm can be used at the linking or post-linking (e.g., BOLT) stage.

This diff modifies the existing data structures to facilitate the implementation
(down the stack). This is a no-op change.

Reviewed By: hoy

Differential Revision: https://reviews.llvm.org/D152833
---
 llvm/lib/Transforms/Utils/CodeLayout.cpp | 768 ++++++++++++-----------
 1 file changed, 398 insertions(+), 370 deletions(-)
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index 9eb3aff3ffe8..059e20a34104 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -6,7 +6,8 @@
 //
 //===----------------------------------------------------------------------===//
 //
-// ExtTSP - layout of basic blocks with i-cache optimization.
+// The file implements "cache-aware" layout algorithms of basic blocks and
+// functions in a binary.
 //
 // The algorithm tries to find a layout of nodes (basic blocks) of a given CFG
 // optimizing jump locality and thus processor I-cache utilization. This is
@@ -41,6 +42,7 @@
 
 #include "llvm/Transforms/Utils/CodeLayout.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/Debug.h"
 
 #include <cmath>
 
@@ -69,11 +71,11 @@ static cl::opt<double> ForwardWeightUncond(
 
 static cl::opt<double> BackwardWeightCond(
     "ext-tsp-backward-weight-cond", cl::ReallyHidden, cl::init(0.1),
-    cl::desc("The weight of conditonal backward jumps for ExtTSP value"));
+    cl::desc("The weight of conditional backward jumps for ExtTSP value"));
 
 static cl::opt<double> BackwardWeightUncond(
     "ext-tsp-backward-weight-uncond", cl::ReallyHidden, cl::init(0.1),
-    cl::desc("The weight of unconditonal backward jumps for ExtTSP value"));
+    cl::desc("The weight of unconditional backward jumps for ExtTSP value"));
 
 static cl::opt<double> FallthroughWeightCond(
     "ext-tsp-fallthrough-weight-cond", cl::ReallyHidden, cl::init(1.0),
@@ -149,29 +151,30 @@ double extTSPScore(uint64_t SrcAddr, uint64_t SrcSize, uint64_t DstAddr,
 
 /// A type of merging two chains, X and Y. The former chain is split into
 /// X1 and X2 and then concatenated with Y in the order specified by the type.
-enum class MergeTypeTy : int { X_Y, X1_Y_X2, Y_X2_X1, X2_X1_Y };
+enum class MergeTypeT : int { X_Y, Y_X, X1_Y_X2, Y_X2_X1, X2_X1_Y };
 
 /// The gain of merging two chains, that is, the Ext-TSP score of the merge
-/// together with the corresponfiding merge 'type' and 'offset'.
-class MergeGainTy {
-public:
-  explicit MergeGainTy() = default;
-  explicit MergeGainTy(double Score, size_t MergeOffset, MergeTypeTy MergeType)
+/// together with the corresponding merge 'type' and 'offset'.
+struct MergeGainT {
+  explicit MergeGainT() = default;
+  explicit MergeGainT(double Score, size_t MergeOffset, MergeTypeT MergeType)
       : Score(Score), MergeOffset(MergeOffset), MergeType(MergeType) {}
 
   double score() const { return Score; }
 
   size_t mergeOffset() const { return MergeOffset; }
 
-  MergeTypeTy mergeType() const { return MergeType; }
+  MergeTypeT mergeType() const { return MergeType; }
+
+  void setMergeType(MergeTypeT Ty) { MergeType = Ty; }
 
   // Returns 'true' iff Other is preferred over this.
-  bool operator<(const MergeGainTy &Other) const {
+  bool operator<(const MergeGainT &Other) const {
     return (Other.Score > EPS && Other.Score > Score + EPS);
   }
 
   // Update the current gain if Other is preferred over this.
-  void updateIfLessThan(const MergeGainTy &Other) {
+  void updateIfLessThan(const MergeGainT &Other) {
     if (*this < Other)
       *this = Other;
   }
@@ -179,106 +182,102 @@ public:
 private:
   double Score{-1.0};
   size_t MergeOffset{0};
-  MergeTypeTy MergeType{MergeTypeTy::X_Y};
+  MergeTypeT MergeType{MergeTypeT::X_Y};
 };
 
-class Jump;
-class Chain;
-class ChainEdge;
+struct JumpT;
+struct ChainT;
+struct ChainEdge;
 
-/// A node in the graph, typically corresponding to a basic block in CFG.
-class Block {
-public:
-  Block(const Block &) = delete;
-  Block(Block &&) = default;
-  Block &operator=(const Block &) = delete;
-  Block &operator=(Block &&) = default;
+/// A node in the graph, typically corresponding to a basic block in the CFG or
+/// a function in the call graph.
+struct NodeT {
+  NodeT(const NodeT &) = delete;
+  NodeT(NodeT &&) = default;
+  NodeT &operator=(const NodeT &) = delete;
+  NodeT &operator=(NodeT &&) = default;
+
+  explicit NodeT(size_t Index, uint64_t Size, uint64_t EC)
+      : Index(Index), Size(Size), ExecutionCount(EC) {}
+
+  bool isEntry() const { return Index == 0; }
+
+  // The total execution count of outgoing jumps.
+  uint64_t outCount() const;
+
+  // The total execution count of incoming jumps.
+  uint64_t inCount() const;
 
-  // The original index of the block in CFG.
+  // The original index of the node in graph.
   size_t Index{0};
-  // The index of the block in the current chain.
+  // The index of the node in the current chain.
   size_t CurIndex{0};
-  // Size of the block in the binary.
+  // The size of the node in the binary.
   uint64_t Size{0};
-  // Execution count of the block in the profile data.
+  // The execution count of the node in the profile data.
   uint64_t ExecutionCount{0};
-  // Current chain of the node.
-  Chain *CurChain{nullptr};
-  // An offset of the block in the current chain.
+  // The current chain of the node.
+  ChainT *CurChain{nullptr};
+  // The offset of the node in the current chain.
   mutable uint64_t EstimatedAddr{0};
-  // Forced successor of the block in CFG.
-  Block *ForcedSucc{nullptr};
-  // Forced predecessor of the block in CFG.
-  Block *ForcedPred{nullptr};
-  // Outgoing jumps from the block.
-  std::vector<Jump *> OutJumps;
-  // Incoming jumps to the block.
-  std::vector<Jump *> InJumps;
-
-public:
-  explicit Block(size_t Index, uint64_t Size, uint64_t EC)
-      : Index(Index), Size(Size), ExecutionCount(EC) {}
-  bool isEntry() const { return Index == 0; }
+  // Forced successor of the node in the graph.
+  NodeT *ForcedSucc{nullptr};
+  // Forced predecessor of the node in the graph.
+  NodeT *ForcedPred{nullptr};
+  // Outgoing jumps from the node.
+  std::vector<JumpT *> OutJumps;
+  // Incoming jumps to the node.
+  std::vector<JumpT *> InJumps;
 };
 
-/// An arc in the graph, typically corresponding to a jump between two blocks.
-class Jump {
-public:
-  Jump(const Jump &) = delete;
-  Jump(Jump &&) = default;
-  Jump &operator=(const Jump &) = delete;
-  Jump &operator=(Jump &&) = default;
-
-  // Source block of the jump.
-  Block *Source;
-  // Target block of the jump.
-  Block *Target;
+/// An arc in the graph, typically corresponding to a jump between two nodes.
+struct JumpT {
+  JumpT(const JumpT &) = delete;
+  JumpT(JumpT &&) = default;
+  JumpT &operator=(const JumpT &) = delete;
+  JumpT &operator=(JumpT &&) = default;
+
+  explicit JumpT(NodeT *Source, NodeT *Target, uint64_t ExecutionCount)
+      : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
+
+  // Source node of the jump.
+  NodeT *Source;
+  // Target node of the jump.
+  NodeT *Target;
   // Execution count of the arc in the profile data.
   uint64_t ExecutionCount{0};
   // Whether the jump corresponds to a conditional branch.
   bool IsConditional{false};
-
-public:
-  explicit Jump(Block *Source, Block *Target, uint64_t ExecutionCount)
-      : Source(Source), Target(Target), ExecutionCount(ExecutionCount) {}
+  // The offset of the jump from the source node.
+  uint64_t Offset{0};
 };
 
-/// A chain (ordered sequence) of blocks.
-class Chain {
-public:
-  Chain(const Chain &) = delete;
-  Chain(Chain &&) = default;
-  Chain &operator=(const Chain &) = delete;
-  Chain &operator=(Chain &&) = default;
+/// A chain (ordered sequence) of nodes in the graph.
+struct ChainT {
+  ChainT(const ChainT &) = delete;
+  ChainT(ChainT &&) = default;
+  ChainT &operator=(const ChainT &) = delete;
+  ChainT &operator=(ChainT &&) = default;
+
+  explicit ChainT(uint64_t Id, NodeT *Node)
+      : Id(Id), ExecutionCount(Node->ExecutionCount), Size(Node->Size),
+        Nodes(1, Node) {}
 
-  explicit Chain(uint64_t Id, Block *Block)
-      : Id(Id), Score(0), Blocks(1, Block) {}
+  size_t numBlocks() const { return Nodes.size(); }
 
-  uint64_t id() const { return Id; }
+  double density() const { return static_cast<double>(ExecutionCount) / Size; }
 
-  bool isEntry() const { return Blocks[0]->Index == 0; }
+  bool isEntry() const { return Nodes[0]->Index == 0; }
 
   bool isCold() const {
-    for (auto *Block : Blocks) {
-      if (Block->ExecutionCount > 0)
+    for (NodeT *Node : Nodes) {
+      if (Node->ExecutionCount > 0)
         return false;
     }
     return true;
   }
 
-  double score() const { return Score; }
-
-  void setScore(double NewScore) { Score = NewScore; }
-
-  const std::vector<Block *> &blocks() const { return Blocks; }
-
-  size_t numBlocks() const { return Blocks.size(); }
-
-  const std::vector<std::pair<Chain *, ChainEdge *>> &edges() const {
-    return Edges;
-  }
-
-  ChainEdge *getEdge(Chain *Other) const {
+  ChainEdge *getEdge(ChainT *Other) const {
     for (auto It : Edges) {
       if (It.first == Other)
         return It.second;
@@ -286,7 +285,7 @@ public:
     return nullptr;
   }
 
-  void removeEdge(Chain *Other) {
+  void removeEdge(ChainT *Other) {
     auto It = Edges.begin();
     while (It != Edges.end()) {
       if (It->first == Other) {
@@ -297,63 +296,68 @@ public:
     }
   }
 
-  void addEdge(Chain *Other, ChainEdge *Edge) {
+  void addEdge(ChainT *Other, ChainEdge *Edge) {
     Edges.push_back(std::make_pair(Other, Edge));
   }
 
-  void merge(Chain *Other, const std::vector<Block *> &MergedBlocks) {
-    Blocks = MergedBlocks;
-    // Update the block's chains
-    for (size_t Idx = 0; Idx < Blocks.size(); Idx++) {
-      Blocks[Idx]->CurChain = this;
-      Blocks[Idx]->CurIndex = Idx;
+  void merge(ChainT *Other, const std::vector<NodeT *> &MergedBlocks) {
+    Nodes = MergedBlocks;
+    // Update the chain's data
+    ExecutionCount += Other->ExecutionCount;
+    Size += Other->Size;
+    Id = Nodes[0]->Index;
+    // Update the node's data
+    for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
+      Nodes[Idx]->CurChain = this;
+      Nodes[Idx]->CurIndex = Idx;
     }
   }
 
-  void mergeEdges(Chain *Other);
+  void mergeEdges(ChainT *Other);
 
   void clear() {
-    Blocks.clear();
-    Blocks.shrink_to_fit();
+    Nodes.clear();
+    Nodes.shrink_to_fit();
     Edges.clear();
     Edges.shrink_to_fit();
   }
 
-private:
   // Unique chain identifier.
   uint64_t Id;
   // Cached ext-tsp score for the chain.
-  double Score;
-  // Blocks of the chain.
-  std::vector<Block *> Blocks;
+  double Score{0};
+  // The total execution count of the chain.
+  uint64_t ExecutionCount{0};
+  // The total size of the chain.
+  uint64_t Size{0};
+  // Nodes of the chain.
+  std::vector<NodeT *> Nodes;
   // Adjacent chains and corresponding edges (lists of jumps).
-  std::vector<std::pair<Chain *, ChainEdge *>> Edges;
+  std::vector<std::pair<ChainT *, ChainEdge *>> Edges;
 };
 
-/// An edge in CFG representing jumps between two chains.
-/// When blocks are merged into chains, the edges are combined too so that
+/// An edge in the graph representing jumps between two chains.
+/// When nodes are merged into chains, the edges are combined too so that
 /// there is always at most one edge between a pair of chains
-class ChainEdge {
-public:
+struct ChainEdge {
   ChainEdge(const ChainEdge &) = delete;
   ChainEdge(ChainEdge &&) = default;
   ChainEdge &operator=(const ChainEdge &) = delete;
-  ChainEdge &operator=(ChainEdge &&) = default;
+  ChainEdge &operator=(ChainEdge &&) = delete;
 
-  explicit ChainEdge(Jump *Jump)
+  explicit ChainEdge(JumpT *Jump)
       : SrcChain(Jump->Source->CurChain), DstChain(Jump->Target->CurChain),
         Jumps(1, Jump) {}
 
-  const std::vector<Jump *> &jumps() const { return Jumps; }
+  ChainT *srcChain() const { return SrcChain; }
 
-  void changeEndpoint(Chain *From, Chain *To) {
-    if (From == SrcChain)
-      SrcChain = To;
-    if (From == DstChain)
-      DstChain = To;
-  }
+  ChainT *dstChain() const { return DstChain; }
+
+  bool isSelfEdge() const { return SrcChain == DstChain; }
 
-  void appendJump(Jump *Jump) { Jumps.push_back(Jump); }
+  const std::vector<JumpT *> &jumps() const { return Jumps; }
+
+  void appendJump(JumpT *Jump) { Jumps.push_back(Jump); }
 
   void moveJumps(ChainEdge *Other) {
     Jumps.insert(Jumps.end(), Other->Jumps.begin(), Other->Jumps.end());
@@ -361,15 +365,22 @@ public:
     Other->Jumps.shrink_to_fit();
   }
 
-  bool hasCachedMergeGain(Chain *Src, Chain *Dst) const {
+  void changeEndpoint(ChainT *From, ChainT *To) {
+    if (From == SrcChain)
+      SrcChain = To;
+    if (From == DstChain)
+      DstChain = To;
+  }
+
+  bool hasCachedMergeGain(ChainT *Src, ChainT *Dst) const {
     return Src == SrcChain ? CacheValidForward : CacheValidBackward;
   }
 
-  MergeGainTy getCachedMergeGain(Chain *Src, Chain *Dst) const {
+  MergeGainT getCachedMergeGain(ChainT *Src, ChainT *Dst) const {
     return Src == SrcChain ? CachedGainForward : CachedGainBackward;
   }
 
-  void setCachedMergeGain(Chain *Src, Chain *Dst, MergeGainTy MergeGain) {
+  void setCachedMergeGain(ChainT *Src, ChainT *Dst, MergeGainT MergeGain) {
     if (Src == SrcChain) {
       CachedGainForward = MergeGain;
       CacheValidForward = true;
@@ -384,31 +395,55 @@ public:
     CacheValidBackward = false;
   }
 
+  void setMergeGain(MergeGainT Gain) { CachedGain = Gain; }
+
+  MergeGainT getMergeGain() const { return CachedGain; }
+
+  double gain() const { return CachedGain.score(); }
+
 private:
   // Source chain.
-  Chain *SrcChain{nullptr};
+  ChainT *SrcChain{nullptr};
   // Destination chain.
-  Chain *DstChain{nullptr};
-  // Original jumps in the binary with correspinding execution counts.
-  std::vector<Jump *> Jumps;
-  // Cached ext-tsp value for merging the pair of chains.
-  // Since the gain of merging (Src, Dst) and (Dst, Src) might be different,
-  // we store both values here.
-  MergeGainTy CachedGainForward;
-  MergeGainTy CachedGainBackward;
+  ChainT *DstChain{nullptr};
+  // Original jumps in the binary with corresponding execution counts.
+  std::vector<JumpT *> Jumps;
+  // Cached gain value for merging the pair of chains.
+  MergeGainT CachedGain;
+
+  // Cached gain values for merging the pair of chains. Since the gain of
+  // merging (Src, Dst) and (Dst, Src) might be different, we store both values
+  // here and a flag indicating which of the options results in a higher gain.
+  // Cached gain values.
+  MergeGainT CachedGainForward;
+  MergeGainT CachedGainBackward;
   // Whether the cached value must be recomputed.
   bool CacheValidForward{false};
   bool CacheValidBackward{false};
 };
 
-void Chain::mergeEdges(Chain *Other) {
-  assert(this != Other && "cannot merge a chain with itself");
+uint64_t NodeT::outCount() const {
+  uint64_t Count = 0;
+  for (JumpT *Jump : OutJumps) {
+    Count += Jump->ExecutionCount;
+  }
+  return Count;
+}
 
+uint64_t NodeT::inCount() const {
+  uint64_t Count = 0;
+  for (JumpT *Jump : InJumps) {
+    Count += Jump->ExecutionCount;
+  }
+  return Count;
+}
+
+void ChainT::mergeEdges(ChainT *Other) {
   // Update edges adjacent to chain Other
   for (auto EdgeIt : Other->Edges) {
-    Chain *DstChain = EdgeIt.first;
+    ChainT *DstChain = EdgeIt.first;
     ChainEdge *DstEdge = EdgeIt.second;
-    Chain *TargetChain = DstChain == Other ? this : DstChain;
+    ChainT *TargetChain = DstChain == Other ? this : DstChain;
     ChainEdge *CurEdge = getEdge(TargetChain);
     if (CurEdge == nullptr) {
       DstEdge->changeEndpoint(Other, this);
@@ -426,15 +461,14 @@ void Chain::mergeEdges(Chain *Other) {
   }
 }
 
-using BlockIter = std::vector<Block *>::const_iterator;
+using NodeIter = std::vector<NodeT *>::const_iterator;
 
-/// A wrapper around three chains of blocks; it is used to avoid extra
+/// A wrapper around three chains of nodes; it is used to avoid extra
 /// instantiation of the vectors.
-class MergedChain {
-public:
-  MergedChain(BlockIter Begin1, BlockIter End1, BlockIter Begin2 = BlockIter(),
-              BlockIter End2 = BlockIter(), BlockIter Begin3 = BlockIter(),
-              BlockIter End3 = BlockIter())
+struct MergedChain {
+  MergedChain(NodeIter Begin1, NodeIter End1, NodeIter Begin2 = NodeIter(),
+              NodeIter End2 = NodeIter(), NodeIter Begin3 = NodeIter(),
+              NodeIter End3 = NodeIter())
       : Begin1(Begin1), End1(End1), Begin2(Begin2), End2(End2), Begin3(Begin3),
         End3(End3) {}
 
@@ -447,8 +481,8 @@ public:
       Func(*It);
   }
 
-  std::vector<Block *> getBlocks() const {
-    std::vector<Block *> Result;
+  std::vector<NodeT *> getNodes() const {
+    std::vector<NodeT *> Result;
     Result.reserve(std::distance(Begin1, End1) + std::distance(Begin2, End2) +
                    std::distance(Begin3, End3));
     Result.insert(Result.end(), Begin1, End1);
@@ -457,42 +491,71 @@ public:
     return Result;
   }
 
-  const Block *getFirstBlock() const { return *Begin1; }
+  const NodeT *getFirstNode() const { return *Begin1; }
 
 private:
-  BlockIter Begin1;
-  BlockIter End1;
-  BlockIter Begin2;
-  BlockIter End2;
-  BlockIter Begin3;
-  BlockIter End3;
+  NodeIter Begin1;
+  NodeIter End1;
+  NodeIter Begin2;
+  NodeIter End2;
+  NodeIter Begin3;
+  NodeIter End3;
 };
 
+/// Merge two chains of nodes respecting a given 'type' and 'offset'.
+///
+/// If MergeType == 0, then the result is a concatenation of two chains.
+/// Otherwise, the first chain is cut into two sub-chains at the offset,
+/// and merged using all possible ways of concatenating three chains.
+MergedChain mergeNodes(const std::vector<NodeT *> &X,
+                       const std::vector<NodeT *> &Y, size_t MergeOffset,
+                       MergeTypeT MergeType) {
+  // Split the first chain, X, into X1 and X2
+  NodeIter BeginX1 = X.begin();
+  NodeIter EndX1 = X.begin() + MergeOffset;
+  NodeIter BeginX2 = X.begin() + MergeOffset;
+  NodeIter EndX2 = X.end();
+  NodeIter BeginY = Y.begin();
+  NodeIter EndY = Y.end();
+
+  // Construct a new chain from the three existing ones
+  switch (MergeType) {
+  case MergeTypeT::X_Y:
+    return MergedChain(BeginX1, EndX2, BeginY, EndY);
+  case MergeTypeT::Y_X:
+    return MergedChain(BeginY, EndY, BeginX1, EndX2);
+  case MergeTypeT::X1_Y_X2:
+    return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
+  case MergeTypeT::Y_X2_X1:
+    return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
+  case MergeTypeT::X2_X1_Y:
+    return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
+  }
+  llvm_unreachable("unexpected chain merge type");
+}
+
 /// The implementation of the ExtTSP algorithm.
 class ExtTSPImpl {
-  using EdgeT = std::pair<uint64_t, uint64_t>;
-  using EdgeCountMap = std::vector<std::pair<EdgeT, uint64_t>>;
-
 public:
-  ExtTSPImpl(size_t NumNodes, const std::vector<uint64_t> &NodeSizes,
+  ExtTSPImpl(const std::vector<uint64_t> &NodeSizes,
              const std::vector<uint64_t> &NodeCounts,
-             const EdgeCountMap &EdgeCounts)
-      : NumNodes(NumNodes) {
+             const std::vector<EdgeCountT> &EdgeCounts)
+      : NumNodes(NodeSizes.size()) {
     initialize(NodeSizes, NodeCounts, EdgeCounts);
   }
 
-  /// Run the algorithm and return an optimized ordering of blocks.
+  /// Run the algorithm and return an optimized ordering of nodes.
   void run(std::vector<uint64_t> &Result) {
-    // Pass 1: Merge blocks with their mutually forced successors
+    // Pass 1: Merge nodes with their mutually forced successors
     mergeForcedPairs();
 
     // Pass 2: Merge pairs of chains while improving the ExtTSP objective
     mergeChainPairs();
 
-    // Pass 3: Merge cold blocks to reduce code size
+    // Pass 3: Merge cold nodes to reduce code size
     mergeColdChains();
 
-    // Collect blocks from all chains
+    // Collect nodes from all chains
     concatChains(Result);
   }
 
@@ -500,26 +563,26 @@ private:
   /// Initialize the algorithm's data structures.
   void initialize(const std::vector<uint64_t> &NodeSizes,
                   const std::vector<uint64_t> &NodeCounts,
-                  const EdgeCountMap &EdgeCounts) {
-    // Initialize blocks
-    AllBlocks.reserve(NumNodes);
-    for (uint64_t Node = 0; Node < NumNodes; Node++) {
-      uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
-      uint64_t ExecutionCount = NodeCounts[Node];
-      // The execution count of the entry block is set to at least 1
-      if (Node == 0 && ExecutionCount == 0)
+                  const std::vector<EdgeCountT> &EdgeCounts) {
+    // Initialize nodes
+    AllNodes.reserve(NumNodes);
+    for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
+      uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
+      uint64_t ExecutionCount = NodeCounts[Idx];
+      // The execution count of the entry node is set to at least one
+      if (Idx == 0 && ExecutionCount == 0)
         ExecutionCount = 1;
-      AllBlocks.emplace_back(Node, Size, ExecutionCount);
+      AllNodes.emplace_back(Idx, Size, ExecutionCount);
     }
 
-    // Initialize jumps between blocks
+    // Initialize jumps between nodes
     SuccNodes.resize(NumNodes);
     PredNodes.resize(NumNodes);
     std::vector<uint64_t> OutDegree(NumNodes, 0);
     AllJumps.reserve(EdgeCounts.size());
     for (auto It : EdgeCounts) {
-      auto Pred = It.first.first;
-      auto Succ = It.first.second;
+      uint64_t Pred = It.first.first;
+      uint64_t Succ = It.first.second;
       OutDegree[Pred]++;
       // Ignore self-edges
       if (Pred == Succ)
@@ -527,16 +590,16 @@ private:
 
       SuccNodes[Pred].push_back(Succ);
       PredNodes[Succ].push_back(Pred);
-      auto ExecutionCount = It.second;
+      uint64_t ExecutionCount = It.second;
       if (ExecutionCount > 0) {
-        auto &Block = AllBlocks[Pred];
-        auto &SuccBlock = AllBlocks[Succ];
-        AllJumps.emplace_back(&Block, &SuccBlock, ExecutionCount);
-        SuccBlock.InJumps.push_back(&AllJumps.back());
-        Block.OutJumps.push_back(&AllJumps.back());
+        NodeT &PredNode = AllNodes[Pred];
+        NodeT &SuccNode = AllNodes[Succ];
+        AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
+        SuccNode.InJumps.push_back(&AllJumps.back());
+        PredNode.OutJumps.push_back(&AllJumps.back());
       }
     }
-    for (auto &Jump : AllJumps) {
+    for (JumpT &Jump : AllJumps) {
       assert(OutDegree[Jump.Source->Index] > 0);
       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
     }
@@ -544,78 +607,78 @@ private:
     // Initialize chains
     AllChains.reserve(NumNodes);
     HotChains.reserve(NumNodes);
-    for (Block &Block : AllBlocks) {
-      AllChains.emplace_back(Block.Index, &Block);
-      Block.CurChain = &AllChains.back();
-      if (Block.ExecutionCount > 0) {
+    for (NodeT &Node : AllNodes) {
+      AllChains.emplace_back(Node.Index, &Node);
+      Node.CurChain = &AllChains.back();
+      if (Node.ExecutionCount > 0) {
         HotChains.push_back(&AllChains.back());
       }
     }
 
     // Initialize chain edges
     AllEdges.reserve(AllJumps.size());
-    for (Block &Block : AllBlocks) {
-      for (auto &Jump : Block.OutJumps) {
-        auto SuccBlock = Jump->Target;
-        ChainEdge *CurEdge = Block.CurChain->getEdge(SuccBlock->CurChain);
+    for (NodeT &PredNode : AllNodes) {
+      for (JumpT *Jump : PredNode.OutJumps) {
+        NodeT *SuccNode = Jump->Target;
+        ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
         // this edge is already present in the graph
         if (CurEdge != nullptr) {
-          assert(SuccBlock->CurChain->getEdge(Block.CurChain) != nullptr);
+          assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
           CurEdge->appendJump(Jump);
           continue;
         }
         // this is a new edge
         AllEdges.emplace_back(Jump);
-        Block.CurChain->addEdge(SuccBlock->CurChain, &AllEdges.back());
-        SuccBlock->CurChain->addEdge(Block.CurChain, &AllEdges.back());
+        PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
+        SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
       }
     }
   }
 
-  /// For a pair of blocks, A and B, block B is the forced successor of A,
+  /// For a pair of nodes, A and B, node B is the forced successor of A,
   /// if (i) all jumps (based on profile) from A goes to B and (ii) all jumps
-  /// to B are from A. Such blocks should be adjacent in the optimal ordering;
-  /// the method finds and merges such pairs of blocks.
+  /// to B are from A. Such nodes should be adjacent in the optimal ordering;
+  /// the method finds and merges such pairs of nodes.
   void mergeForcedPairs() {
     // Find fallthroughs based on edge weights
-    for (auto &Block : AllBlocks) {
-      if (SuccNodes[Block.Index].size() == 1 &&
-          PredNodes[SuccNodes[Block.Index][0]].size() == 1 &&
-          SuccNodes[Block.Index][0] != 0) {
-        size_t SuccIndex = SuccNodes[Block.Index][0];
-        Block.ForcedSucc = &AllBlocks[SuccIndex];
-        AllBlocks[SuccIndex].ForcedPred = &Block;
+    for (NodeT &Node : AllNodes) {
+      if (SuccNodes[Node.Index].size() == 1 &&
+          PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
+          SuccNodes[Node.Index][0] != 0) {
+        size_t SuccIndex = SuccNodes[Node.Index][0];
+        Node.ForcedSucc = &AllNodes[SuccIndex];
+        AllNodes[SuccIndex].ForcedPred = &Node;
       }
     }
 
     // There might be 'cycles' in the forced dependencies, since profile
     // data isn't 100% accurate. Typically this is observed in loops, when the
     // loop edges are the hottest successors for the basic blocks of the loop.
-    // Break the cycles by choosing the block with the smallest index as the
+    // Break the cycles by choosing the node with the smallest index as the
     // head. This helps to keep the original order of the loops, which likely
     // have already been rotated in the optimized manner.
-    for (auto &Block : AllBlocks) {
-      if (Block.ForcedSucc == nullptr || Block.ForcedPred == nullptr)
+    for (NodeT &Node : AllNodes) {
+      if (Node.ForcedSucc == nullptr || Node.ForcedPred == nullptr)
         continue;
 
-      auto SuccBlock = Block.ForcedSucc;
-      while (SuccBlock != nullptr && SuccBlock != &Block) {
-        SuccBlock = SuccBlock->ForcedSucc;
+      NodeT *SuccNode = Node.ForcedSucc;
+      while (SuccNode != nullptr && SuccNode != &Node) {
+        SuccNode = SuccNode->ForcedSucc;
       }
-      if (SuccBlock == nullptr)
+      if (SuccNode == nullptr)
         continue;
       // Break the cycle
-      AllBlocks[Block.ForcedPred->Index].ForcedSucc = nullptr;
-      Block.ForcedPred = nullptr;
+      AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
+      Node.ForcedPred = nullptr;
     }
 
-    // Merge blocks with their fallthrough successors
-    for (auto &Block : AllBlocks) {
-      if (Block.ForcedPred == nullptr && Block.ForcedSucc != nullptr) {
-        auto CurBlock = &Block;
+    // Merge nodes with their fallthrough successors
+    for (NodeT &Node : AllNodes) {
+      if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
+        const NodeT *CurBlock = &Node;
         while (CurBlock->ForcedSucc != nullptr) {
-          const auto NextBlock = CurBlock->ForcedSucc;
-          mergeChains(Block.CurChain, NextBlock->CurChain, 0, MergeTypeTy::X_Y);
+          const NodeT *NextBlock = CurBlock->ForcedSucc;
+          mergeChains(Node.CurChain, NextBlock->CurChain, 0, MergeTypeT::X_Y);
           CurBlock = NextBlock;
         }
       }
@@ -625,23 +688,23 @@ private:
   /// Merge pairs of chains while improving the ExtTSP objective.
   void mergeChainPairs() {
     /// Deterministically compare pairs of chains
-    auto compareChainPairs = [](const Chain *A1, const Chain *B1,
-                                const Chain *A2, const Chain *B2) {
+    auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
+                                const ChainT *A2, const ChainT *B2) {
       if (A1 != A2)
-        return A1->id() < A2->id();
-      return B1->id() < B2->id();
+        return A1->Id < A2->Id;
+      return B1->Id < B2->Id;
     };
 
     while (HotChains.size() > 1) {
-      Chain *BestChainPred = nullptr;
-      Chain *BestChainSucc = nullptr;
-      auto BestGain = MergeGainTy();
+      ChainT *BestChainPred = nullptr;
+      ChainT *BestChainSucc = nullptr;
+      MergeGainT BestGain;
       // Iterate over all pairs of chains
-      for (Chain *ChainPred : HotChains) {
+      for (ChainT *ChainPred : HotChains) {
         // Get candidates for merging with the current chain
-        for (auto EdgeIter : ChainPred->edges()) {
-          Chain *ChainSucc = EdgeIter.first;
-          class ChainEdge *ChainEdge = EdgeIter.second;
+        for (auto EdgeIt : ChainPred->Edges) {
+          ChainT *ChainSucc = EdgeIt.first;
+          ChainEdge *Edge = EdgeIt.second;
           // Ignore loop edges
           if (ChainPred == ChainSucc)
             continue;
@@ -651,8 +714,7 @@ private:
             continue;
 
           // Compute the gain of merging the two chains
-          MergeGainTy CurGain =
-              getBestMergeGain(ChainPred, ChainSucc, ChainEdge);
+          MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
           if (CurGain.score() <= EPS)
             continue;
 
@@ -677,43 +739,43 @@ private:
     }
   }
 
-  /// Merge remaining blocks into chains w/o taking jump counts into
-  /// consideration. This allows to maintain the original block order in the
-  /// absense of profile data
+  /// Merge remaining nodes into chains w/o taking jump counts into
+  /// consideration. This allows to maintain the original node order in the
+  /// absence of profile data
   void mergeColdChains() {
     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
       // Iterating in reverse order to make sure original fallthrough jumps are
       // merged first; this might be beneficial for code size.
       size_t NumSuccs = SuccNodes[SrcBB].size();
       for (size_t Idx = 0; Idx < NumSuccs; Idx++) {
-        auto DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
-        auto SrcChain = AllBlocks[SrcBB].CurChain;
-        auto DstChain = AllBlocks[DstBB].CurChain;
+        size_t DstBB = SuccNodes[SrcBB][NumSuccs - Idx - 1];
+        ChainT *SrcChain = AllNodes[SrcBB].CurChain;
+        ChainT *DstChain = AllNodes[DstBB].CurChain;
         if (SrcChain != DstChain && !DstChain->isEntry() &&
-            SrcChain->blocks().back()->Index == SrcBB &&
-            DstChain->blocks().front()->Index == DstBB &&
+            SrcChain->Nodes.back()->Index == SrcBB &&
+            DstChain->Nodes.front()->Index == DstBB &&
             SrcChain->isCold() == DstChain->isCold()) {
-          mergeChains(SrcChain, DstChain, 0, MergeTypeTy::X_Y);
+          mergeChains(SrcChain, DstChain, 0, MergeTypeT::X_Y);
         }
       }
     }
   }
 
-  /// Compute the Ext-TSP score for a given block order and a list of jumps.
+  /// Compute the Ext-TSP score for a given node order and a list of jumps.
   double extTSPScore(const MergedChain &MergedBlocks,
-                     const std::vector<Jump *> &Jumps) const {
+                     const std::vector<JumpT *> &Jumps) const {
     if (Jumps.empty())
       return 0.0;
     uint64_t CurAddr = 0;
-    MergedBlocks.forEach([&](const Block *BB) {
-      BB->EstimatedAddr = CurAddr;
-      CurAddr += BB->Size;
+    MergedBlocks.forEach([&](const NodeT *Node) {
+      Node->EstimatedAddr = CurAddr;
+      CurAddr += Node->Size;
     });
 
     double Score = 0;
-    for (auto &Jump : Jumps) {
-      const Block *SrcBlock = Jump->Source;
-      const Block *DstBlock = Jump->Target;
+    for (JumpT *Jump : Jumps) {
+      const NodeT *SrcBlock = Jump->Source;
+      const NodeT *DstBlock = Jump->Target;
       Score += ::extTSPScore(SrcBlock->EstimatedAddr, SrcBlock->Size,
                              DstBlock->EstimatedAddr, Jump->ExecutionCount,
                              Jump->IsConditional);
@@ -727,8 +789,8 @@ private:
   /// computes the one having the largest increase in ExtTSP objective. The
   /// result is a pair with the first element being the gain and the second
   /// element being the corresponding merging type.
-  MergeGainTy getBestMergeGain(Chain *ChainPred, Chain *ChainSucc,
-                               ChainEdge *Edge) const {
+  MergeGainT getBestMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
+                              ChainEdge *Edge) const {
     if (Edge->hasCachedMergeGain(ChainPred, ChainSucc)) {
       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
     }
@@ -742,22 +804,22 @@ private:
     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
 
     // The object holds the best currently chosen gain of merging the two chains
-    MergeGainTy Gain = MergeGainTy();
+    MergeGainT Gain = MergeGainT();
 
     /// Given a merge offset and a list of merge types, try to merge two chains
     /// and update Gain with a better alternative
     auto tryChainMerging = [&](size_t Offset,
-                               const std::vector<MergeTypeTy> &MergeTypes) {
+                               const std::vector<MergeTypeT> &MergeTypes) {
       // Skip merging corresponding to concatenation w/o splitting
-      if (Offset == 0 || Offset == ChainPred->blocks().size())
+      if (Offset == 0 || Offset == ChainPred->Nodes.size())
         return;
       // Skip merging if it breaks Forced successors
-      auto BB = ChainPred->blocks()[Offset - 1];
-      if (BB->ForcedSucc != nullptr)
+      NodeT *Node = ChainPred->Nodes[Offset - 1];
+      if (Node->ForcedSucc != nullptr)
         return;
       // Apply the merge, compute the corresponding gain, and update the best
       // value, if the merge is beneficial
-      for (const auto &MergeType : MergeTypes) {
+      for (const MergeTypeT &MergeType : MergeTypes) {
         Gain.updateIfLessThan(
             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
       }
@@ -765,36 +827,36 @@ private:
 
     // Try to concatenate two chains w/o splitting
     Gain.updateIfLessThan(
-        computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeTy::X_Y));
+        computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
 
     if (EnableChainSplitAlongJumps) {
-      // Attach (a part of) ChainPred before the first block of ChainSucc
-      for (auto &Jump : ChainSucc->blocks().front()->InJumps) {
-        const auto SrcBlock = Jump->Source;
+      // Attach (a part of) ChainPred before the first node of ChainSucc
+      for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
+        const NodeT *SrcBlock = Jump->Source;
         if (SrcBlock->CurChain != ChainPred)
           continue;
         size_t Offset = SrcBlock->CurIndex + 1;
-        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::X2_X1_Y});
+        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
       }
 
-      // Attach (a part of) ChainPred after the last block of ChainSucc
-      for (auto &Jump : ChainSucc->blocks().back()->OutJumps) {
-        const auto DstBlock = Jump->Source;
+      // Attach (a part of) ChainPred after the last node of ChainSucc
+      for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
+        const NodeT *DstBlock = Jump->Source;
         if (DstBlock->CurChain != ChainPred)
           continue;
         size_t Offset = DstBlock->CurIndex;
-        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1});
+        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1});
       }
     }
 
     // Try to break ChainPred in various ways and concatenate with ChainSucc
-    if (ChainPred->blocks().size() <= ChainSplitThreshold) {
-      for (size_t Offset = 1; Offset < ChainPred->blocks().size(); Offset++) {
+    if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
+      for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
         // Try to split the chain in different ways. In practice, applying
         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
         // it from consideration to reduce the search space
-        tryChainMerging(Offset, {MergeTypeTy::X1_Y_X2, MergeTypeTy::Y_X2_X1,
-                                 MergeTypeTy::X2_X1_Y});
+        tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
+                                 MergeTypeT::X2_X1_Y});
       }
     }
     Edge->setCachedMergeGain(ChainPred, ChainSucc, Gain);
@@ -805,96 +867,66 @@ private:
   /// merge 'type' and 'offset'.
   ///
   /// The two chains are not modified in the method.
-  MergeGainTy computeMergeGain(const Chain *ChainPred, const Chain *ChainSucc,
-                               const std::vector<Jump *> &Jumps,
-                               size_t MergeOffset,
-                               MergeTypeTy MergeType) const {
-    auto MergedBlocks = mergeBlocks(ChainPred->blocks(), ChainSucc->blocks(),
-                                    MergeOffset, MergeType);
-
-    // Do not allow a merge that does not preserve the original entry block
+  MergeGainT computeMergeGain(const ChainT *ChainPred, const ChainT *ChainSucc,
+                              const std::vector<JumpT *> &Jumps,
+                              size_t MergeOffset, MergeTypeT MergeType) const {
+    auto MergedBlocks =
+        mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
+
+    // Do not allow a merge that does not preserve the original entry point
     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
-        !MergedBlocks.getFirstBlock()->isEntry())
-      return MergeGainTy();
+        !MergedBlocks.getFirstNode()->isEntry())
+      return MergeGainT();
 
     // The gain for the new chain
-    auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->score();
-    return MergeGainTy(NewGainScore, MergeOffset, MergeType);
-  }
-
-  /// Merge two chains of blocks respecting a given merge 'type' and 'offset'.
-  ///
-  /// If MergeType == 0, then the result is a concatenation of two chains.
-  /// Otherwise, the first chain is cut into two sub-chains at the offset,
-  /// and merged using all possible ways of concatenating three chains.
-  MergedChain mergeBlocks(const std::vector<Block *> &X,
-                          const std::vector<Block *> &Y, size_t MergeOffset,
-                          MergeTypeTy MergeType) const {
-    // Split the first chain, X, into X1 and X2
-    BlockIter BeginX1 = X.begin();
-    BlockIter EndX1 = X.begin() + MergeOffset;
-    BlockIter BeginX2 = X.begin() + MergeOffset;
-    BlockIter EndX2 = X.end();
-    BlockIter BeginY = Y.begin();
-    BlockIter EndY = Y.end();
-
-    // Construct a new chain from the three existing ones
-    switch (MergeType) {
-    case MergeTypeTy::X_Y:
-      return MergedChain(BeginX1, EndX2, BeginY, EndY);
-    case MergeTypeTy::X1_Y_X2:
-      return MergedChain(BeginX1, EndX1, BeginY, EndY, BeginX2, EndX2);
-    case MergeTypeTy::Y_X2_X1:
-      return MergedChain(BeginY, EndY, BeginX2, EndX2, BeginX1, EndX1);
-    case MergeTypeTy::X2_X1_Y:
-      return MergedChain(BeginX2, EndX2, BeginX1, EndX1, BeginY, EndY);
-    }
-    llvm_unreachable("unexpected chain merge type");
+    auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score;
+    return MergeGainT(NewGainScore, MergeOffset, MergeType);
   }
 
   /// Merge chain From into chain Into, update the list of active chains,
   /// adjacency information, and the corresponding cached values.
-  void mergeChains(Chain *Into, Chain *From, size_t MergeOffset,
-                   MergeTypeTy MergeType) {
+  void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
+                   MergeTypeT MergeType) {
     assert(Into != From && "a chain cannot be merged with itself");
 
-    // Merge the blocks
-    MergedChain MergedBlocks =
-        mergeBlocks(Into->blocks(), From->blocks(), MergeOffset, MergeType);
-    Into->merge(From, MergedBlocks.getBlocks());
+    // Merge the nodes
+    MergedChain MergedNodes =
+        mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
+    Into->merge(From, MergedNodes.getNodes());
+
+    // Merge the edges
     Into->mergeEdges(From);
     From->clear();
 
     // Update cached ext-tsp score for the new chain
     ChainEdge *SelfEdge = Into->getEdge(Into);
     if (SelfEdge != nullptr) {
-      MergedBlocks = MergedChain(Into->blocks().begin(), Into->blocks().end());
-      Into->setScore(extTSPScore(MergedBlocks, SelfEdge->jumps()));
+      MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end());
+      Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps());
     }
 
-    // Remove chain From from the list of active chains
+    // Remove the chain from the list of active chains
     llvm::erase_value(HotChains, From);
 
     // Invalidate caches
-    for (auto EdgeIter : Into->edges()) {
-      EdgeIter.second->invalidateCache();
-    }
+    for (auto EdgeIt : Into->Edges)
+      EdgeIt.second->invalidateCache();
   }
 
-  /// Concatenate all chains into a final order of blocks.
+  /// Concatenate all chains into the final order.
   void concatChains(std::vector<uint64_t> &Order) {
-    // Collect chains and calculate some stats for their sorting
-    std::vector<Chain *> SortedChains;
-    DenseMap<const Chain *, double> ChainDensity;
-    for (auto &Chain : AllChains) {
-      if (!Chain.blocks().empty()) {
+    // Collect chains and calculate density stats for their sorting
+    std::vector<const ChainT *> SortedChains;
+    DenseMap<const ChainT *, double> ChainDensity;
+    for (ChainT &Chain : AllChains) {
+      if (!Chain.Nodes.empty()) {
         SortedChains.push_back(&Chain);
-        // Using doubles to avoid overflow of ExecutionCount
+        // Using doubles to avoid overflow of ExecutionCounts
         double Size = 0;
         double ExecutionCount = 0;
-        for (auto *Block : Chain.blocks()) {
-          Size += static_cast<double>(Block->Size);
-          ExecutionCount += static_cast<double>(Block->ExecutionCount);
+        for (NodeT *Node : Chain.Nodes) {
+          Size += static_cast<double>(Node->Size);
+          ExecutionCount += static_cast<double>(Node->ExecutionCount);
         }
         assert(Size > 0 && "a chain of zero size");
         ChainDensity[&Chain] = ExecutionCount / Size;
@@ -903,24 +935,23 @@ private:
 
     // Sorting chains by density in the decreasing order
     std::stable_sort(SortedChains.begin(), SortedChains.end(),
-                     [&](const Chain *C1, const Chain *C2) {
-                       // Make sure the original entry block is at the
+                     [&](const ChainT *L, const ChainT *R) {
+                       // Make sure the original entry point is at the
                        // beginning of the order
-                       if (C1->isEntry() != C2->isEntry()) {
-                         return C1->isEntry();
-                       }
+                       if (L->isEntry() != R->isEntry())
+                         return L->isEntry();
 
-                       const double D1 = ChainDensity[C1];
-                       const double D2 = ChainDensity[C2];
+                       const double DL = ChainDensity[L];
+                       const double DR = ChainDensity[R];
                        // Compare by density and break ties by chain identifiers
-                       return (D1 != D2) ? (D1 > D2) : (C1->id() < C2->id());
+                       return (DL != DR) ? (DL > DR) : (L->Id < R->Id);
                      });
 
-    // Collect the blocks in the order specified by their chains
+    // Collect the nodes in the order specified by their chains
     Order.reserve(NumNodes);
-    for (Chain *Chain : SortedChains) {
-      for (Block *Block : Chain->blocks()) {
-        Order.push_back(Block->Index);
+    for (const ChainT *Chain : SortedChains) {
+      for (NodeT *Node : Chain->Nodes) {
+        Order.push_back(Node->Index);
       }
     }
   }
@@ -935,49 +966,47 @@ private:
   /// Predecessors of each node.
   std::vector<std::vector<uint64_t>> PredNodes;
 
-  /// All basic blocks.
-  std::vector<Block> AllBlocks;
+  /// All nodes (basic blocks) in the graph.
+  std::vector<NodeT> AllNodes;
 
-  /// All jumps between blocks.
-  std::vector<Jump> AllJumps;
+  /// All jumps between the nodes.
+  std::vector<JumpT> AllJumps;
 
-  /// All chains of basic blocks.
-  std::vector<Chain> AllChains;
+  /// All chains of nodes.
+  std::vector<ChainT> AllChains;
 
-  /// All edges between chains.
+  /// All edges between the chains.
   std::vector<ChainEdge> AllEdges;
 
   /// Active chains. The vector gets updated at runtime when chains are merged.
-  std::vector<Chain *> HotChains;
+  std::vector<ChainT *> HotChains;
 };
 
 } // end of anonymous namespace
 
-std::vector<uint64_t> llvm::applyExtTspLayout(
-    const std::vector<uint64_t> &NodeSizes,
-    const std::vector<uint64_t> &NodeCounts,
-    const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
-  size_t NumNodes = NodeSizes.size();
-
-  // Verify correctness of the input data.
+std::vector<uint64_t>
+llvm::applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
+                        const std::vector<uint64_t> &NodeCounts,
+                        const std::vector<EdgeCountT> &EdgeCounts) {
+  // Verify correctness of the input data
   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
-  assert(NumNodes > 2 && "Incorrect input");
+  assert(NodeSizes.size() > 2 && "Incorrect input");
 
-  // Apply the reordering algorithm.
-  auto Alg = ExtTSPImpl(NumNodes, NodeSizes, NodeCounts, EdgeCounts);
+  // Apply the reordering algorithm
+  ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
   std::vector<uint64_t> Result;
   Alg.run(Result);
 
-  // Verify correctness of the output.
+  // Verify correctness of the output
   assert(Result.front() == 0 && "Original entry point is not preserved");
-  assert(Result.size() == NumNodes && "Incorrect size of reordered layout");
+  assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
   return Result;
 }
 
-double llvm::calcExtTspScore(
-    const std::vector<uint64_t> &Order, const std::vector<uint64_t> &NodeSizes,
-    const std::vector<uint64_t> &NodeCounts,
-    const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
+double llvm::calcExtTspScore(const std::vector<uint64_t> &Order,
+                             const std::vector<uint64_t> &NodeSizes,
+                             const std::vector<uint64_t> &NodeCounts,
+                             const std::vector<EdgeCountT> &EdgeCounts) {
   // Estimate addresses of the blocks in memory
   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
@@ -985,15 +1014,15 @@ double llvm::calcExtTspScore(
   }
   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
   for (auto It : EdgeCounts) {
-    auto Pred = It.first.first;
+    uint64_t Pred = It.first.first;
     OutDegree[Pred]++;
   }
 
   // Increase the score for each jump
   double Score = 0;
   for (auto It : EdgeCounts) {
-    auto Pred = It.first.first;
-    auto Succ = It.first.second;
+    uint64_t Pred = It.first.first;
+    uint64_t Succ = It.first.second;
     uint64_t Count = It.second;
     bool IsConditional = OutDegree[Pred] > 1;
     Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
@@ -1002,10 +1031,9 @@ double llvm::calcExtTspScore(
   return Score;
 }
 
-double llvm::calcExtTspScore(
-    const std::vector<uint64_t> &NodeSizes,
-    const std::vector<uint64_t> &NodeCounts,
-    const std::vector<std::pair<EdgeT, uint64_t>> &EdgeCounts) {
+double llvm::calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
+                             const std::vector<uint64_t> &NodeCounts,
+                             const std::vector<EdgeCountT> &EdgeCounts) {
   std::vector<uint64_t> Order(NodeSizes.size());
   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
     Order[Idx] = Idx;
-- 
Gitee


From fb6b984c1e63db705b8d14103c00c80459a92c24 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@fb.com>
Date: Tue, 13 Jun 2023 10:08:00 -0700
Subject: [PATCH 02/10] [Backport]A new code layout algorithm for function
 reordering [2/3]

We are bringing a new algorithm for function layout (reordering) based on the
call graph (extracted from a profile data). The algorithm is an improvement of
top of a known heuristic, C^3. It tries to co-locate hot and frequently executed
together functions in the resulting ordering. Unlike C^3, it explores a larger
search space and have an objective closely tied to the performance of
instruction and i-TLB caches. Hence, the name CDS = Cache-Directed Sort.
The algorithm can be used at the linking or post-linking (e.g., BOLT) stage.

The algorithm shares some similarities with C^3 and an approach for basic block
reordering (ext-tsp). It works with chains (ordered lists)
of functions. Initially all chains are isolated functions. On every iteration,
we pick a pair of chains whose merging yields the biggest increase in the
objective, which is a weighted combination of frequency-based and distance-based
locality. That is, we try to co-locate hot functions together (so they can share
the cache lines) and functions frequently executed together. The merging process
stops when there is only one chain left, or when merging does not improve the
objective. In the latter case, the remaining chains are sorted by density in the
decreasing order.

**Complexity**
We regularly apply the algorithm for large data-center binaries containing 10K+
(hot) functions, and the algorithm takes only a few seconds. For some extreme
cases with 100K-1M nodes, the runtime is within minutes.

**Perf-impact**
We extensively tested the implementation extensively on a benchmark of isolated
binaries and prod services. The impact is measurable for "larger" binaries that
are front-end bound: the cpu time improvement (on top of C^3) is in the range
of [0% .. 1%], which is a result of a reduced i-TLB miss rate (by up to 20%) and
i-cache miss rate (up to 5%).

Reviewed By: rahmanl

Differential Revision: https://reviews.llvm.org/D152834
---
 .../llvm/Transforms/Utils/CodeLayout.h        |  34 +
 llvm/lib/Transforms/Utils/CodeLayout.cpp      | 599 +++++++++++++++---
 2 files changed, 547 insertions(+), 86 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
index e8106e474332..11a829b601ce 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
@@ -53,6 +53,40 @@ double calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
                        const std::vector<uint64_t> &NodeCounts,
                        const std::vector<EdgeCountT> &EdgeCounts);
 
+/// Algorithm-specific params for Cache-Directed Sort. The values are tuned for
+/// the best performance of large-scale front-end bound binaries.
+struct CDSortConfig {
+  /// The size of the cache.
+  unsigned CacheEntries = 16;
+  /// The size of a line in the cache.
+  unsigned CacheSize = 2048;
+  /// The power exponent for the distance-based locality.
+  double DistancePower = 0.25;
+  /// The scale factor for the frequency-based locality.
+  double FrequencyScale = 0.25;
+};
+
+/// Apply a Cache-Directed Sort for functions represented by a call graph.
+/// The placement is done by optimizing the call locality by co-locating
+/// frequently executed functions.
+/// \p FuncSizes: The sizes of the nodes (in bytes).
+/// \p FuncCounts: The execution counts of the nodes in the profile.
+/// \p CallCounts: The execution counts of every edge (jump) in the profile. The
+///    map also defines the edges in CFG and should include 0-count edges.
+/// \p CallOffsets: The offsets of the calls from their source nodes.
+/// \returns The best function order found.
+std::vector<uint64_t> applyCDSLayout(const std::vector<uint64_t> &FuncSizes,
+                                     const std::vector<uint64_t> &FuncCounts,
+                                     const std::vector<EdgeCountT> &CallCounts,
+                                     const std::vector<uint64_t> &CallOffsets);
+
+/// Apply a Cache-Directed Sort with a custom config.
+std::vector<uint64_t> applyCDSLayout(const CDSortConfig &Config,
+                                     const std::vector<uint64_t> &FuncSizes,
+                                     const std::vector<uint64_t> &FuncCounts,
+                                     const std::vector<EdgeCountT> &CallCounts,
+                                     const std::vector<uint64_t> &CallOffsets);
+
 } // end namespace llvm
 
 #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index 059e20a34104..8ffde74ac650 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -45,6 +45,7 @@
 #include "llvm/Support/Debug.h"
 
 #include <cmath>
+#include <set>
 
 using namespace llvm;
 #define DEBUG_TYPE "code-layout"
@@ -59,8 +60,8 @@ cl::opt<bool> ApplyExtTspWithoutProfile(
     cl::desc("Whether to apply ext-tsp placement for instances w/o profile"),
     cl::init(true), cl::Hidden);
 
-// Algorithm-specific params. The values are tuned for the best performance
-// of large-scale front-end bound binaries.
+// Algorithm-specific params for Ext-TSP. The values are tuned for the best
+// performance of large-scale front-end bound binaries.
 static cl::opt<double> ForwardWeightCond(
     "ext-tsp-forward-weight-cond", cl::ReallyHidden, cl::init(0.1),
     cl::desc("The weight of conditional forward jumps for ExtTSP value"));
@@ -111,6 +112,21 @@ static cl::opt<bool> EnableChainSplitAlongJumps(
     "ext-tsp-enable-chain-split-along-jumps", cl::ReallyHidden, cl::init(true),
     cl::desc("The maximum size of a chain to apply splitting"));
 
+// Algorithm-specific options for CDS.
+static cl::opt<unsigned> CacheEntries("cds-cache-entries", cl::ReallyHidden,
+                                      cl::desc("The size of the cache"));
+
+static cl::opt<unsigned> CacheSize("cds-cache-size", cl::ReallyHidden,
+                                   cl::desc("The size of a line in the cache"));
+
+static cl::opt<double> DistancePower(
+    "cds-distance-power", cl::ReallyHidden,
+    cl::desc("The power exponent for the distance-based locality"));
+
+static cl::opt<double> FrequencyScale(
+    "cds-frequency-scale", cl::ReallyHidden,
+    cl::desc("The scale factor for the frequency-based locality"));
+
 namespace {
 
 // Epsilon for comparison of doubles.
@@ -278,9 +294,9 @@ struct ChainT {
   }
 
   ChainEdge *getEdge(ChainT *Other) const {
-    for (auto It : Edges) {
-      if (It.first == Other)
-        return It.second;
+    for (const auto &[Chain, ChainEdge] : Edges) {
+      if (Chain == Other)
+        return ChainEdge;
     }
     return nullptr;
   }
@@ -302,11 +318,11 @@ struct ChainT {
 
   void merge(ChainT *Other, const std::vector<NodeT *> &MergedBlocks) {
     Nodes = MergedBlocks;
-    // Update the chain's data
+    // Update the chain's data.
     ExecutionCount += Other->ExecutionCount;
     Size += Other->Size;
     Id = Nodes[0]->Index;
-    // Update the node's data
+    // Update the node's data.
     for (size_t Idx = 0; Idx < Nodes.size(); Idx++) {
       Nodes[Idx]->CurChain = this;
       Nodes[Idx]->CurIndex = Idx;
@@ -338,7 +354,7 @@ struct ChainT {
 
 /// An edge in the graph representing jumps between two chains.
 /// When nodes are merged into chains, the edges are combined too so that
-/// there is always at most one edge between a pair of chains
+/// there is always at most one edge between a pair of chains.
 struct ChainEdge {
   ChainEdge(const ChainEdge &) = delete;
   ChainEdge(ChainEdge &&) = default;
@@ -424,40 +440,34 @@ private:
 
 uint64_t NodeT::outCount() const {
   uint64_t Count = 0;
-  for (JumpT *Jump : OutJumps) {
+  for (JumpT *Jump : OutJumps)
     Count += Jump->ExecutionCount;
-  }
   return Count;
 }
 
 uint64_t NodeT::inCount() const {
   uint64_t Count = 0;
-  for (JumpT *Jump : InJumps) {
+  for (JumpT *Jump : InJumps)
     Count += Jump->ExecutionCount;
-  }
   return Count;
 }
 
 void ChainT::mergeEdges(ChainT *Other) {
-  // Update edges adjacent to chain Other
-  for (auto EdgeIt : Other->Edges) {
-    ChainT *DstChain = EdgeIt.first;
-    ChainEdge *DstEdge = EdgeIt.second;
+  // Update edges adjacent to chain Other.
+  for (const auto &[DstChain, DstEdge] : Other->Edges) {
     ChainT *TargetChain = DstChain == Other ? this : DstChain;
     ChainEdge *CurEdge = getEdge(TargetChain);
     if (CurEdge == nullptr) {
       DstEdge->changeEndpoint(Other, this);
       this->addEdge(TargetChain, DstEdge);
-      if (DstChain != this && DstChain != Other) {
+      if (DstChain != this && DstChain != Other)
         DstChain->addEdge(this, DstEdge);
-      }
     } else {
       CurEdge->moveJumps(DstEdge);
     }
-    // Cleanup leftover edge
-    if (DstChain != Other) {
+    // Cleanup leftover edge.
+    if (DstChain != Other)
       DstChain->removeEdge(Other);
-    }
   }
 }
 
@@ -510,7 +520,7 @@ private:
 MergedChain mergeNodes(const std::vector<NodeT *> &X,
                        const std::vector<NodeT *> &Y, size_t MergeOffset,
                        MergeTypeT MergeType) {
-  // Split the first chain, X, into X1 and X2
+  // Split the first chain, X, into X1 and X2.
   NodeIter BeginX1 = X.begin();
   NodeIter EndX1 = X.begin() + MergeOffset;
   NodeIter BeginX2 = X.begin() + MergeOffset;
@@ -518,7 +528,7 @@ MergedChain mergeNodes(const std::vector<NodeT *> &X,
   NodeIter BeginY = Y.begin();
   NodeIter EndY = Y.end();
 
-  // Construct a new chain from the three existing ones
+  // Construct a new chain from the three existing ones.
   switch (MergeType) {
   case MergeTypeT::X_Y:
     return MergedChain(BeginX1, EndX2, BeginY, EndY);
@@ -569,7 +579,7 @@ private:
     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
       uint64_t Size = std::max<uint64_t>(NodeSizes[Idx], 1ULL);
       uint64_t ExecutionCount = NodeCounts[Idx];
-      // The execution count of the entry node is set to at least one
+      // The execution count of the entry node is set to at least one.
       if (Idx == 0 && ExecutionCount == 0)
         ExecutionCount = 1;
       AllNodes.emplace_back(Idx, Size, ExecutionCount);
@@ -584,7 +594,7 @@ private:
       uint64_t Pred = It.first.first;
       uint64_t Succ = It.first.second;
       OutDegree[Pred]++;
-      // Ignore self-edges
+      // Ignore self-edges.
       if (Pred == Succ)
         continue;
 
@@ -604,30 +614,29 @@ private:
       Jump.IsConditional = OutDegree[Jump.Source->Index] > 1;
     }
 
-    // Initialize chains
+    // Initialize chains.
     AllChains.reserve(NumNodes);
     HotChains.reserve(NumNodes);
     for (NodeT &Node : AllNodes) {
       AllChains.emplace_back(Node.Index, &Node);
       Node.CurChain = &AllChains.back();
-      if (Node.ExecutionCount > 0) {
+      if (Node.ExecutionCount > 0)
         HotChains.push_back(&AllChains.back());
-      }
     }
 
-    // Initialize chain edges
+    // Initialize chain edges.
     AllEdges.reserve(AllJumps.size());
     for (NodeT &PredNode : AllNodes) {
       for (JumpT *Jump : PredNode.OutJumps) {
         NodeT *SuccNode = Jump->Target;
         ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
-        // this edge is already present in the graph
+        // this edge is already present in the graph.
         if (CurEdge != nullptr) {
           assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
           CurEdge->appendJump(Jump);
           continue;
         }
-        // this is a new edge
+        // this is a new edge.
         AllEdges.emplace_back(Jump);
         PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
         SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
@@ -640,7 +649,7 @@ private:
   /// to B are from A. Such nodes should be adjacent in the optimal ordering;
   /// the method finds and merges such pairs of nodes.
   void mergeForcedPairs() {
-    // Find fallthroughs based on edge weights
+    // Find fallthroughs based on edge weights.
     for (NodeT &Node : AllNodes) {
       if (SuccNodes[Node.Index].size() == 1 &&
           PredNodes[SuccNodes[Node.Index][0]].size() == 1 &&
@@ -667,12 +676,12 @@ private:
       }
       if (SuccNode == nullptr)
         continue;
-      // Break the cycle
+      // Break the cycle.
       AllNodes[Node.ForcedPred->Index].ForcedSucc = nullptr;
       Node.ForcedPred = nullptr;
     }
 
-    // Merge nodes with their fallthrough successors
+    // Merge nodes with their fallthrough successors.
     for (NodeT &Node : AllNodes) {
       if (Node.ForcedPred == nullptr && Node.ForcedSucc != nullptr) {
         const NodeT *CurBlock = &Node;
@@ -687,7 +696,7 @@ private:
 
   /// Merge pairs of chains while improving the ExtTSP objective.
   void mergeChainPairs() {
-    /// Deterministically compare pairs of chains
+    /// Deterministically compare pairs of chains.
     auto compareChainPairs = [](const ChainT *A1, const ChainT *B1,
                                 const ChainT *A2, const ChainT *B2) {
       if (A1 != A2)
@@ -699,21 +708,19 @@ private:
       ChainT *BestChainPred = nullptr;
       ChainT *BestChainSucc = nullptr;
       MergeGainT BestGain;
-      // Iterate over all pairs of chains
+      // Iterate over all pairs of chains.
       for (ChainT *ChainPred : HotChains) {
-        // Get candidates for merging with the current chain
-        for (auto EdgeIt : ChainPred->Edges) {
-          ChainT *ChainSucc = EdgeIt.first;
-          ChainEdge *Edge = EdgeIt.second;
-          // Ignore loop edges
+        // Get candidates for merging with the current chain.
+        for (const auto &[ChainSucc, Edge] : ChainPred->Edges) {
+          // Ignore loop edges.
           if (ChainPred == ChainSucc)
             continue;
 
-          // Stop early if the combined chain violates the maximum allowed size
+          // Stop early if the combined chain violates the maximum allowed size.
           if (ChainPred->numBlocks() + ChainSucc->numBlocks() >= MaxChainSize)
             continue;
 
-          // Compute the gain of merging the two chains
+          // Compute the gain of merging the two chains.
           MergeGainT CurGain = getBestMergeGain(ChainPred, ChainSucc, Edge);
           if (CurGain.score() <= EPS)
             continue;
@@ -729,11 +736,11 @@ private:
         }
       }
 
-      // Stop merging when there is no improvement
+      // Stop merging when there is no improvement.
       if (BestGain.score() <= EPS)
         break;
 
-      // Merge the best pair of chains
+      // Merge the best pair of chains.
       mergeChains(BestChainPred, BestChainSucc, BestGain.mergeOffset(),
                   BestGain.mergeType());
     }
@@ -741,7 +748,7 @@ private:
 
   /// Merge remaining nodes into chains w/o taking jump counts into
   /// consideration. This allows to maintain the original node order in the
-  /// absence of profile data
+  /// absence of profile data.
   void mergeColdChains() {
     for (size_t SrcBB = 0; SrcBB < NumNodes; SrcBB++) {
       // Iterating in reverse order to make sure original fallthrough jumps are
@@ -795,7 +802,7 @@ private:
       return Edge->getCachedMergeGain(ChainPred, ChainSucc);
     }
 
-    // Precompute jumps between ChainPred and ChainSucc
+    // Precompute jumps between ChainPred and ChainSucc.
     auto Jumps = Edge->jumps();
     ChainEdge *EdgePP = ChainPred->getEdge(ChainPred);
     if (EdgePP != nullptr) {
@@ -803,34 +810,34 @@ private:
     }
     assert(!Jumps.empty() && "trying to merge chains w/o jumps");
 
-    // The object holds the best currently chosen gain of merging the two chains
+    // This object holds the best chosen gain of merging two chains.
     MergeGainT Gain = MergeGainT();
 
     /// Given a merge offset and a list of merge types, try to merge two chains
-    /// and update Gain with a better alternative
+    /// and update Gain with a better alternative.
     auto tryChainMerging = [&](size_t Offset,
                                const std::vector<MergeTypeT> &MergeTypes) {
-      // Skip merging corresponding to concatenation w/o splitting
+      // Skip merging corresponding to concatenation w/o splitting.
       if (Offset == 0 || Offset == ChainPred->Nodes.size())
         return;
-      // Skip merging if it breaks Forced successors
+      // Skip merging if it breaks Forced successors.
       NodeT *Node = ChainPred->Nodes[Offset - 1];
       if (Node->ForcedSucc != nullptr)
         return;
       // Apply the merge, compute the corresponding gain, and update the best
-      // value, if the merge is beneficial
+      // value, if the merge is beneficial.
       for (const MergeTypeT &MergeType : MergeTypes) {
         Gain.updateIfLessThan(
             computeMergeGain(ChainPred, ChainSucc, Jumps, Offset, MergeType));
       }
     };
 
-    // Try to concatenate two chains w/o splitting
+    // Try to concatenate two chains w/o splitting.
     Gain.updateIfLessThan(
         computeMergeGain(ChainPred, ChainSucc, Jumps, 0, MergeTypeT::X_Y));
 
     if (EnableChainSplitAlongJumps) {
-      // Attach (a part of) ChainPred before the first node of ChainSucc
+      // Attach (a part of) ChainPred before the first node of ChainSucc.
       for (JumpT *Jump : ChainSucc->Nodes.front()->InJumps) {
         const NodeT *SrcBlock = Jump->Source;
         if (SrcBlock->CurChain != ChainPred)
@@ -839,7 +846,7 @@ private:
         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::X2_X1_Y});
       }
 
-      // Attach (a part of) ChainPred after the last node of ChainSucc
+      // Attach (a part of) ChainPred after the last node of ChainSucc.
       for (JumpT *Jump : ChainSucc->Nodes.back()->OutJumps) {
         const NodeT *DstBlock = Jump->Source;
         if (DstBlock->CurChain != ChainPred)
@@ -849,12 +856,12 @@ private:
       }
     }
 
-    // Try to break ChainPred in various ways and concatenate with ChainSucc
+    // Try to break ChainPred in various ways and concatenate with ChainSucc.
     if (ChainPred->Nodes.size() <= ChainSplitThreshold) {
       for (size_t Offset = 1; Offset < ChainPred->Nodes.size(); Offset++) {
         // Try to split the chain in different ways. In practice, applying
         // X2_Y_X1 merging is almost never provides benefits; thus, we exclude
-        // it from consideration to reduce the search space
+        // it from consideration to reduce the search space.
         tryChainMerging(Offset, {MergeTypeT::X1_Y_X2, MergeTypeT::Y_X2_X1,
                                  MergeTypeT::X2_X1_Y});
       }
@@ -873,12 +880,12 @@ private:
     auto MergedBlocks =
         mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
 
-    // Do not allow a merge that does not preserve the original entry point
+    // Do not allow a merge that does not preserve the original entry point.
     if ((ChainPred->isEntry() || ChainSucc->isEntry()) &&
         !MergedBlocks.getFirstNode()->isEntry())
       return MergeGainT();
 
-    // The gain for the new chain
+    // The gain for the new chain.
     auto NewGainScore = extTSPScore(MergedBlocks, Jumps) - ChainPred->Score;
     return MergeGainT(NewGainScore, MergeOffset, MergeType);
   }
@@ -889,39 +896,39 @@ private:
                    MergeTypeT MergeType) {
     assert(Into != From && "a chain cannot be merged with itself");
 
-    // Merge the nodes
+    // Merge the nodes.
     MergedChain MergedNodes =
         mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
     Into->merge(From, MergedNodes.getNodes());
 
-    // Merge the edges
+    // Merge the edges.
     Into->mergeEdges(From);
     From->clear();
 
-    // Update cached ext-tsp score for the new chain
+    // Update cached ext-tsp score for the new chain.
     ChainEdge *SelfEdge = Into->getEdge(Into);
     if (SelfEdge != nullptr) {
       MergedNodes = MergedChain(Into->Nodes.begin(), Into->Nodes.end());
       Into->Score = extTSPScore(MergedNodes, SelfEdge->jumps());
     }
 
-    // Remove the chain from the list of active chains
+    // Remove the chain from the list of active chains.
     llvm::erase_value(HotChains, From);
 
-    // Invalidate caches
+    // Invalidate caches.
     for (auto EdgeIt : Into->Edges)
       EdgeIt.second->invalidateCache();
   }
 
   /// Concatenate all chains into the final order.
   void concatChains(std::vector<uint64_t> &Order) {
-    // Collect chains and calculate density stats for their sorting
+    // Collect chains and calculate density stats for their sorting.
     std::vector<const ChainT *> SortedChains;
     DenseMap<const ChainT *, double> ChainDensity;
     for (ChainT &Chain : AllChains) {
       if (!Chain.Nodes.empty()) {
         SortedChains.push_back(&Chain);
-        // Using doubles to avoid overflow of ExecutionCounts
+        // Using doubles to avoid overflow of ExecutionCounts.
         double Size = 0;
         double ExecutionCount = 0;
         for (NodeT *Node : Chain.Nodes) {
@@ -933,21 +940,22 @@ private:
       }
     }
 
-    // Sorting chains by density in the decreasing order
-    std::stable_sort(SortedChains.begin(), SortedChains.end(),
-                     [&](const ChainT *L, const ChainT *R) {
-                       // Make sure the original entry point is at the
-                       // beginning of the order
-                       if (L->isEntry() != R->isEntry())
-                         return L->isEntry();
-
-                       const double DL = ChainDensity[L];
-                       const double DR = ChainDensity[R];
-                       // Compare by density and break ties by chain identifiers
-                       return (DL != DR) ? (DL > DR) : (L->Id < R->Id);
-                     });
-
-    // Collect the nodes in the order specified by their chains
+    // Sorting chains by density in the decreasing order.
+    std::sort(SortedChains.begin(), SortedChains.end(),
+              [&](const ChainT *L, const ChainT *R) {
+                // Place the entry point is at the beginning of the order.
+                if (L->isEntry() != R->isEntry())
+                  return L->isEntry();
+
+                const double DL = ChainDensity[L];
+                const double DR = ChainDensity[R];
+                // Compare by density and break ties by chain identifiers.
+                return (DL != DR) ? (DL > DR) : (L->Id < R->Id);
+                return std::make_tuple(-DL, L->Id) <
+                       std::make_tuple(-DR, R->Id);
+              });
+
+    // Collect the nodes in the order specified by their chains.
     Order.reserve(NumNodes);
     for (const ChainT *Chain : SortedChains) {
       for (NodeT *Node : Chain->Nodes) {
@@ -982,22 +990,404 @@ private:
   std::vector<ChainT *> HotChains;
 };
 
+/// The implementation of the Cache-Directed Sort (CDS) algorithm for ordering
+/// functions represented by a call graph.
+class CDSortImpl {
+public:
+  CDSortImpl(const CDSortConfig &Config, const std::vector<uint64_t> &NodeSizes,
+             const std::vector<uint64_t> &NodeCounts,
+             const std::vector<EdgeCountT> &EdgeCounts,
+             const std::vector<uint64_t> &EdgeOffsets)
+      : Config(Config), NumNodes(NodeSizes.size()) {
+    initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
+  }
+
+  /// Run the algorithm and return an ordered set of function clusters.
+  void run(std::vector<uint64_t> &Result) {
+    // Merge pairs of chains while improving the objective.
+    mergeChainPairs();
+
+    LLVM_DEBUG(dbgs() << "Cache-directed function sorting reduced the number"
+                      << " of chains from " << NumNodes << " to "
+                      << HotChains.size() << "\n");
+
+    // Collect nodes from all the chains.
+    concatChains(Result);
+  }
+
+private:
+  /// Initialize the algorithm's data structures.
+  void initialize(const std::vector<uint64_t> &NodeSizes,
+                  const std::vector<uint64_t> &NodeCounts,
+                  const std::vector<EdgeCountT> &EdgeCounts,
+                  const std::vector<uint64_t> &EdgeOffsets) {
+    // Initialize nodes.
+    AllNodes.reserve(NumNodes);
+    for (uint64_t Node = 0; Node < NumNodes; Node++) {
+      uint64_t Size = std::max<uint64_t>(NodeSizes[Node], 1ULL);
+      uint64_t ExecutionCount = NodeCounts[Node];
+      AllNodes.emplace_back(Node, Size, ExecutionCount);
+      TotalSamples += ExecutionCount;
+      if (ExecutionCount > 0)
+        TotalSize += Size;
+    }
+
+    // Initialize jumps between the nodes.
+    SuccNodes.resize(NumNodes);
+    PredNodes.resize(NumNodes);
+    AllJumps.reserve(EdgeCounts.size());
+    for (size_t I = 0; I < EdgeCounts.size(); I++) {
+      auto It = EdgeCounts[I];
+      uint64_t Pred = It.first.first;
+      uint64_t Succ = It.first.second;
+      // Ignore recursive calls.
+      if (Pred == Succ)
+        continue;
+
+      SuccNodes[Pred].push_back(Succ);
+      PredNodes[Succ].push_back(Pred);
+      uint64_t ExecutionCount = It.second;
+      if (ExecutionCount > 0) {
+        NodeT &PredNode = AllNodes[Pred];
+        NodeT &SuccNode = AllNodes[Succ];
+        AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
+        AllJumps.back().Offset = EdgeOffsets[I];
+        SuccNode.InJumps.push_back(&AllJumps.back());
+        PredNode.OutJumps.push_back(&AllJumps.back());
+      }
+    }
+
+    // Initialize chains.
+    AllChains.reserve(NumNodes);
+    HotChains.reserve(NumNodes);
+    for (NodeT &Node : AllNodes) {
+      // Adjust execution counts.
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.inCount());
+      Node.ExecutionCount = std::max(Node.ExecutionCount, Node.outCount());
+      // Create chain.
+      AllChains.emplace_back(Node.Index, &Node);
+      Node.CurChain = &AllChains.back();
+      if (Node.ExecutionCount > 0)
+        HotChains.push_back(&AllChains.back());
+    }
+
+    // Initialize chain edges.
+    AllEdges.reserve(AllJumps.size());
+    for (NodeT &PredNode : AllNodes) {
+      for (JumpT *Jump : PredNode.OutJumps) {
+        NodeT *SuccNode = Jump->Target;
+        ChainEdge *CurEdge = PredNode.CurChain->getEdge(SuccNode->CurChain);
+        // this edge is already present in the graph.
+        if (CurEdge != nullptr) {
+          assert(SuccNode->CurChain->getEdge(PredNode.CurChain) != nullptr);
+          CurEdge->appendJump(Jump);
+          continue;
+        }
+        // this is a new edge.
+        AllEdges.emplace_back(Jump);
+        PredNode.CurChain->addEdge(SuccNode->CurChain, &AllEdges.back());
+        SuccNode->CurChain->addEdge(PredNode.CurChain, &AllEdges.back());
+      }
+    }
+  }
+
+  /// Merge pairs of chains while there is an improvement in the objective.
+  void mergeChainPairs() {
+    // Create a priority queue containing all edges ordered by the merge gain.
+    auto GainComparator = [](ChainEdge *L, ChainEdge *R) {
+      return std::make_tuple(-L->gain(), L->srcChain()->Id, L->dstChain()->Id) <
+             std::make_tuple(-R->gain(), R->srcChain()->Id, R->dstChain()->Id);
+    };
+    std::set<ChainEdge *, decltype(GainComparator)> Queue(GainComparator);
+
+    // Insert the edges into the queue.
+    for (ChainT *ChainPred : HotChains) {
+      for (const auto &[Chain, Edge] : ChainPred->Edges) {
+        // Ignore self-edges.
+        if (Edge->isSelfEdge())
+          continue;
+        // Ignore already processed edges.
+        if (Edge->gain() != -1.0)
+          continue;
+
+        // Compute the gain of merging the two chains.
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+
+    // Merge the chains while the gain of merging is positive.
+    while (!Queue.empty()) {
+      // Extract the best (top) edge for merging.
+      ChainEdge *BestEdge = *Queue.begin();
+      Queue.erase(Queue.begin());
+      // Ignore self-edges.
+      if (BestEdge->isSelfEdge())
+        continue;
+      // Ignore edges with non-positive gains.
+      if (BestEdge->gain() <= EPS)
+        continue;
+
+      ChainT *BestSrcChain = BestEdge->srcChain();
+      ChainT *BestDstChain = BestEdge->dstChain();
+
+      // Remove outdated edges from the queue.
+      for (const auto &[Chain, ChainEdge] : BestSrcChain->Edges)
+        Queue.erase(ChainEdge);
+      for (const auto &[Chain, ChainEdge] : BestDstChain->Edges)
+        Queue.erase(ChainEdge);
+
+      // Merge the best pair of chains.
+      MergeGainT BestGain = BestEdge->getMergeGain();
+      mergeChains(BestSrcChain, BestDstChain, BestGain.mergeOffset(),
+                  BestGain.mergeType());
+
+      // Insert newly created edges into the queue.
+      for (const auto &[Chain, Edge] : BestSrcChain->Edges) {
+        // Ignore loop edges.
+        if (Edge->isSelfEdge())
+          continue;
+
+        // Compute the gain of merging the two chains.
+        MergeGainT Gain = getBestMergeGain(Edge);
+        Edge->setMergeGain(Gain);
+
+        if (Edge->gain() > EPS)
+          Queue.insert(Edge);
+      }
+    }
+  }
+
+  /// Compute the gain of merging two chains.
+  ///
+  /// The function considers all possible ways of merging two chains and
+  /// computes the one having the largest increase in ExtTSP objective. The
+  /// result is a pair with the first element being the gain and the second
+  /// element being the corresponding merging type.
+  MergeGainT getBestMergeGain(ChainEdge *Edge) const {
+    // Precompute jumps between ChainPred and ChainSucc.
+    auto Jumps = Edge->jumps();
+    assert(!Jumps.empty() && "trying to merge chains w/o jumps");
+    ChainT *SrcChain = Edge->srcChain();
+    ChainT *DstChain = Edge->dstChain();
+
+    // This object holds the best currently chosen gain of merging two chains.
+    MergeGainT Gain = MergeGainT();
+
+    /// Given a list of merge types, try to merge two chains and update Gain
+    /// with a better alternative.
+    auto tryChainMerging = [&](const std::vector<MergeTypeT> &MergeTypes) {
+      // Apply the merge, compute the corresponding gain, and update the best
+      // value, if the merge is beneficial.
+      for (const MergeTypeT &MergeType : MergeTypes) {
+        MergeGainT NewGain =
+            computeMergeGain(SrcChain, DstChain, Jumps, MergeType);
+
+        // When forward and backward gains are the same, prioritize merging that
+        // preserves the original order of the functions in the binary.
+        if (std::abs(Gain.score() - NewGain.score()) < EPS) {
+          if ((MergeType == MergeTypeT::X_Y && SrcChain->Id < DstChain->Id) ||
+              (MergeType == MergeTypeT::Y_X && SrcChain->Id > DstChain->Id)) {
+            Gain = NewGain;
+          }
+        } else if (NewGain.score() > Gain.score() + EPS) {
+          Gain = NewGain;
+        }
+      }
+    };
+
+    // Try to concatenate two chains w/o splitting.
+    tryChainMerging({MergeTypeT::X_Y, MergeTypeT::Y_X});
+
+    return Gain;
+  }
+
+  /// Compute the score gain of merging two chains, respecting a given type.
+  ///
+  /// The two chains are not modified in the method.
+  MergeGainT computeMergeGain(ChainT *ChainPred, ChainT *ChainSucc,
+                              const std::vector<JumpT *> &Jumps,
+                              MergeTypeT MergeType) const {
+    // This doesn't depend on the ordering of the nodes
+    double FreqGain = freqBasedLocalityGain(ChainPred, ChainSucc);
+
+    // Merge offset is always 0, as the chains are not split.
+    size_t MergeOffset = 0;
+    auto MergedBlocks =
+        mergeNodes(ChainPred->Nodes, ChainSucc->Nodes, MergeOffset, MergeType);
+    double DistGain = distBasedLocalityGain(MergedBlocks, Jumps);
+
+    double GainScore = DistGain + Config.FrequencyScale * FreqGain;
+    // Scale the result to increase the importance of merging short chains.
+    if (GainScore >= 0.0)
+      GainScore /= std::min(ChainPred->Size, ChainSucc->Size);
+
+    return MergeGainT(GainScore, MergeOffset, MergeType);
+  }
+
+  /// Compute the change of the frequency locality after merging the chains.
+  double freqBasedLocalityGain(ChainT *ChainPred, ChainT *ChainSucc) const {
+    auto missProbability = [&](double ChainDensity) {
+      double PageSamples = ChainDensity * Config.CacheSize;
+      if (PageSamples >= TotalSamples)
+        return 0.0;
+      double P = PageSamples / TotalSamples;
+      return pow(1.0 - P, static_cast<double>(Config.CacheEntries));
+    };
+
+    // Cache misses on the chains before merging.
+    double CurScore =
+        ChainPred->ExecutionCount * missProbability(ChainPred->density()) +
+        ChainSucc->ExecutionCount * missProbability(ChainSucc->density());
+
+    // Cache misses on the merged chain
+    double MergedCounts = ChainPred->ExecutionCount + ChainSucc->ExecutionCount;
+    double MergedSize = ChainPred->Size + ChainSucc->Size;
+    double MergedDensity = static_cast<double>(MergedCounts) / MergedSize;
+    double NewScore = MergedCounts * missProbability(MergedDensity);
+
+    return CurScore - NewScore;
+  }
+
+  /// Compute the distance locality for a jump / call.
+  double distScore(uint64_t SrcAddr, uint64_t DstAddr, uint64_t Count) const {
+    uint64_t Dist = SrcAddr <= DstAddr ? DstAddr - SrcAddr : SrcAddr - DstAddr;
+    double D = Dist == 0 ? 0.1 : static_cast<double>(Dist);
+    return static_cast<double>(Count) * std::pow(D, -Config.DistancePower);
+  }
+
+  /// Compute the change of the distance locality after merging the chains.
+  double distBasedLocalityGain(const MergedChain &MergedBlocks,
+                               const std::vector<JumpT *> &Jumps) const {
+    if (Jumps.empty())
+      return 0.0;
+    uint64_t CurAddr = 0;
+    MergedBlocks.forEach([&](const NodeT *Node) {
+      Node->EstimatedAddr = CurAddr;
+      CurAddr += Node->Size;
+    });
+
+    double CurScore = 0;
+    double NewScore = 0;
+    for (const JumpT *Arc : Jumps) {
+      uint64_t SrcAddr = Arc->Source->EstimatedAddr + Arc->Offset;
+      uint64_t DstAddr = Arc->Target->EstimatedAddr;
+      NewScore += distScore(SrcAddr, DstAddr, Arc->ExecutionCount);
+      CurScore += distScore(0, TotalSize, Arc->ExecutionCount);
+    }
+    return NewScore - CurScore;
+  }
+
+  /// Merge chain From into chain Into, update the list of active chains,
+  /// adjacency information, and the corresponding cached values.
+  void mergeChains(ChainT *Into, ChainT *From, size_t MergeOffset,
+                   MergeTypeT MergeType) {
+    assert(Into != From && "a chain cannot be merged with itself");
+
+    // Merge the nodes.
+    MergedChain MergedNodes =
+        mergeNodes(Into->Nodes, From->Nodes, MergeOffset, MergeType);
+    Into->merge(From, MergedNodes.getNodes());
+
+    // Merge the edges.
+    Into->mergeEdges(From);
+    From->clear();
+
+    // Remove the chain from the list of active chains.
+    llvm::erase_value(HotChains, From);
+  }
+
+  /// Concatenate all chains into the final order.
+  void concatChains(std::vector<uint64_t> &Order) {
+    // Collect chains and calculate density stats for their sorting.
+    std::vector<const ChainT *> SortedChains;
+    DenseMap<const ChainT *, double> ChainDensity;
+    for (ChainT &Chain : AllChains) {
+      if (!Chain.Nodes.empty()) {
+        SortedChains.push_back(&Chain);
+        // Using doubles to avoid overflow of ExecutionCounts.
+        double Size = 0;
+        double ExecutionCount = 0;
+        for (NodeT *Node : Chain.Nodes) {
+          Size += static_cast<double>(Node->Size);
+          ExecutionCount += static_cast<double>(Node->ExecutionCount);
+        }
+        assert(Size > 0 && "a chain of zero size");
+        ChainDensity[&Chain] = ExecutionCount / Size;
+      }
+    }
+
+    // Sort chains by density in the decreasing order.
+    std::sort(SortedChains.begin(), SortedChains.end(),
+              [&](const ChainT *L, const ChainT *R) {
+                const double DL = ChainDensity[L];
+                const double DR = ChainDensity[R];
+                // Compare by density and break ties by chain identifiers.
+                return std::make_tuple(-DL, L->Id) <
+                       std::make_tuple(-DR, R->Id);
+              });
+
+    // Collect the nodes in the order specified by their chains.
+    Order.reserve(NumNodes);
+    for (const ChainT *Chain : SortedChains)
+      for (NodeT *Node : Chain->Nodes)
+        Order.push_back(Node->Index);
+  }
+
+private:
+  /// Config for the algorithm.
+  const CDSortConfig Config;
+
+  /// The number of nodes in the graph.
+  const size_t NumNodes;
+
+  /// Successors of each node.
+  std::vector<std::vector<uint64_t>> SuccNodes;
+
+  /// Predecessors of each node.
+  std::vector<std::vector<uint64_t>> PredNodes;
+
+  /// All nodes (functions) in the graph.
+  std::vector<NodeT> AllNodes;
+
+  /// All jumps (function calls) between the nodes.
+  std::vector<JumpT> AllJumps;
+
+  /// All chains of nodes.
+  std::vector<ChainT> AllChains;
+
+  /// All edges between the chains.
+  std::vector<ChainEdge> AllEdges;
+
+  /// Active chains. The vector gets updated at runtime when chains are merged.
+  std::vector<ChainT *> HotChains;
+
+  /// The total number of samples in the graph.
+  uint64_t TotalSamples{0};
+
+  /// The total size of the nodes in the graph.
+  uint64_t TotalSize{0};
+};
+
 } // end of anonymous namespace
 
 std::vector<uint64_t>
 llvm::applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
                         const std::vector<uint64_t> &NodeCounts,
                         const std::vector<EdgeCountT> &EdgeCounts) {
-  // Verify correctness of the input data
+  // Verify correctness of the input data.
   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
   assert(NodeSizes.size() > 2 && "Incorrect input");
 
-  // Apply the reordering algorithm
+  // Apply the reordering algorithm.
   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
   std::vector<uint64_t> Result;
   Alg.run(Result);
 
-  // Verify correctness of the output
+  // Verify correctness of the output.
   assert(Result.front() == 0 && "Original entry point is not preserved");
   assert(Result.size() == NodeSizes.size() && "Incorrect size of layout");
   return Result;
@@ -1007,7 +1397,7 @@ double llvm::calcExtTspScore(const std::vector<uint64_t> &Order,
                              const std::vector<uint64_t> &NodeSizes,
                              const std::vector<uint64_t> &NodeCounts,
                              const std::vector<EdgeCountT> &EdgeCounts) {
-  // Estimate addresses of the blocks in memory
+  // Estimate addresses of the blocks in memory.
   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
@@ -1018,7 +1408,7 @@ double llvm::calcExtTspScore(const std::vector<uint64_t> &Order,
     OutDegree[Pred]++;
   }
 
-  // Increase the score for each jump
+  // Increase the score for each jump.
   double Score = 0;
   for (auto It : EdgeCounts) {
     uint64_t Pred = It.first.first;
@@ -1040,3 +1430,40 @@ double llvm::calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
   }
   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
 }
+
+std::vector<uint64_t>
+llvm::applyCDSLayout(const CDSortConfig &Config,
+                     const std::vector<uint64_t> &FuncSizes,
+                     const std::vector<uint64_t> &FuncCounts,
+                     const std::vector<EdgeCountT> &CallCounts,
+                     const std::vector<uint64_t> &CallOffsets) {
+  // Verify correctness of the input data.
+  assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
+
+  // Apply the reordering algorithm.
+  CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
+  std::vector<uint64_t> Result;
+  Alg.run(Result);
+
+  // Verify correctness of the output.
+  assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
+  return Result;
+}
+
+std::vector<uint64_t>
+llvm::applyCDSLayout(const std::vector<uint64_t> &FuncSizes,
+                     const std::vector<uint64_t> &FuncCounts,
+                     const std::vector<EdgeCountT> &CallCounts,
+                     const std::vector<uint64_t> &CallOffsets) {
+  CDSortConfig Config;
+  // Populate the config from the command-line options.
+  if (CacheEntries.getNumOccurrences() > 0)
+    Config.CacheEntries = CacheEntries;
+  if (CacheSize.getNumOccurrences() > 0)
+    Config.CacheSize = CacheSize;
+  if (DistancePower.getNumOccurrences() > 0)
+    Config.DistancePower = DistancePower;
+  if (FrequencyScale.getNumOccurrences() > 0)
+    Config.FrequencyScale = FrequencyScale;
+  return applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
+}
-- 
Gitee


From 898ec602b8acccab04cb7a70c2c02ca431ba40b7 Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@fb.com>
Date: Tue, 13 Jun 2023 10:08:00 -0700
Subject: [PATCH 03/10] [Backport][BOLT] A new code layout algorithm for
 function reordering [3b/3]

This is a new algorithm for function layout (reordering) based on the call graph
extracted from a profile data; see diffs down the stack for more details.

This layout is very similar to the existing hfsort+, but perhaps a little better
on some benchmarks. The goals of the change is as follows:

(i) rename and replace hfsort+ with a newer (hopefully better) implementation.
I'd prefer to keep both algs together for some time to simplify evaluation and
transition, but do want to remove hfsort+ once we're confident that there are
no regressions.

(ii) unify the implementation of code layout algorithms across LLVM. Currently
Passes/HfsortPlus.cpp and Utils/CodeLayout.cpp share many implementation-specific
details; this diff unifies the code.

Reviewed By: Amir

Differential Revision: https://reviews.llvm.org/D153039
---
 bolt/include/bolt/Passes/ReorderFunctions.h |  1 +
 bolt/lib/Passes/ReorderFunctions.cpp        | 79 ++++++++++++++-------
 2 files changed, 53 insertions(+), 27 deletions(-)

diff --git a/bolt/include/bolt/Passes/ReorderFunctions.h b/bolt/include/bolt/Passes/ReorderFunctions.h
index 52156a600791..27094bee771a 100644
--- a/bolt/include/bolt/Passes/ReorderFunctions.h
+++ b/bolt/include/bolt/Passes/ReorderFunctions.h
@@ -32,6 +32,7 @@ public:
     RT_EXEC_COUNT,
     RT_HFSORT,
     RT_HFSORT_PLUS,
+    RT_CDS,
     RT_PETTIS_HANSEN,
     RT_RANDOM,
     RT_USER
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index 998e0eab66fa..b17aefda5ddc 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -13,6 +13,7 @@
 #include "bolt/Passes/ReorderFunctions.h"
 #include "bolt/Passes/HFSort.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 #include <fstream>
 
 #define DEBUG_TYPE "hfsort"
@@ -27,33 +28,27 @@ extern cl::opt<uint32_t> RandomSeed;
 
 extern size_t padFunction(const bolt::BinaryFunction &Function);
 
-cl::opt<bolt::ReorderFunctions::ReorderType>
-ReorderFunctions("reorder-functions",
-  cl::desc("reorder and cluster functions (works only with relocations)"),
-  cl::init(bolt::ReorderFunctions::RT_NONE),
-  cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE,
-      "none",
-      "do not reorder functions"),
-    clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT,
-      "exec-count",
-      "order by execution count"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT,
-      "hfsort",
-      "use hfsort algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS,
-      "hfsort+",
-      "use hfsort+ algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
-      "pettis-hansen",
-      "use Pettis-Hansen algorithm"),
-    clEnumValN(bolt::ReorderFunctions::RT_RANDOM,
-      "random",
-      "reorder functions randomly"),
-    clEnumValN(bolt::ReorderFunctions::RT_USER,
-      "user",
-      "use function order specified by -function-order")),
-  cl::ZeroOrMore,
-  cl::cat(BoltOptCategory));
+cl::opt<bolt::ReorderFunctions::ReorderType> ReorderFunctions(
+    "reorder-functions",
+    cl::desc("reorder and cluster functions (works only with relocations)"),
+    cl::init(bolt::ReorderFunctions::RT_NONE),
+    cl::values(clEnumValN(bolt::ReorderFunctions::RT_NONE, "none",
+                          "do not reorder functions"),
+               clEnumValN(bolt::ReorderFunctions::RT_EXEC_COUNT, "exec-count",
+                          "order by execution count"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT, "hfsort",
+                          "use hfsort algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_HFSORT_PLUS, "hfsort+",
+                          "use hfsort+ algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_CDS, "cds",
+                          "use cache-directed sort"),
+               clEnumValN(bolt::ReorderFunctions::RT_PETTIS_HANSEN,
+                          "pettis-hansen", "use Pettis-Hansen algorithm"),
+               clEnumValN(bolt::ReorderFunctions::RT_RANDOM, "random",
+                          "reorder functions randomly"),
+               clEnumValN(bolt::ReorderFunctions::RT_USER, "user",
+                          "use function order specified by -function-order")),
+    cl::ZeroOrMore, cl::cat(BoltOptCategory));
 
 static cl::opt<bool> ReorderFunctionsUseHotSize(
     "reorder-functions-use-hot-size",
@@ -323,6 +318,36 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
   case RT_HFSORT_PLUS:
     Clusters = hfsortPlus(Cg);
     break;
+  case RT_CDS: {
+    // It is required that the sum of incoming arc weights is not greater
+    // than the number of samples for every function. Ensuring the call graph
+    // obeys the property before running the algorithm.
+    Cg.adjustArcWeights();
+
+    // Initialize CFG nodes and their data
+    std::vector<uint64_t> FuncSizes;
+    std::vector<uint64_t> FuncCounts;
+    using JumpT = std::pair<uint64_t, uint64_t>;
+    std::vector<std::pair<JumpT, uint64_t>> CallCounts;
+    std::vector<uint64_t> CallOffsets;
+    for (NodeId F = 0; F < Cg.numNodes(); ++F) {
+      FuncSizes.push_back(Cg.size(F));
+      FuncCounts.push_back(Cg.samples(F));
+      for (NodeId Succ : Cg.successors(F)) {
+        const Arc &Arc = *Cg.findArc(F, Succ);
+        auto It = std::make_pair(F, Succ);
+        CallCounts.push_back(std::make_pair(It, Arc.weight()));
+        CallOffsets.push_back(uint64_t(Arc.avgCallOffset()));
+      }
+    }
+
+    // Run the layout algorithm.
+    std::vector<uint64_t> Result =
+        applyCDSLayout(FuncSizes, FuncCounts, CallCounts, CallOffsets);
+
+    // Create a single cluster from the computed order of hot functions.
+    Clusters.emplace_back(Cluster(Result, Cg));
+  } break;
   case RT_PETTIS_HANSEN:
     Clusters = pettisAndHansen(Cg);
     break;
-- 
Gitee


From efb5085e4b4995ebe08a7d4002c914895db61f24 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 6 Jul 2023 10:31:45 -0700
Subject: [PATCH 04/10] [ELF] Use compression::getReasonIfUnsupported for
 zlib/zstd unavailable error

The error message now matches llvm-objcopy --compress-debug-sections=[zlib|zstd].
---
 lld/ELF/Driver.cpp                   | 30 ++++++++++++++--------------
 lld/test/ELF/compress-sections-err.s | 12 +++++++++++
 2 files changed, 27 insertions(+), 15 deletions(-)
 create mode 100644 lld/test/ELF/compress-sections-err.s

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 7e2a72acf8f6..d94a066211db 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -974,21 +974,19 @@ template <class ELFT> static void readCallGraphsFromObjectFiles() {
   }
 }
 
-static DebugCompressionType getCompressDebugSections(opt::InputArgList &args) {
-  StringRef s = args.getLastArgValue(OPT_compress_debug_sections, "none");
-  if (s == "zlib") {
-    if (!compression::zlib::isAvailable())
-      error("--compress-debug-sections: zlib is not available");
-    return DebugCompressionType::Zlib;
-  }
-  if (s == "zstd") {
-    if (!compression::zstd::isAvailable())
-      error("--compress-debug-sections: zstd is not available");
-    return DebugCompressionType::Zstd;
+static DebugCompressionType getCompressionType(StringRef s, StringRef option) {
+  DebugCompressionType type = StringSwitch<DebugCompressionType>(s)
+                                  .Case("zlib", DebugCompressionType::Zlib)
+                                  .Case("zstd", DebugCompressionType::Zstd)
+                                  .Default(DebugCompressionType::None);
+  if (type == DebugCompressionType::None) {
+    if (s != "none")
+      error("unknown " + option + " value: " + s);
+  } else if (const char *reason = compression::getReasonIfUnsupported(
+                 compression::formatFor(type))) {
+    error(option + ": " + reason);
   }
-  if (s != "none")
-    error("unknown --compress-debug-sections value: " + s);
-  return DebugCompressionType::None;
+  return type;
 }
 
 static StringRef getAliasSpelling(opt::Arg *arg) {
@@ -1081,7 +1079,9 @@ static void readConfigs(opt::InputArgList &args) {
   config->checkSections =
       args.hasFlag(OPT_check_sections, OPT_no_check_sections, true);
   config->chroot = args.getLastArgValue(OPT_chroot);
-  config->compressDebugSections = getCompressDebugSections(args);
+  config->compressDebugSections = getCompressionType(
+      args.getLastArgValue(OPT_compress_debug_sections, "none"),
+      "--compress-debug-sections");
   config->cref = args.hasArg(OPT_cref);
   config->optimizeBBJumps =
       args.hasFlag(OPT_optimize_bb_jumps, OPT_no_optimize_bb_jumps, false);
diff --git a/lld/test/ELF/compress-sections-err.s b/lld/test/ELF/compress-sections-err.s
new file mode 100644
index 000000000000..097803807083
--- /dev/null
+++ b/lld/test/ELF/compress-sections-err.s
@@ -0,0 +1,12 @@
+# REQUIRES: x86
+# UNSUPPORTED: zlib
+
+# RUN: llvm-mc -filetype=obj -triple=x86_64 %s -o %t.o
+# RUN: ld.lld %t.o --compress-debug-sections=zlib --compress-debug-sections=none -o /dev/null 2>&1 | count 0
+# RUN: not ld.lld %t.o --compress-debug-sections=zlib -o /dev/null 2>&1 | \
+# RUN:   FileCheck %s --implicit-check-not=error:
+
+# CHECK: error: --compress-debug-sections: LLVM was not built with LLVM_ENABLE_ZLIB or did not find zlib at build time
+
+.globl _start
+_start:
-- 
Gitee


From a2234bdf150a093b35bbd85e160d260c5c115c3c Mon Sep 17 00:00:00 2001
From: modimo <modimo@fb.com>
Date: Thu, 13 Jul 2023 19:02:52 -0700
Subject: [PATCH 05/10] [Backport][WPD][LLD] Add option to validate RTTI is
 enabled on all native types and prevent devirtualization on types with native
 RTTI

Discussion about this approach: https://discourse.llvm.org/t/rfc-safer-whole-program-class-hierarchy-analysis/65144/18

When enabling WPD in an environment where native binaries are present, types we want to optimize can be derived from inside these native files and devirtualizing them can lead to correctness issues. RTTI can be used as a way to determine all such types in native files and exclude them from WPD providing a safe checked way to enable WPD.

The approach is:
1. In the linker, identify if RTTI is available for all native types. If not, under `--lto-validate-all-vtables-have-type-infos` `--lto-whole-program-visibility` is automatically disabled. This is done by examining all .symtab symbols in object files and .dynsym symbols in DSOs for vtable (_ZTV) and typeinfo (_ZTI) symbols and ensuring there's always a match for every vtable symbol.
2. During thinlink, if `--lto-validate-all-vtables-have-type-infos` is set and RTTI is available for all native types, identify all typename (_ZTS) symbols via their corresponding typeinfo (_ZTI) symbols that are used natively or outside of our summary and exclude them from WPD.

Testing:
ninja check-all
large Meta service that uses boost, glog and libstdc++.so runs successfully with WPD via --lto-whole-program-visibility. Previously, native types in boost caused incorrect devirtualization that led to crashes.

Reviewed By: MaskRay, tejohnson

Differential Revision: https://reviews.llvm.org/D155659
---
 lld/ELF/Config.h                              |   4 +
 lld/ELF/Driver.cpp                            |  65 +++++
 lld/ELF/LTO.cpp                               |   3 +
 lld/ELF/Options.td                            |   5 +
 .../devirt_validate_vtable_typeinfos.ll       |  26 ++
 ...evirt_validate_vtable_typeinfos_no_rtti.ll |  19 ++
 .../devirt_validate_vtable_typeinfos_ref.ll   |  68 +++++
 .../devirt_validate_vtable_typeinfos_undef.ll |  16 ++
 .../lto/devirt_validate_vtable_typeinfos.ll   | 263 ++++++++++++++++++
 ...irt_validate_vtable_typeinfos_mixed_lto.ll | 183 ++++++++++++
 ...evirt_validate_vtable_typeinfos_no_rtti.ll | 136 +++++++++
 .../devirt_validate_vtable_typeinfos_ref.ll   | 130 +++++++++
 llvm/include/llvm/LTO/Config.h                |   6 +
 .../llvm/Transforms/IPO/WholeProgramDevirt.h  |  12 +-
 llvm/lib/LTO/LTO.cpp                          |  55 +++-
 llvm/lib/LTO/LTOCodeGenerator.cpp             |  13 +-
 llvm/lib/LTO/ThinLTOCodeGenerator.cpp         |   9 +-
 .../lib/Transforms/IPO/WholeProgramDevirt.cpp |  76 ++++-
 llvm/tools/opt/opt.cpp                        |  11 +-
 19 files changed, 1074 insertions(+), 26 deletions(-)
 create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll
 create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll
 create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
 create mode 100644 lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
 create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
 create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
 create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
 create mode 100644 lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 706f17b764c8..4ba7bc12d038 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -225,6 +225,7 @@ struct Config {
   bool ltoDebugPassManager;
   bool ltoEmitAsm;
   bool ltoUniqueBasicBlockSectionNames;
+  bool ltoValidateAllVtablesHaveTypeInfos;
   bool ltoWholeProgramVisibility;
   bool mergeArmExidx;
   bool mipsN32Abi = false;
@@ -441,6 +442,9 @@ struct Ctx {
   std::atomic<bool> hasTlsIe{false};
   // True if we need to reserve two .got entries for local-dynamic TLS model.
   std::atomic<bool> needsTlsLd{false};
+  // True if all native vtable symbols have corresponding type info symbols
+  // during LTO.
+  bool ltoAllVtablesHaveTypeInfos;
 
   void reset();
 };
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index d94a066211db..d59389728622 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -104,6 +104,7 @@ void Ctx::reset() {
   backwardReferences.clear();
   hasSympart.store(false, std::memory_order_relaxed);
   needsTlsLd.store(false, std::memory_order_relaxed);
+  ltoAllVtablesHaveTypeInfos = false;
 }
 
 bool elf::link(ArrayRef<const char *> args, llvm::raw_ostream &stdoutOS,
@@ -974,6 +975,63 @@ template <class ELFT> static void readCallGraphsFromObjectFiles() {
   }
 }
 
+template <class ELFT>
+static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) {
+  DenseSet<StringRef> typeInfoSymbols;
+  SmallSetVector<StringRef, 0> vtableSymbols;
+  auto processVtableAndTypeInfoSymbols = [&](StringRef name) {
+    if (name.consume_front("_ZTI"))
+      typeInfoSymbols.insert(name);
+    else if (name.consume_front("_ZTV"))
+      vtableSymbols.insert(name);
+  };
+
+  // Examine all native symbol tables.
+  for (ELFFileBase *f : ctx.objectFiles) {
+    using Elf_Sym = typename ELFT::Sym;
+    for (const Elf_Sym &s : f->template getGlobalELFSyms<ELFT>()) {
+      if (s.st_shndx != SHN_UNDEF) {
+        StringRef name = check(s.getName(f->getStringTable()));
+        processVtableAndTypeInfoSymbols(name);
+      }
+    }
+  }
+
+  for (SharedFile *f : ctx.sharedFiles) {
+    using Elf_Sym = typename ELFT::Sym;
+    for (const Elf_Sym &s : f->template getELFSyms<ELFT>()) {
+      if (s.st_shndx != SHN_UNDEF) {
+        StringRef name = check(s.getName(f->getStringTable()));
+        processVtableAndTypeInfoSymbols(name);
+      }
+    }
+  }
+
+  SmallSetVector<StringRef, 0> vtableSymbolsWithNoRTTI;
+  for (StringRef s : vtableSymbols)
+    if (!typeInfoSymbols.count(s))
+      vtableSymbolsWithNoRTTI.insert(s);
+
+  // Remove known safe symbols.
+  for (auto *arg : args.filtered(OPT_lto_known_safe_vtables)) {
+    StringRef knownSafeName = arg->getValue();
+    if (!knownSafeName.consume_front("_ZTV"))
+      error("--lto-known-safe-vtables=: expected symbol to start with _ZTV, "
+            "but got " +
+            knownSafeName);
+    vtableSymbolsWithNoRTTI.remove(knownSafeName);
+  }
+
+  ctx.ltoAllVtablesHaveTypeInfos = vtableSymbolsWithNoRTTI.empty();
+  // Check for unmatched RTTI symbols
+  for (StringRef s : vtableSymbolsWithNoRTTI) {
+    message(
+        "--lto-validate-all-vtables-have-type-infos: RTTI missing for vtable "
+        "_ZTV" +
+        s + ", --lto-whole-program-visibility disabled");
+  }
+}
+
 static DebugCompressionType getCompressionType(StringRef s, StringRef option) {
   DebugCompressionType type = StringSwitch<DebugCompressionType>(s)
                                   .Case("zlib", DebugCompressionType::Zlib)
@@ -1138,6 +1196,9 @@ static void readConfigs(opt::InputArgList &args) {
   config->ltoWholeProgramVisibility =
       args.hasFlag(OPT_lto_whole_program_visibility,
                    OPT_no_lto_whole_program_visibility, false);
+  config->ltoValidateAllVtablesHaveTypeInfos =
+      args.hasFlag(OPT_lto_validate_all_vtables_have_type_infos,
+                   OPT_no_lto_validate_all_vtables_have_type_infos, false);
   config->ltoo = args::getInteger(args, OPT_lto_O, 2);
   config->ltoObjPath = args.getLastArgValue(OPT_lto_obj_path_eq);
   config->ltoPartitions = args::getInteger(args, OPT_lto_partitions, 1);
@@ -2666,6 +2727,10 @@ void LinkerDriver::link(opt::InputArgList &args) {
                                 config->ltoEmitAsm ||
                                 !config->thinLTOModulesToCompile.empty();
 
+  // Handle --lto-validate-all-vtables-have-type-infos.
+  if (config->ltoValidateAllVtablesHaveTypeInfos)
+    invokeELFT(ltoValidateAllVtablesHaveTypeInfos, args);
+
   // Do link-time optimization if given files are LLVM bitcode files.
   // This compiles bitcode files into real object files.
   //
diff --git a/lld/ELF/LTO.cpp b/lld/ELF/LTO.cpp
index b80f1f48f768..e8c0e9778c5f 100644
--- a/lld/ELF/LTO.cpp
+++ b/lld/ELF/LTO.cpp
@@ -153,6 +153,9 @@ static lto::Config createConfig() {
   c.DwoDir = std::string(config->dwoDir);
 
   c.HasWholeProgramVisibility = config->ltoWholeProgramVisibility;
+  c.ValidateAllVtablesHaveTypeInfos =
+      config->ltoValidateAllVtablesHaveTypeInfos;
+  c.AllVtablesHaveTypeInfos = ctx.ltoAllVtablesHaveTypeInfos;
   c.AlwaysEmitRegularLTOObj = !config->ltoObjPath.empty();
 
   for (const llvm::StringRef &name : config->thinLTOModulesToCompile)
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index b6a6ef64d017..e4daea072924 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -569,9 +569,14 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
 defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch",
   "turn on warnings about profile cfg mismatch (default)>",
   "turn off warnings about profile cfg mismatch">;
+defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", 
+  "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">;
 def lto_obj_path_eq: JJ<"lto-obj-path=">;
 def lto_sample_profile: JJ<"lto-sample-profile=">,
   HelpText<"Sample profile file path">;
+defm lto_validate_all_vtables_have_type_infos: BB<"lto-validate-all-vtables-have-type-infos",
+  "Validate that all vtables have type infos for LTO link",
+  "Do not validate that all vtables have type infos for LTO link">;
 defm lto_whole_program_visibility: BB<"lto-whole-program-visibility",
   "Asserts that the LTO link has whole program visibility",
   "Asserts that the LTO link does not have whole program visibility">;
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll
new file mode 100644
index 000000000000..fb357831d6f2
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos.ll
@@ -0,0 +1,26 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI6Native, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] }
+@_ZTS6Native = linkonce_odr constant [8 x i8] c"6Native\00"
+@_ZTI6Native = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS6Native, ptr @_ZTI1A }
+
+; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+
+define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll
new file mode 100644
index 000000000000..4533504c6018
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll
@@ -0,0 +1,19 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV6Native = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1A1nEi, ptr @_ZN6Native1fEi] }
+
+define linkonce_odr i32 @_ZN6Native1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
new file mode 100644
index 000000000000..43df8366aa2a
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_ref.ll
@@ -0,0 +1,68 @@
+;; Source code:
+;; cat > a.h <<'eof'
+;; struct A { virtual int foo(); };
+;; int bar(A *a);
+;; eof
+;; cat > b.cc <<'eof'
+;; #include "a.h"
+;; struct B : A { int foo() { return 2; } };
+;; int baz() { B b; return bar(&b); }
+;; eof
+;; clang++ -flto=thin b.cc -c
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.B = type { %struct.A }
+%struct.A = type { ptr }
+
+@_ZTV1B = linkonce_odr dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B3fooEv] }, !type !0, !type !1, !type !2, !type !3
+@_ZTS1B = linkonce_odr dso_local constant [3 x i8] c"1B\00"
+@_ZTI1A = external constant ptr
+@_ZTI1B = linkonce_odr dso_local constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+@_ZTV1A = external unnamed_addr constant { [3 x ptr] }
+
+define dso_local noundef i32 @_Z3bazv() #0 {
+entry:
+  %b = alloca %struct.B
+  call void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %b)
+  %call = call noundef i32 @_Z3barP1A(ptr noundef %b)
+  ret i32 %call
+}
+
+define linkonce_odr dso_local void @_ZN1BC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this1)
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1
+  ret void
+}
+
+declare i32 @_Z3barP1A(ptr noundef)
+
+define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1
+  ret void
+}
+
+define linkonce_odr i32 @_ZN1B3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  ret i32 2
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFivE.virtual"}
+!2 = !{i64 16, !"_ZTS1B"}
+!3 = !{i64 16, !"_ZTSM1BFivE.virtual"}
diff --git a/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
new file mode 100644
index 000000000000..6cc55df82e2f
--- /dev/null
+++ b/lld/test/ELF/lto/Inputs/devirt_validate_vtable_typeinfos_undef.ll
@@ -0,0 +1,16 @@
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@_ZTV1B = external unnamed_addr constant { [4 x ptr] }
+
+define linkonce_odr void @_ZN1BC2Ev(ptr %this) #0 {
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  store ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV1B, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  ret void
+}
+
+attributes #0 = { noinline optnone }
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
new file mode 100644
index 000000000000..d6ac53f9fb93
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos.ll
@@ -0,0 +1,263 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!12, !13}' >> %t1_regular.ll
+; RUN: echo '!12 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!13 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos.ll -o %t2.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o
+; RUN: ld.lld %t2.o -o %t2.so -shared
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_no_rtti.ll -o %t2_nortti.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2_nortti.bc -o %t2_nortti.o
+; RUN: ld.lld %t2_nortti.o -o %t2_nortti.so -shared
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_undef.ll -o %t2_undef.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2_undef.bc -o %t2_undef.o
+; RUN: ld.lld %t2_undef.o -o %t2_undef.so -shared
+
+;; With --lto-whole-program-visibility, we assume no native types can interfere
+;; and thus proceed with devirtualization even in the presence of native types
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; With --lto-validate-all-vtables-have-type-infos, the linker checks for the presence of vtables
+;; and RTTI in native files and blocks devirtualization to be conservative on correctness
+;; for these types.
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t4_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t4_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t4_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t4_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; DSOs behave similarly
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.so -o %t5_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.so -o %t5_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.so -o %t5_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t5_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+; VALIDATE-NOT: single-impl:
+; VALIDATE:     single-impl: devirtualized a call to _ZN1D1mEi
+; VALIDATE-NOT: single-impl:
+
+;; When vtables without type infos are detected in native files, we have a hole in our knowledge so
+;; --lto-validate-all-vtables-have-type-infos conservatively disables --lto-whole-program-visibility
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.o -o %t6_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t6_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t6_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t6_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; DSOs behave similarly
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.so -o %t7_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.so -o %t7_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.so -o %t7_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=NO-RTTI
+; RUN: llvm-dis %t7_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-NO-RTTI-IR
+
+; NO-RTTI-DAG: --lto-validate-all-vtables-have-type-infos: RTTI missing for vtable _ZTV6Native, --lto-whole-program-visibility disabled
+; NO-RTTI-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; --lto-known-safe-vtables=* can be used to specifically allow types to participate in WPD
+;; even if they don't have corresponding RTTI
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_nortti.o -o %t8_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_nortti.o -o %t8_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_nortti.o -o %t8_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   --lto-known-safe-vtables=_ZTV6Native -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t8_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Only check for definitions of vtables symbols, just having a reference does not allow a type to
+;; be derived from
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2_undef.o -o %t9_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2_undef.o -o %t9_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2_undef.o -o %t9_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t9_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11
+
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00"
+@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+
+@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00"
+@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A }
+
+@_ZTS1D = internal constant [3 x i8] c"1D\00"
+@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  ;; --lto-whole-program-visibility disabled so no devirtualization
+  ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1
+  ; CHECK-NO-RTTI-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; We still have to call it as virtual.
+  ; CHECK-IR: %call2 = tail call i32 %fptr22
+  ; CHECK-VALIDATE-IR: %call2 = tail call i32 %fptr22
+  ; CHECK-NO-RTTI-IR: %call2 = tail call i32 %fptr22
+  %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi
+  ;; Types not present in native files can still be devirtualized
+  ; CHECK-VALIDATE-IR: %call3 = tail call i32 @_ZN1D1mEi
+  ;; --lto-whole-program-visibility disabled but being local this
+  ;;  has VCallVisibilityTranslationUnit visibility so it's still devirtualized
+  ; CHECK-NO-RTTI-IR: %call3 = tail call i32 @_ZN1D1mEi
+  %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2)
+
+  ret i32 %call3
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS1B"}
+!4 = !{i64 16, !"_ZTSM1BFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM1BFviE.virtual"}
+!6 = !{i64 16, !"_ZTS1C"}
+!7 = !{i64 16, !"_ZTSM1CFviE.virtual"}
+!8 = !{i64 24, !"_ZTSM1CFviE.virtual"}
+!9 = !{i64 16, !10}
+!10 = distinct !{}
+!11 = !{i64 2}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
new file mode 100644
index 000000000000..15040b8707ae
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_mixed_lto.ll
@@ -0,0 +1,183 @@
+; REQUIRES: x86
+
+; RUN: rm -rf %t.dir
+; RUN: split-file %s %t.dir
+; RUN: cd %t.dir
+
+;; Common artifacts
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1.o ThinLTO.ll
+; RUN: opt -module-summary -o %t2.o RegularLTO.ll
+
+;; --lto-whole-program-visibility when there's split ThinLTO and a RegularLTO with summary optimizes
+;; using the combined index.
+; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR
+; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR
+
+;; --lto-validate-all-vtables-have-type-infos when there's split ThinLTO and a RegularLTO with summary behaves the same
+;; as everything is present in the combined index.
+; RUN: ld.lld %t1.o %t2.o -o %t3 -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR,CHECK-COMMON-IR
+; RUN: llvm-dis %t3.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-REGULAR-IR,CHECK-COMMON-REGULAR-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+
+;--- ThinLTO.ll
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1B, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI1C, ptr @_ZN1A1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1, !type !2, !type !6, !type !7, !type !8
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1D, ptr @_ZN1D1mEi] }, !type !9, !vcall_visibility !11
+
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+@_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00"
+@_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1B, ptr @_ZTI1A }
+
+@_ZTS1C = linkonce_odr constant [3 x i8] c"1C\00"
+@_ZTI1C = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS1C, ptr @_ZTI1A }
+
+@_ZTS1D = internal constant [3 x i8] c"1D\00"
+@_ZTI1D = internal constant { ptr, ptr } { ptr null, ptr @_ZTS1D }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ], section "llvm.metadata"
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+  ;; Call function built with RegularLTO
+  %RegularLTOResult = call i32 @RegularLTO(ptr %obj, i32 %a)
+
+  ;; ThinLTO code starts here
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; Check that the call was not devirtualized.
+  ; CHECK-IR: %call2 = tail call i32 %fptr22
+  %call2 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !10)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call3 = tail call i32 @_ZN1D1mEi
+  %call3 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call2)
+
+  ret i32 %call3
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i32 @RegularLTO(ptr)
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1A1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS1B"}
+!4 = !{i64 16, !"_ZTSM1BFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM1BFviE.virtual"}
+!6 = !{i64 16, !"_ZTS1C"}
+!7 = !{i64 16, !"_ZTSM1CFviE.virtual"}
+!8 = !{i64 24, !"_ZTSM1CFviE.virtual"}
+!9 = !{i64 16, !10}
+!10 = distinct !{}
+!11 = !{i64 2}
+
+;--- RegularLTO.ll
+; REQUIRES: x86
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.Native = type { %struct.A }
+
+@_ZTV7Regular = linkonce_odr unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI7Regular, ptr @_ZN7Regular1fEi, ptr @_ZN1A1nEi] } , !type !0, !type !1, !type !2, !type !3, !type !4, !type !5
+@_ZTS7Regular = linkonce_odr constant [9 x i8] c"7Regular\00"
+@_ZTI7Regular = linkonce_odr constant { ptr, ptr, ptr } { ptr null, ptr @_ZTS7Regular, ptr @_ZTI1A }
+
+; Base type A does not need to emit a vtable if it's never instantiated. However, RTTI still gets generated
+@_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
+@_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr null, ptr @_ZTS1A }
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [1 x ptr] [ ptr @_ZTV7Regular ], section "llvm.metadata"
+
+; CHECK-COMMON-REGULAR-IR-LABEL: define dso_local i32 @RegularLTO
+define i32 @RegularLTO(ptr %obj, i32 %a) #0 {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptr1 = load ptr, ptr %vtable, align 8
+
+  ;; Check that the call was not devirtualized.
+  ; CHECK-REGULAR-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  ret i32 %call
+}
+; CHECK-COMMON-REGULAR-IR-LABEL: ret i32
+; CHECK-COMMON-REGULAR-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN7Regular1fEi(ptr %this, i32 %a) #0 {
+   ret i32 1;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+attributes #0 = { noinline optnone }
+!llvm.module.flags = !{!6, !7}
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFviE.virtual"}
+!2 = !{i64 24, !"_ZTSM1AFviE.virtual"}
+!3 = !{i64 16, !"_ZTS7Regular"}
+!4 = !{i64 16, !"_ZTSM7RegularFviE.virtual"}
+!5 = !{i64 24, !"_ZTSM7RegularFviE.virtual"}
+!6 = !{i32 1, !"ThinLTO", i32 0}
+!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
new file mode 100644
index 000000000000..30bd75606f7d
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_no_rtti.ll
@@ -0,0 +1,136 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!6, !7}' >> %t1_regular.ll
+; RUN: echo '!6 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!7 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+;; With --lto-whole-program-visibility, we assume no native types can interfere
+;; and thus proceed with devirtualization even in the presence of native types
+
+;; Index based WPD
+; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=REMARK
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-IR
+
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1A1nEi
+; REMARK-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+;; With --lto-whole-program-visibility and --lto-validate-all-vtables-have-type-infos
+;; we rely on resolutions on the typename symbol to inform us of what's outside the summary.
+;; Without the typename symbol in the LTO unit (e.g. RTTI disabled) this causes
+;; conservative disablement of WPD on these types unless it's local
+
+;; Index based WPD
+; RUN: ld.lld %t1.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s --check-prefix=VALIDATE
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-COMMON-IR-LABEL,CHECK-VALIDATE-IR
+
+; VALIDATE-DAG: single-impl: devirtualized a call to _ZN1D1mEi
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { ptr }
+%struct.B = type { %struct.A }
+%struct.C = type { %struct.A }
+%struct.D = type { ptr }
+
+@_ZTV1B = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1B1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !1
+@_ZTV1C = linkonce_odr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr null, ptr @_ZN1C1fEi, ptr @_ZN1A1nEi] }, !type !0, !type !2
+@_ZTV1D = internal constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr null, ptr @_ZN1D1mEi] }, !type !3, !vcall_visibility !5
+
+;; Prevent the vtables from being dead code eliminated.
+@llvm.used = appending global [3 x ptr] [ ptr @_ZTV1B, ptr @_ZTV1C, ptr @_ZTV1D ]
+
+; CHECK-COMMON-IR-LABEL: define dso_local i32 @_start
+define i32 @_start(ptr %obj, ptr %obj2, i32 %a) {
+entry:
+  %vtable = load ptr, ptr %obj
+  %p = call i1 @llvm.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %p)
+  %fptrptr = getelementptr ptr, ptr %vtable, i32 1
+  %fptr1 = load ptr, ptr %fptrptr, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call = tail call i32 @_ZN1A1nEi
+  ;; No resolution for _ZTS1A means we don't devirtualize
+  ; CHECK-VALIDATE-IR: %call = tail call i32 %fptr1
+  %call = tail call i32 %fptr1(ptr nonnull %obj, i32 %a)
+
+  %fptr22 = load ptr, ptr %vtable, align 8
+
+  ;; We still have to call it as virtual.
+  ; CHECK-IR: %call3 = tail call i32 %fptr22
+  ; CHECK-VALIDATE-IR: %call3 = tail call i32 %fptr22
+  %call3 = tail call i32 %fptr22(ptr nonnull %obj, i32 %call)
+
+  %vtable2 = load ptr, ptr %obj2
+  %p2 = call i1 @llvm.type.test(ptr %vtable2, metadata !4)
+  call void @llvm.assume(i1 %p2)
+
+  %fptr33 = load ptr, ptr %vtable2, align 8
+
+  ;; Check that the call was devirtualized.
+  ; CHECK-IR: %call4 = tail call i32 @_ZN1D1mEi
+  ;; Being local this has VCallVisibilityTranslationUnit
+  ;; visibility so it's still devirtualized
+  ; CHECK-VALIDATE-IR: %call4 = tail call i32 @_ZN1D1mEi
+  %call4 = tail call i32 %fptr33(ptr nonnull %obj2, i32 %call3)
+  ret i32 %call4
+}
+; CHECK-COMMON-IR-LABEL: ret i32
+; CHECK-COMMON-IR-LABEL: }
+
+declare i1 @llvm.type.test(ptr, metadata)
+declare void @llvm.assume(i1)
+
+define linkonce_odr i32 @_ZN1B1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1A1nEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define linkonce_odr i32 @_ZN1C1fEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+define internal i32 @_ZN1D1mEi(ptr %this, i32 %a) #0 {
+   ret i32 0;
+}
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTS1B"}
+!2 = !{i64 16, !"_ZTS1C"}
+!3 = !{i64 16, !4}
+!4 = distinct !{}
+!5 = !{i64 2}
diff --git a/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
new file mode 100644
index 000000000000..4ef048d6b6c6
--- /dev/null
+++ b/lld/test/ELF/lto/devirt_validate_vtable_typeinfos_ref.ll
@@ -0,0 +1,130 @@
+; REQUIRES: x86
+
+;; Common artifacts
+; RUN: opt --thinlto-bc -o %t1.o %s
+; RUN: opt --thinlto-bc --thinlto-split-lto-unit -o %t1_hybrid.o %s
+; RUN: cp %s %t1_regular.ll
+; RUN: echo '!llvm.module.flags = !{!2, !3}' >> %t1_regular.ll
+; RUN: echo '!2 = !{i32 1, !"ThinLTO", i32 0}' >> %t1_regular.ll
+; RUN: echo '!3 = !{i32 1, !"EnableSplitLTOUnit", i32 1}' >> %t1_regular.ll
+; RUN: opt -module-summary -o %t1_regular.o %t1_regular.ll
+
+; RUN: llvm-as %S/Inputs/devirt_validate_vtable_typeinfos_ref.ll -o %t2.bc
+; RUN: llc -relocation-model=pic -filetype=obj %t2.bc -o %t2.o
+
+;; Native objects can contain only a reference to the base type infos if the base declaration has no key functions.
+;; Because of that, --lto-validate-all-vtables-have-type-infos needs to query for the type info symbol inside native files rather than the
+;; type name symbol that's used as the key in !type metadata to correctly stop devirtualization on the native type.
+
+;; Index based WPD
+; RUN: ld.lld %t1.o %t2.o -o %t3_index -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t1.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+;; Hybrid WPD
+; RUN: ld.lld %t1_hybrid.o %t2.o -o %t3_hybrid -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t1_hybrid.o.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+;; Regular LTO WPD
+; RUN: ld.lld %t1_regular.o %t2.o -o %t3_regular -save-temps --lto-whole-program-visibility --lto-validate-all-vtables-have-type-infos \
+; RUN:   -mllvm -pass-remarks=. 2>&1 | FileCheck %s
+; RUN: llvm-dis %t3_regular.0.4.opt.bc -o - | FileCheck %s --check-prefixes=CHECK-IR
+
+; CHECK-NOT:     single-impl: devirtualized a call to _ZN1A3fooEv
+
+;; Source code:
+;; cat > a.h <<'eof'
+;; struct A { virtual int foo(); };
+;; int bar(A *a);
+;; eof
+;; cat > main.cc <<'eof'
+;; #include "a.h"
+;;
+;; int A::foo() { return 1; }
+;; int bar(A *a) { return a->foo(); }
+;;
+;; extern int baz();
+;; int main() {
+;;   A a;
+;;   int i = bar(&a);
+;;   int j = baz();
+;;   return i + j;
+;; }
+;; eof
+;; clang++ -fwhole-program-vtables -fno-split-lto-unit -flto=thin main.cc -c
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+%struct.A = type { %struct.Abase }
+%struct.Abase = type { ptr }
+
+@_ZTV1A = dso_local unnamed_addr constant { [3 x ptr] } { [3 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A3fooEv] }, align 8, !type !0, !type !1
+@_ZTS1A = dso_local constant [3 x i8] c"1A\00", align 1
+@_ZTI1A = dso_local constant { ptr, ptr } { ptr null, ptr @_ZTS1A }, align 8
+
+define dso_local noundef i32 @_ZN1A3fooEv(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 {
+entry:
+  %this.addr = alloca ptr
+  store ptr %this, ptr %this.addr
+  %this1 = load ptr, ptr %this.addr
+  ret i32 1
+}
+
+; CHECK-IR: define dso_local noundef i32 @_Z3barP1A
+define dso_local noundef i32 @_Z3barP1A(ptr noundef %a) #0 {
+entry:
+  %a.addr = alloca ptr
+  store ptr %a, ptr %a.addr
+  %0 = load ptr, ptr %a.addr
+  %vtable = load ptr, ptr %0
+  %1 = call i1 @llvm.public.type.test(ptr %vtable, metadata !"_ZTS1A")
+  call void @llvm.assume(i1 %1)
+  %vfn = getelementptr inbounds ptr, ptr %vtable, i64 0
+  %fptr = load ptr, ptr %vfn
+  ;; Check that the call was not devirtualized.
+  ; CHECK-IR: %call = call noundef i32 %fptr
+  %call = call noundef i32 %fptr(ptr noundef nonnull align 8 dereferenceable(8) %0)
+  ret i32 %call
+}
+; CHECK-IR: ret i32
+; CHECK-IR: }
+
+declare i1 @llvm.public.type.test(ptr, metadata)
+declare void @llvm.assume(i1 noundef)
+
+define dso_local noundef i32 @main() #0 {
+entry:
+  %retval = alloca i32, align 4
+  %a = alloca %struct.A, align 8
+  %i = alloca i32, align 4
+  %j = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  call void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %a)
+  %call = call noundef i32 @_Z3barP1A(ptr noundef %a)
+  store i32 %call, ptr %i, align 4
+  %call1 = call noundef i32 @_Z3bazv()
+  store i32 %call1, ptr %j, align 4
+  %0 = load i32, ptr %i, align 4
+  %1 = load i32, ptr %j, align 4
+  %add = add nsw i32 %0, %1
+  ret i32 %add
+}
+
+define linkonce_odr dso_local void @_ZN1AC2Ev(ptr noundef nonnull align 8 dereferenceable(8) %this) #0 align 2 {
+entry:
+  %this.addr = alloca ptr, align 8
+  store ptr %this, ptr %this.addr, align 8
+  %this1 = load ptr, ptr %this.addr, align 8
+  store ptr getelementptr inbounds ({ [3 x ptr] }, ptr @_ZTV1A, i32 0, inrange i32 0, i32 2), ptr %this1, align 8
+  ret void
+}
+
+declare noundef i32 @_Z3bazv()
+
+;; Make sure we don't inline or otherwise optimize out the direct calls.
+attributes #0 = { noinline optnone }
+
+!0 = !{i64 16, !"_ZTS1A"}
+!1 = !{i64 16, !"_ZTSM1AFivE.virtual"}
diff --git a/llvm/include/llvm/LTO/Config.h b/llvm/include/llvm/LTO/Config.h
index 7a746592c9fc..cb8259269820 100644
--- a/llvm/include/llvm/LTO/Config.h
+++ b/llvm/include/llvm/LTO/Config.h
@@ -79,6 +79,12 @@ struct Config {
   /// link.
   bool HasWholeProgramVisibility = false;
 
+  /// We're validating that all native vtables have corresponding type infos.
+  bool ValidateAllVtablesHaveTypeInfos = false;
+  /// If all native vtables have corresponding type infos, allow
+  /// usage of RTTI to block devirtualization on types used in native files.
+  bool AllVtablesHaveTypeInfos = false;
+
   /// Always emit a Regular LTO object even when it is empty because no Regular
   /// LTO modules were linked. This option is useful for some build system which
   /// want to know a priori all possible output files.
diff --git a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
index a2296a064213..4932157a7a3d 100644
--- a/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
+++ b/llvm/include/llvm/Transforms/IPO/WholeProgramDevirt.h
@@ -244,10 +244,18 @@ void updatePublicTypeTestCalls(Module &M,
                                bool WholeProgramVisibilityEnabledInLTO);
 void updateVCallVisibilityInModule(
     Module &M, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols);
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    bool ValidateAllVtablesHaveTypeInfos,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj);
 void updateVCallVisibilityInIndex(
     ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols);
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    const DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols);
+
+void getVisibleToRegularObjVtableGUIDs(
+    ModuleSummaryIndex &Index,
+    DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj);
 
 /// Perform index-based whole program devirtualization on the \p Summary
 /// index. Any devirtualized targets used by a type test in another module
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 1cd48adac3f0..0e5eeb6ff978 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1134,13 +1134,27 @@ Error LTO::runRegularLTO(AddStreamFn AddStream) {
     }
   }
 
+  bool WholeProgramVisibilityEnabledInLTO =
+      Conf.HasWholeProgramVisibility &&
+      // If validation is enabled, upgrade visibility only when all vtables
+      // have typeinfos.
+      (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
+
+  // This returns true when the name is local or not defined. Locals are
+  // expected to be handled separately.
+  auto IsVisibleToRegularObj = [&](StringRef name) {
+    auto It = GlobalResolutions.find(name);
+    return (It == GlobalResolutions.end() || It->second.VisibleOutsideSummary);
+  };
+
   // If allowed, upgrade public vcall visibility metadata to linkage unit
   // visibility before whole program devirtualization in the optimizer.
-  updateVCallVisibilityInModule(*RegularLTO.CombinedModule,
-                                Conf.HasWholeProgramVisibility,
-                                DynamicExportSymbols);
+  updateVCallVisibilityInModule(
+      *RegularLTO.CombinedModule, WholeProgramVisibilityEnabledInLTO,
+      DynamicExportSymbols, Conf.ValidateAllVtablesHaveTypeInfos,
+      IsVisibleToRegularObj);
   updatePublicTypeTestCalls(*RegularLTO.CombinedModule,
-                            Conf.HasWholeProgramVisibility);
+                            WholeProgramVisibilityEnabledInLTO);
 
   if (Conf.PreOptModuleHook &&
       !Conf.PreOptModuleHook(0, *RegularLTO.CombinedModule))
@@ -1521,13 +1535,38 @@ Error LTO::runThinLTO(AddStreamFn AddStream, FileCache Cache,
 
   std::set<GlobalValue::GUID> ExportedGUIDs;
 
-  if (hasWholeProgramVisibility(Conf.HasWholeProgramVisibility))
+  bool WholeProgramVisibilityEnabledInLTO =
+      Conf.HasWholeProgramVisibility &&
+      // If validation is enabled, upgrade visibility only when all vtables
+      // have typeinfos.
+      (!Conf.ValidateAllVtablesHaveTypeInfos || Conf.AllVtablesHaveTypeInfos);
+  if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     ThinLTO.CombinedIndex.setWithWholeProgramVisibility();
+
+  // If we're validating, get the vtable symbols that should not be
+  // upgraded because they correspond to typeIDs outside of index-based
+  // WPD info.
+  DenseSet<GlobalValue::GUID> VisibleToRegularObjSymbols;
+  if (WholeProgramVisibilityEnabledInLTO &&
+      Conf.ValidateAllVtablesHaveTypeInfos) {
+    // This returns true when the name is local or not defined. Locals are
+    // expected to be handled separately.
+    auto IsVisibleToRegularObj = [&](StringRef name) {
+      auto It = GlobalResolutions.find(name);
+      return (It == GlobalResolutions.end() ||
+              It->second.VisibleOutsideSummary);
+    };
+
+    getVisibleToRegularObjVtableGUIDs(ThinLTO.CombinedIndex,
+                                      VisibleToRegularObjSymbols,
+                                      IsVisibleToRegularObj);
+  }
+
   // If allowed, upgrade public vcall visibility to linkage unit visibility in
   // the summaries before whole program devirtualization below.
-  updateVCallVisibilityInIndex(ThinLTO.CombinedIndex,
-                               Conf.HasWholeProgramVisibility,
-                               DynamicExportSymbols);
+  updateVCallVisibilityInIndex(
+      ThinLTO.CombinedIndex, WholeProgramVisibilityEnabledInLTO,
+      DynamicExportSymbols, VisibleToRegularObjSymbols);
 
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
diff --git a/llvm/lib/LTO/LTOCodeGenerator.cpp b/llvm/lib/LTO/LTOCodeGenerator.cpp
index ae7b7e4b5481..d7aed2fbc2a1 100644
--- a/llvm/lib/LTO/LTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/LTOCodeGenerator.cpp
@@ -604,11 +604,14 @@ bool LTOCodeGenerator::optimize() {
   // pipeline run below.
   updatePublicTypeTestCalls(*MergedModule,
                             /* WholeProgramVisibilityEnabledInLTO */ false);
-  updateVCallVisibilityInModule(*MergedModule,
-                                /* WholeProgramVisibilityEnabledInLTO */ false,
-                                // FIXME: This needs linker information via a
-                                // TBD new interface.
-                                /* DynamicExportSymbols */ {});
+  updateVCallVisibilityInModule(
+      *MergedModule,
+      /* WholeProgramVisibilityEnabledInLTO */ false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
 
   // We always run the verifier once on the merged module, the `DisableVerify`
   // parameter only applies to subsequent verify.
diff --git a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
index 5b137a8f8cb3..0d2e66008f1f 100644
--- a/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
+++ b/llvm/lib/LTO/ThinLTOCodeGenerator.cpp
@@ -1053,11 +1053,14 @@ void ThinLTOCodeGenerator::run() {
   // via the internal option. Must be done before WPD below.
   if (hasWholeProgramVisibility(/* WholeProgramVisibilityEnabledInLTO */ false))
     Index->setWithWholeProgramVisibility();
+
+  // FIXME: This needs linker information via a TBD new interface
   updateVCallVisibilityInIndex(*Index,
-                               /* WholeProgramVisibilityEnabledInLTO */ false,
-                               // FIXME: This needs linker information via a
+                               /*WholeProgramVisibilityEnabledInLTO=*/false,
+                               // FIXME: These need linker information via a
                                // TBD new interface.
-                               /* DynamicExportSymbols */ {});
+                               /*DynamicExportSymbols=*/{},
+                               /*VisibleToRegularObjSymbols=*/{});
 
   // Perform index-based WPD. This will return immediately if there are
   // no index entries in the typeIdMetadata map (e.g. if we are instead
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 487a0a4a97f7..f60cd1c2b2ec 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -780,12 +780,52 @@ bool hasWholeProgramVisibility(bool WholeProgramVisibilityEnabledInLTO) {
          !DisableWholeProgramVisibility;
 }
 
+static bool
+typeIDVisibleToRegularObj(StringRef TypeID,
+                          function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  // TypeID for member function pointer type is an internal construct
+  // and won't exist in IsVisibleToRegularObj. The full TypeID
+  // will be present and participate in invalidation.
+  if (TypeID.ends_with(".virtual"))
+    return false;
+
+  // TypeID that doesn't start with Itanium mangling (_ZTS) will be
+  // non-externally visible types which cannot interact with
+  // external native files. See CodeGenModule::CreateMetadataIdentifierImpl.
+  if (!TypeID.consume_front("_ZTS"))
+    return false;
+
+  // TypeID is keyed off the type name symbol (_ZTS). However, the native
+  // object may not contain this symbol if it does not contain a key
+  // function for the base type and thus only contains a reference to the
+  // type info (_ZTI). To catch this case we query using the type info
+  // symbol corresponding to the TypeID.
+  std::string typeInfo = ("_ZTI" + TypeID).str();
+  return IsVisibleToRegularObj(typeInfo);
+}
+
+static bool
+skipUpdateDueToValidation(GlobalVariable &GV,
+                          function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  SmallVector<MDNode *, 2> Types;
+  GV.getMetadata(LLVMContext::MD_type, Types);
+
+  for (auto Type : Types)
+    if (auto *TypeID = dyn_cast<MDString>(Type->getOperand(1).get()))
+      return typeIDVisibleToRegularObj(TypeID->getString(),
+                                       IsVisibleToRegularObj);
+
+  return false;
+}
+
 /// If whole program visibility asserted, then upgrade all public vcall
 /// visibility metadata on vtable definitions to linkage unit visibility in
 /// Module IR (for regular or hybrid LTO).
 void updateVCallVisibilityInModule(
     Module &M, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols) {
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    bool ValidateAllVtablesHaveTypeInfos,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj) {
   if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     return;
   for (GlobalVariable &GV : M.globals()) {
@@ -796,7 +836,13 @@ void updateVCallVisibilityInModule(
         GV.getVCallVisibility() == GlobalObject::VCallVisibilityPublic &&
         // Don't upgrade the visibility for symbols exported to the dynamic
         // linker, as we have no information on their eventual use.
-        !DynamicExportSymbols.count(GV.getGUID()))
+        !DynamicExportSymbols.count(GV.getGUID()) &&
+        // With validation enabled, we want to exclude symbols visible to
+        // regular objects. Local symbols will be in this group due to the
+        // current implementation but those with VCallVisibilityTranslationUnit
+        // will have already been marked in clang so are unaffected.
+        !(ValidateAllVtablesHaveTypeInfos &&
+          skipUpdateDueToValidation(GV, IsVisibleToRegularObj)))
       GV.setVCallVisibilityMetadata(GlobalObject::VCallVisibilityLinkageUnit);
   }
 }
@@ -828,12 +874,26 @@ void updatePublicTypeTestCalls(Module &M,
   }
 }
 
+/// Based on typeID string, get all associated vtable GUIDS that are
+/// visible to regular objects.
+void getVisibleToRegularObjVtableGUIDs(
+    ModuleSummaryIndex &Index,
+    DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols,
+    function_ref<bool(StringRef)> IsVisibleToRegularObj) {
+  for (const auto &typeID : Index.typeIdCompatibleVtableMap()) {
+    if (typeIDVisibleToRegularObj(typeID.first, IsVisibleToRegularObj))
+      for (const TypeIdOffsetVtableInfo &P : typeID.second)
+        VisibleToRegularObjSymbols.insert(P.VTableVI.getGUID());
+  }
+}
+
 /// If whole program visibility asserted, then upgrade all public vcall
 /// visibility metadata on vtable definition summaries to linkage unit
 /// visibility in Module summary index (for ThinLTO).
 void updateVCallVisibilityInIndex(
     ModuleSummaryIndex &Index, bool WholeProgramVisibilityEnabledInLTO,
-    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols) {
+    const DenseSet<GlobalValue::GUID> &DynamicExportSymbols,
+    const DenseSet<GlobalValue::GUID> &VisibleToRegularObjSymbols) {
   if (!hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO))
     return;
   for (auto &P : Index) {
@@ -846,6 +906,12 @@ void updateVCallVisibilityInIndex(
       if (!GVar ||
           GVar->getVCallVisibility() != GlobalObject::VCallVisibilityPublic)
         continue;
+      // With validation enabled, we want to exclude symbols visible to regular
+      // objects. Local symbols will be in this group due to the current
+      // implementation but those with VCallVisibilityTranslationUnit will have
+      // already been marked in clang so are unaffected.
+      if (VisibleToRegularObjSymbols.count(P.first))
+        continue;
       GVar->setVCallVisibility(GlobalObject::VCallVisibilityLinkageUnit);
     }
   }
@@ -1032,8 +1098,8 @@ bool DevirtModule::tryFindVirtualCallTargets(
 }
 
 bool DevirtIndex::tryFindVirtualCallTargets(
-    std::vector<ValueInfo> &TargetsForSlot, const TypeIdCompatibleVtableInfo TIdInfo,
-    uint64_t ByteOffset) {
+    std::vector<ValueInfo> &TargetsForSlot,
+    const TypeIdCompatibleVtableInfo TIdInfo, uint64_t ByteOffset) {
   for (const TypeIdOffsetVtableInfo &P : TIdInfo) {
     // Find a representative copy of the vtable initializer.
     // We can have multiple available_externally, linkonce_odr and weak_odr
diff --git a/llvm/tools/opt/opt.cpp b/llvm/tools/opt/opt.cpp
index 40632b43e73b..f0459f476054 100644
--- a/llvm/tools/opt/opt.cpp
+++ b/llvm/tools/opt/opt.cpp
@@ -582,9 +582,14 @@ int main(int argc, char **argv) {
   // the facility for updating public visibility to linkage unit visibility when
   // specified by an internal option. This is normally done during LTO which is
   // not performed via opt.
-  updateVCallVisibilityInModule(*M,
-                                /* WholeProgramVisibilityEnabledInLTO */ false,
-                                /* DynamicExportSymbols */ {});
+  updateVCallVisibilityInModule(
+      *M,
+      /*WholeProgramVisibilityEnabledInLTO=*/false,
+      // FIXME: These need linker information via a
+      // TBD new interface.
+      /*DynamicExportSymbols=*/{},
+      /*ValidateAllVtablesHaveTypeInfos=*/false,
+      /*IsVisibleToRegularObj=*/[](StringRef) { return true; });
 
   // Figure out what stream we are supposed to write to...
   std::unique_ptr<ToolOutputFile> Out;
-- 
Gitee


From e15e1805721c6fbd867b766a1903756a5ae582dc Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Mon, 25 Sep 2023 09:49:40 -0700
Subject: [PATCH 06/10] [Backport][ELF] Change --call-graph-profile-sort to
 accept an argument

Change the FF form --call-graph-profile-sort to --call-graph-profile-sort={none,hfsort}.
This will be extended to support llvm/lib/Transforms/Utils/CodeLayout.cpp.

--call-graph-profile-sort is not used in the wild but
--no-call-graph-profile-sort is (Chromium). Make --no-call-graph-profile-sort an
alias for --call-graph-profile-sort=none.

Reviewed By: rahmanl

Differential Revision: https://reviews.llvm.org/D159544
---
 lld/ELF/Config.h             |  5 ++++-
 lld/ELF/Driver.cpp           | 16 ++++++++++++----
 lld/ELF/Options.td           |  9 ++++++---
 lld/docs/ld.lld.1            | 11 +++++++++++
 lld/test/ELF/cgprofile-obj.s |  5 ++++-
 lld/test/ELF/cgprofile-txt.s | 10 +++++++++-
 6 files changed, 46 insertions(+), 10 deletions(-)

diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index 4ba7bc12d038..cca6ebb6791a 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -57,6 +57,9 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All };
 // For --build-id.
 enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid };
 
+// For --call-graph-profile-sort={none,hfsort}.
+enum class CGProfileSortKind { None, Hfsort };
+
 // For --discard-{all,locals,none}.
 enum class DiscardPolicy { Default, All, Locals, None };
 
@@ -193,7 +196,7 @@ struct Config {
   bool armJ1J2BranchEncoding = false;
   bool asNeeded = false;
   BsymbolicKind bsymbolic = BsymbolicKind::None;
-  bool callGraphProfileSort;
+  CGProfileSortKind callGraphProfileSort;
   bool checkSections;
   bool checkDynamicRelocs;
   llvm::DebugCompressionType compressDebugSections;
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index d59389728622..1f8e48d83fc8 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1032,6 +1032,15 @@ static void ltoValidateAllVtablesHaveTypeInfos(opt::InputArgList &args) {
   }
 }
 
+static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) {
+  StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort");
+  if (s == "hfsort")
+    return CGProfileSortKind::Hfsort;
+  if (s != "none")
+    error("unknown --call-graph-profile-sort= value: " + s);
+  return CGProfileSortKind::None;
+}
+
 static DebugCompressionType getCompressionType(StringRef s, StringRef option) {
   DebugCompressionType type = StringSwitch<DebugCompressionType>(s)
                                   .Case("zlib", DebugCompressionType::Zlib)
@@ -1134,6 +1143,7 @@ static void readConfigs(opt::InputArgList &args) {
     else if (arg->getOption().matches(OPT_Bsymbolic))
       config->bsymbolic = BsymbolicKind::All;
   }
+  config->callGraphProfileSort = getCGProfileSortKind(args);
   config->checkSections =
       args.hasFlag(OPT_check_sections, OPT_no_check_sections, true);
   config->chroot = args.getLastArgValue(OPT_chroot);
@@ -1154,8 +1164,6 @@ static void readConfigs(opt::InputArgList &args) {
       args.hasFlag(OPT_eh_frame_hdr, OPT_no_eh_frame_hdr, false);
   config->emitLLVM = args.hasArg(OPT_plugin_opt_emit_llvm, false);
   config->emitRelocs = args.hasArg(OPT_emit_relocs);
-  config->callGraphProfileSort = args.hasFlag(
-      OPT_call_graph_profile_sort, OPT_no_call_graph_profile_sort, true);
   config->enableNewDtags =
       args.hasFlag(OPT_enable_new_dtags, OPT_disable_new_dtags, true);
   config->entry = args.getLastArgValue(OPT_entry);
@@ -1526,7 +1534,7 @@ static void readConfigs(opt::InputArgList &args) {
       config->symbolOrderingFile = getSymbolOrderingFile(*buffer);
       // Also need to disable CallGraphProfileSort to prevent
       // LLD order symbols with CGProfile
-      config->callGraphProfileSort = false;
+      config->callGraphProfileSort = CGProfileSortKind::None;
     }
   }
 
@@ -2914,7 +2922,7 @@ void LinkerDriver::link(opt::InputArgList &args) {
   }
 
   // Read the callgraph now that we know what was gced or icfed
-  if (config->callGraphProfileSort) {
+  if (config->callGraphProfileSort != CGProfileSortKind::None) {
     if (auto *arg = args.getLastArg(OPT_call_graph_ordering_file))
       if (std::optional<MemoryBufferRef> buffer = readFile(arg->getValue()))
         readCallGraph(*buffer);
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index e4daea072924..e359a12597a9 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -111,9 +111,12 @@ defm as_needed: B<"as-needed",
 defm call_graph_ordering_file:
   Eq<"call-graph-ordering-file", "Layout sections to optimize the given callgraph">;
 
-defm call_graph_profile_sort: BB<"call-graph-profile-sort",
-    "Reorder sections with call graph profile (default)",
-    "Do not reorder sections with call graph profile">;
+def call_graph_profile_sort: JJ<"call-graph-profile-sort=">,
+  HelpText<"Reorder input sections with call graph profile using the specified algorithm (default: hfsort)">,
+  MetaVarName<"[none,hfsort]">,
+  Values<"none,hfsort">;
+def : FF<"no-call-graph-profile-sort">, Alias<call_graph_profile_sort>, AliasArgs<["none"]>,
+  Flags<[HelpHidden]>;
 
 // --chroot doesn't have a help text because it is an internal option.
 def chroot: Separate<["--"], "chroot">;
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index edeb7c4bfe37..dc64afabd7f5 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -115,6 +115,17 @@ is not intended to be cryptographically secure.
 .It Fl -build-id
 Synonym for
 .Fl -build-id Ns = Ns Cm fast .
+.It Fl -call-graph-profile-sort Ns = Ns Ar algorithm
+.Ar algorithm
+may be:
+.Pp
+.Bl -tag -width 2n -compact
+.It Cm none
+Ignore call graph profile.
+.It Cm hfsort
+Use hfsort (default).
+.El
+.Pp
 .It Fl -color-diagnostics Ns = Ns Ar value
 Use colors in diagnostics.
 .Ar value
diff --git a/lld/test/ELF/cgprofile-obj.s b/lld/test/ELF/cgprofile-obj.s
index f56f3bcbf0c3..0848adc5e427 100644
--- a/lld/test/ELF/cgprofile-obj.s
+++ b/lld/test/ELF/cgprofile-obj.s
@@ -3,8 +3,11 @@
 # RUN: llvm-mc -filetype=obj -triple=x86_64-unknown-linux %s -o %t.o
 # RUN: ld.lld -e A %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s
-# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t
+# RUN: ld.lld --call-graph-profile-sort=none -e A %t.o -o %t
 # RUN: llvm-nm --no-sort %t | FileCheck %s --check-prefix=NO-CG
+## --no-call-graph-profile-sort is an alias for --call-graph-profile-sort=none.
+# RUN: ld.lld --no-call-graph-profile-sort -e A %t.o -o %t1
+# RUN: cmp %t %t1
 
     .section    .text.D,"ax",@progbits
 D:
diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s
index 99cbfa574532..2c0c9642a509 100644
--- a/lld/test/ELF/cgprofile-txt.s
+++ b/lld/test/ELF/cgprofile-txt.s
@@ -24,8 +24,16 @@
 # RUN: echo "TooManyPreds8 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds9 TooManyPreds 10" >> %t.call_graph
 # RUN: echo "TooManyPreds10 TooManyPreds 11" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2
 # RUN: llvm-readobj --symbols %t2 | FileCheck %s
+## --call-graph-profile-sort=hfsort is the default.
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b
+# RUN: cmp %t2 %t2b
+
+# RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \
+# RUN:   -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN
+
+# UNKNOWN: error: unknown --call-graph-profile-sort= value: sort
 
     .section    .text.D,"ax",@progbits
 D:
-- 
Gitee


From d5327576c8a5b22521eb37c299859275f55e8dad Mon Sep 17 00:00:00 2001
From: spupyrev <spupyrev@fb.com>
Date: Tue, 13 Jun 2023 10:08:00 -0700
Subject: [PATCH 07/10] [Backport][ELF] A new code layout algorithm for
 function reordering [3a/3]

We are brining a new algorithm for function layout (reordering) based on the
call graph (extracted from a profile data). The algorithm is an improvement of
top of a known heuristic, C^3. It tries to co-locate hot and frequently executed
together functions in the resulting ordering. Unlike C^3, it explores a larger
search space and have an objective closely tied to the performance of
instruction and i-TLB caches. Hence, the name CDS = Cache-Directed Sort.
The algorithm can be used at the linking or post-linking (e.g., BOLT) stage.
Refer to https://reviews.llvm.org/D152834 for the actual implementation of the
reordering algorithm.

This diff adds a linker option to replace the existing C^3 heuristic with CDS.
The new behavior can be turned on by passing "--use-cache-directed-sort".
(the plan is to make it default in a next diff)

**Perf-impact**
clang-10 binary (built with LTO+AutoFDO/CSSPGO): wins on top of C^3 in [0.3%..0.8%]
rocksDB-8 binary (built with LTO+CSSPGO): wins on top of C^3 in [0.8%..1.5%]

Note that function layout affects the perf the most on older machines (with
smaller instruction/iTLB caches) and when huge pages are not enabled. The impact
on newer processors with huge pages enabled is likely neutral/minor.

Reviewed By: MaskRay

Differential Revision: https://reviews.llvm.org/D152840
---
 lld/ELF/CMakeLists.txt        |   1 +
 lld/ELF/CallGraphSort.cpp     | 140 ++++++++++++++++++++++++++--------
 lld/ELF/CallGraphSort.h       |   2 +
 lld/ELF/Config.h              |   4 +-
 lld/ELF/Driver.cpp            |   2 +
 lld/ELF/Options.td            |   2 +-
 lld/docs/ld.lld.1             |   2 +
 lld/test/ELF/cgprofile-txt.s  |  28 +++++++
 lld/test/ELF/cgprofile-txt2.s |  31 +++++---
 9 files changed, 166 insertions(+), 46 deletions(-)

diff --git a/lld/ELF/CMakeLists.txt b/lld/ELF/CMakeLists.txt
index 8e6a746d219e..3a571b8d7b78 100644
--- a/lld/ELF/CMakeLists.txt
+++ b/lld/ELF/CMakeLists.txt
@@ -72,6 +72,7 @@ add_lld_library(lldELF
   Passes
   Support
   TargetParser
+  TransformUtils
 
   LINK_LIBS
   lldCommon
diff --git a/lld/ELF/CallGraphSort.cpp b/lld/ELF/CallGraphSort.cpp
index ff72731b1f38..5e36964da94f 100644
--- a/lld/ELF/CallGraphSort.cpp
+++ b/lld/ELF/CallGraphSort.cpp
@@ -6,38 +6,21 @@
 //
 //===----------------------------------------------------------------------===//
 ///
-/// Implementation of Call-Chain Clustering from: Optimizing Function Placement
-/// for Large-Scale Data-Center Applications
-/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
-///
-/// The goal of this algorithm is to improve runtime performance of the final
-/// executable by arranging code sections such that page table and i-cache
-/// misses are minimized.
-///
-/// Definitions:
-/// * Cluster
-///   * An ordered list of input sections which are laid out as a unit. At the
-///     beginning of the algorithm each input section has its own cluster and
-///     the weight of the cluster is the sum of the weight of all incoming
-///     edges.
-/// * Call-Chain Clustering (C³) Heuristic
-///   * Defines when and how clusters are combined. Pick the highest weighted
-///     input section then add it to its most likely predecessor if it wouldn't
-///     penalize it too much.
-/// * Density
-///   * The weight of the cluster divided by the size of the cluster. This is a
-///     proxy for the amount of execution time spent per byte of the cluster.
-///
-/// It does so given a call graph profile by the following:
-/// * Build a weighted call graph from the call graph profile
-/// * Sort input sections by weight
-/// * For each input section starting with the highest weight
-///   * Find its most likely predecessor cluster
-///   * Check if the combined cluster would be too large, or would have too low
-///     a density.
-///   * If not, then combine the clusters.
-/// * Sort non-empty clusters by density
+/// The file is responsible for sorting sections using LLVM call graph profile
+/// data by placing frequently executed code sections together. The goal of the
+/// placement is to improve the runtime performance of the final executable by
+/// arranging code sections so that i-TLB misses and i-cache misses are reduced.
 ///
+/// The algorithm first builds a call graph based on the profile data and then
+/// iteratively merges "chains" (ordered lists) of input sections which will be
+/// laid out as a unit. There are two implementations for deciding how to
+/// merge a pair of chains:
+///  - a simpler one, referred to as Call-Chain Clustering (C^3), that follows
+///    "Optimizing Function Placement for Large-Scale Data-Center Applications"
+/// https://research.fb.com/wp-content/uploads/2017/01/cgo2017-hfsort-final1.pdf
+/// - a more advanced one, referred to as Cache-Directed-Sort (CDSort), which
+///   typically produces layouts with higher locality, and hence, yields fewer
+///   instruction cache misses on large binaries.
 //===----------------------------------------------------------------------===//
 
 #include "CallGraphSort.h"
@@ -45,6 +28,7 @@
 #include "InputSection.h"
 #include "Symbols.h"
 #include "llvm/Support/FileSystem.h"
+#include "llvm/Transforms/Utils/CodeLayout.h"
 
 #include <numeric>
 
@@ -75,6 +59,33 @@ struct Cluster {
   Edge bestPred = {-1, 0};
 };
 
+/// Implementation of the Call-Chain Clustering (C^3). The goal of this
+/// algorithm is to improve runtime performance of the executable by arranging
+/// code sections such that page table and i-cache misses are minimized.
+///
+/// Definitions:
+/// * Cluster
+///   * An ordered list of input sections which are laid out as a unit. At the
+///     beginning of the algorithm each input section has its own cluster and
+///     the weight of the cluster is the sum of the weight of all incoming
+///     edges.
+/// * Call-Chain Clustering (C³) Heuristic
+///   * Defines when and how clusters are combined. Pick the highest weighted
+///     input section then add it to its most likely predecessor if it wouldn't
+///     penalize it too much.
+/// * Density
+///   * The weight of the cluster divided by the size of the cluster. This is a
+///     proxy for the amount of execution time spent per byte of the cluster.
+///
+/// It does so given a call graph profile by the following:
+/// * Build a weighted call graph from the call graph profile
+/// * Sort input sections by weight
+/// * For each input section starting with the highest weight
+///   * Find its most likely predecessor cluster
+///   * Check if the combined cluster would be too large, or would have too low
+///     a density.
+///   * If not, then combine the clusters.
+/// * Sort non-empty clusters by density
 class CallGraphSort {
 public:
   CallGraphSort();
@@ -260,11 +271,74 @@ DenseMap<const InputSectionBase *, int> CallGraphSort::run() {
   return orderMap;
 }
 
+// Sort sections by the profile data using the Cache-Directed Sort algorithm.
+// The placement is done by optimizing the locality by co-locating frequently
+// executed code sections together.
+DenseMap<const InputSectionBase *, int> elf::computeCacheDirectedSortOrder() {
+  SmallVector<uint64_t, 0> funcSizes;
+  SmallVector<uint64_t, 0> funcCounts;
+  SmallVector<codelayout::EdgeCount, 0> callCounts;
+  SmallVector<uint64_t, 0> callOffsets;
+  SmallVector<const InputSectionBase *, 0> sections;
+  DenseMap<const InputSectionBase *, size_t> secToTargetId;
+
+  auto getOrCreateNode = [&](const InputSectionBase *inSec) -> size_t {
+    auto res = secToTargetId.try_emplace(inSec, sections.size());
+    if (res.second) {
+      // inSec does not appear before in the graph.
+      sections.push_back(inSec);
+      assert(inSec->getSize() > 0 && "found a function with zero size");
+      funcSizes.push_back(inSec->getSize());
+      funcCounts.push_back(0);
+    }
+    return res.first->second;
+  };
+
+  // Create the graph.
+  for (std::pair<SectionPair, uint64_t> &c : config->callGraphProfile) {
+    const InputSectionBase *fromSB = cast<InputSectionBase>(c.first.first);
+    const InputSectionBase *toSB = cast<InputSectionBase>(c.first.second);
+    // Ignore edges between input sections belonging to different sections.
+    if (fromSB->getOutputSection() != toSB->getOutputSection())
+      continue;
+
+    uint64_t weight = c.second;
+    // Ignore edges with zero weight.
+    if (weight == 0)
+      continue;
+
+    size_t from = getOrCreateNode(fromSB);
+    size_t to = getOrCreateNode(toSB);
+    // Ignore self-edges (recursive calls).
+    if (from == to)
+      continue;
+
+    callCounts.push_back({from, to, weight});
+    // Assume that the jump is at the middle of the input section. The profile
+    // data does not contain jump offsets.
+    callOffsets.push_back((funcSizes[from] + 1) / 2);
+    funcCounts[to] += weight;
+  }
+
+  // Run the layout algorithm.
+  std::vector<uint64_t> sortedSections = codelayout::computeCacheDirectedLayout(
+      funcSizes, funcCounts, callCounts, callOffsets);
+
+  // Create the final order.
+  DenseMap<const InputSectionBase *, int> orderMap;
+  int curOrder = 1;
+  for (uint64_t secIdx : sortedSections)
+    orderMap[sections[secIdx]] = curOrder++;
+
+  return orderMap;
+}
+
 // Sort sections by the profile data provided by --callgraph-profile-file.
 //
 // This first builds a call graph based on the profile data then merges sections
-// according to the C³ heuristic. All clusters are then sorted by a density
-// metric to further improve locality.
+// according either to the C³ or Cache-Directed-Sort ordering algorithm.
 DenseMap<const InputSectionBase *, int> elf::computeCallGraphProfileOrder() {
+  if (config->callGraphProfileSort == CGProfileSortKind::Cdsort)
+    return computeCacheDirectedSortOrder();
   return CallGraphSort().run();
 }
diff --git a/lld/ELF/CallGraphSort.h b/lld/ELF/CallGraphSort.h
index 4997cb102c32..1b54f2b62482 100644
--- a/lld/ELF/CallGraphSort.h
+++ b/lld/ELF/CallGraphSort.h
@@ -14,6 +14,8 @@
 namespace lld::elf {
 class InputSectionBase;
 
+llvm::DenseMap<const InputSectionBase *, int> computeCacheDirectedSortOrder();
+
 llvm::DenseMap<const InputSectionBase *, int> computeCallGraphProfileOrder();
 } // namespace lld::elf
 
diff --git a/lld/ELF/Config.h b/lld/ELF/Config.h
index cca6ebb6791a..90a1e312735f 100644
--- a/lld/ELF/Config.h
+++ b/lld/ELF/Config.h
@@ -57,8 +57,8 @@ enum class BsymbolicKind { None, NonWeakFunctions, Functions, All };
 // For --build-id.
 enum class BuildIdKind { None, Fast, Md5, Sha1, Hexstring, Uuid };
 
-// For --call-graph-profile-sort={none,hfsort}.
-enum class CGProfileSortKind { None, Hfsort };
+// For --call-graph-profile-sort={none,hfsort,cdsort}.
+enum class CGProfileSortKind { None, Hfsort, Cdsort };
 
 // For --discard-{all,locals,none}.
 enum class DiscardPolicy { Default, All, Locals, None };
diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index 1f8e48d83fc8..b666602a5586 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -1036,6 +1036,8 @@ static CGProfileSortKind getCGProfileSortKind(opt::InputArgList &args) {
   StringRef s = args.getLastArgValue(OPT_call_graph_profile_sort, "hfsort");
   if (s == "hfsort")
     return CGProfileSortKind::Hfsort;
+  if (s == "cdsort")
+    return CGProfileSortKind::Cdsort;
   if (s != "none")
     error("unknown --call-graph-profile-sort= value: " + s);
   return CGProfileSortKind::None;
diff --git a/lld/ELF/Options.td b/lld/ELF/Options.td
index e359a12597a9..c91111006942 100644
--- a/lld/ELF/Options.td
+++ b/lld/ELF/Options.td
@@ -572,7 +572,7 @@ def lto_cs_profile_file: JJ<"lto-cs-profile-file=">,
 defm lto_pgo_warn_mismatch: BB<"lto-pgo-warn-mismatch",
   "turn on warnings about profile cfg mismatch (default)>",
   "turn off warnings about profile cfg mismatch">;
-defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables", 
+defm lto_known_safe_vtables : EEq<"lto-known-safe-vtables",
   "When --lto-validate-all-vtables-have-type-infos is enabled, skip validation on these vtables (_ZTV symbols)">;
 def lto_obj_path_eq: JJ<"lto-obj-path=">;
 def lto_sample_profile: JJ<"lto-sample-profile=">,
diff --git a/lld/docs/ld.lld.1 b/lld/docs/ld.lld.1
index dc64afabd7f5..35ee7ff0447d 100644
--- a/lld/docs/ld.lld.1
+++ b/lld/docs/ld.lld.1
@@ -124,6 +124,8 @@ may be:
 Ignore call graph profile.
 .It Cm hfsort
 Use hfsort (default).
+.It Cm cdsort
+Use cdsort.
 .El
 .Pp
 .It Fl -color-diagnostics Ns = Ns Ar value
diff --git a/lld/test/ELF/cgprofile-txt.s b/lld/test/ELF/cgprofile-txt.s
index 2c0c9642a509..c9194bbbc43c 100644
--- a/lld/test/ELF/cgprofile-txt.s
+++ b/lld/test/ELF/cgprofile-txt.s
@@ -30,6 +30,9 @@
 # RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2b
 # RUN: cmp %t2 %t2b
 
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CDSORT
+
 # RUN: not ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=sort \
 # RUN:   -o /dev/null 2>&1 | FileCheck %s --check-prefix=UNKNOWN
 
@@ -167,6 +170,31 @@ TooManyPreds10:
 # CHECK:          Name: _init2
 # CHECK-NEXT:     Value: 0x201141
 
+# CDSORT:          Name: D
+# CDSORT-NEXT:     Value: 0x201123
+# CDSORT:          Name: TooManyPreds
+# CDSORT-NEXT:     Value: 0x20112F
+# CDSORT:          Name: TooManyPreds10
+# CDSORT-NEXT:     Value: 0x20112E
+# CDSORT:          Name: C
+# CDSORT-NEXT:     Value: 0x201122
+# CDSORT:          Name: B
+# CDSORT-NEXT:     Value: 0x201121
+# CDSORT:          Name: A
+# CDSORT-NEXT:     Value: 0x201120
+# CDSORT:          Name: TS
+# CDSORT-NEXT:     Value: 0x20113D
+# CDSORT:          Name: PP
+# CDSORT-NEXT:     Value: 0x20113C
+# CDSORT:          Name: QC
+# CDSORT-NEXT:     Value: 0x20113E
+# CDSORT:          Name: GB
+# CDSORT-NEXT:     Value: 0x20113F
+# CDSORT:          Name: _init
+# CDSORT-NEXT:     Value: 0x201140
+# CDSORT:          Name: _init2
+# CDSORT-NEXT:     Value: 0x201141
+
 # NOSORT:          Name: D
 # NOSORT-NEXT:     Value: 0x201120
 # NOSORT:          Name: TooManyPreds
diff --git a/lld/test/ELF/cgprofile-txt2.s b/lld/test/ELF/cgprofile-txt2.s
index 91961db39c3a..b59b6eeb292f 100644
--- a/lld/test/ELF/cgprofile-txt2.s
+++ b/lld/test/ELF/cgprofile-txt2.s
@@ -5,17 +5,28 @@
 # RUN: echo "B C 50" >> %t.call_graph
 # RUN: echo "C D 40" >> %t.call_graph
 # RUN: echo "D B 10" >> %t.call_graph
-# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph -o %t2
-# RUN: llvm-readobj --symbols %t2 | FileCheck %s
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=hfsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKC3
+# RUN: ld.lld -e A %t --call-graph-ordering-file %t.call_graph --call-graph-profile-sort=cdsort -o %t2
+# RUN: llvm-readobj --symbols %t2 | FileCheck %s --check-prefix=CHECKCDS
 
-# CHECK:      Name: A
-# CHECK-NEXT: Value: 0x201123
-# CHECK:      Name: B
-# CHECK-NEXT: Value: 0x201120
-# CHECK:      Name: C
-# CHECK-NEXT: Value: 0x201121
-# CHECK:      Name: D
-# CHECK-NEXT: Value: 0x201122
+# CHECKC3:      Name: A
+# CHECKC3-NEXT: Value: 0x201123
+# CHECKC3:      Name: B
+# CHECKC3-NEXT: Value: 0x201120
+# CHECKC3:      Name: C
+# CHECKC3-NEXT: Value: 0x201121
+# CHECKC3:      Name: D
+# CHECKC3-NEXT: Value: 0x201122
+
+# CHECKCDS:      Name: A
+# CHECKCDS-NEXT: Value: 0x201120
+# CHECKCDS:      Name: B
+# CHECKCDS-NEXT: Value: 0x201121
+# CHECKCDS:      Name: C
+# CHECKCDS-NEXT: Value: 0x201122
+# CHECKCDS:      Name: D
+# CHECKCDS-NEXT: Value: 0x201123
 
 .section    .text.A,"ax",@progbits
 .globl  A
-- 
Gitee


From 7d204af9bb389e9582596a3f889ecd380beee8b7 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Thu, 21 Sep 2023 13:13:03 -0700
Subject: [PATCH 08/10] [Backport][CodeLayout] Refactor std::vector uses,
 namespace, and EdgeCountT. NFC

* Place types and functions in the llvm::codelayout namespace
* Change EdgeCountT from pair<pair<uint64_t, uint64_t>, uint64_t> to a struct and utilize structured bindings.
  It is not conventional to use the "T" suffix for structure types.
* Remove a redundant copy in ChainT::merge.
* Change {ExtTSPImpl,CDSortImpl}::run to use return value instead of an output parameter
* Rename applyCDSLayout to computeCacheDirectedLayout: (a) avoid rare
  abbreviation "CDS" (cache-directed sort) (b) "compute" is more conventional
  for the specific use case
* Change the parameter types from std::vector to ArrayRef so that
  SmallVector arguments can be used.
* Similarly, rename applyExtTspLayout to computeExtTspLayout.

Reviewed By: Amir

Differential Revision: https://reviews.llvm.org/D159526
---
 bolt/lib/Passes/ReorderAlgorithm.cpp          |  10 +-
 bolt/lib/Passes/ReorderFunctions.cpp          |  10 +-
 .../llvm/Transforms/Utils/CodeLayout.h        |  49 +++---
 llvm/lib/CodeGen/MachineBlockPlacement.cpp    |   8 +-
 llvm/lib/Transforms/Utils/CodeLayout.cpp      | 150 ++++++++----------
 5 files changed, 106 insertions(+), 121 deletions(-)

diff --git a/bolt/lib/Passes/ReorderAlgorithm.cpp b/bolt/lib/Passes/ReorderAlgorithm.cpp
index b5052cdaddb1..3c3365e1d3d7 100644
--- a/bolt/lib/Passes/ReorderAlgorithm.cpp
+++ b/bolt/lib/Passes/ReorderAlgorithm.cpp
@@ -531,21 +531,21 @@ void ExtTSPReorderAlgorithm::reorderBasicBlocks(BinaryFunction &BF,
   }
 
   // Initialize CFG edges
-  using JumpT = std::pair<uint64_t, uint64_t>;
-  std::vector<std::pair<JumpT, uint64_t>> JumpCounts;
+  std::vector<codelayout::EdgeCount> JumpCounts;
   for (BinaryBasicBlock *BB : BF.getLayout().blocks()) {
     auto BI = BB->branch_info_begin();
     for (BinaryBasicBlock *SuccBB : BB->successors()) {
       assert(BI->Count != BinaryBasicBlock::COUNT_NO_PROFILE &&
              "missing profile for a jump");
-      auto It = std::make_pair(BB->getLayoutIndex(), SuccBB->getLayoutIndex());
-      JumpCounts.push_back(std::make_pair(It, BI->Count));
+      JumpCounts.push_back(
+          {BB->getLayoutIndex(), SuccBB->getLayoutIndex(), BI->Count});
       ++BI;
     }
   }
 
   // Run the layout algorithm
-  auto Result = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  auto Result =
+      codelayout::computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
   Order.reserve(BF.getLayout().block_size());
   for (uint64_t R : Result)
     Order.push_back(OrigOrder[R]);
diff --git a/bolt/lib/Passes/ReorderFunctions.cpp b/bolt/lib/Passes/ReorderFunctions.cpp
index b17aefda5ddc..f0ad54ef106f 100644
--- a/bolt/lib/Passes/ReorderFunctions.cpp
+++ b/bolt/lib/Passes/ReorderFunctions.cpp
@@ -327,23 +327,21 @@ void ReorderFunctions::runOnFunctions(BinaryContext &BC) {
     // Initialize CFG nodes and their data
     std::vector<uint64_t> FuncSizes;
     std::vector<uint64_t> FuncCounts;
-    using JumpT = std::pair<uint64_t, uint64_t>;
-    std::vector<std::pair<JumpT, uint64_t>> CallCounts;
+    std::vector<codelayout::EdgeCount> CallCounts;
     std::vector<uint64_t> CallOffsets;
     for (NodeId F = 0; F < Cg.numNodes(); ++F) {
       FuncSizes.push_back(Cg.size(F));
       FuncCounts.push_back(Cg.samples(F));
       for (NodeId Succ : Cg.successors(F)) {
         const Arc &Arc = *Cg.findArc(F, Succ);
-        auto It = std::make_pair(F, Succ);
-        CallCounts.push_back(std::make_pair(It, Arc.weight()));
+        CallCounts.push_back({F, Succ, uint64_t(Arc.weight())});
         CallOffsets.push_back(uint64_t(Arc.avgCallOffset()));
       }
     }
 
     // Run the layout algorithm.
-    std::vector<uint64_t> Result =
-        applyCDSLayout(FuncSizes, FuncCounts, CallCounts, CallOffsets);
+    std::vector<uint64_t> Result = codelayout::computeCacheDirectedLayout(
+        FuncSizes, FuncCounts, CallCounts, CallOffsets);
 
     // Create a single cluster from the computed order of hot functions.
     Clusters.emplace_back(Cluster(Result, Cg));
diff --git a/llvm/include/llvm/Transforms/Utils/CodeLayout.h b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
index 11a829b601ce..f5127cff24af 100644
--- a/llvm/include/llvm/Transforms/Utils/CodeLayout.h
+++ b/llvm/include/llvm/Transforms/Utils/CodeLayout.h
@@ -14,14 +14,21 @@
 #ifndef LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
 #define LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
 
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 
+#include <utility>
 #include <vector>
 
-namespace llvm {
+namespace llvm::codelayout {
 
 using EdgeT = std::pair<uint64_t, uint64_t>;
-using EdgeCountT = std::pair<EdgeT, uint64_t>;
+
+struct EdgeCount {
+  uint64_t src;
+  uint64_t dst;
+  uint64_t count;
+};
 
 /// Find a layout of nodes (basic blocks) of a given CFG optimizing jump
 /// locality and thus processor I-cache utilization. This is achieved via
@@ -34,24 +41,22 @@ using EdgeCountT = std::pair<EdgeT, uint64_t>;
 /// \p EdgeCounts: The execution counts of every edge (jump) in the profile. The
 ///    map also defines the edges in CFG and should include 0-count edges.
 /// \returns The best block order found.
-std::vector<uint64_t>
-applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
-                  const std::vector<uint64_t> &NodeCounts,
-                  const std::vector<EdgeCountT> &EdgeCounts);
+std::vector<uint64_t> computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
+                                          ArrayRef<uint64_t> NodeCounts,
+                                          ArrayRef<EdgeCount> EdgeCounts);
 
 /// Estimate the "quality" of a given node order in CFG. The higher the score,
 /// the better the order is. The score is designed to reflect the locality of
 /// the given order, which is anti-correlated with the number of I-cache misses
 /// in a typical execution of the function.
-double calcExtTspScore(const std::vector<uint64_t> &Order,
-                       const std::vector<uint64_t> &NodeSizes,
-                       const std::vector<uint64_t> &NodeCounts,
-                       const std::vector<EdgeCountT> &EdgeCounts);
+double calcExtTspScore(ArrayRef<uint64_t> Order, ArrayRef<uint64_t> NodeSizes,
+                       ArrayRef<uint64_t> NodeCounts,
+                       ArrayRef<EdgeCount> EdgeCounts);
 
 /// Estimate the "quality" of the current node order in CFG.
-double calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
-                       const std::vector<uint64_t> &NodeCounts,
-                       const std::vector<EdgeCountT> &EdgeCounts);
+double calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
+                       ArrayRef<uint64_t> NodeCounts,
+                       ArrayRef<EdgeCount> EdgeCounts);
 
 /// Algorithm-specific params for Cache-Directed Sort. The values are tuned for
 /// the best performance of large-scale front-end bound binaries.
@@ -75,18 +80,16 @@ struct CDSortConfig {
 ///    map also defines the edges in CFG and should include 0-count edges.
 /// \p CallOffsets: The offsets of the calls from their source nodes.
 /// \returns The best function order found.
-std::vector<uint64_t> applyCDSLayout(const std::vector<uint64_t> &FuncSizes,
-                                     const std::vector<uint64_t> &FuncCounts,
-                                     const std::vector<EdgeCountT> &CallCounts,
-                                     const std::vector<uint64_t> &CallOffsets);
+std::vector<uint64_t> computeCacheDirectedLayout(
+    ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
+    ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets);
 
 /// Apply a Cache-Directed Sort with a custom config.
-std::vector<uint64_t> applyCDSLayout(const CDSortConfig &Config,
-                                     const std::vector<uint64_t> &FuncSizes,
-                                     const std::vector<uint64_t> &FuncCounts,
-                                     const std::vector<EdgeCountT> &CallCounts,
-                                     const std::vector<uint64_t> &CallOffsets);
+std::vector<uint64_t> computeCacheDirectedLayout(
+    const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
+    ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
+    ArrayRef<uint64_t> CallOffsets);
 
-} // end namespace llvm
+} // namespace llvm::codelayout
 
 #endif // LLVM_TRANSFORMS_UTILS_CODELAYOUT_H
diff --git a/llvm/lib/CodeGen/MachineBlockPlacement.cpp b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
index 7bbc347a8cf8..b6fbc65d83b8 100644
--- a/llvm/lib/CodeGen/MachineBlockPlacement.cpp
+++ b/llvm/lib/CodeGen/MachineBlockPlacement.cpp
@@ -3502,7 +3502,7 @@ void MachineBlockPlacement::applyExtTsp() {
 
   auto BlockSizes = std::vector<uint64_t>(F->size());
   auto BlockCounts = std::vector<uint64_t>(F->size());
-  std::vector<EdgeCountT> JumpCounts;
+  std::vector<codelayout::EdgeCount> JumpCounts;
   for (MachineBasicBlock &MBB : *F) {
     // Getting the block frequency.
     BlockFrequency BlockFreq = MBFI->getBlockFreq(&MBB);
@@ -3521,8 +3521,8 @@ void MachineBlockPlacement::applyExtTsp() {
     for (MachineBasicBlock *Succ : MBB.successors()) {
       auto EP = MBPI->getEdgeProbability(&MBB, Succ);
       BlockFrequency JumpFreq = BlockFreq * EP;
-      auto Jump = std::make_pair(BlockIndex[&MBB], BlockIndex[Succ]);
-      JumpCounts.push_back(std::make_pair(Jump, JumpFreq.getFrequency()));
+      JumpCounts.push_back(
+          {BlockIndex[&MBB], BlockIndex[Succ], JumpFreq.getFrequency()});
     }
   }
 
@@ -3535,7 +3535,7 @@ void MachineBlockPlacement::applyExtTsp() {
                        calcExtTspScore(BlockSizes, BlockCounts, JumpCounts)));
 
   // Run the layout algorithm.
-  auto NewOrder = applyExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
+  auto NewOrder = computeExtTspLayout(BlockSizes, BlockCounts, JumpCounts);
   std::vector<const MachineBasicBlock *> NewBlockOrder;
   NewBlockOrder.reserve(F->size());
   for (uint64_t Node : NewOrder) {
diff --git a/llvm/lib/Transforms/Utils/CodeLayout.cpp b/llvm/lib/Transforms/Utils/CodeLayout.cpp
index 8ffde74ac650..f4a820918ee8 100644
--- a/llvm/lib/Transforms/Utils/CodeLayout.cpp
+++ b/llvm/lib/Transforms/Utils/CodeLayout.cpp
@@ -48,6 +48,8 @@
 #include <set>
 
 using namespace llvm;
+using namespace llvm::codelayout;
+
 #define DEBUG_TYPE "code-layout"
 
 cl::opt<bool> EnableExtTspBlockPlacement(
@@ -316,8 +318,8 @@ struct ChainT {
     Edges.push_back(std::make_pair(Other, Edge));
   }
 
-  void merge(ChainT *Other, const std::vector<NodeT *> &MergedBlocks) {
-    Nodes = MergedBlocks;
+  void merge(ChainT *Other, std::vector<NodeT *> MergedBlocks) {
+    Nodes = std::move(MergedBlocks);
     // Update the chain's data.
     ExecutionCount += Other->ExecutionCount;
     Size += Other->Size;
@@ -547,15 +549,14 @@ MergedChain mergeNodes(const std::vector<NodeT *> &X,
 /// The implementation of the ExtTSP algorithm.
 class ExtTSPImpl {
 public:
-  ExtTSPImpl(const std::vector<uint64_t> &NodeSizes,
-             const std::vector<uint64_t> &NodeCounts,
-             const std::vector<EdgeCountT> &EdgeCounts)
+  ExtTSPImpl(ArrayRef<uint64_t> NodeSizes, ArrayRef<uint64_t> NodeCounts,
+             ArrayRef<EdgeCount> EdgeCounts)
       : NumNodes(NodeSizes.size()) {
     initialize(NodeSizes, NodeCounts, EdgeCounts);
   }
 
   /// Run the algorithm and return an optimized ordering of nodes.
-  void run(std::vector<uint64_t> &Result) {
+  std::vector<uint64_t> run() {
     // Pass 1: Merge nodes with their mutually forced successors
     mergeForcedPairs();
 
@@ -566,14 +567,14 @@ public:
     mergeColdChains();
 
     // Collect nodes from all chains
-    concatChains(Result);
+    return concatChains();
   }
 
 private:
   /// Initialize the algorithm's data structures.
-  void initialize(const std::vector<uint64_t> &NodeSizes,
-                  const std::vector<uint64_t> &NodeCounts,
-                  const std::vector<EdgeCountT> &EdgeCounts) {
+  void initialize(const ArrayRef<uint64_t> &NodeSizes,
+                  const ArrayRef<uint64_t> &NodeCounts,
+                  const ArrayRef<EdgeCount> &EdgeCounts) {
     // Initialize nodes
     AllNodes.reserve(NumNodes);
     for (uint64_t Idx = 0; Idx < NumNodes; Idx++) {
@@ -590,21 +591,18 @@ private:
     PredNodes.resize(NumNodes);
     std::vector<uint64_t> OutDegree(NumNodes, 0);
     AllJumps.reserve(EdgeCounts.size());
-    for (auto It : EdgeCounts) {
-      uint64_t Pred = It.first.first;
-      uint64_t Succ = It.first.second;
-      OutDegree[Pred]++;
+    for (auto Edge : EdgeCounts) {
+      ++OutDegree[Edge.src];
       // Ignore self-edges.
-      if (Pred == Succ)
+      if (Edge.src == Edge.dst)
         continue;
 
-      SuccNodes[Pred].push_back(Succ);
-      PredNodes[Succ].push_back(Pred);
-      uint64_t ExecutionCount = It.second;
-      if (ExecutionCount > 0) {
-        NodeT &PredNode = AllNodes[Pred];
-        NodeT &SuccNode = AllNodes[Succ];
-        AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
+      SuccNodes[Edge.src].push_back(Edge.dst);
+      PredNodes[Edge.dst].push_back(Edge.src);
+      if (Edge.count > 0) {
+        NodeT &PredNode = AllNodes[Edge.src];
+        NodeT &SuccNode = AllNodes[Edge.dst];
+        AllJumps.emplace_back(&PredNode, &SuccNode, Edge.count);
         SuccNode.InJumps.push_back(&AllJumps.back());
         PredNode.OutJumps.push_back(&AllJumps.back());
       }
@@ -921,7 +919,7 @@ private:
   }
 
   /// Concatenate all chains into the final order.
-  void concatChains(std::vector<uint64_t> &Order) {
+  std::vector<uint64_t> concatChains() {
     // Collect chains and calculate density stats for their sorting.
     std::vector<const ChainT *> SortedChains;
     DenseMap<const ChainT *, double> ChainDensity;
@@ -956,12 +954,12 @@ private:
               });
 
     // Collect the nodes in the order specified by their chains.
+    std::vector<uint64_t> Order;
     Order.reserve(NumNodes);
-    for (const ChainT *Chain : SortedChains) {
-      for (NodeT *Node : Chain->Nodes) {
+    for (const ChainT *Chain : SortedChains)
+      for (NodeT *Node : Chain->Nodes)
         Order.push_back(Node->Index);
-      }
-    }
+    return Order;
   }
 
 private:
@@ -994,16 +992,15 @@ private:
 /// functions represented by a call graph.
 class CDSortImpl {
 public:
-  CDSortImpl(const CDSortConfig &Config, const std::vector<uint64_t> &NodeSizes,
-             const std::vector<uint64_t> &NodeCounts,
-             const std::vector<EdgeCountT> &EdgeCounts,
-             const std::vector<uint64_t> &EdgeOffsets)
+  CDSortImpl(const CDSortConfig &Config, ArrayRef<uint64_t> NodeSizes,
+             ArrayRef<uint64_t> NodeCounts, ArrayRef<EdgeCount> EdgeCounts,
+             ArrayRef<uint64_t> EdgeOffsets)
       : Config(Config), NumNodes(NodeSizes.size()) {
     initialize(NodeSizes, NodeCounts, EdgeCounts, EdgeOffsets);
   }
 
   /// Run the algorithm and return an ordered set of function clusters.
-  void run(std::vector<uint64_t> &Result) {
+  std::vector<uint64_t> run() {
     // Merge pairs of chains while improving the objective.
     mergeChainPairs();
 
@@ -1012,15 +1009,15 @@ public:
                       << HotChains.size() << "\n");
 
     // Collect nodes from all the chains.
-    concatChains(Result);
+    return concatChains();
   }
 
 private:
   /// Initialize the algorithm's data structures.
-  void initialize(const std::vector<uint64_t> &NodeSizes,
-                  const std::vector<uint64_t> &NodeCounts,
-                  const std::vector<EdgeCountT> &EdgeCounts,
-                  const std::vector<uint64_t> &EdgeOffsets) {
+  void initialize(const ArrayRef<uint64_t> &NodeSizes,
+                  const ArrayRef<uint64_t> &NodeCounts,
+                  const ArrayRef<EdgeCount> &EdgeCounts,
+                  const ArrayRef<uint64_t> &EdgeOffsets) {
     // Initialize nodes.
     AllNodes.reserve(NumNodes);
     for (uint64_t Node = 0; Node < NumNodes; Node++) {
@@ -1037,20 +1034,17 @@ private:
     PredNodes.resize(NumNodes);
     AllJumps.reserve(EdgeCounts.size());
     for (size_t I = 0; I < EdgeCounts.size(); I++) {
-      auto It = EdgeCounts[I];
-      uint64_t Pred = It.first.first;
-      uint64_t Succ = It.first.second;
+      auto [Pred, Succ, Count] = EdgeCounts[I];
       // Ignore recursive calls.
       if (Pred == Succ)
         continue;
 
       SuccNodes[Pred].push_back(Succ);
       PredNodes[Succ].push_back(Pred);
-      uint64_t ExecutionCount = It.second;
-      if (ExecutionCount > 0) {
+      if (Count > 0) {
         NodeT &PredNode = AllNodes[Pred];
         NodeT &SuccNode = AllNodes[Succ];
-        AllJumps.emplace_back(&PredNode, &SuccNode, ExecutionCount);
+        AllJumps.emplace_back(&PredNode, &SuccNode, Count);
         AllJumps.back().Offset = EdgeOffsets[I];
         SuccNode.InJumps.push_back(&AllJumps.back());
         PredNode.OutJumps.push_back(&AllJumps.back());
@@ -1301,7 +1295,7 @@ private:
   }
 
   /// Concatenate all chains into the final order.
-  void concatChains(std::vector<uint64_t> &Order) {
+  std::vector<uint64_t> concatChains() {
     // Collect chains and calculate density stats for their sorting.
     std::vector<const ChainT *> SortedChains;
     DenseMap<const ChainT *, double> ChainDensity;
@@ -1331,10 +1325,12 @@ private:
               });
 
     // Collect the nodes in the order specified by their chains.
+    std::vector<uint64_t> Order;
     Order.reserve(NumNodes);
     for (const ChainT *Chain : SortedChains)
       for (NodeT *Node : Chain->Nodes)
         Order.push_back(Node->Index);
+    return Order;
   }
 
 private:
@@ -1375,17 +1371,16 @@ private:
 } // end of anonymous namespace
 
 std::vector<uint64_t>
-llvm::applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
-                        const std::vector<uint64_t> &NodeCounts,
-                        const std::vector<EdgeCountT> &EdgeCounts) {
+codelayout::computeExtTspLayout(ArrayRef<uint64_t> NodeSizes,
+                                ArrayRef<uint64_t> NodeCounts,
+                                ArrayRef<EdgeCount> EdgeCounts) {
   // Verify correctness of the input data.
   assert(NodeCounts.size() == NodeSizes.size() && "Incorrect input");
   assert(NodeSizes.size() > 2 && "Incorrect input");
 
   // Apply the reordering algorithm.
   ExtTSPImpl Alg(NodeSizes, NodeCounts, EdgeCounts);
-  std::vector<uint64_t> Result;
-  Alg.run(Result);
+  std::vector<uint64_t> Result = Alg.run();
 
   // Verify correctness of the output.
   assert(Result.front() == 0 && "Original entry point is not preserved");
@@ -1393,37 +1388,32 @@ llvm::applyExtTspLayout(const std::vector<uint64_t> &NodeSizes,
   return Result;
 }
 
-double llvm::calcExtTspScore(const std::vector<uint64_t> &Order,
-                             const std::vector<uint64_t> &NodeSizes,
-                             const std::vector<uint64_t> &NodeCounts,
-                             const std::vector<EdgeCountT> &EdgeCounts) {
+double codelayout::calcExtTspScore(ArrayRef<uint64_t> Order,
+                                   ArrayRef<uint64_t> NodeSizes,
+                                   ArrayRef<uint64_t> NodeCounts,
+                                   ArrayRef<EdgeCount> EdgeCounts) {
   // Estimate addresses of the blocks in memory.
   std::vector<uint64_t> Addr(NodeSizes.size(), 0);
   for (size_t Idx = 1; Idx < Order.size(); Idx++) {
     Addr[Order[Idx]] = Addr[Order[Idx - 1]] + NodeSizes[Order[Idx - 1]];
   }
   std::vector<uint64_t> OutDegree(NodeSizes.size(), 0);
-  for (auto It : EdgeCounts) {
-    uint64_t Pred = It.first.first;
-    OutDegree[Pred]++;
-  }
+  for (auto Edge : EdgeCounts)
+    ++OutDegree[Edge.src];
 
   // Increase the score for each jump.
   double Score = 0;
-  for (auto It : EdgeCounts) {
-    uint64_t Pred = It.first.first;
-    uint64_t Succ = It.first.second;
-    uint64_t Count = It.second;
-    bool IsConditional = OutDegree[Pred] > 1;
-    Score += ::extTSPScore(Addr[Pred], NodeSizes[Pred], Addr[Succ], Count,
-                           IsConditional);
+  for (auto Edge : EdgeCounts) {
+    bool IsConditional = OutDegree[Edge.src] > 1;
+    Score += ::extTSPScore(Addr[Edge.src], NodeSizes[Edge.src], Addr[Edge.dst],
+                           Edge.count, IsConditional);
   }
   return Score;
 }
 
-double llvm::calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
-                             const std::vector<uint64_t> &NodeCounts,
-                             const std::vector<EdgeCountT> &EdgeCounts) {
+double codelayout::calcExtTspScore(ArrayRef<uint64_t> NodeSizes,
+                                   ArrayRef<uint64_t> NodeCounts,
+                                   ArrayRef<EdgeCount> EdgeCounts) {
   std::vector<uint64_t> Order(NodeSizes.size());
   for (size_t Idx = 0; Idx < NodeSizes.size(); Idx++) {
     Order[Idx] = Idx;
@@ -1431,30 +1421,23 @@ double llvm::calcExtTspScore(const std::vector<uint64_t> &NodeSizes,
   return calcExtTspScore(Order, NodeSizes, NodeCounts, EdgeCounts);
 }
 
-std::vector<uint64_t>
-llvm::applyCDSLayout(const CDSortConfig &Config,
-                     const std::vector<uint64_t> &FuncSizes,
-                     const std::vector<uint64_t> &FuncCounts,
-                     const std::vector<EdgeCountT> &CallCounts,
-                     const std::vector<uint64_t> &CallOffsets) {
+std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
+    const CDSortConfig &Config, ArrayRef<uint64_t> FuncSizes,
+    ArrayRef<uint64_t> FuncCounts, ArrayRef<EdgeCount> CallCounts,
+    ArrayRef<uint64_t> CallOffsets) {
   // Verify correctness of the input data.
   assert(FuncCounts.size() == FuncSizes.size() && "Incorrect input");
 
   // Apply the reordering algorithm.
   CDSortImpl Alg(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
-  std::vector<uint64_t> Result;
-  Alg.run(Result);
-
-  // Verify correctness of the output.
+  std::vector<uint64_t> Result = Alg.run();
   assert(Result.size() == FuncSizes.size() && "Incorrect size of layout");
   return Result;
 }
 
-std::vector<uint64_t>
-llvm::applyCDSLayout(const std::vector<uint64_t> &FuncSizes,
-                     const std::vector<uint64_t> &FuncCounts,
-                     const std::vector<EdgeCountT> &CallCounts,
-                     const std::vector<uint64_t> &CallOffsets) {
+std::vector<uint64_t> codelayout::computeCacheDirectedLayout(
+    ArrayRef<uint64_t> FuncSizes, ArrayRef<uint64_t> FuncCounts,
+    ArrayRef<EdgeCount> CallCounts, ArrayRef<uint64_t> CallOffsets) {
   CDSortConfig Config;
   // Populate the config from the command-line options.
   if (CacheEntries.getNumOccurrences() > 0)
@@ -1465,5 +1448,6 @@ llvm::applyCDSLayout(const std::vector<uint64_t> &FuncSizes,
     Config.DistancePower = DistancePower;
   if (FrequencyScale.getNumOccurrences() > 0)
     Config.FrequencyScale = FrequencyScale;
-  return applyCDSLayout(Config, FuncSizes, FuncCounts, CallCounts, CallOffsets);
+  return computeCacheDirectedLayout(Config, FuncSizes, FuncCounts, CallCounts,
+                                    CallOffsets);
 }
-- 
Gitee


From 7efbe1f6bbd3c61c1b417a0c364e74e43031857a Mon Sep 17 00:00:00 2001
From: Dhruv Chawla <44582521+dc03@users.noreply.github.com>
Date: Wed, 7 Jun 2023 12:46:52 +0530
Subject: [PATCH 09/10] [SetVector] Improve performance for small sizes

SmallSetVector has an inefficiency where it does set insertions
regardless of the number of elements present within it. This contrasts
with other "Small-" containers where they use linear scan up to a
certain size "N", after which they switch to another strategy.

This patch implements this functionality in SetVector, adding a template
parameter "N" which specifies the number of elements upto which the
SetVector follows the "small" strategy. Due to the use of "if
constexpr", there is no "small" code emitted when N is 0 which makes
this a zero overhead change for users using the default behaviour.

This change also allows having SmallSetVector use DenseSet instead of
SmallDenseSet by default, which helps a little with performance.

The reason for implementing this functionality in SetVector instead of
SmallSetVector is that it allows reusing all the code that is already
there and it is just augmented with the "isSmall" checks.

This change gives a good speedup (0.4%):
https://llvm-compile-time-tracker.com/compare.php?from=086601eac266ec253bf313c746390ff3e5656132&to=acd0a72a4d3ee840f7b455d1b35d82b11ffdb3c0&stat=instructions%3Au

Differential Revision: https://reviews.llvm.org/D152497
---
 flang/lib/Lower/Bridge.cpp        |  2 +-
 llvm/include/llvm/ADT/SetVector.h | 91 +++++++++++++++++++++++++++----
 mlir/include/mlir/Support/LLVM.h  |  6 +-
 3 files changed, 84 insertions(+), 15 deletions(-)

diff --git a/flang/lib/Lower/Bridge.cpp b/flang/lib/Lower/Bridge.cpp
index c8971ea5036d..7eeda0ec3efa 100644
--- a/flang/lib/Lower/Bridge.cpp
+++ b/flang/lib/Lower/Bridge.cpp
@@ -174,7 +174,7 @@ private:
   /// Track symbols symbols processed during and after the registration
   /// to avoid infinite loops between type conversions and global variable
   /// creation.
-  llvm::SmallSetVector<Fortran::semantics::SymbolRef, 64> seen;
+  llvm::SmallSetVector<Fortran::semantics::SymbolRef, 32> seen;
 };
 
 class DispatchTableConverter {
diff --git a/llvm/include/llvm/ADT/SetVector.h b/llvm/include/llvm/ADT/SetVector.h
index 37509e28f891..2eabe578a479 100644
--- a/llvm/include/llvm/ADT/SetVector.h
+++ b/llvm/include/llvm/ADT/SetVector.h
@@ -35,9 +35,30 @@ namespace llvm {
 /// This adapter class provides a way to keep a set of things that also has the
 /// property of a deterministic iteration order. The order of iteration is the
 /// order of insertion.
+///
+/// The key and value types are derived from the Set and Vector types
+/// respectively. This allows the vector-type operations and set-type operations
+/// to have different types. In particular, this is useful when storing pointers
+/// as "Foo *" values but looking them up as "const Foo *" keys.
+///
+/// No constraint is placed on the key and value types, although it is assumed
+/// that value_type can be converted into key_type for insertion. Users must be
+/// aware of any loss of information in this conversion. For example, setting
+/// value_type to float and key_type to int can produce very surprising results,
+/// but it is not explicitly disallowed.
+///
+/// The parameter N specifies the "small" size of the container, which is the
+/// number of elements upto which a linear scan over the Vector will be used
+/// when searching for elements instead of checking Set, due to it being better
+/// for performance. A value of 0 means that this mode of operation is not used,
+/// and is the default value.
 template <typename T, typename Vector = std::vector<T>,
-          typename Set = DenseSet<T>>
+          typename Set = DenseSet<T>, unsigned N = 0>
 class SetVector {
+  // Much like in SmallPtrSet, this value should not be too high to prevent
+  // excessively long linear scans from occuring.
+  static_assert(N <= 32, "Small size should be less than or equal to 32!");
+
 public:
   using value_type = T;
   using key_type = T;
@@ -139,6 +160,17 @@ public:
   /// Insert a new element into the SetVector.
   /// \returns true if the element was inserted into the SetVector.
   bool insert(const value_type &X) {
+    if constexpr (canBeSmall())
+      if (isSmall()) {
+        if (llvm::find(vector_, X) == vector_.end()) {
+          vector_.push_back(X);
+          if (vector_.size() > N)
+            makeBig();
+          return true;
+        }
+        return false;
+      }
+
     bool result = set_.insert(X).second;
     if (result)
       vector_.push_back(X);
@@ -149,12 +181,21 @@ public:
   template<typename It>
   void insert(It Start, It End) {
     for (; Start != End; ++Start)
-      if (set_.insert(*Start).second)
-        vector_.push_back(*Start);
+      insert(*Start);
   }
 
   /// Remove an item from the set vector.
   bool remove(const value_type& X) {
+    if constexpr (canBeSmall())
+      if (isSmall()) {
+        typename vector_type::iterator I = find(vector_, X);
+        if (I != vector_.end()) {
+          vector_.erase(I);
+          return true;
+        }
+        return false;
+      }
+
     if (set_.erase(X)) {
       typename vector_type::iterator I = find(vector_, X);
       assert(I != vector_.end() && "Corrupted SetVector instances!");
@@ -169,6 +210,10 @@ public:
   /// element erased. This is the end of the SetVector if the last element is
   /// erased.
   iterator erase(const_iterator I) {
+    if constexpr (canBeSmall())
+      if (isSmall())
+        return vector_.erase(I);
+
     const key_type &V = *I;
     assert(set_.count(V) && "Corrupted SetVector instances!");
     set_.erase(V);
@@ -190,8 +235,15 @@ public:
   /// \returns true if any element is removed.
   template <typename UnaryPredicate>
   bool remove_if(UnaryPredicate P) {
-    typename vector_type::iterator I =
-        llvm::remove_if(vector_, TestAndEraseFromSet<UnaryPredicate>(P, set_));
+    typename vector_type::iterator I = [this, P] {
+      if constexpr (canBeSmall())
+        if (isSmall())
+          return llvm::remove_if(vector_, P);
+
+      return llvm::remove_if(vector_,
+                             TestAndEraseFromSet<UnaryPredicate>(P, set_));
+    }();
+
     if (I == vector_.end())
       return false;
     vector_.erase(I, vector_.end());
@@ -200,12 +252,20 @@ public:
 
   /// Check if the SetVector contains the given key.
   bool contains(const key_type &key) const {
+    if constexpr (canBeSmall())
+      if (isSmall())
+        return is_contained(vector_, key);
+
     return set_.find(key) != set_.end();
   }
 
   /// Count the number of elements of a given key in the SetVector.
   /// \returns 0 if the element is not in the SetVector, 1 if it is.
   size_type count(const key_type &key) const {
+    if constexpr (canBeSmall())
+      if (isSmall())
+        return is_contained(vector_, key);
+
     return set_.count(key);
   }
 
@@ -261,7 +321,7 @@ public:
       remove(*SI);
   }
 
-  void swap(SetVector<T, Vector, Set> &RHS) {
+  void swap(SetVector<T, Vector, Set, N> &RHS) {
     set_.swap(RHS.set_);
     vector_.swap(RHS.vector_);
   }
@@ -290,6 +350,16 @@ private:
     }
   };
 
+  [[nodiscard]] static constexpr bool canBeSmall() { return N != 0; }
+
+  [[nodiscard]] bool isSmall() const { return set_.empty(); }
+
+  void makeBig() {
+    if constexpr (canBeSmall())
+      for (const auto &entry : vector_)
+        set_.insert(entry);
+  }
+
   set_type set_;         ///< The set.
   vector_type vector_;   ///< The vector.
 };
@@ -297,8 +367,7 @@ private:
 /// A SetVector that performs no allocations if smaller than
 /// a certain size.
 template <typename T, unsigned N>
-class SmallSetVector
-    : public SetVector<T, SmallVector<T, N>, SmallDenseSet<T, N>> {
+class SmallSetVector : public SetVector<T, SmallVector<T, N>, DenseSet<T>, N> {
 public:
   SmallSetVector() = default;
 
@@ -314,9 +383,9 @@ public:
 namespace std {
 
 /// Implement std::swap in terms of SetVector swap.
-template<typename T, typename V, typename S>
-inline void
-swap(llvm::SetVector<T, V, S> &LHS, llvm::SetVector<T, V, S> &RHS) {
+template <typename T, typename V, typename S, unsigned N>
+inline void swap(llvm::SetVector<T, V, S, N> &LHS,
+                 llvm::SetVector<T, V, S, N> &RHS) {
   LHS.swap(RHS);
 }
 
diff --git a/mlir/include/mlir/Support/LLVM.h b/mlir/include/mlir/Support/LLVM.h
index 7acf97b5a253..48d569b0ca75 100644
--- a/mlir/include/mlir/Support/LLVM.h
+++ b/mlir/include/mlir/Support/LLVM.h
@@ -61,7 +61,7 @@ class MutableArrayRef;
 template <typename T> using Optional = std::optional<T>;
 template <typename... PT>
 class PointerUnion;
-template <typename T, typename Vector, typename Set>
+template <typename T, typename Vector, typename Set, unsigned N>
 class SetVector;
 template <typename T, unsigned N>
 class SmallPtrSet;
@@ -123,8 +123,8 @@ using DenseMap = llvm::DenseMap<KeyT, ValueT, KeyInfoT, BucketT>;
 template <typename ValueT, typename ValueInfoT = DenseMapInfo<ValueT>>
 using DenseSet = llvm::DenseSet<ValueT, ValueInfoT>;
 template <typename T, typename Vector = std::vector<T>,
-          typename Set = DenseSet<T>>
-using SetVector = llvm::SetVector<T, Vector, Set>;
+          typename Set = DenseSet<T>, unsigned N = 0>
+using SetVector = llvm::SetVector<T, Vector, Set, N>;
 template <typename AllocatorTy = llvm::MallocAllocator>
 using StringSet = llvm::StringSet<AllocatorTy>;
 using llvm::MutableArrayRef;
-- 
Gitee


From ce6d7e5fb1fc462e87591f91372a30359b598289 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Fri, 17 Feb 2023 00:28:07 +0900
Subject: [PATCH 10/10] llvm-tblgen: Apply IWYU partially

---
 llvm/include/llvm/TableGen/DirectiveEmitter.h    |  5 +++++
 llvm/lib/TableGen/Main.cpp                       | 12 +++++++++++-
 llvm/utils/TableGen/AsmWriterInst.cpp            |  1 -
 llvm/utils/TableGen/AsmWriterInst.h              |  1 -
 llvm/utils/TableGen/CTagsEmitter.cpp             |  1 -
 llvm/utils/TableGen/CodeEmitterGen.cpp           |  4 ++--
 llvm/utils/TableGen/CodeGenDAGPatterns.cpp       |  1 +
 llvm/utils/TableGen/CodeGenHwModes.h             |  2 ++
 llvm/utils/TableGen/CodeGenMapTable.cpp          |  1 +
 llvm/utils/TableGen/CodeGenRegisters.h           |  5 ++++-
 llvm/utils/TableGen/CodeGenSchedule.h            |  7 +++++++
 llvm/utils/TableGen/CompressInstEmitter.cpp      |  1 +
 llvm/utils/TableGen/DAGISelEmitter.cpp           |  1 +
 llvm/utils/TableGen/DAGISelMatcher.cpp           |  1 +
 llvm/utils/TableGen/DAGISelMatcher.h             |  5 +++++
 llvm/utils/TableGen/DAGISelMatcherEmitter.cpp    |  3 +++
 llvm/utils/TableGen/DAGISelMatcherGen.cpp        |  3 +++
 llvm/utils/TableGen/DAGISelMatcherOpt.cpp        |  1 +
 llvm/utils/TableGen/DFAEmitter.h                 |  2 ++
 llvm/utils/TableGen/DXILEmitter.cpp              |  1 -
 llvm/utils/TableGen/DecoderEmitter.cpp           |  1 +
 llvm/utils/TableGen/FastISelEmitter.cpp          |  3 +++
 llvm/utils/TableGen/GICombinerEmitter.cpp        |  3 +++
 llvm/utils/TableGen/GlobalISelEmitter.cpp        |  4 ++++
 llvm/utils/TableGen/InfoByHwMode.cpp             |  2 +-
 llvm/utils/TableGen/InfoByHwMode.h               |  8 +++++++-
 llvm/utils/TableGen/PredicateExpander.cpp        |  1 +
 llvm/utils/TableGen/RegisterBankEmitter.cpp      |  6 +++---
 llvm/utils/TableGen/RegisterInfoEmitter.cpp      |  2 ++
 llvm/utils/TableGen/SubtargetEmitter.cpp         |  1 +
 llvm/utils/TableGen/SubtargetFeatureInfo.cpp     |  1 -
 llvm/utils/TableGen/SubtargetFeatureInfo.h       |  2 ++
 llvm/utils/TableGen/TableGen.cpp                 |  4 ++++
 llvm/utils/TableGen/VarLenCodeEmitterGen.cpp     |  1 +
 llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp |  1 +
 llvm/utils/TableGen/X86FoldTablesEmitter.cpp     |  3 ++-
 llvm/utils/TableGen/X86MnemonicTables.cpp        |  2 +-
 llvm/utils/TableGen/X86ModRMFilters.h            |  2 +-
 llvm/utils/TableGen/X86RecognizableInstr.h       |  4 +++-
 39 files changed, 91 insertions(+), 18 deletions(-)

diff --git a/llvm/include/llvm/TableGen/DirectiveEmitter.h b/llvm/include/llvm/TableGen/DirectiveEmitter.h
index e85c13f4b7cc..4bca4b13d729 100644
--- a/llvm/include/llvm/TableGen/DirectiveEmitter.h
+++ b/llvm/include/llvm/TableGen/DirectiveEmitter.h
@@ -1,8 +1,13 @@
 #ifndef LLVM_TABLEGEN_DIRECTIVEEMITTER_H
 #define LLVM_TABLEGEN_DIRECTIVEEMITTER_H
 
+#include "llvm/ADT/STLExtras.h"
 #include "llvm/ADT/StringExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/TableGen/Record.h"
+#include <algorithm>
+#include <string>
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/lib/TableGen/Main.cpp b/llvm/lib/TableGen/Main.cpp
index 1d5f130737ee..2f9ac86e1f07 100644
--- a/llvm/lib/TableGen/Main.cpp
+++ b/llvm/lib/TableGen/Main.cpp
@@ -15,15 +15,25 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/TableGen/Main.h"
+#include "TGLexer.h"
 #include "TGParser.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/ADT/Twine.h"
 #include "llvm/Support/CommandLine.h"
+#include "llvm/Support/ErrorOr.h"
 #include "llvm/Support/FileSystem.h"
 #include "llvm/Support/MemoryBuffer.h"
+#include "llvm/Support/SMLoc.h"
+#include "llvm/Support/SourceMgr.h"
 #include "llvm/Support/ToolOutputFile.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include <algorithm>
+#include <memory>
+#include <string>
 #include <system_error>
+#include <utility>
+#include <vector>
 using namespace llvm;
 
 static cl::opt<std::string>
diff --git a/llvm/utils/TableGen/AsmWriterInst.cpp b/llvm/utils/TableGen/AsmWriterInst.cpp
index 4a78108d6f4a..c9558593e142 100644
--- a/llvm/utils/TableGen/AsmWriterInst.cpp
+++ b/llvm/utils/TableGen/AsmWriterInst.cpp
@@ -12,7 +12,6 @@
 
 #include "AsmWriterInst.h"
 #include "CodeGenInstruction.h"
-#include "CodeGenTarget.h"
 #include "llvm/ADT/StringExtras.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/AsmWriterInst.h b/llvm/utils/TableGen/AsmWriterInst.h
index fe2b934e266f..9c93e82b611b 100644
--- a/llvm/utils/TableGen/AsmWriterInst.h
+++ b/llvm/utils/TableGen/AsmWriterInst.h
@@ -21,7 +21,6 @@
 
 namespace llvm {
   class CodeGenInstruction;
-  class Record;
 
   struct AsmWriterOperand {
     enum OpType {
diff --git a/llvm/utils/TableGen/CTagsEmitter.cpp b/llvm/utils/TableGen/CTagsEmitter.cpp
index fe62d6a9b67f..b4ffbfa2012c 100644
--- a/llvm/utils/TableGen/CTagsEmitter.cpp
+++ b/llvm/utils/TableGen/CTagsEmitter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include <algorithm>
-#include <string>
 #include <vector>
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/CodeEmitterGen.cpp b/llvm/utils/TableGen/CodeEmitterGen.cpp
index dc4fd589eaa8..65befa0473fc 100644
--- a/llvm/utils/TableGen/CodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/CodeEmitterGen.cpp
@@ -12,10 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
-#include "SubtargetFeatureInfo.h"
-#include "Types.h"
+#include "InfoByHwMode.h"
 #include "VarLenCodeEmitterGen.h"
 #include "llvm/ADT/APInt.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
index dd04778e2dbe..2713e7a1a8ed 100644
--- a/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
+++ b/llvm/utils/TableGen/CodeGenDAGPatterns.cpp
@@ -13,6 +13,7 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenRegisters.h"
 #include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/STLExtras.h"
diff --git a/llvm/utils/TableGen/CodeGenHwModes.h b/llvm/utils/TableGen/CodeGenHwModes.h
index 55507cbca37d..335e918bfe73 100644
--- a/llvm/utils/TableGen/CodeGenHwModes.h
+++ b/llvm/utils/TableGen/CodeGenHwModes.h
@@ -12,9 +12,11 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENHWMODES_H
 
 #include "llvm/ADT/StringMap.h"
+#include "llvm/ADT/StringRef.h"
 #include <cassert>
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 
 // HwModeId -> list of predicates (definition)
diff --git a/llvm/utils/TableGen/CodeGenMapTable.cpp b/llvm/utils/TableGen/CodeGenMapTable.cpp
index 02695942f5c1..fd375735dfd2 100644
--- a/llvm/utils/TableGen/CodeGenMapTable.cpp
+++ b/llvm/utils/TableGen/CodeGenMapTable.cpp
@@ -78,6 +78,7 @@
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 using namespace llvm;
 typedef std::map<std::string, std::vector<Record*> > InstrRelMapTy;
 
diff --git a/llvm/utils/TableGen/CodeGenRegisters.h b/llvm/utils/TableGen/CodeGenRegisters.h
index 765425ed68cb..7638816811e8 100644
--- a/llvm/utils/TableGen/CodeGenRegisters.h
+++ b/llvm/utils/TableGen/CodeGenRegisters.h
@@ -14,6 +14,7 @@
 #ifndef LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H
 #define LLVM_UTILS_TABLEGEN_CODEGENREGISTERS_H
 
+#include "CodeGenHwModes.h"
 #include "InfoByHwMode.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/BitVector.h"
@@ -32,8 +33,11 @@
 #include <cassert>
 #include <cstdint>
 #include <deque>
+#include <functional>
 #include <list>
 #include <map>
+#include <memory>
+#include <optional>
 #include <string>
 #include <utility>
 #include <vector>
@@ -41,7 +45,6 @@
 namespace llvm {
 
   class CodeGenRegBank;
-  template <typename T, typename Vector, typename Set> class SetVector;
 
   /// Used to encode a step in a register lane mask transformation.
   /// Mask the bits specified in Mask, then rotate them Rol bits to the left
diff --git a/llvm/utils/TableGen/CodeGenSchedule.h b/llvm/utils/TableGen/CodeGenSchedule.h
index bbf5381ad086..76ef1e439530 100644
--- a/llvm/utils/TableGen/CodeGenSchedule.h
+++ b/llvm/utils/TableGen/CodeGenSchedule.h
@@ -15,10 +15,17 @@
 #define LLVM_UTILS_TABLEGEN_CODEGENSCHEDULE_H
 
 #include "llvm/ADT/APInt.h"
+#include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/DenseSet.h"
 #include "llvm/ADT/STLExtras.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
+#include <cassert>
+#include <string>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/CompressInstEmitter.cpp b/llvm/utils/TableGen/CompressInstEmitter.cpp
index a18d6a6b8854..1f9c39c08e67 100644
--- a/llvm/utils/TableGen/CompressInstEmitter.cpp
+++ b/llvm/utils/TableGen/CompressInstEmitter.cpp
@@ -65,6 +65,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenInstruction.h"
+#include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
 #include "llvm/ADT/IndexedMap.h"
 #include "llvm/ADT/SmallVector.h"
diff --git a/llvm/utils/TableGen/DAGISelEmitter.cpp b/llvm/utils/TableGen/DAGISelEmitter.cpp
index d012a0172a8f..70738c7adca8 100644
--- a/llvm/utils/TableGen/DAGISelEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelEmitter.cpp
@@ -12,6 +12,7 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenTarget.h"
 #include "DAGISelMatcher.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcher.cpp b/llvm/utils/TableGen/DAGISelMatcher.cpp
index e436a931a9f5..c08c6a9a30a2 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcher.cpp
@@ -8,6 +8,7 @@
 
 #include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Record.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcher.h b/llvm/utils/TableGen/DAGISelMatcher.h
index 77280acaf4ca..c9094f5675e6 100644
--- a/llvm/utils/TableGen/DAGISelMatcher.h
+++ b/llvm/utils/TableGen/DAGISelMatcher.h
@@ -14,6 +14,11 @@
 #include "llvm/ADT/StringRef.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/MachineValueType.h"
+#include <cassert>
+#include <cstddef>
+#include <memory>
+#include <string>
+#include <utility>
 
 namespace llvm {
   struct CodeGenRegister;
diff --git a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
index 777e75dcd929..2b876c2f7496 100644
--- a/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherEmitter.cpp
@@ -11,7 +11,10 @@
 //===----------------------------------------------------------------------===//
 
 #include "CodeGenDAGPatterns.h"
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
 #include "DAGISelMatcher.h"
+#include "SDNodeProperties.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/MapVector.h"
 #include "llvm/ADT/StringMap.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcherGen.cpp b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
index 44bff4c67ab3..03f7bc4ff519 100644
--- a/llvm/utils/TableGen/DAGISelMatcherGen.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherGen.cpp
@@ -9,7 +9,10 @@
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
 #include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
 #include "DAGISelMatcher.h"
+#include "InfoByHwMode.h"
+#include "SDNodeProperties.h"
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/StringMap.h"
 #include "llvm/TableGen/Error.h"
diff --git a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
index 4273bd69b87d..764b86c97dbf 100644
--- a/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
+++ b/llvm/utils/TableGen/DAGISelMatcherOpt.cpp
@@ -12,6 +12,7 @@
 
 #include "DAGISelMatcher.h"
 #include "CodeGenDAGPatterns.h"
+#include "SDNodeProperties.h"
 #include "llvm/ADT/StringSet.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
diff --git a/llvm/utils/TableGen/DFAEmitter.h b/llvm/utils/TableGen/DFAEmitter.h
index 44e5d97d544f..c831a65a73cd 100644
--- a/llvm/utils/TableGen/DFAEmitter.h
+++ b/llvm/utils/TableGen/DFAEmitter.h
@@ -21,6 +21,8 @@
 #include "llvm/ADT/UniqueVector.h"
 #include <map>
 #include <set>
+#include <utility>
+#include <vector>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/DXILEmitter.cpp b/llvm/utils/TableGen/DXILEmitter.cpp
index 44c1df3e9ac4..c9cd5b0d7ec6 100644
--- a/llvm/utils/TableGen/DXILEmitter.cpp
+++ b/llvm/utils/TableGen/DXILEmitter.cpp
@@ -17,7 +17,6 @@
 #include "llvm/ADT/StringSet.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/DXILOperationCommon.h"
-#include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/DecoderEmitter.cpp b/llvm/utils/TableGen/DecoderEmitter.cpp
index 8f816744370c..eabc158ab91e 100644
--- a/llvm/utils/TableGen/DecoderEmitter.cpp
+++ b/llvm/utils/TableGen/DecoderEmitter.cpp
@@ -11,6 +11,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "InfoByHwMode.h"
diff --git a/llvm/utils/TableGen/FastISelEmitter.cpp b/llvm/utils/TableGen/FastISelEmitter.cpp
index 0a88f67be168..ef501f86f291 100644
--- a/llvm/utils/TableGen/FastISelEmitter.cpp
+++ b/llvm/utils/TableGen/FastISelEmitter.cpp
@@ -18,6 +18,9 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "llvm/ADT/StringSwitch.h"
 #include "llvm/Support/ErrorHandling.h"
 #include "llvm/TableGen/Error.h"
diff --git a/llvm/utils/TableGen/GICombinerEmitter.cpp b/llvm/utils/TableGen/GICombinerEmitter.cpp
index 2ae313081a6f..927fb81dc74b 100644
--- a/llvm/utils/TableGen/GICombinerEmitter.cpp
+++ b/llvm/utils/TableGen/GICombinerEmitter.cpp
@@ -15,6 +15,9 @@
 #include "GlobalISel/CodeExpander.h"
 #include "GlobalISel/CodeExpansions.h"
 #include "GlobalISel/GIMatchDag.h"
+#include "GlobalISel/GIMatchDagEdge.h"
+#include "GlobalISel/GIMatchDagInstr.h"
+#include "GlobalISel/GIMatchDagOperands.h"
 #include "GlobalISel/GIMatchDagPredicate.h"
 #include "GlobalISel/GIMatchTree.h"
 #include "llvm/ADT/SmallSet.h"
diff --git a/llvm/utils/TableGen/GlobalISelEmitter.cpp b/llvm/utils/TableGen/GlobalISelEmitter.cpp
index c79c79948a80..360d42f3978a 100644
--- a/llvm/utils/TableGen/GlobalISelEmitter.cpp
+++ b/llvm/utils/TableGen/GlobalISelEmitter.cpp
@@ -31,6 +31,10 @@
 
 #include "CodeGenDAGPatterns.h"
 #include "CodeGenInstruction.h"
+#include "CodeGenIntrinsics.h"
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "SubtargetFeatureInfo.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/Support/CodeGenCoverage.h"
diff --git a/llvm/utils/TableGen/InfoByHwMode.cpp b/llvm/utils/TableGen/InfoByHwMode.cpp
index 73c4fbf0a5eb..5140c5a0d20f 100644
--- a/llvm/utils/TableGen/InfoByHwMode.cpp
+++ b/llvm/utils/TableGen/InfoByHwMode.cpp
@@ -17,7 +17,7 @@
 #include "llvm/ADT/Twine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/raw_ostream.h"
-
+#include "llvm/TableGen/Record.h"
 #include <string>
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/InfoByHwMode.h b/llvm/utils/TableGen/InfoByHwMode.h
index 44927d0bf0df..6cfd6e8bb493 100644
--- a/llvm/utils/TableGen/InfoByHwMode.h
+++ b/llvm/utils/TableGen/InfoByHwMode.h
@@ -16,10 +16,16 @@
 
 #include "CodeGenHwModes.h"
 #include "llvm/ADT/SmallSet.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/StringRef.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/Support/MachineValueType.h"
-
+#include <cassert>
+#include <limits>
 #include <map>
 #include <string>
+#include <tuple>
+#include <utility>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/PredicateExpander.cpp b/llvm/utils/TableGen/PredicateExpander.cpp
index b129401461b5..8f96d3307ded 100644
--- a/llvm/utils/TableGen/PredicateExpander.cpp
+++ b/llvm/utils/TableGen/PredicateExpander.cpp
@@ -12,6 +12,7 @@
 
 #include "PredicateExpander.h"
 #include "CodeGenSchedule.h" // Definition of STIPredicateFunction.
+#include "llvm/TableGen/Record.h"
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/RegisterBankEmitter.cpp b/llvm/utils/TableGen/RegisterBankEmitter.cpp
index e6689b211a7d..01f2f7864d8d 100644
--- a/llvm/utils/TableGen/RegisterBankEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterBankEmitter.cpp
@@ -11,15 +11,15 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenRegisters.h"
+#include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
-#include "CodeGenRegisters.h"
-#include "CodeGenTarget.h"
-
 #define DEBUG_TYPE "register-bank-emitter"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/RegisterInfoEmitter.cpp b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
index 113cebf8a08e..5715dc1deb30 100644
--- a/llvm/utils/TableGen/RegisterInfoEmitter.cpp
+++ b/llvm/utils/TableGen/RegisterInfoEmitter.cpp
@@ -12,8 +12,10 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenRegisters.h"
 #include "CodeGenTarget.h"
+#include "InfoByHwMode.h"
 #include "SequenceToOffsetTable.h"
 #include "Types.h"
 #include "llvm/ADT/ArrayRef.h"
diff --git a/llvm/utils/TableGen/SubtargetEmitter.cpp b/llvm/utils/TableGen/SubtargetEmitter.cpp
index 8afe6d37d0e0..ec26e1c41f85 100644
--- a/llvm/utils/TableGen/SubtargetEmitter.cpp
+++ b/llvm/utils/TableGen/SubtargetEmitter.cpp
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenHwModes.h"
 #include "CodeGenSchedule.h"
 #include "CodeGenTarget.h"
 #include "PredicateExpander.h"
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
index 2a63fc490380..1abcf485f856 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.cpp
@@ -11,7 +11,6 @@
 #include "llvm/Config/llvm-config.h"
 #include "llvm/TableGen/Error.h"
 #include "llvm/TableGen/Record.h"
-#include <map>
 
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/SubtargetFeatureInfo.h b/llvm/utils/TableGen/SubtargetFeatureInfo.h
index 8c8a4487934c..e6a3f82d9bb8 100644
--- a/llvm/utils/TableGen/SubtargetFeatureInfo.h
+++ b/llvm/utils/TableGen/SubtargetFeatureInfo.h
@@ -9,9 +9,11 @@
 #ifndef LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H
 #define LLVM_UTIL_TABLEGEN_SUBTARGETFEATUREINFO_H
 
+#include "llvm/ADT/StringRef.h"
 #include "llvm/TableGen/Record.h"
 #include <map>
 #include <string>
+#include <utility>
 #include <vector>
 
 namespace llvm {
diff --git a/llvm/utils/TableGen/TableGen.cpp b/llvm/utils/TableGen/TableGen.cpp
index 746e2dd1db16..4117ed5a3f6a 100644
--- a/llvm/utils/TableGen/TableGen.cpp
+++ b/llvm/utils/TableGen/TableGen.cpp
@@ -13,9 +13,13 @@
 #include "TableGenBackends.h" // Declares all backends.
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/InitLLVM.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Main.h"
 #include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/SetTheory.h"
+#include <cassert>
+#include <string>
+#include <vector>
 
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
index 2c1acd8d910c..85da547d04c1 100644
--- a/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
+++ b/llvm/utils/TableGen/VarLenCodeEmitterGen.cpp
@@ -58,6 +58,7 @@
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/Support/raw_ostream.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 
 using namespace llvm;
 
diff --git a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
index 1384330ee8a1..b42ffa2aec1a 100644
--- a/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86EVEX2VEXTablesEmitter.cpp
@@ -15,6 +15,7 @@
 #include "CodeGenTarget.h"
 #include "X86RecognizableInstr.h"
 #include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
index 5b3f11848de6..7bc17e6f2b64 100644
--- a/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
+++ b/llvm/utils/TableGen/X86FoldTablesEmitter.cpp
@@ -11,10 +11,11 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "X86RecognizableInstr.h"
 #include "llvm/Support/FormattedStream.h"
-#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/X86MnemonicTables.cpp b/llvm/utils/TableGen/X86MnemonicTables.cpp
index f405e051e355..b5405488de0e 100644
--- a/llvm/utils/TableGen/X86MnemonicTables.cpp
+++ b/llvm/utils/TableGen/X86MnemonicTables.cpp
@@ -14,7 +14,7 @@
 #include "CodeGenInstruction.h"
 #include "CodeGenTarget.h"
 #include "X86RecognizableInstr.h"
-#include "llvm/TableGen/Error.h"
+#include "llvm/TableGen/Record.h"
 #include "llvm/TableGen/TableGenBackend.h"
 
 using namespace llvm;
diff --git a/llvm/utils/TableGen/X86ModRMFilters.h b/llvm/utils/TableGen/X86ModRMFilters.h
index e2d0907b4f8b..d2169a8e879b 100644
--- a/llvm/utils/TableGen/X86ModRMFilters.h
+++ b/llvm/utils/TableGen/X86ModRMFilters.h
@@ -17,7 +17,7 @@
 #ifndef LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H
 #define LLVM_UTILS_TABLEGEN_X86MODRMFILTERS_H
 
-#include "llvm/Support/DataTypes.h"
+#include <cstdint>
 
 namespace llvm {
 
diff --git a/llvm/utils/TableGen/X86RecognizableInstr.h b/llvm/utils/TableGen/X86RecognizableInstr.h
index ea56a9d7d994..f389ff01670c 100644
--- a/llvm/utils/TableGen/X86RecognizableInstr.h
+++ b/llvm/utils/TableGen/X86RecognizableInstr.h
@@ -17,8 +17,10 @@
 #define LLVM_UTILS_TABLEGEN_X86RECOGNIZABLEINSTR_H
 
 #include "CodeGenInstruction.h"
-#include "llvm/Support/DataTypes.h"
 #include "llvm/Support/X86DisassemblerDecoderCommon.h"
+#include <cstdint>
+#include <string>
+#include <vector>
 
 struct InstructionSpecifier;
 
-- 
Gitee