From 0299a113b73a244946136891bbaba23ea02dd250 Mon Sep 17 00:00:00 2001
From: Cathy Sheng <cathy.sheng@huawei.com>
Date: Tue, 5 Aug 2025 13:41:02 -0400
Subject: [PATCH] [ISEL] Reduce emitted load/stores when changing endianness

---
 .../Target/AArch64/AArch64ISelDAGToDAG.cpp    | 696 ++++++++++++++++++
 .../test/CodeGen/AArch64/merge-trunc-store.ll |  54 +-
 .../test/CodeGen/AArch64/reduce-load-store.ll | 285 +++++++
 3 files changed, 1017 insertions(+), 18 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/reduce-load-store.ll
diff --git a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
index f79d4d1934aa..691149cd6540 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelDAGToDAG.cpp
@@ -31,6 +31,11 @@ using namespace llvm;
 #define DEBUG_TYPE "aarch64-isel"
 #define PASS_NAME "AArch64 Instruction Selection"
 
+static cl::opt<bool> EnableEndiannessOpts(
+    "aarch64-endianness-opts", cl::Hidden,
+    cl::desc("Allow endianness opts to reduce load/store instructions"),
+    cl::init(false));
+
 //===--------------------------------------------------------------------===//
 /// AArch64DAGToDAGISel - AArch64 specific code to select AArch64 machine
 /// instructions for SelectionDAG operations.
@@ -427,6 +432,23 @@ public:
   bool trySelectCastFixedLengthToScalableVector(SDNode *N);
   bool trySelectCastScalableToFixedLengthVector(SDNode *N);
 
+  void tryWriteOpt(SDNode *N);
+  void SelectWrite3(SDNode *N, std::vector<StoreSDNode *>::iterator St,
+                    SDValue Addr);
+  void SelectWrite6(SDNode *N, std::vector<StoreSDNode *>::iterator St,
+                    SDValue Addr);
+  void SelectWrite7(SDNode *N, std::vector<StoreSDNode *>::iterator St,
+                    SDValue Addr);
+  void ReplaceStoreNodes(std::vector<StoreSDNode *>::iterator St, SDNode *NewSt,
+                         int NumSt);
+  bool tryReadOpt(SDNode *N);
+  void SelectRead3(SDNode *N,
+                   SmallVector<std::pair<LoadSDNode *, int64_t>> &LD);
+  void SelectRead6(SDNode *N,
+                   SmallVector<std::pair<LoadSDNode *, int64_t>> &LD);
+  void SelectRead7(SDNode *N,
+                   SmallVector<std::pair<LoadSDNode *, int64_t>> &LD);
+
 // Include the pieces autogenerated from the target description.
 #include "AArch64GenDAGISel.inc"
 
@@ -4182,6 +4204,671 @@ bool AArch64DAGToDAGISel::trySelectCastScalableToFixedLengthVector(SDNode *N) {
   return true;
 }
 
+// Returns false if the address of the store is an add node
+// with a constant second operand not in the range [-256, 255].
+// Returns true otherwise.
+static bool canGetLoadStoreOffset(MemSDNode *N) {
+  SDValue Addr = N->getBasePtr();
+  if (Addr.getOpcode() == ISD::ADD &&
+      Addr.getOperand(1)->getOpcode() == ISD::Constant)
+    if (cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue() < -256 ||
+        cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue() > 255)
+      return false;
+  return true;
+}
+
+// Assumes canGetLoadStoreOffset(N) returns true.
+// Returns 0 if the address of the store is not an add node
+// with a constant second operand.
+// Otherwise, returns the value of the constant operand.
+static int64_t getLoadStoreOffset(MemSDNode *N) {
+  SDValue Addr = N->getBasePtr();
+  if (Addr.getOpcode() == ISD::ADD &&
+      Addr.getOperand(1)->getOpcode() == ISD::Constant)
+    return cast<ConstantSDNode>(Addr.getOperand(1))->getSExtValue();
+  return 0;
+}
+
+// Returns the address (without offset) of the load/store
+static SDValue getLoadStoreAddrWithoutOffset(MemSDNode *N) {
+  SDValue Addr = N->getBasePtr();
+  if (Addr.getOpcode() == ISD::ADD &&
+      Addr.getOperand(1)->getOpcode() == ISD::Constant)
+    return Addr.getOperand(0);
+  return Addr;
+}
+
+// For each operand of N, if it is a store node, add it to AllSt,
+// where AllSt is a map where keys are the addresses stored to
+// and values are heap allocated vectors of store nodes storing to the key.
+static void
+getAllStores(SDNode *N,
+             DenseMap<SDValue, std::vector<StoreSDNode *> *> &AllSt) {
+  for (auto op = N->op_begin(); op != N->op_end(); ++op) {
+    // The operand is a store node.
+    if (op->getNode()->getOpcode() == ISD::STORE) {
+      StoreSDNode *St = cast<StoreSDNode>(op->getNode());
+      // It must store an i8.
+      if (St->getMemoryVT() == MVT::i8) {
+        // The node must have an offset in the range [-256, 255].
+        if (canGetLoadStoreOffset(St)) {
+          // Add St to AllSt.
+          SDValue Addr = getLoadStoreAddrWithoutOffset(St);
+          auto it = AllSt.find(Addr);
+          if (it == AllSt.end()) {
+            std::vector<StoreSDNode *> *Vec = new std::vector<StoreSDNode *>();
+            Vec->push_back(St);
+            AllSt.insert(
+                std::pair<SDValue, std::vector<StoreSDNode *> *>(Addr, Vec));
+          } else
+            it->second->push_back(St);
+        }
+      }
+    }
+  }
+}
+
+// Begin points to a vector of store nodes that all store to the same address.
+// If the stores can be combined, return true.
+// If the stores cannot be combined, return false.
+static bool canCombineStores(std::vector<StoreSDNode *>::iterator Begin,
+                             std::vector<StoreSDNode *>::iterator End,
+                             int NumSt) {
+  int64_t Offset = getLoadStoreOffset(*Begin);
+
+  // Need at least NumSt stores.
+  if (End - Begin < NumSt)
+    return false;
+
+  for (int i = 0; i < NumSt; ++i) {
+    StoreSDNode *St = Begin[i];
+    // All stores should be simple.
+    if (!St->isSimple())
+      return false;
+    // Stores must have consecutive offsets.
+    if (getLoadStoreOffset(St) != Offset + i)
+      return false;
+    // If the stored value is used elsewhere, then it can't be changed.
+    if (i != NumSt - 1 && !St->getOperand(1)->hasOneUse())
+      return false;
+  }
+
+  SDValue Stored = Begin[NumSt - 1]->getOperand(1);
+
+  // The value stored should be an i64.
+  if (Stored.getValueType() != MVT::i64)
+    return false;
+
+  SDValue Chain = Begin[NumSt - 1]->getChain();
+
+  for (int i = 0; i < NumSt - 1; ++i) {
+    SDValue Shift = Begin[i]->getOperand(1);
+    // Chains should all be the same.
+    if (Begin[i]->getChain() != Chain)
+      return false;
+    // The data stored should be shifted right.
+    if (Shift->getOpcode() != ISD::SRL)
+      return false;
+    // The shifted data should be same as Stored.
+    if (Shift->getOperand(0) != Stored)
+      return false;
+    // This should be shifted by a constant.
+    if (Shift->getOperand(1).getOpcode() != ISD::Constant)
+      return false;
+    // This should be shifted by (NumSt-1-i)*8.
+    if (cast<ConstantSDNode>(Shift->getOperand(1))->getSExtValue() !=
+        (NumSt - 1 - i) * 8)
+      return false;
+  }
+
+  // The store nodes can be combined.
+  return true;
+}
+
+// Replace the first NumSt store nodes in St with NewSt.
+void AArch64DAGToDAGISel::ReplaceStoreNodes(
+    std::vector<StoreSDNode *>::iterator St, SDNode *NewSt, int NumSt) {
+  for (NumSt -= 1; NumSt >= 0; --NumSt) {
+    ReplaceNode(St[NumSt], NewSt);
+  }
+}
+
+void AArch64DAGToDAGISel::SelectWrite3(SDNode *N,
+                                       std::vector<StoreSDNode *>::iterator St,
+                                       SDValue Addr) {
+  SDLoc dl(N);
+
+  SDValue Stored = St[2]->getOperand(1);
+
+  SDValue Subreg =
+      CurDAG->getTargetExtractSubreg(AArch64::sub_32, dl, MVT::i32, Stored);
+
+  SDNode *Rev16 =
+      CurDAG->getMachineNode(AArch64::REV16Wr, dl, MVT::i32, Subreg);
+
+  MachineMemOperand *MemOp = St[1]->getMemOperand();
+  MachineMemOperand *MMO = CurDAG->getMachineFunction().getMachineMemOperand(
+      MemOp, MemOp->getPointerInfo(), getLLTForMVT(MVT::i16));
+
+  SDValue Chain = St[1]->getChain();
+  SDValue Const =
+      CurDAG->getTargetConstant(getLoadStoreOffset(St[1]), dl, MVT::i64);
+  SDValue Ops[] = {SDValue(Rev16, 0), Addr, Const, Chain};
+
+  MachineSDNode *Strh =
+      CurDAG->getMachineNode(AArch64::STURHHi, dl, MVT::Other, Ops);
+  CurDAG->setNodeMemRefs(Strh, {MMO});
+
+  ReplaceStoreNodes(St + 1, Strh, 2);
+}
+
+void AArch64DAGToDAGISel::SelectWrite6(SDNode *N,
+                                       std::vector<StoreSDNode *>::iterator St,
+                                       SDValue Addr) {
+  SDLoc dl(N);
+
+  SDValue Stored = St[5]->getOperand(1);
+  SDValue Chain = St[5]->getChain();
+
+  SDValue LsrConst = CurDAG->getTargetConstant(32, dl, MVT::i64);
+  SDValue Const63 = CurDAG->getTargetConstant(63, dl, MVT::i64);
+  SDValue LsrOps[] = {Stored, LsrConst, Const63};
+  SDNode *Lsr = CurDAG->getMachineNode(AArch64::UBFMXri, dl, MVT::i64, LsrOps);
+
+  SDValue LsrSubreg = CurDAG->getTargetExtractSubreg(AArch64::sub_32, dl,
+                                                     MVT::i32, SDValue(Lsr, 0));
+
+  SDNode *Rev16 =
+      CurDAG->getMachineNode(AArch64::REV16Wr, dl, MVT::i32, LsrSubreg);
+
+  MachineMemOperand *StrhMemOp = St[0]->getMemOperand();
+  MachineMemOperand *StrhMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          StrhMemOp, StrhMemOp->getPointerInfo(), getLLTForMVT(MVT::i16));
+
+  SDValue StrhConst =
+      CurDAG->getTargetConstant(getLoadStoreOffset(St[0]), dl, MVT::i64);
+  SDValue StrhOps[] = {SDValue(Rev16, 0), Addr, StrhConst, Chain};
+
+  MachineSDNode *Strh =
+      CurDAG->getMachineNode(AArch64::STURHHi, dl, MVT::Other, StrhOps);
+  CurDAG->setNodeMemRefs(Strh, {StrhMMO});
+
+  ReplaceStoreNodes(St, Strh, 2);
+
+  SDValue Subreg =
+      CurDAG->getTargetExtractSubreg(AArch64::sub_32, dl, MVT::i32, Stored);
+
+  SDNode *Rev = CurDAG->getMachineNode(AArch64::REVWr, dl, MVT::i32, Subreg);
+
+  MachineMemOperand *SturMemOp = St[2]->getMemOperand();
+  MachineMemOperand *SturMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          SturMemOp, SturMemOp->getPointerInfo(), getLLTForMVT(MVT::i32));
+
+  SDValue SturConst =
+      CurDAG->getTargetConstant(getLoadStoreOffset(St[2]), dl, MVT::i64);
+  SDValue SturOps[] = {SDValue(Rev, 0), Addr, SturConst, Chain};
+
+  MachineSDNode *Stur =
+      CurDAG->getMachineNode(AArch64::STURWi, dl, MVT::Other, SturOps);
+  CurDAG->setNodeMemRefs(Stur, {SturMMO});
+
+  ReplaceStoreNodes(St + 2, Stur, 4);
+}
+
+void AArch64DAGToDAGISel::SelectWrite7(SDNode *N,
+                                       std::vector<StoreSDNode *>::iterator St,
+                                       SDValue Addr) {
+  SDLoc dl(N);
+
+  SDValue Stored = St[6]->getOperand(1);
+  SDValue Chain = St[6]->getChain();
+
+  SDValue Lsr1Const = CurDAG->getTargetConstant(32, dl, MVT::i64);
+  SDValue Const63 = CurDAG->getTargetConstant(63, dl, MVT::i64);
+  SDValue Lsr1Ops[] = {Stored, Lsr1Const, Const63};
+  SDNode *Lsr1 =
+      CurDAG->getMachineNode(AArch64::UBFMXri, dl, MVT::i64, Lsr1Ops);
+
+  SDValue Lsr1Subreg = CurDAG->getTargetExtractSubreg(
+      AArch64::sub_32, dl, MVT::i32, SDValue(Lsr1, 0));
+
+  SDNode *Rev16 =
+      CurDAG->getMachineNode(AArch64::REV16Wr, dl, MVT::i32, Lsr1Subreg);
+
+  MachineMemOperand *StrhMemOp = St[1]->getMemOperand();
+  MachineMemOperand *StrhMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          StrhMemOp, StrhMemOp->getPointerInfo(), getLLTForMVT(MVT::i16));
+
+  SDValue StrhConst =
+      CurDAG->getTargetConstant(getLoadStoreOffset(St[1]), dl, MVT::i64);
+  SDValue StrhOps[] = {SDValue(Rev16, 0), Addr, StrhConst, Chain};
+
+  MachineSDNode *Strh =
+      CurDAG->getMachineNode(AArch64::STURHHi, dl, MVT::Other, StrhOps);
+  CurDAG->setNodeMemRefs(Strh, {StrhMMO});
+
+  ReplaceStoreNodes(St + 1, Strh, 2);
+
+  SDValue Lsr2Const = CurDAG->getTargetConstant(24, dl, MVT::i64);
+  SDValue Const31 = CurDAG->getTargetConstant(31, dl, MVT::i64);
+  SDValue Lsr2Ops[] = {SDValue(Rev16, 0), Lsr2Const, Const31};
+  SDNode *Lsr2 =
+      CurDAG->getMachineNode(AArch64::UBFMWri, dl, MVT::i32, Lsr2Ops);
+
+  SDValue StrbConst =
+      CurDAG->getTargetConstant(getLoadStoreOffset(St[0]), dl, MVT::i64);
+  SDValue StrbOps[] = {SDValue(Lsr2, 0), Addr, StrbConst, Chain};
+
+  MachineSDNode *Strb =
+      CurDAG->getMachineNode(AArch64::STRBBui, dl, MVT::Other, StrbOps);
+  CurDAG->setNodeMemRefs(Strb, {St[0]->getMemOperand()});
+
+  ReplaceStoreNodes(St, Strb, 1);
+
+  SDValue Subreg =
+      CurDAG->getTargetExtractSubreg(AArch64::sub_32, dl, MVT::i32, Stored);
+  SDNode *Rev = CurDAG->getMachineNode(AArch64::REVWr, dl, MVT::i32, Subreg);
+
+  MachineMemOperand *SturMemOp = St[3]->getMemOperand();
+  MachineMemOperand *SturMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          SturMemOp, SturMemOp->getPointerInfo(), getLLTForMVT(MVT::i32));
+
+  SDValue SturConst =
+      CurDAG->getTargetConstant(getLoadStoreOffset(St[3]), dl, MVT::i64);
+  SDValue SturOps[] = {SDValue(Rev, 0), Addr, SturConst, Chain};
+
+  MachineSDNode *Stur =
+      CurDAG->getMachineNode(AArch64::STURWi, dl, MVT::Other, SturOps);
+  CurDAG->setNodeMemRefs(Stur, {SturMMO});
+
+  ReplaceStoreNodes(St + 3, Stur, 4);
+}
+
+void AArch64DAGToDAGISel::tryWriteOpt(SDNode *N) {
+  assert(N->getOpcode() == ISD::TokenFactor && "Expected TokenFactor");
+
+  // AllSt is a map such that the key is an address stored to by a store operand
+  // of N and the value is a heap allocated vector that holds these stores.
+  DenseMap<SDValue, std::vector<StoreSDNode *> *> AllSt;
+  getAllStores(N, AllSt);
+
+  // For each address, check if the stores can be combined.
+  for (auto it = AllSt.begin(); it != AllSt.end(); ++it) {
+    SDValue Addr = it->first;
+    std::vector<StoreSDNode *> *StVec = it->second;
+
+    // Sort the store nodes in increasing offset order.
+    std::sort(StVec->begin(), StVec->end(), [](StoreSDNode *a, StoreSDNode *b) {
+      return getLoadStoreOffset(a) < getLoadStoreOffset(b);
+    });
+
+    // For each store node, check if it can be combined.
+    for (auto St = StVec->begin(); St != StVec->end(); ++St) {
+      SDValue Stored = (*St)->getOperand(1);
+      if (Stored->getOpcode() == ISD::SRL &&
+          Stored->getOperand(1).getOpcode() == ISD::Constant) {
+        int64_t Shift =
+            cast<ConstantSDNode>(Stored->getOperand(1))->getSExtValue();
+        switch (Shift) {
+        case 48:
+          if (canCombineStores(St, StVec->end(), 7)) {
+            SelectWrite7(N, St, Addr);
+            St += 6;
+          }
+          break;
+        case 40:
+          if (canCombineStores(St, StVec->end(), 6)) {
+            SelectWrite6(N, St, Addr);
+            St += 5;
+          }
+          break;
+        case 16:
+          if (canCombineStores(St, StVec->end(), 3)) {
+            SelectWrite3(N, St, Addr);
+            St += 2;
+          }
+          break;
+        default:
+          continue;
+        }
+      }
+    }
+    delete StVec;
+  }
+
+  return;
+}
+
+// LdVec is a vector of load nodes and their shifts in increasing offset order.
+// If the loads can be combined, return true.
+// If the loads cannot be combined, return false.
+static bool canCombineLoads(SmallVector<std::pair<LoadSDNode *, int64_t>> LdVec,
+                            int NumLd) {
+
+  // Need NumLd loads.
+  if (LdVec.size() != NumLd)
+    return false;
+
+  for (int i = 0; i < NumLd; ++i) {
+    int64_t Shift = LdVec[i].second;
+    // The shifts should start at (NumLd-1) * 8 and decrease by 8.
+    if (NumLd - Shift / 8 != i + 1)
+      return false;
+  }
+
+  return true;
+}
+
+static bool canCombineLoad(LoadSDNode *N) {
+  // All loads should be simple.
+  if (!N->isSimple())
+    return false;
+  // All loads should be zero extended.
+  if (N->getExtensionType() != ISD::ZEXTLOAD)
+    return false;
+  // All loads should be byte loads.
+  if (N->getMemoryVT() != MVT::i8)
+    return false;
+  // Load addresses should be of a certain form.
+  if (!canGetLoadStoreOffset(N))
+    return false;
+  return true;
+}
+
+static bool getAllLoadsHelper(SDNode *N,
+                              SmallVector<std::pair<LoadSDNode *, int64_t>> &LD,
+                              MVT VT, int64_t Shift = 0) {
+  for (SDValue Op : {N->getOperand(0), N->getOperand(1)}) {
+
+    // The operands of the OR node must have one use
+    // and the correct value type to be combined.
+    if (!Op.hasOneUse() || Op.getValueType() != VT)
+      return false;
+
+    if (Op.getOpcode() == ISD::OR) {
+      // If it is an OR node, continue traversing DAG.
+      if (!getAllLoadsHelper(Op.getNode(), LD, VT, Shift))
+        return false;
+    } else if (Op.getOpcode() == ISD::SHL) {
+      // If it is an SHL node, there are two possible cases:
+      // 1. There is a LOAD node that needs to be added to LD.
+      // 2. There is an OR node that needs to be traversed.
+
+      // The shift must be by a constant.
+      if (Op.getOperand(1).getOpcode() != ISD::Constant)
+        return false;
+      int64_t ShiftImm =
+          cast<ConstantSDNode>(Op.getOperand(1).getNode())->getSExtValue();
+      // The shift should be in range and divisible by 8.
+      if (ShiftImm < 0 || ShiftImm % 8 != 0 ||
+          (ShiftImm > 32 && VT == MVT::i32) ||
+          (ShiftImm > 64 && VT == MVT::i64))
+        return false;
+      // The data that is shifted should have one use.
+      Op = Op.getOperand(0);
+      if (!Op.hasOneUse() || Op.getValueType() != VT)
+        return false;
+      // This is to account for the case where the data shifted
+      // is extended to a different value type.
+      if (VT == MVT::i64 && ShiftImm == 32 &&
+          Op.getOpcode() == ISD::ANY_EXTEND) {
+        Op = Op.getOperand(0);
+        VT = Op.getSimpleValueType();
+        if (!Op.hasOneUse() || VT != MVT::i32)
+          return false;
+      }
+      if (Op.getOpcode() == ISD::LOAD) {
+        // The value shifted is from a load.
+        LoadSDNode *Load = cast<LoadSDNode>(Op.getNode());
+        if (!canCombineLoad(Load))
+          return false;
+        LD.push_back(std::make_pair(Load, Shift + ShiftImm));
+      } else if (Op.getOpcode() == ISD::OR) {
+        // The value shifted is from an or.
+        if (!getAllLoadsHelper(Op.getNode(), LD, VT, Shift + ShiftImm))
+          return false;
+      } else {
+        return false;
+      }
+    } else if (Op.getOpcode() == ISD::LOAD) {
+      // If it is a LOAD node, add it to LD.
+      LoadSDNode *Load = cast<LoadSDNode>(Op.getNode());
+      if (!canCombineLoad(Load))
+        return false;
+      LD.push_back(std::make_pair(Load, Shift));
+    } else {
+      return false;
+    }
+  }
+  return true;
+}
+
+// All the LOAD nodes in the DAG reachable from the OR node N
+// that can be combined are added to the vector LdVec with their shift
+// in increasing offset order.
+// Returns true if the loads can be combined, false otherwise.
+static bool getAllLoads(SDNode *N,
+                        SmallVector<std::pair<LoadSDNode *, int64_t>> &LdVec) {
+  if (!getAllLoadsHelper(N, LdVec, N->getSimpleValueType(0)))
+    return false;
+
+  if (LdVec.empty())
+    return false;
+
+  // Sort the load nodes in increasing offset order.
+  std::sort(LdVec.begin(), LdVec.end(),
+            [](std::pair<LoadSDNode *, int64_t> a,
+               std::pair<LoadSDNode *, int64_t> b) {
+              return getLoadStoreOffset(a.first) < getLoadStoreOffset(b.first);
+            });
+
+  SDValue Addr = getLoadStoreAddrWithoutOffset(LdVec[0].first);
+  int64_t Offset = getLoadStoreOffset(LdVec[0].first);
+  SDValue Chain = LdVec[0].first->getChain();
+
+  for (int i = LdVec.size() - 1; i >= 0; --i) {
+    LoadSDNode *Ld = LdVec[i].first;
+    // The loads should all load from the same address.
+    if (getLoadStoreAddrWithoutOffset(Ld) != Addr)
+      return false;
+    // The loads should have consecutive offsets.
+    if (getLoadStoreOffset(Ld) != Offset + i)
+      return false;
+    // The chains should all be the same.
+    if (Ld->getChain() != Chain)
+      return false;
+  }
+
+  return true;
+}
+
+void AArch64DAGToDAGISel::SelectRead3(
+    SDNode *N, SmallVector<std::pair<LoadSDNode *, int64_t>> &LD) {
+  assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
+
+  if (N->getValueType(0) != MVT::i32)
+    return;
+
+  SDValue Addr = getLoadStoreAddrWithoutOffset(LD[0].first);
+  int64_t Offset = getLoadStoreOffset(LD[0].first);
+  SDValue Chain = LD[0].first->getChain();
+
+  SDLoc dl(N);
+
+  MachineMemOperand *MemOp = LD[0].first->getMemOperand();
+  MachineMemOperand *MMO = CurDAG->getMachineFunction().getMachineMemOperand(
+      MemOp, MemOp->getPointerInfo(), getLLTForMVT(MVT::i16));
+
+  SDValue Off = CurDAG->getTargetConstant(Offset, dl, MVT::i64);
+  SDValue LdrhOps[] = {Addr, Off, Chain};
+
+  MachineSDNode *Ldrh = CurDAG->getMachineNode(
+      AArch64::LDURHHi, dl, LD[0].first->getVTList(), LdrhOps);
+  CurDAG->setNodeMemRefs(Ldrh, {MMO});
+
+  SDNode *Rev16 =
+      CurDAG->getMachineNode(AArch64::REV16Wr, dl, MVT::i32, SDValue(Ldrh, 0));
+
+  SDValue Const = CurDAG->getTargetConstant(8, dl, MVT::i64);
+  SDValue Ops[] = {SDValue(LD[2].first, 0), SDValue(Rev16, 0), Const};
+
+  SDNode *Or = CurDAG->getMachineNode(AArch64::ORRWrs, dl, N->getVTList(), Ops);
+
+  ReplaceNode(N, Or);
+}
+
+void AArch64DAGToDAGISel::SelectRead6(
+    SDNode *N, SmallVector<std::pair<LoadSDNode *, int64_t>> &LD) {
+  assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
+
+  if (N->getValueType(0) != MVT::i64)
+    return;
+
+  SDValue Addr = getLoadStoreAddrWithoutOffset(LD[0].first);
+  int64_t Offset = getLoadStoreOffset(LD[0].first);
+  SDValue Chain = LD[0].first->getChain();
+
+  SDLoc dl(N);
+  SDValue Const0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+
+  MachineMemOperand *LdrhMemOp = LD[0].first->getMemOperand();
+  MachineMemOperand *LdrhMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          LdrhMemOp, LdrhMemOp->getPointerInfo(), getLLTForMVT(MVT::i16));
+
+  SDValue LdrhConst = CurDAG->getTargetConstant(Offset, dl, MVT::i64);
+  SDValue LdrhOps[] = {Addr, LdrhConst, Chain};
+
+  MachineSDNode *Ldrh = CurDAG->getMachineNode(
+      AArch64::LDURHHi, dl, LD[0].first->getVTList(), LdrhOps);
+  CurDAG->setNodeMemRefs(Ldrh, {LdrhMMO});
+
+  SDNode *Rev16 =
+      CurDAG->getMachineNode(AArch64::REV16Wr, dl, MVT::i32, SDValue(Ldrh, 0));
+
+  SDValue Sub32 = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i64);
+  SDValue Subreg0Ops[] = {Const0, SDValue(Rev16, 0), Sub32};
+  SDNode *Subreg0 =
+      CurDAG->getMachineNode(AArch64::SUBREG_TO_REG, dl, MVT::i64, Subreg0Ops);
+
+  MachineMemOperand *LdurMemOp = LD[2].first->getMemOperand();
+  MachineMemOperand *LdurMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          LdurMemOp, LdurMemOp->getPointerInfo(), getLLTForMVT(MVT::i32));
+
+  SDValue LdurConst = CurDAG->getTargetConstant(Offset + 2, dl, MVT::i64);
+  SDValue LdurOps[] = {Addr, LdurConst, Chain};
+
+  MachineSDNode *Ldur = CurDAG->getMachineNode(
+      AArch64::LDURWi, dl, LD[2].first->getVTList(), LdurOps);
+  CurDAG->setNodeMemRefs(Ldur, {LdurMMO});
+
+  SDNode *Rev =
+      CurDAG->getMachineNode(AArch64::REVWr, dl, MVT::i32, SDValue(Ldur, 0));
+
+  SDValue Subreg1Ops[] = {Const0, SDValue(Rev, 0), Sub32};
+  SDNode *Subreg1 =
+      CurDAG->getMachineNode(AArch64::SUBREG_TO_REG, dl, MVT::i64, Subreg1Ops);
+
+  SDValue Const = CurDAG->getTargetConstant(32, dl, MVT::i64);
+  SDValue Ops[] = {SDValue(Subreg1, 0), SDValue(Subreg0, 0), Const};
+  SDNode *Or = CurDAG->getMachineNode(AArch64::ORRXrs, dl, N->getVTList(), Ops);
+
+  ReplaceNode(N, Or);
+}
+
+void AArch64DAGToDAGISel::SelectRead7(
+    SDNode *N, SmallVector<std::pair<LoadSDNode *, int64_t>> &LD) {
+  assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
+
+  if (N->getValueType(0) != MVT::i64)
+    return;
+
+  SDValue Addr = getLoadStoreAddrWithoutOffset(LD[0].first);
+  int64_t Offset = getLoadStoreOffset(LD[0].first);
+  SDValue Chain = LD[0].first->getChain();
+
+  SDLoc dl(N);
+  SDValue Const0 = CurDAG->getTargetConstant(0, dl, MVT::i32);
+
+  MachineMemOperand *LdrhMemOp = LD[0].first->getMemOperand();
+  MachineMemOperand *LdrhMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          LdrhMemOp, LdrhMemOp->getPointerInfo(), getLLTForMVT(MVT::i16));
+
+  SDValue LdrhConst = CurDAG->getTargetConstant(Offset, dl, MVT::i64);
+  SDValue LdrhOps[] = {Addr, LdrhConst, Chain};
+
+  MachineSDNode *Ldrh = CurDAG->getMachineNode(
+      AArch64::LDURHHi, dl, LD[0].first->getVTList(), LdrhOps);
+  CurDAG->setNodeMemRefs(Ldrh, {LdrhMMO});
+
+  SDNode *Rev16 =
+      CurDAG->getMachineNode(AArch64::REV16Wr, dl, MVT::i32, SDValue(Ldrh, 0));
+
+  SDValue OrrConst = CurDAG->getTargetConstant(8, dl, MVT::i64);
+  SDValue OrrOps[] = {SDValue(LD[2].first, 0), SDValue(Rev16, 0), OrrConst};
+  SDNode *Orr = CurDAG->getMachineNode(AArch64::ORRWrs, dl, MVT::i32, OrrOps);
+
+  SDValue Sub32 = CurDAG->getTargetConstant(AArch64::sub_32, dl, MVT::i64);
+  SDValue Subreg0Ops[] = {Const0, SDValue(Orr, 0), Sub32};
+  SDNode *Subreg0 =
+      CurDAG->getMachineNode(AArch64::SUBREG_TO_REG, dl, MVT::i64, Subreg0Ops);
+
+  MachineMemOperand *LdurMemOp = LD[3].first->getMemOperand();
+  MachineMemOperand *LdurMMO =
+      CurDAG->getMachineFunction().getMachineMemOperand(
+          LdurMemOp, LdurMemOp->getPointerInfo(), getLLTForMVT(MVT::i32));
+
+  SDValue LdurConst = CurDAG->getTargetConstant(Offset + 3, dl, MVT::i64);
+  SDValue LdurOps[] = {Addr, LdurConst, Chain};
+
+  MachineSDNode *Ldur = CurDAG->getMachineNode(
+      AArch64::LDURWi, dl, LD[3].first->getVTList(), LdurOps);
+  CurDAG->setNodeMemRefs(Ldur, {LdurMMO});
+
+  SDNode *Rev =
+      CurDAG->getMachineNode(AArch64::REVWr, dl, MVT::i32, SDValue(Ldur, 0));
+
+  SDValue Subreg1Ops[] = {Const0, SDValue(Rev, 0), Sub32};
+  SDNode *Subreg1 =
+      CurDAG->getMachineNode(AArch64::SUBREG_TO_REG, dl, MVT::i64, Subreg1Ops);
+
+  SDValue Const = CurDAG->getTargetConstant(32, dl, MVT::i64);
+  SDValue Ops[] = {SDValue(Subreg1, 0), SDValue(Subreg0, 0), Const};
+  SDNode *Or = CurDAG->getMachineNode(AArch64::ORRXrs, dl, N->getVTList(), Ops);
+
+  ReplaceNode(N, Or);
+}
+
+bool AArch64DAGToDAGISel::tryReadOpt(SDNode *N) {
+  assert(N->getOpcode() == ISD::OR && "Expected OR instruction");
+
+  // AllLd is a vector of all reachable (load, shift) pairs.
+  SmallVector<std::pair<LoadSDNode *, int64_t>> AllLd;
+  if (!getAllLoads(N, AllLd))
+    return false;
+
+  if (canCombineLoads(AllLd, 7)) {
+    SelectRead7(N, AllLd);
+    return true;
+  }
+  if (canCombineLoads(AllLd, 6)) {
+    SelectRead6(N, AllLd);
+    return true;
+  }
+  if (canCombineLoads(AllLd, 3)) {
+    SelectRead3(N, AllLd);
+    return true;
+  }
+
+  return false;
+}
+
 void AArch64DAGToDAGISel::Select(SDNode *Node) {
   // If we have a custom node, we already have selected!
   if (Node->isMachineOpcode()) {
@@ -4245,6 +4932,10 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
   case ISD::OR:
     if (tryBitfieldInsertOp(Node))
       return;
+    if (EnableEndiannessOpts) {
+      if (tryReadOpt(Node))
+        return;
+    }
     break;
 
   case ISD::EXTRACT_SUBVECTOR: {
@@ -6424,6 +7115,11 @@ void AArch64DAGToDAGISel::Select(SDNode *Node) {
     }
     break;
   }
+  case ISD::TokenFactor:
+    if (EnableEndiannessOpts) {
+      tryWriteOpt(Node);
+    }
+    break;
   }
 
   // Select the default instruction
diff --git a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
index 0f2cb775e98e..d3e23770c065 100644
--- a/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
+++ b/llvm/test/CodeGen/AArch64/merge-trunc-store.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc < %s -mtriple=aarch64--    | FileCheck %s --check-prefixes=CHECK,LE
-; RUN: llc < %s -mtriple=aarch64_be-- | FileCheck %s --check-prefixes=CHECK,BE
+; RUN: llc < %s -mtriple=aarch64--    | FileCheck %s --check-prefixes=CHECK,LE,NO-ENDIAN-OPTS
+; RUN: llc < %s -mtriple=aarch64_be-- | FileCheck %s --check-prefixes=CHECK,BE,NO-ENDIAN-OPTS
+; RUN: llc < %s -mtriple=aarch64--    -aarch64-endianness-opts | FileCheck %s --check-prefixes=CHECK,LE,ENDIAN-OPTS
+; RUN: llc < %s -mtriple=aarch64_be-- -aarch64-endianness-opts | FileCheck %s --check-prefixes=CHECK,BE,ENDIAN-OPTS
 
 define void @le_i16_to_i8(i16 %x, ptr %p0) {
 ; LE-LABEL: le_i16_to_i8:
@@ -744,22 +746,38 @@ define void @i32_to_i8_incomplete(i32 %x, ptr %p0) {
 ; Negative test - no store of 't3'
 
 define void @i64_to_i8_incomplete(i64 %x, ptr %p0) {
-; CHECK-LABEL: i64_to_i8_incomplete:
-; CHECK:       // %bb.0:
-; CHECK-NEXT:    lsr x8, x0, #56
-; CHECK-NEXT:    lsr x9, x0, #48
-; CHECK-NEXT:    lsr x10, x0, #40
-; CHECK-NEXT:    lsr x11, x0, #32
-; CHECK-NEXT:    strb w0, [x1, #7]
-; CHECK-NEXT:    strb w8, [x1]
-; CHECK-NEXT:    lsr x8, x0, #16
-; CHECK-NEXT:    strb w9, [x1, #1]
-; CHECK-NEXT:    lsr x9, x0, #8
-; CHECK-NEXT:    strb w10, [x1, #2]
-; CHECK-NEXT:    strb w11, [x1, #3]
-; CHECK-NEXT:    strb w8, [x1, #5]
-; CHECK-NEXT:    strb w9, [x1, #6]
-; CHECK-NEXT:    ret
+; NO-ENDIAN-OPTS-LABEL: i64_to_i8_incomplete:
+; NO-ENDIAN-OPTS:       // %bb.0:
+; NO-ENDIAN-OPTS-NEXT:    lsr x8, x0, #56
+; NO-ENDIAN-OPTS-NEXT:    lsr x9, x0, #48
+; NO-ENDIAN-OPTS-NEXT:    lsr x10, x0, #40
+; NO-ENDIAN-OPTS-NEXT:    lsr x11, x0, #32
+; NO-ENDIAN-OPTS-NEXT:    strb w0, [x1, #7]
+; NO-ENDIAN-OPTS-NEXT:    strb w8, [x1]
+; NO-ENDIAN-OPTS-NEXT:    lsr x8, x0, #16
+; NO-ENDIAN-OPTS-NEXT:    strb w9, [x1, #1]
+; NO-ENDIAN-OPTS-NEXT:    lsr x9, x0, #8
+; NO-ENDIAN-OPTS-NEXT:    strb w10, [x1, #2]
+; NO-ENDIAN-OPTS-NEXT:    strb w11, [x1, #3]
+; NO-ENDIAN-OPTS-NEXT:    strb w8, [x1, #5]
+; NO-ENDIAN-OPTS-NEXT:    strb w9, [x1, #6]
+; NO-ENDIAN-OPTS-NEXT:    ret
+;
+; ENDIAN-OPTS-LABEL: i64_to_i8_incomplete:
+; ENDIAN-OPTS:       // %bb.0:
+; ENDIAN-OPTS-NEXT:    rev16 w8, w0
+; ENDIAN-OPTS-NEXT:    lsr x9, x0, #56
+; ENDIAN-OPTS-NEXT:    lsr x10, x0, #48
+; ENDIAN-OPTS-NEXT:    lsr x11, x0, #40
+; ENDIAN-OPTS-NEXT:    sturh w8, [x1, #6]
+; ENDIAN-OPTS-NEXT:    lsr x8, x0, #32
+; ENDIAN-OPTS-NEXT:    strb w9, [x1]
+; ENDIAN-OPTS-NEXT:    lsr x9, x0, #16
+; ENDIAN-OPTS-NEXT:    strb w10, [x1, #1]
+; ENDIAN-OPTS-NEXT:    strb w11, [x1, #2]
+; ENDIAN-OPTS-NEXT:    strb w8, [x1, #3]
+; ENDIAN-OPTS-NEXT:    strb w9, [x1, #5]
+; ENDIAN-OPTS-NEXT:    ret
   %sh1 = lshr i64 %x, 8
   %sh2 = lshr i64 %x, 16
   %sh3 = lshr i64 %x, 24
diff --git a/llvm/test/CodeGen/AArch64/reduce-load-store.ll b/llvm/test/CodeGen/AArch64/reduce-load-store.ll
new file mode 100644
index 000000000000..bc5fc901cf93
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/reduce-load-store.ll
@@ -0,0 +1,285 @@
+; RUN: llc < %s -mtriple=aarch64 -aarch64-endianness-opts | FileCheck %s --check-prefixes=CHECK
+
+define i32 @test_read_3(ptr %b) {
+; CHECK-LABEL: test_read_3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x0, #2]
+; CHECK-NEXT:    rev16 w8, w8
+; CHECK-NEXT:    orr w0, w9, w8, lsl #8
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, ptr %b, align 1
+  %conv = zext i8 %0 to i32
+  %shl = shl nuw nsw i32 %conv, 16
+  %arrayidx1 = getelementptr inbounds i8, ptr %b, i64 1
+  %1 = load i8, ptr %arrayidx1, align 1
+  %conv2 = zext i8 %1 to i32
+  %shl3 = shl nuw nsw i32 %conv2, 8
+  %or = or i32 %shl3, %shl
+  %arrayidx4 = getelementptr inbounds i8, ptr %b, i64 2
+  %2 = load i8, ptr %arrayidx4, align 1
+  %conv5 = zext i8 %2 to i32
+  %or6 = or i32 %or, %conv5
+  ret i32 %or6
+}
+
+define i64 @test_read_6(ptr %b) {
+; CHECK-LABEL: test_read_6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0]
+; CHECK-NEXT:    ldur w9, [x0, #2]
+; CHECK-NEXT:    rev16 w8, w8
+; CHECK-NEXT:    rev w9, w9
+; CHECK-NEXT:    orr x0, x9, x8, lsl #32
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, ptr %b, align 1
+  %conv.i = zext i8 %0 to i64
+  %arrayidx1.i = getelementptr inbounds i8, ptr %b, i64 1
+  %1 = load i8, ptr %arrayidx1.i, align 1
+  %conv2.i = zext i8 %1 to i64
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 2
+  %2 = load i8, ptr %add.ptr, align 1
+  %conv.i5 = zext i8 %2 to i64
+  %shl.i6 = shl nuw nsw i64 %conv.i5, 24
+  %arrayidx1.i7 = getelementptr inbounds i8, ptr %b, i64 3
+  %3 = load i8, ptr %arrayidx1.i7, align 1
+  %conv2.i8 = zext i8 %3 to i64
+  %shl3.i = shl nuw nsw i64 %conv2.i8, 16
+  %arrayidx4.i = getelementptr inbounds i8, ptr %b, i64 4
+  %4 = load i8, ptr %arrayidx4.i, align 1
+  %conv5.i = zext i8 %4 to i64
+  %shl6.i = shl nuw nsw i64 %conv5.i, 8
+  %arrayidx8.i = getelementptr inbounds i8, ptr %b, i64 5
+  %5 = load i8, ptr %arrayidx8.i, align 1
+  %conv9.i = zext i8 %5 to i64
+  %6 = shl nuw nsw i64 %conv.i, 40
+  %7 = shl nuw nsw i64 %conv2.i, 32
+  %or.i9 = or i64 %7, %6
+  %or7.i = or i64 %or.i9, %shl.i6
+  %or10.i = or i64 %or7.i, %shl3.i
+  %shl.i10 = or i64 %or10.i, %shl6.i
+  %or.i11 = or i64 %shl.i10, %conv9.i
+  ret i64 %or.i11
+}
+
+define i64 @test_read_7(ptr %b) {
+; CHECK-LABEL: test_read_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldurh w8, [x0]
+; CHECK-NEXT:    ldrb w9, [x0, #2]
+; CHECK-NEXT:    ldur w10, [x0, #3]
+; CHECK-NEXT:    rev16 w8, w8
+; CHECK-NEXT:    orr w8, w9, w8, lsl #8
+; CHECK-NEXT:    rev w9, w10
+; CHECK-NEXT:    orr x0, x9, x8, lsl #32
+; CHECK-NEXT:    ret
+entry:
+  %0 = load i8, ptr %b, align 1
+  %conv.i = zext i8 %0 to i64
+  %shl.i = shl nuw nsw i64 %conv.i, 16
+  %arrayidx1.i = getelementptr inbounds i8, ptr %b, i64 1
+  %1 = load i8, ptr %arrayidx1.i, align 1
+  %conv2.i = zext i8 %1 to i64
+  %shl3.i = shl nuw nsw i64 %conv2.i, 8
+  %or.i = or i64 %shl3.i, %shl.i
+  %arrayidx4.i = getelementptr inbounds i8, ptr %b, i64 2
+  %2 = load i8, ptr %arrayidx4.i, align 1
+  %conv5.i = zext i8 %2 to i64
+  %or6.i = or i64 %or.i, %conv5.i
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 3
+  %3 = load i8, ptr %add.ptr, align 1
+  %conv.i5 = zext i8 %3 to i64
+  %shl.i6 = shl nuw nsw i64 %conv.i5, 24
+  %arrayidx1.i7 = getelementptr inbounds i8, ptr %b, i64 4
+  %4 = load i8, ptr %arrayidx1.i7, align 1
+  %conv2.i8 = zext i8 %4 to i64
+  %shl3.i9 = shl nuw nsw i64 %conv2.i8, 16
+  %or.i10 = or i64 %shl3.i9, %shl.i6
+  %arrayidx4.i11 = getelementptr inbounds i8, ptr %b, i64 5
+  %5 = load i8, ptr %arrayidx4.i11, align 1
+  %conv5.i12 = zext i8 %5 to i64
+  %shl6.i = shl nuw nsw i64 %conv5.i12, 8
+  %arrayidx8.i = getelementptr inbounds i8, ptr %b, i64 6
+  %6 = load i8, ptr %arrayidx8.i, align 1
+  %conv9.i = zext i8 %6 to i64
+  %shl.i13 = shl nuw nsw i64 %or6.i, 32
+  %or7.i = or i64 %or.i10, %shl.i13
+  %or10.i = or i64 %or7.i, %shl6.i
+  %or.i14 = or i64 %or10.i, %conv9.i
+  ret i64 %or.i14
+}
+
+define void @test_write_3(ptr %b, i64 %n) {
+; CHECK-LABEL: test_write_3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev16 w8, w1
+; CHECK-NEXT:    lsr x9, x1, #16
+; CHECK-NEXT:    sturh w8, [x0, #1]
+; CHECK-NEXT:    strb w9, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %shr = lshr i64 %n, 16
+  %conv = trunc i64 %shr to i8
+  store i8 %conv, ptr %b, align 1
+  %shr1 = lshr i64 %n, 8
+  %conv2 = trunc i64 %shr1 to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %b, i64 1
+  store i8 %conv2, ptr %arrayidx3, align 1
+  %conv4 = trunc i64 %n to i8
+  %arrayidx5 = getelementptr inbounds i8, ptr %b, i64 2
+  store i8 %conv4, ptr %arrayidx5, align 1
+  ret void
+}
+
+define void @test_write_6(ptr %b, i64 %n) {
+; CHECK-LABEL: test_write_6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsr x9, x1, #32
+; CHECK-NEXT:    rev w8, w1
+; CHECK-NEXT:    rev16 w9, w9
+; CHECK-NEXT:    stur w8, [x0, #2]
+; CHECK-NEXT:    sturh w9, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %shr = lshr i64 %n, 32
+  %shr.i = lshr i64 %n, 40
+  %conv.i = trunc i64 %shr.i to i8
+  store i8 %conv.i, ptr %b, align 1
+  %conv1.i = trunc i64 %shr to i8
+  %arrayidx2.i = getelementptr inbounds i8, ptr %b, i64 1
+  store i8 %conv1.i, ptr %arrayidx2.i, align 1
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 2
+  %shr.i3 = lshr i64 %n, 24
+  %conv.i4 = trunc i64 %shr.i3 to i8
+  store i8 %conv.i4, ptr %add.ptr, align 1
+  %shr1.i = lshr i64 %n, 16
+  %conv2.i = trunc i64 %shr1.i to i8
+  %arrayidx3.i = getelementptr inbounds i8, ptr %b, i64 3
+  store i8 %conv2.i, ptr %arrayidx3.i, align 1
+  %shr4.i = lshr i64 %n, 8
+  %conv5.i = trunc i64 %shr4.i to i8
+  %arrayidx6.i = getelementptr inbounds i8, ptr %b, i64 4
+  store i8 %conv5.i, ptr %arrayidx6.i, align 1
+  %conv7.i = trunc i64 %n to i8
+  %arrayidx8.i = getelementptr inbounds i8, ptr %b, i64 5
+  store i8 %conv7.i, ptr %arrayidx8.i, align 1
+  ret void
+}
+
+define void @test_write_7(ptr %b, i64 %n) {
+; CHECK-LABEL: test_write_7:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    lsr x9, x1, #32
+; CHECK-NEXT:    rev w8, w1
+; CHECK-NEXT:    rev16 w9, w9
+; CHECK-NEXT:    lsr w10, w9, #24
+; CHECK-NEXT:    stur w8, [x0, #3]
+; CHECK-NEXT:    sturh w9, [x0, #1]
+; CHECK-NEXT:    strb w10, [x0]
+; CHECK-NEXT:    ret
+entry:
+  %shr = lshr i64 %n, 32
+  %shr.i = lshr i64 %n, 48
+  %conv.i = trunc i64 %shr.i to i8
+  store i8 %conv.i, ptr %b, align 1
+  %shr1.i = lshr i64 %n, 40
+  %conv2.i = trunc i64 %shr1.i to i8
+  %arrayidx3.i = getelementptr inbounds i8, ptr %b, i64 1
+  store i8 %conv2.i, ptr %arrayidx3.i, align 1
+  %conv4.i = trunc i64 %shr to i8
+  %arrayidx5.i = getelementptr inbounds i8, ptr %b, i64 2
+  store i8 %conv4.i, ptr %arrayidx5.i, align 1
+  %add.ptr = getelementptr inbounds i8, ptr %b, i64 3
+  %shr.i3 = lshr i64 %n, 24
+  %conv.i4 = trunc i64 %shr.i3 to i8
+  store i8 %conv.i4, ptr %add.ptr, align 1
+  %shr1.i5 = lshr i64 %n, 16
+  %conv2.i6 = trunc i64 %shr1.i5 to i8
+  %arrayidx3.i7 = getelementptr inbounds i8, ptr %b, i64 4
+  store i8 %conv2.i6, ptr %arrayidx3.i7, align 1
+  %shr4.i = lshr i64 %n, 8
+  %conv5.i = trunc i64 %shr4.i to i8
+  %arrayidx6.i = getelementptr inbounds i8, ptr %b, i64 5
+  store i8 %conv5.i, ptr %arrayidx6.i, align 1
+  %conv7.i = trunc i64 %n to i8
+  %arrayidx8.i = getelementptr inbounds i8, ptr %b, i64 6
+  store i8 %conv7.i, ptr %arrayidx8.i, align 1
+  ret void
+}
+
+define void @test_write_with_different_chains(ptr noalias %b, i64 %n, ptr noalias %c) {
+; CHECK-LABEL: test_write_with_different_chains:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    ldr x8, [x2]
+; CHECK-NEXT:    lsr x9, x1, #16
+; CHECK-NEXT:    rev16 w10, w1
+; CHECK-NEXT:    rev16 w11, w8
+; CHECK-NEXT:    lsr x8, x8, #16
+; CHECK-NEXT:    strb w9, [x0]
+; CHECK-NEXT:    sturh w10, [x0, #1]
+; CHECK-NEXT:    sturh w11, [x2, #1]
+; CHECK-NEXT:    strb w8, [x2]
+; CHECK-NEXT:    ret
+entry:
+  %shr = lshr i64 %n, 16
+  %conv = trunc i64 %shr to i8
+  store i8 %conv, ptr %b, align 1
+  %shr1 = lshr i64 %n, 8
+  %conv2 = trunc i64 %shr1 to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %b, i64 1
+  store i8 %conv2, ptr %arrayidx3, align 1
+  %conv4 = trunc i64 %n to i8
+  %arrayidx5 = getelementptr inbounds i8, ptr %b, i64 2
+  store i8 %conv4, ptr %arrayidx5, align 1
+  %0 = load i64, ptr %c, align 1
+  %shr10 = lshr i64 %0, 16
+  %conv10 = trunc i64 %shr10 to i8
+  store i8 %conv10, ptr %c, align 1
+  %shr11 = lshr i64 %0, 8
+  %conv12 = trunc i64 %shr11 to i8
+  %arrayidx13 = getelementptr inbounds i8, ptr %c, i64 1
+  store i8 %conv12, ptr %arrayidx13, align 1
+  %conv14 = trunc i64 %0 to i8
+  %arrayidx15 = getelementptr inbounds i8, ptr %c, i64 2
+  store i8 %conv14, ptr %arrayidx15, align 1
+  ret void
+}
+
+define void @test_write_multiple_to_same_address(ptr %b, i64 %n, i64 %m) {
+; CHECK-LABEL: test_write_multiple_to_same_address:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    rev16 w8, w2
+; CHECK-NEXT:    rev16 w9, w1
+; CHECK-NEXT:    lsr x10, x1, #16
+; CHECK-NEXT:    lsr x11, x2, #16
+; CHECK-NEXT:    sturh w8, [x0, #8]
+; CHECK-NEXT:    sturh w9, [x0, #1]
+; CHECK-NEXT:    strb w10, [x0]
+; CHECK-NEXT:    strb w11, [x0, #7]
+; CHECK-NEXT:    ret
+entry:
+  %shr = lshr i64 %n, 16
+  %conv = trunc i64 %shr to i8
+  store i8 %conv, ptr %b, align 1
+  %shr1 = lshr i64 %n, 8
+  %conv2 = trunc i64 %shr1 to i8
+  %arrayidx3 = getelementptr inbounds i8, ptr %b, i64 1
+  store i8 %conv2, ptr %arrayidx3, align 1
+  %conv4 = trunc i64 %n to i8
+  %arrayidx5 = getelementptr inbounds i8, ptr %b, i64 2
+  store i8 %conv4, ptr %arrayidx5, align 1
+  %shr10 = lshr i64 %m, 16
+  %conv10 = trunc i64 %shr10 to i8
+  %arrayidx11 = getelementptr inbounds i8, ptr %b, i64 7
+  store i8 %conv10, ptr %arrayidx11, align 1
+  %shr11 = lshr i64 %m, 8
+  %conv12 = trunc i64 %shr11 to i8
+  %arrayidx13 = getelementptr inbounds i8, ptr %b, i64 8
+  store i8 %conv12, ptr %arrayidx13, align 1
+  %conv14 = trunc i64 %m to i8
+  %arrayidx15 = getelementptr inbounds i8, ptr %b, i64 9
+  store i8 %conv14, ptr %arrayidx15, align 1
+  ret void
+}
\ No newline at end of file
-- 
Gitee