From 6296ebd45d3f916bea6bf434c1b5580441f9234a Mon Sep 17 00:00:00 2001
From: Tobias Hieta <tobias@hieta.se>
Date: Tue, 15 Jul 2025 15:59:05 +0200
Subject: [PATCH 1/9] Bump version to 21.1.0-git

---
 cmake/Modules/LLVMVersion.cmake          | 2 +-
 libcxx/include/__config                  | 2 +-
 llvm/utils/gn/secondary/llvm/version.gni | 2 +-
 llvm/utils/lit/lit/__init__.py           | 2 +-
 llvm/utils/mlgo-utils/mlgo/__init__.py   | 2 +-
 5 files changed, 5 insertions(+), 5 deletions(-)

diff --git a/cmake/Modules/LLVMVersion.cmake b/cmake/Modules/LLVMVersion.cmake
index f14aae172f077..c12240f98e97f 100644
--- a/cmake/Modules/LLVMVersion.cmake
+++ b/cmake/Modules/LLVMVersion.cmake
@@ -4,7 +4,7 @@ if(NOT DEFINED LLVM_VERSION_MAJOR)
   set(LLVM_VERSION_MAJOR 21)
 endif()
 if(NOT DEFINED LLVM_VERSION_MINOR)
-  set(LLVM_VERSION_MINOR 0)
+  set(LLVM_VERSION_MINOR 1)
 endif()
 if(NOT DEFINED LLVM_VERSION_PATCH)
   set(LLVM_VERSION_PATCH 0)
diff --git a/libcxx/include/__config b/libcxx/include/__config
index d940461c30234..8f215bbe47928 100644
--- a/libcxx/include/__config
+++ b/libcxx/include/__config
@@ -28,7 +28,7 @@
 // _LIBCPP_VERSION represents the version of libc++, which matches the version of LLVM.
 // Given a LLVM release LLVM XX.YY.ZZ (e.g. LLVM 17.0.1 == 17.00.01), _LIBCPP_VERSION is
 // defined to XXYYZZ.
-#  define _LIBCPP_VERSION 210000
+#  define _LIBCPP_VERSION 210100
 
 #  define _LIBCPP_CONCAT_IMPL(_X, _Y) _X##_Y
 #  define _LIBCPP_CONCAT(_X, _Y) _LIBCPP_CONCAT_IMPL(_X, _Y)
diff --git a/llvm/utils/gn/secondary/llvm/version.gni b/llvm/utils/gn/secondary/llvm/version.gni
index 2b1a9076afe4a..ece4106de4aca 100644
--- a/llvm/utils/gn/secondary/llvm/version.gni
+++ b/llvm/utils/gn/secondary/llvm/version.gni
@@ -1,4 +1,4 @@
 llvm_version_major = 21
-llvm_version_minor = 0
+llvm_version_minor = 1
 llvm_version_patch = 0
 llvm_version = "$llvm_version_major.$llvm_version_minor.$llvm_version_patch"
diff --git a/llvm/utils/lit/lit/__init__.py b/llvm/utils/lit/lit/__init__.py
index b5aa8edc03dc7..520ff22dc6fb0 100644
--- a/llvm/utils/lit/lit/__init__.py
+++ b/llvm/utils/lit/lit/__init__.py
@@ -2,7 +2,7 @@
 
 __author__ = "Daniel Dunbar"
 __email__ = "daniel@minormatter.com"
-__versioninfo__ = (21, 0, 0)
+__versioninfo__ = (21, 1, 0)
 __version__ = ".".join(str(v) for v in __versioninfo__) + "dev"
 
 __all__ = []
diff --git a/llvm/utils/mlgo-utils/mlgo/__init__.py b/llvm/utils/mlgo-utils/mlgo/__init__.py
index d3369abae70b9..03eee0028b3cc 100644
--- a/llvm/utils/mlgo-utils/mlgo/__init__.py
+++ b/llvm/utils/mlgo-utils/mlgo/__init__.py
@@ -4,7 +4,7 @@
 
 from datetime import timezone, datetime
 
-__versioninfo__ = (20, 0, 0)
+__versioninfo__ = (21, 1, 0)
 __version__ = (
     ".".join(str(v) for v in __versioninfo__)
     + "dev"

From 18624ae54bc979e47ad990721eb20eb9ca982a2f Mon Sep 17 00:00:00 2001
From: Martin Erhart <martin.erhart@sifive.com>
Date: Tue, 15 Jul 2025 14:48:05 +0100
Subject: [PATCH 2/9] [mlir][SliceAnalysis] Fix stack overflow in graph regions
 (#139694)

This analysis currently just crashes when applied to a graph region that
has a use-def cycle. This PR fixes that by keeping track of the
operations the DFS has already visited when following use-def edges and
stopping once we visit an operation again.
---
 mlir/include/mlir/Analysis/SliceAnalysis.h  | 10 ++--
 mlir/lib/Analysis/SliceAnalysis.cpp         | 65 ++++++++++++++++-----
 mlir/test/Dialect/Affine/slicing-utils.mlir | 23 ++++++++
 3 files changed, 79 insertions(+), 19 deletions(-)

diff --git a/mlir/include/mlir/Analysis/SliceAnalysis.h b/mlir/include/mlir/Analysis/SliceAnalysis.h
index d082d2d9f758b..18349d071bb2e 100644
--- a/mlir/include/mlir/Analysis/SliceAnalysis.h
+++ b/mlir/include/mlir/Analysis/SliceAnalysis.h
@@ -65,8 +65,9 @@ using ForwardSliceOptions = SliceOptions;
 ///
 /// The implementation traverses the use chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `forwardSlice`, no
-/// need to traverse its uses again. Since use-def chains form a DAG, this
-/// terminates.
+/// need to traverse its uses again. In the presence of use-def cycles in a
+/// graph region, the traversal stops at the first operation that was already
+/// visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `forwardSlice` is filled with a
 /// postorder list of uses (i.e. a reverse topological order). To get a proper
@@ -114,8 +115,9 @@ void getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 ///
 /// The implementation traverses the def chains in postorder traversal for
 /// efficiency reasons: if an operation is already in `backwardSlice`, no
-/// need to traverse its definitions again. Since useuse-def chains form a DAG,
-/// this terminates.
+/// need to traverse its definitions again. In the presence of use-def cycles
+/// in a graph region, the traversal stops at the first operation that was
+/// already visited (which is not added to the slice anymore).
 ///
 /// Upon return to the root call, `backwardSlice` is filled with a
 /// postorder list of defs. This happens to be a topological order, from the
diff --git a/mlir/lib/Analysis/SliceAnalysis.cpp b/mlir/lib/Analysis/SliceAnalysis.cpp
index 36a9812bd7972..991c71e3f689a 100644
--- a/mlir/lib/Analysis/SliceAnalysis.cpp
+++ b/mlir/lib/Analysis/SliceAnalysis.cpp
@@ -26,7 +26,8 @@
 using namespace mlir;
 
 static void
-getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
+getForwardSliceImpl(Operation *op, DenseSet<Operation *> &visited,
+                    SetVector<Operation *> *forwardSlice,
                     const SliceOptions::TransitiveFilter &filter = nullptr) {
   if (!op)
     return;
@@ -40,20 +41,41 @@ getForwardSliceImpl(Operation *op, SetVector<Operation *> *forwardSlice,
   for (Region &region : op->getRegions())
     for (Block &block : region)
       for (Operation &blockOp : block)
-        if (forwardSlice->count(&blockOp) == 0)
-          getForwardSliceImpl(&blockOp, forwardSlice, filter);
-  for (Value result : op->getResults()) {
-    for (Operation *userOp : result.getUsers())
-      if (forwardSlice->count(userOp) == 0)
-        getForwardSliceImpl(userOp, forwardSlice, filter);
-  }
+        if (forwardSlice->count(&blockOp) == 0) {
+          // We don't have to check if the 'blockOp' is already visited because
+          // there cannot be a traversal path from this nested op to the parent
+          // and thus a cycle cannot be closed here. We still have to mark it
+          // as visited to stop before visiting this operation again if it is
+          // part of a cycle.
+          visited.insert(&blockOp);
+          getForwardSliceImpl(&blockOp, visited, forwardSlice, filter);
+          visited.erase(&blockOp);
+        }
+
+  for (Value result : op->getResults())
+    for (Operation *userOp : result.getUsers()) {
+      // A cycle can only occur within a basic block (not across regions or
+      // basic blocks) because the parent region must be a graph region, graph
+      // regions are restricted to always have 0 or 1 blocks, and there cannot
+      // be a def-use edge from a nested operation to an operation in an
+      // ancestor region. Therefore, we don't have to but may use the same
+      // 'visited' set across regions/blocks as long as we remove operations
+      // from the set again when the DFS traverses back from the leaf to the
+      // root.
+      if (forwardSlice->count(userOp) == 0 && visited.insert(userOp).second)
+        getForwardSliceImpl(userOp, visited, forwardSlice, filter);
+
+      visited.erase(userOp);
+    }
 
   forwardSlice->insert(op);
 }
 
 void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
                            const ForwardSliceOptions &options) {
-  getForwardSliceImpl(op, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  getForwardSliceImpl(op, visited, forwardSlice, options.filter);
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
     // want it in the results.
@@ -69,8 +91,12 @@ void mlir::getForwardSlice(Operation *op, SetVector<Operation *> *forwardSlice,
 
 void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
                            const SliceOptions &options) {
-  for (Operation *user : root.getUsers())
-    getForwardSliceImpl(user, forwardSlice, options.filter);
+  DenseSet<Operation *> visited;
+  for (Operation *user : root.getUsers()) {
+    visited.insert(user);
+    getForwardSliceImpl(user, visited, forwardSlice, options.filter);
+    visited.erase(user);
+  }
 
   // Reverse to get back the actual topological order.
   // std::reverse does not work out of the box on SetVector and I want an
@@ -80,6 +106,7 @@ void mlir::getForwardSlice(Value root, SetVector<Operation *> *forwardSlice,
 }
 
 static LogicalResult getBackwardSliceImpl(Operation *op,
+                                          DenseSet<Operation *> &visited,
                                           SetVector<Operation *> *backwardSlice,
                                           const BackwardSliceOptions &options) {
   if (!op || op->hasTrait<OpTrait::IsIsolatedFromAbove>())
@@ -93,8 +120,12 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 
   auto processValue = [&](Value value) {
     if (auto *definingOp = value.getDefiningOp()) {
-      if (backwardSlice->count(definingOp) == 0)
-        return getBackwardSliceImpl(definingOp, backwardSlice, options);
+      if (backwardSlice->count(definingOp) == 0 &&
+          visited.insert(definingOp).second)
+        return getBackwardSliceImpl(definingOp, visited, backwardSlice,
+                                    options);
+
+      visited.erase(definingOp);
     } else if (auto blockArg = dyn_cast<BlockArgument>(value)) {
       if (options.omitBlockArguments)
         return success();
@@ -107,7 +138,8 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
       if (parentOp && backwardSlice->count(parentOp) == 0) {
         if (parentOp->getNumRegions() == 1 &&
             llvm::hasSingleElement(parentOp->getRegion(0).getBlocks())) {
-          return getBackwardSliceImpl(parentOp, backwardSlice, options);
+          return getBackwardSliceImpl(parentOp, visited, backwardSlice,
+                                      options);
         }
       }
     } else {
@@ -145,7 +177,10 @@ static LogicalResult getBackwardSliceImpl(Operation *op,
 LogicalResult mlir::getBackwardSlice(Operation *op,
                                      SetVector<Operation *> *backwardSlice,
                                      const BackwardSliceOptions &options) {
-  LogicalResult result = getBackwardSliceImpl(op, backwardSlice, options);
+  DenseSet<Operation *> visited;
+  visited.insert(op);
+  LogicalResult result =
+      getBackwardSliceImpl(op, visited, backwardSlice, options);
 
   if (!options.inclusive) {
     // Don't insert the top level operation, we just queried on it and don't
diff --git a/mlir/test/Dialect/Affine/slicing-utils.mlir b/mlir/test/Dialect/Affine/slicing-utils.mlir
index 0848a924b9d96..c53667a98cfbe 100644
--- a/mlir/test/Dialect/Affine/slicing-utils.mlir
+++ b/mlir/test/Dialect/Affine/slicing-utils.mlir
@@ -292,3 +292,26 @@ func.func @slicing_test_multiple_return(%arg0: index) -> (index, index) {
   %0:2 = "slicing-test-op"(%arg0, %arg0): (index, index) -> (index, index)
   return %0#0, %0#1 : index, index
 }
+
+// -----
+
+// FWD-LABEL: graph_region_with_cycle
+// BWD-LABEL: graph_region_with_cycle
+// FWDBWD-LABEL: graph_region_with_cycle
+func.func @graph_region_with_cycle() {
+  test.isolated_graph_region {
+    // FWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // FWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 forward static slice:
+    // FWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    
+    // BWD: matched: [[V0:%.+]] = "slicing-test-op"([[V1:%.+]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 
+    // BWD: matched: [[V1]] = "slicing-test-op"([[V0]]) : (i1) -> i1 backward static slice:
+    // BWD: [[V0]] = "slicing-test-op"([[V1]]) : (i1) -> i1 
+    %0 = "slicing-test-op"(%1) : (i1) -> i1
+    %1 = "slicing-test-op"(%0) : (i1) -> i1
+  }
+
+  return
+}

From 588b8130794f7ce53fe30237f6bf5614b7122f45 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 15 Jul 2025 14:53:05 +0100
Subject: [PATCH 3/9] [AArch64] Use correct regclass for spills of ZPR2/ZPR4
 (#148806)

Commit a6293228fdd5aba8c04c63f02f3d017443feb3f2 forced the register
class of ZPR[24]StridedOrContiguous for spills/fills of ZPR2 and ZPR4,
but this may result in issues when the regclass for the fill is a
ZPR2/ZPR4 which would allow the register allocator to pick `z1_z2`,
which is not a supported register for ZPR2StridedOrContiguous that only
supports tuples of the form (strided) `z0_z8`, `z1_z9` or (contiguous,
start at multiple of 2) `z0_z1`, `z2_z3`. For spills we could add a new
register class that supports any of the tuple forms, but I've decided
to use two pseudos similar to the fills for consistency.

Fixes https://github.com/llvm/llvm-project/issues/148655
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  4 ++
 llvm/lib/Target/AArch64/AArch64InstrInfo.cpp  | 40 ++++++++++++---
 .../lib/Target/AArch64/AArch64SVEInstrInfo.td | 18 ++++---
 llvm/test/CodeGen/AArch64/spillfill-sve.mir   | 49 ++++++++++---------
 4 files changed, 73 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 36f3a670808d4..07b36d20b0c6d 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -1591,18 +1591,22 @@ bool AArch64ExpandPseudo::expandMI(MachineBasicBlock &MBB,
          "Non-writeback variants of STGloop / STZGloop should not "
          "survive past PrologEpilogInserter.");
    case AArch64::STR_ZZZZXI:
+   case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 4);
    case AArch64::STR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 3);
    case AArch64::STR_ZZXI:
+   case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_ZXI, 2);
    case AArch64::STR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::STR_PXI, 2);
    case AArch64::LDR_ZZZZXI:
+   case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 4);
    case AArch64::LDR_ZZZXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 3);
    case AArch64::LDR_ZZXI:
+   case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_ZXI, 2);
    case AArch64::LDR_PPXI:
      return expandSVESpillFill(MBB, MBBI, AArch64::LDR_PXI, 2);
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
index c1474773faa76..5420545cc3cec 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.cpp
@@ -2482,8 +2482,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::LDR_PXI:
   case AArch64::LDR_ZXI:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZXI:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDRBBui:
   case AArch64::LDRBui:
   case AArch64::LDRDui:
@@ -2525,8 +2527,10 @@ unsigned AArch64InstrInfo::getLoadStoreImmIdx(unsigned Opc) {
   case AArch64::STR_PXI:
   case AArch64::STR_ZXI:
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STR_ZZZXI:
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::STRBBui:
   case AArch64::STRBui:
   case AArch64::STRDui:
@@ -4318,7 +4322,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     break;
   // SVE
   case AArch64::STR_ZZZZXI:
+  case AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZZZXI:
+  case AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 4);
     MinOffset = -256;
@@ -4332,7 +4338,9 @@ bool AArch64InstrInfo::getMemOpInfo(unsigned Opcode, TypeSize &Scale,
     MaxOffset = 253;
     break;
   case AArch64::STR_ZZXI:
+  case AArch64::STR_ZZXI_STRIDED_CONTIGUOUS:
   case AArch64::LDR_ZZXI:
+  case AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS:
     Scale = TypeSize::getScalable(16);
     Width = TypeSize::getScalable(16 * 2);
     MinOffset = -256;
@@ -5559,8 +5567,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZXI;
@@ -5584,8 +5596,12 @@ void AArch64InstrInfo::storeRegToStackSlot(MachineBasicBlock &MBB,
       assert(Subtarget.hasNEON() && "Unexpected register store without NEON");
       Opc = AArch64::ST1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register store without SVE store instructions");
+      Opc = AArch64::STR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register store without SVE store instructions");
       Opc = AArch64::STR_ZZZZXI;
@@ -5736,8 +5752,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Twov2d;
       Offset = false;
-    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR2StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR2RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZXI;
@@ -5761,8 +5781,12 @@ void AArch64InstrInfo::loadRegFromStackSlot(
       assert(Subtarget.hasNEON() && "Unexpected register load without NEON");
       Opc = AArch64::LD1Fourv2d;
       Offset = false;
-    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC) ||
-               AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+    } else if (AArch64::ZPR4StridedOrContiguousRegClass.hasSubClassEq(RC)) {
+      assert(Subtarget.isSVEorStreamingSVEAvailable() &&
+             "Unexpected register load without SVE load instructions");
+      Opc = AArch64::LDR_ZZZZXI_STRIDED_CONTIGUOUS;
+      StackID = TargetStackID::ScalableVector;
+    } else if (AArch64::ZPR4RegClass.hasSubClassEq(RC)) {
       assert(Subtarget.isSVEorStreamingSVEAvailable() &&
              "Unexpected register load without SVE load instructions");
       Opc = AArch64::LDR_ZZZZXI;
diff --git a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
index eddb96979f7b8..0c4b4f4c3ed88 100644
--- a/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64SVEInstrInfo.td
@@ -2625,16 +2625,22 @@ let Predicates = [HasSVE_or_SME] in {
   // These get expanded to individual LDR_ZXI/STR_ZXI instructions in
   // AArch64ExpandPseudoInsts.
   let mayLoad = 1, hasSideEffects = 0 in {
-    def LDR_ZZXI   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs   ZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def LDR_ZZXI   : Pseudo<(outs   ZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def LDR_ZZZXI  : Pseudo<(outs  ZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b_strided_and_contiguous:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def LDR_PPXI   : Pseudo<(outs PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_ZZZZXI : Pseudo<(outs ZZZZ_b:$Zd), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def LDR_PPXI   : Pseudo<(outs   PPR2:$pp), (ins GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
   let mayStore = 1, hasSideEffects = 0 in {
-    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZXI_STRIDED_CONTIGUOUS   : Pseudo<(outs), (ins   ZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI_STRIDED_CONTIGUOUS : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+
+    def STR_ZZXI   : Pseudo<(outs), (ins   ZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
     def STR_ZZZXI  : Pseudo<(outs), (ins  ZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b_strided_and_contiguous:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
-    def STR_PPXI   : Pseudo<(outs), (ins PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_ZZZZXI : Pseudo<(outs), (ins ZZZZ_b:$Zs, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
+    def STR_PPXI   : Pseudo<(outs), (ins   PPR2:$pp, GPR64sp:$sp, simm4s1:$offset),[]>, Sched<[]>;
   }
 
   let AddedComplexity = 1 in {
diff --git a/llvm/test/CodeGen/AArch64/spillfill-sve.mir b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
index 83c9b73c57570..2b16dd0f29ecc 100644
--- a/llvm/test/CodeGen/AArch64/spillfill-sve.mir
+++ b/llvm/test/CodeGen/AArch64/spillfill-sve.mir
@@ -1,5 +1,5 @@
-# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy %s -o - | FileCheck %s
-# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs %s -o - | FileCheck %s --check-prefix=EXPAND
+# RUN: llc -mtriple=aarch64-linux-gnu -run-pass=greedy -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-linux-gnu -start-before=greedy -stop-after=aarch64-expand-pseudo -verify-machineinstrs -aarch64-stack-hazard-size=0 %s -o - | FileCheck %s --check-prefix=EXPAND
 --- |
   ; ModuleID = '<stdin>'
   source_filename = "<stdin>"
@@ -14,13 +14,14 @@
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_virtreg_ppr_to_pnr() #1 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr2strided() #2 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr3() #0 { entry: unreachable }
   define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4() #0 { entry: unreachable }
-  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #0 { entry: unreachable }
+  define aarch64_sve_vector_pcs void @spills_fills_stack_id_zpr4strided() #2 { entry: unreachable }
 
   attributes #0 = { nounwind "target-features"="+sve" }
   attributes #1 = { nounwind "target-features"="+sve2p1" }
+  attributes #2 = { nounwind "target-features"="+sve,+sme2" "aarch64_pstate_sm_enabled" }
 
 ...
 ---
@@ -318,10 +319,10 @@ registers:
   - { id: 0, class: zpr2 }
 stack:
 liveins:
-  - { reg: '$z0_z1', virtual-reg: '%0' }
+  - { reg: '$z1_z2', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1
+    liveins: $z1_z2
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr2
     ; CHECK: stack:
@@ -329,12 +330,12 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr2
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
 
-    %0:zpr2 = COPY $z0_z1
+    %0:zpr2 = COPY $z1_z2
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -345,7 +346,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1 = COPY %0
+    $z1_z2 = COPY %0
     RET_ReallyLR
 ...
 ---
@@ -439,10 +440,10 @@ registers:
   - { id: 0, class: zpr4 }
 stack:
 liveins:
-  - { reg: '$z0_z1_z2_z3', virtual-reg: '%0' }
+  - { reg: '$z1_z2_z3_z4', virtual-reg: '%0' }
 body:             |
   bb.0.entry:
-    liveins: $z0_z1_z2_z3
+    liveins: $z1_z2_z3_z4
 
     ; CHECK-LABEL: name: spills_fills_stack_id_zpr4
     ; CHECK: stack:
@@ -450,16 +451,16 @@ body:             |
     ; CHECK-NEXT:     stack-id: scalable-vector
 
     ; EXPAND-LABEL: name: spills_fills_stack_id_zpr4
-    ; EXPAND: STR_ZXI $z0, $sp, 0
-    ; EXPAND: STR_ZXI $z1, $sp, 1
-    ; EXPAND: STR_ZXI $z2, $sp, 2
-    ; EXPAND: STR_ZXI $z3, $sp, 3
-    ; EXPAND: $z0 = LDR_ZXI $sp, 0
-    ; EXPAND: $z1 = LDR_ZXI $sp, 1
-    ; EXPAND: $z2 = LDR_ZXI $sp, 2
-    ; EXPAND: $z3 = LDR_ZXI $sp, 3
+    ; EXPAND: STR_ZXI $z1, $sp, 0
+    ; EXPAND: STR_ZXI $z2, $sp, 1
+    ; EXPAND: STR_ZXI $z3, $sp, 2
+    ; EXPAND: STR_ZXI $z4, $sp, 3
+    ; EXPAND: $z1 = LDR_ZXI $sp, 0
+    ; EXPAND: $z2 = LDR_ZXI $sp, 1
+    ; EXPAND: $z3 = LDR_ZXI $sp, 2
+    ; EXPAND: $z4 = LDR_ZXI $sp, 3
 
-    %0:zpr4 = COPY $z0_z1_z2_z3
+    %0:zpr4 = COPY $z1_z2_z3_z4
 
     $z0_z1_z2_z3     = IMPLICIT_DEF
     $z4_z5_z6_z7     = IMPLICIT_DEF
@@ -470,7 +471,7 @@ body:             |
     $z24_z25_z26_z27 = IMPLICIT_DEF
     $z28_z29_z30_z31 = IMPLICIT_DEF
 
-    $z0_z1_z2_z3 = COPY %0
+    $z1_z2_z3_z4 = COPY %0
     RET_ReallyLR
 ...
 ---

From d1517ec62222584304951fcf63ce35d8fd0942f2 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Tue, 15 Jul 2025 14:53:47 +0100
Subject: [PATCH 4/9] [AArch64] Ensure bundle expansion of MOVPRFX gets correct
 implicit ops (#148824)

By finalizing the bundle _after_ copying over the implicit-ops, it also
adds any implicit-defs to the BUNDLE.

Fixes https://github.com/llvm/llvm-project/issues/148645
---
 .../AArch64/AArch64ExpandPseudoInsts.cpp      |  2 +-
 .../AArch64/sve-pseudos-expand-undef.mir      | 20 ++++++++++++++++---
 2 files changed, 18 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
index 07b36d20b0c6d..7de66ccbf6f29 100644
--- a/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ExpandPseudoInsts.cpp
@@ -671,8 +671,8 @@ bool AArch64ExpandPseudo::expand_DestructiveOp(
   }
 
   if (PRFX) {
-    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
     transferImpOps(MI, PRFX, DOP);
+    finalizeBundle(MBB, PRFX->getIterator(), MBBI->getIterator());
   } else
     transferImpOps(MI, DOP, DOP);
 
diff --git a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
index ae70f91a4ec64..a1d615c910792 100644
--- a/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
+++ b/llvm/test/CodeGen/AArch64/sve-pseudos-expand-undef.mir
@@ -12,7 +12,7 @@ body:             |
   bb.0:
     liveins: $p0, $z0
 
-    ; CHECK: add_x
+    ; CHECK: name: add_x
     ; CHECK-NOT: MOVPRFX
     ; CHECK: $z0 = FADD_ZPmZ_S renamable $p0, killed $z0, renamable $z0
     ; CHECK-NEXT: RET
@@ -21,22 +21,36 @@ body:             |
 
 ...
 
-# CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
 ---
 name: expand_mls_to_msb
 body:             |
   bb.0:
+    ; CHECK: name: expand_mls_to_msb
+    ; CHECK: {{.*}} MSB_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLS_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
 
-# CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
 ---
 name: expand_mla_to_mad
 body:             |
   bb.0:
+    ; CHECK: name: expand_mla_to_mad
+    ; CHECK: {{.*}} MAD_ZPmZZ_B {{.*}}
     renamable $p0 = PTRUE_B 31, implicit $vg
     renamable $z0 = MLA_ZPZZZ_B_UNDEF killed renamable $p0, killed renamable $z2, killed renamable $z0, killed renamable $z1
     RET_ReallyLR implicit $z0
 ...
+
+---
+name: expand_transfer_implicit_defs
+body:             |
+  bb.0:
+    ; CHECK: name: expand_transfer_implicit_defs
+    ; CHECK:      BUNDLE
+    ; CHECK-SAME: implicit-def $z0_z1_z2_z3
+    liveins: $z1, $z2, $p0
+    renamable $z0 = FADD_ZPZZ_D_UNDEF killed $p0, killed $z1, killed $z2, implicit-def $z0_z1_z2_z3
+    RET_ReallyLR implicit $z0_z1_z2_z3
+...

From 7d803c868ab96dabbd4cb47d0b3e60a78057e1b0 Mon Sep 17 00:00:00 2001
From: Sjoerd Meijer <smeijer@nvidia.com>
Date: Tue, 15 Jul 2025 14:59:18 +0100
Subject: [PATCH 5/9] [AArch64] Set the cache line size to 64 for the V2 and
 V3. (#148213)

This sets the cache line size to 64 for the Neoverse V2 and V3. I've
tested this with loop-interchange: it doesn't result in extra
compile-times, but it does enable a lot more interchange.
---
 llvm/lib/Target/AArch64/AArch64Subtarget.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
index 0956823346795..2409cc862f21c 100644
--- a/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
+++ b/llvm/lib/Target/AArch64/AArch64Subtarget.cpp
@@ -270,6 +270,7 @@ void AArch64Subtarget::initializeProperties(bool HasMinSize) {
     break;
   case NeoverseV2:
   case NeoverseV3:
+    CacheLineSize = 64;
     EpilogueVectorizationMinVF = 8;
     MaxInterleaveFactor = 4;
     ScatterOverhead = 13;

From a0895b4581bac8634596263b42d49c7f2e2d957f Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Tue, 15 Jul 2025 09:00:15 -0500
Subject: [PATCH 6/9] [Frontend][OpenMP] Move isPrivatizingClause to OMP.h, NFC
 (#148644)

---
 .../Frontend/OpenMP/ConstructDecompositionT.h  | 18 +-----------------
 llvm/include/llvm/Frontend/OpenMP/OMP.h        | 16 ++++++++++++++++
 2 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
index cdc80c88b7425..611bfe3f8aced 100644
--- a/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
+++ b/llvm/include/llvm/Frontend/OpenMP/ConstructDecompositionT.h
@@ -795,25 +795,9 @@ bool ConstructDecompositionT<C, H>::applyClause(
   // assigned to which leaf constructs.
 
   // [5.2:340:33]
-  auto canMakePrivateCopy = [](llvm::omp::Clause id) {
-    switch (id) {
-    // Clauses with "privatization" property:
-    case llvm::omp::Clause::OMPC_firstprivate:
-    case llvm::omp::Clause::OMPC_in_reduction:
-    case llvm::omp::Clause::OMPC_lastprivate:
-    case llvm::omp::Clause::OMPC_linear:
-    case llvm::omp::Clause::OMPC_private:
-    case llvm::omp::Clause::OMPC_reduction:
-    case llvm::omp::Clause::OMPC_task_reduction:
-      return true;
-    default:
-      return false;
-    }
-  };
-
   bool applied = applyIf(node, [&](const auto &leaf) {
     return llvm::any_of(leaf.clauses, [&](const ClauseTy *n) {
-      return canMakePrivateCopy(n->id);
+      return llvm::omp::isPrivatizingClause(n->id);
     });
   });
 
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.h b/llvm/include/llvm/Frontend/OpenMP/OMP.h
index 35dafc6d246f0..d44c33301bde7 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.h
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.h
@@ -48,6 +48,22 @@ static constexpr inline bool canHaveIterator(Clause C) {
   }
 }
 
+// Can clause C create a private copy of a variable.
+static constexpr inline bool isPrivatizingClause(Clause C) {
+  switch (C) {
+  case OMPC_firstprivate:
+  case OMPC_in_reduction:
+  case OMPC_lastprivate:
+  case OMPC_linear:
+  case OMPC_private:
+  case OMPC_reduction:
+  case OMPC_task_reduction:
+    return true;
+  default:
+    return false;
+  }
+}
+
 static constexpr unsigned FallbackVersion = 52;
 LLVM_ABI ArrayRef<unsigned> getOpenMPVersions();
 

From 49722f1df1ef62de3b1b671c2d4a11c08be11774 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 15 Jul 2025 10:49:23 -0700
Subject: [PATCH 7/9] [RISCV] Remove incorrect and untested FrameIndex support
 from SelectAddrRegImm9. (#148779)

To fold a FrameIndex, we need to teach eliminateFrameIndex to respect
the uimm9 range.

(cherry picked from commit 63d099af146a19bc8fd5a791d6184125e6cc42e7)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 666c76b21e631..880e6b0d48892 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -2936,8 +2936,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm(SDValue Addr, SDValue &Base,
 /// Similar to SelectAddrRegImm, except that the offset is restricted to uimm9.
 bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
                                           SDValue &Offset) {
-  if (SelectAddrFrameIndex(Addr, Base, Offset))
-    return true;
+  // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+  // a 9-bit immediate can be folded.
 
   SDLoc DL(Addr);
   MVT VT = Addr.getSimpleValueType();
@@ -2947,8 +2947,8 @@ bool RISCVDAGToDAGISel::SelectAddrRegImm9(SDValue Addr, SDValue &Base,
     if (isUInt<9>(CVal)) {
       Base = Addr.getOperand(0);
 
-      if (auto *FIN = dyn_cast<FrameIndexSDNode>(Base))
-        Base = CurDAG->getTargetFrameIndex(FIN->getIndex(), VT);
+      // FIXME: Support FrameIndex. Need to teach eliminateFrameIndex that only
+      // a 9-bit immediate can be folded.
       Offset = CurDAG->getSignedTargetConstant(CVal, DL, VT);
       return true;
     }

From b71c9a43664101ff46fa0a46041a238d369a7784 Mon Sep 17 00:00:00 2001
From: Sudharsan Veeravalli <quic_svs@quicinc.com>
Date: Wed, 16 Jul 2025 00:31:33 +0530
Subject: [PATCH 8/9] [RISCV] Fix issues in ORI to QC.INSBI transformation
 (#148809)

The transformation done in #147349 was incorrect since we were not
passing the input node of the `OR` instruction to the `QC.INSBI`
instruction leading to the generated instruction doing the wrong thing.
In order to do this we first needed to add the output register to
`QC.INSBI` as being both an input and output.

The code produced after the above fix will need a copy (mv) to preserve
the register input to the OR instruction if it has more than one use
making the transformation net neutral ( `6-byte QC.E.ORI/ORAI` vs
`2-byte C.MV + 4-byte QC.INSB`I). Avoid doing the transformation if
there is more than one use of the input register to the OR instruction.

(cherry picked from commit d67d91a9906366585162cebf292f923a3f28c8a6)
---
 llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp   | 10 +++-
 llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td   |  5 +-
 .../test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll |  1 +
 llvm/test/CodeGen/RISCV/xqcibm-insert.ll      | 53 +++++++++++++++++++
 4 files changed, 65 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
index 880e6b0d48892..186191abe12a2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelDAGToDAG.cpp
@@ -689,10 +689,16 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   if (!isShiftedMask_32(C1) || isInt<12>(C1))
     return false;
 
+  // INSBI will clobber the input register in N0. Bail out if we need a copy to
+  // preserve this value.
+  SDValue N0 = Node->getOperand(0);
+  if (!N0.hasOneUse())
+    return false;
+
   // If C1 is a shifted mask (but can't be formed as an ORI),
   // use a bitfield insert of -1.
   // Transform (or x, C1)
-  //        -> (qc.insbi x, width, shift)
+  //        -> (qc.insbi x, -1, width, shift)
   const unsigned Leading = llvm::countl_zero((uint32_t)C1);
   const unsigned Trailing = llvm::countr_zero((uint32_t)C1);
   const unsigned Width = 32 - Leading - Trailing;
@@ -705,7 +711,7 @@ bool RISCVDAGToDAGISel::trySignedBitfieldInsertInMask(SDNode *Node) {
   SDLoc DL(Node);
   MVT VT = Node->getSimpleValueType(0);
 
-  SDValue Ops[] = {CurDAG->getSignedTargetConstant(-1, DL, VT),
+  SDValue Ops[] = {N0, CurDAG->getSignedTargetConstant(-1, DL, VT),
                    CurDAG->getTargetConstant(Width, DL, VT),
                    CurDAG->getTargetConstant(Trailing, DL, VT)};
   SDNode *BitIns = CurDAG->getMachineNode(RISCV::QC_INSBI, DL, VT, Ops);
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
index 26bb1e8d17857..c7cb6e237aeac 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoXqci.td
@@ -845,10 +845,11 @@ let Predicates = [HasVendorXqcibi, IsRV32] in {
 let Predicates = [HasVendorXqcibm, IsRV32] in {
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in {
   def QC_INSBRI : QCIRVInstRI<0b1, simm11, "qc.insbri">;
-  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd),
-                             (ins simm5:$imm5, uimm5_plus1:$width,
+  def QC_INSBI : RVInstIBase<0b001, OPC_CUSTOM_0, (outs GPRNoX0:$rd_wb),
+                             (ins GPRNoX0:$rd, simm5:$imm5, uimm5_plus1:$width,
                              uimm5:$shamt), "qc.insbi",
                              "$rd, $imm5, $width, $shamt"> {
+    let Constraints = "$rd = $rd_wb";
     bits<5> imm5;
     bits<5> shamt;
     bits<5> width;
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
index f227fa9aa423d..2fa06517508ce 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-cto-clo-brev.ll
@@ -105,6 +105,7 @@ define i16 @test_cttz_i16(i16 %a) nounwind {
 ;
 ; RV32ZBBXQCIBM-LABEL: test_cttz_i16:
 ; RV32ZBBXQCIBM:       # %bb.0:
+; RV32ZBBXQCIBM-NEXT:    not a0, a0
 ; RV32ZBBXQCIBM-NEXT:    qc.insbi a0, -1, 1, 16
 ; RV32ZBBXQCIBM-NEXT:    ctz a0, a0
 ; RV32ZBBXQCIBM-NEXT:    ret
diff --git a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
index 6b7f9ae856625..88054a691bad1 100644
--- a/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
+++ b/llvm/test/CodeGen/RISCV/xqcibm-insert.ll
@@ -47,6 +47,29 @@ define i32 @test_insbi_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_mask_mv(i32 %a, i32 %b) nounwind {
+; RV32I-LABEL: test_insbi_mask_mv:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a0, 16
+; RV32I-NEXT:    addi a0, a0, -1
+; RV32I-NEXT:    or a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    mv a0, a1
+; RV32IXQCIBM-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_mask_mv:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    mv a0, a1
+; RV32IXQCIBMZBS-NEXT:    qc.insbi a0, -1, 16, 0
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %b, 65535
+  ret i32 %or
+}
+
 define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
 ; RV32I-LABEL: test_insbi_shifted_mask:
 ; RV32I:       # %bb.0:
@@ -67,6 +90,36 @@ define i32 @test_insbi_shifted_mask(i32 %a) nounwind {
   ret i32 %or
 }
 
+define i32 @test_insbi_shifted_mask_multiple_uses(i32 %a) nounwind {
+; RV32I-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32I:       # %bb.0:
+; RV32I-NEXT:    lui a1, 15
+; RV32I-NEXT:    or a1, a0, a1
+; RV32I-NEXT:    addi a0, a0, 10
+; RV32I-NEXT:    xor a0, a1, a0
+; RV32I-NEXT:    ret
+;
+; RV32IXQCIBM-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBM:       # %bb.0:
+; RV32IXQCIBM-NEXT:    lui a1, 15
+; RV32IXQCIBM-NEXT:    or a1, a1, a0
+; RV32IXQCIBM-NEXT:    addi a0, a0, 10
+; RV32IXQCIBM-NEXT:    xor a0, a0, a1
+; RV32IXQCIBM-NEXT:    ret
+;
+; RV32IXQCIBMZBS-LABEL: test_insbi_shifted_mask_multiple_uses:
+; RV32IXQCIBMZBS:       # %bb.0:
+; RV32IXQCIBMZBS-NEXT:    lui a1, 15
+; RV32IXQCIBMZBS-NEXT:    or a1, a1, a0
+; RV32IXQCIBMZBS-NEXT:    addi a0, a0, 10
+; RV32IXQCIBMZBS-NEXT:    xor a0, a0, a1
+; RV32IXQCIBMZBS-NEXT:    ret
+  %or = or i32 %a, 61440
+  %add = add i32 %a, 10
+  %xor = xor i32 %or, %add
+  ret i32 %xor
+}
+
 define i32 @test_single_bit_set(i32 %a) nounwind {
 ; RV32I-LABEL: test_single_bit_set:
 ; RV32I:       # %bb.0:

From 04d4be501dc83fe411193a46c10e898898552731 Mon Sep 17 00:00:00 2001
From: Eli Friedman <efriedma@quicinc.com>
Date: Tue, 15 Jul 2025 15:39:51 -0700
Subject: [PATCH 9/9] [libclang] Fix version for symbol clang_visitCXXMethods
 (#148958)

Happened to spot this while looking at libclang.map for other reasons.
clang_visitCXXMethods was added in LLVM 21, not LLVM 20.

(cherry picked from commit 116110e1a93531a64d82f049b6e36403bc14f278)
---
 clang/tools/libclang/libclang.map | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/tools/libclang/libclang.map b/clang/tools/libclang/libclang.map
index d140a71e771a0..49c472e3833fd 100644
--- a/clang/tools/libclang/libclang.map
+++ b/clang/tools/libclang/libclang.map
@@ -435,12 +435,12 @@ LLVM_20 {
     clang_getTypePrettyPrinted;
     clang_isBeforeInTranslationUnit;
     clang_visitCXXBaseClasses;
-    clang_visitCXXMethods;
 };
 
 LLVM_21 {
   global:
     clang_getFullyQualifiedName;
+    clang_visitCXXMethods;
     clang_Cursor_getGCCAssemblyTemplate;
     clang_Cursor_isGCCAssemblyHasGoto;
     clang_Cursor_getGCCAssemblyNumOutputs;