From c9f27275c1330a325661bdf14fb3bc444a5e3648 Mon Sep 17 00:00:00 2001
From: Daniel Paoliello <danpao@microsoft.com>
Date: Tue, 15 Oct 2024 10:37:36 -0700
Subject: [PATCH 001/329] [clang][aarch64] Add support for the MSVC qualifiers
 __ptr32, __ptr64, __sptr, __uptr for AArch64 (#111879)

MSVC has a set of qualifiers to allow using 32-bit signed/unsigned
pointers when building 64-bit targets. This is useful for WoW code
(i.e., the part of Windows that handles running 32-bit application on a
64-bit OS). Currently this is supported on x64 using the 270, 271 and
272 address spaces, but does not work for AArch64 at all.

This change adds the same 270, 271 and 272 address spaces to AArch64 and
adjusts the data layout string accordingly. Clang will generate the
correct address space casts, but these will currently be ignored until
the AArch64 backend is updated to handle them.

Partially fixes #62536

This is a resurrected version of <https://reviews.llvm.org/D158857>
(originally created by @a_vorobev) - I've cleaned it up a little, fixed
the rest of the tests and added to auto-upgrade for the data layout.
---
 clang/lib/Basic/Targets/AArch64.cpp           | 22 +++++++---
 clang/lib/Basic/Targets/AArch64.h             | 40 +++++++++++++++++++
 clang/test/CodeGen/aarch64-type-sizes.c       |  2 +-
 clang/test/CodeGen/coff-aarch64-type-sizes.c  |  2 +-
 clang/test/CodeGen/ms-mixed-ptr-sizes.c       | 19 +++++++++
 clang/test/CodeGen/target-data.c              |  6 +--
 llvm/lib/IR/AutoUpgrade.cpp                   | 23 ++++++-----
 .../Target/AArch64/AArch64TargetMachine.cpp   | 12 ++++--
 .../Bitcode/DataLayoutUpgradeTest.cpp         | 13 ++++--
 9 files changed, 112 insertions(+), 27 deletions(-)

diff --git a/clang/lib/Basic/Targets/AArch64.cpp b/clang/lib/Basic/Targets/AArch64.cpp
index 61889861c9c8..b96fab978a3f 100644
--- a/clang/lib/Basic/Targets/AArch64.cpp
+++ b/clang/lib/Basic/Targets/AArch64.cpp
@@ -143,6 +143,8 @@ AArch64TargetInfo::AArch64TargetInfo(const llvm::Triple &Triple,
     IntMaxType = SignedLong;
   }
 
+  AddrSpaceMap = &ARM64AddrSpaceMap;
+
   // All AArch64 implementations support ARMv8 FP, which makes half a legal type.
   HasLegalHalfType = true;
   HalfArgsAndReturns = true;
@@ -1533,11 +1535,16 @@ AArch64leTargetInfo::AArch64leTargetInfo(const llvm::Triple &Triple,
 void AArch64leTargetInfo::setDataLayout() {
   if (getTriple().isOSBinFormatMachO()) {
     if(getTriple().isArch32Bit())
-      resetDataLayout("e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32", "_");
+      resetDataLayout("e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-"
+                      "i128:128-n32:64-S128-Fn32",
+                      "_");
     else
-      resetDataLayout("e-m:o-i64:64-i128:128-n32:64-S128-Fn32", "_");
+      resetDataLayout("e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+                      "n32:64-S128-Fn32",
+                      "_");
   } else
-    resetDataLayout("e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32");
+    resetDataLayout("e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-"
+                    "i64:64-i128:128-n32:64-S128-Fn32");
 }
 
 void AArch64leTargetInfo::getTargetDefines(const LangOptions &Opts,
@@ -1560,7 +1567,8 @@ void AArch64beTargetInfo::getTargetDefines(const LangOptions &Opts,
 
 void AArch64beTargetInfo::setDataLayout() {
   assert(!getTriple().isOSBinFormatMachO());
-  resetDataLayout("E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32");
+  resetDataLayout("E-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-"
+                  "i64:64-i128:128-n32:64-S128-Fn32");
 }
 
 WindowsARM64TargetInfo::WindowsARM64TargetInfo(const llvm::Triple &Triple,
@@ -1583,8 +1591,10 @@ WindowsARM64TargetInfo::WindowsARM64TargetInfo(const llvm::Triple &Triple,
 
 void WindowsARM64TargetInfo::setDataLayout() {
   resetDataLayout(Triple.isOSBinFormatMachO()
-                      ? "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
-                      : "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32",
+                      ? "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:"
+                        "128-n32:64-S128-Fn32"
+                      : "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-"
+                        "i64:64-i128:128-n32:64-S128-Fn32",
                   Triple.isOSBinFormatMachO() ? "_" : "");
 }
 
diff --git a/clang/lib/Basic/Targets/AArch64.h b/clang/lib/Basic/Targets/AArch64.h
index 1226ce4d4355..16a02e102e04 100644
--- a/clang/lib/Basic/Targets/AArch64.h
+++ b/clang/lib/Basic/Targets/AArch64.h
@@ -21,6 +21,34 @@
 namespace clang {
 namespace targets {
 
+enum AArch64AddrSpace { ptr32_sptr = 270, ptr32_uptr = 271, ptr64 = 272 };
+
+static const unsigned ARM64AddrSpaceMap[] = {
+    0, // Default
+    0, // opencl_global
+    0, // opencl_local
+    0, // opencl_constant
+    0, // opencl_private
+    0, // opencl_generic
+    0, // opencl_global_device
+    0, // opencl_global_host
+    0, // cuda_device
+    0, // cuda_constant
+    0, // cuda_shared
+    0, // sycl_global
+    0, // sycl_global_device
+    0, // sycl_global_host
+    0, // sycl_local
+    0, // sycl_private
+    static_cast<unsigned>(AArch64AddrSpace::ptr32_sptr),
+    static_cast<unsigned>(AArch64AddrSpace::ptr32_uptr),
+    static_cast<unsigned>(AArch64AddrSpace::ptr64),
+    0, // hlsl_groupshared
+    // Wasm address space values for this target are dummy values,
+    // as it is only enabled for Wasm targets.
+    20, // wasm_funcref
+};
+
 class LLVM_LIBRARY_VISIBILITY AArch64TargetInfo : public TargetInfo {
   virtual void setDataLayout() = 0;
   static const TargetInfo::GCCRegAlias GCCRegAliases[];
@@ -207,6 +235,18 @@ public:
 
   bool validateGlobalRegisterVariable(StringRef RegName, unsigned RegSize,
                                       bool &HasSizeMismatch) const override;
+
+  uint64_t getPointerWidthV(LangAS AddrSpace) const override {
+    if (AddrSpace == LangAS::ptr32_sptr || AddrSpace == LangAS::ptr32_uptr)
+      return 32;
+    if (AddrSpace == LangAS::ptr64)
+      return 64;
+    return PointerWidth;
+  }
+
+  uint64_t getPointerAlignV(LangAS AddrSpace) const override {
+    return getPointerWidthV(AddrSpace);
+  }
 };
 
 class LLVM_LIBRARY_VISIBILITY AArch64leTargetInfo : public AArch64TargetInfo {
diff --git a/clang/test/CodeGen/aarch64-type-sizes.c b/clang/test/CodeGen/aarch64-type-sizes.c
index a40423c1f8de..f6129b3943d2 100644
--- a/clang/test/CodeGen/aarch64-type-sizes.c
+++ b/clang/test/CodeGen/aarch64-type-sizes.c
@@ -1,7 +1,7 @@
 // RUN: %clang_cc1 -triple aarch64_be-none-linux-gnu -emit-llvm -w -o - %s | FileCheck %s
 // char by definition has size 1
 
-// CHECK: target datalayout = "E-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+// CHECK: target datalayout = "E-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 
 int check_short(void) {
   return sizeof(short);
diff --git a/clang/test/CodeGen/coff-aarch64-type-sizes.c b/clang/test/CodeGen/coff-aarch64-type-sizes.c
index 9cb0ddbaef3f..2e5b94c14e6a 100644
--- a/clang/test/CodeGen/coff-aarch64-type-sizes.c
+++ b/clang/test/CodeGen/coff-aarch64-type-sizes.c
@@ -1,6 +1,6 @@
 // RUN: %clang_cc1 -triple aarch64-windows -emit-llvm -w -o - %s | FileCheck %s
 
-// CHECK: target datalayout = "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
+// CHECK: target datalayout = "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32"
 // CHECK: target triple = "aarch64-unknown-windows-msvc"
 
 int check_short(void) {
diff --git a/clang/test/CodeGen/ms-mixed-ptr-sizes.c b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
index 0bc1925b13db..f99c6196557e 100644
--- a/clang/test/CodeGen/ms-mixed-ptr-sizes.c
+++ b/clang/test/CodeGen/ms-mixed-ptr-sizes.c
@@ -1,5 +1,6 @@
 // RUN: %clang_cc1 -triple x86_64-windows-msvc -fms-extensions -emit-llvm -O2 < %s | FileCheck %s --check-prefixes=X64,ALL
 // RUN: %clang_cc1 -triple i386-pc-win32 -fms-extensions -emit-llvm -O2 < %s | FileCheck %s --check-prefixes=X86,ALL
+// RUN: %clang_cc1 -triple aarch64-windows-msvc -fms-extensions -emit-llvm -O2 < %s | FileCheck %s --check-prefixes=AARCH64,ALL
 
 struct Foo {
   int * __ptr32 p32;
@@ -9,32 +10,40 @@ void use_foo(struct Foo *f);
 void test_sign_ext(struct Foo *f, int * __ptr32 __sptr i) {
 // X64-LABEL: define dso_local void @test_sign_ext({{.*}}ptr addrspace(270) noundef %i)
 // X86-LABEL: define dso_local void @test_sign_ext(ptr noundef %f, ptr noundef %i)
+// AARCH64-LABEL: define dso_local void @test_sign_ext({{.*}}ptr addrspace(270) noundef %i) local_unnamed_addr #0
 // X64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr
 // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272)
+// AARCH64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr
   f->p64 = i;
   use_foo(f);
 }
 void test_zero_ext(struct Foo *f, int * __ptr32 __uptr i) {
 // X64-LABEL: define dso_local void @test_zero_ext({{.*}}ptr addrspace(271) noundef %i)
 // X86-LABEL: define dso_local void @test_zero_ext({{.*}}ptr addrspace(271) noundef %i)
+// AARCH64-LABEL: define dso_local void @test_zero_ext({{.*}}ptr addrspace(271) noundef %i) local_unnamed_addr #0
 // X64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr
 // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272)
+// AARCH64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr
   f->p64 = i;
   use_foo(f);
 }
 void test_trunc(struct Foo *f, int * __ptr64 i) {
 // X64-LABEL: define dso_local void @test_trunc(ptr noundef %f, ptr noundef %i)
 // X86-LABEL: define dso_local void @test_trunc({{.*}}ptr addrspace(272) noundef %i)
+// AARCH64-LABEL: define dso_local void @test_trunc(ptr noundef %f, ptr noundef %i) local_unnamed_addr #0
 // X64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(270)
 // X86: %{{.+}} = addrspacecast ptr addrspace(272) %i to ptr
+// AARCH64: %{{.+}} = addrspacecast ptr %i to ptr addrspace(270)
   f->p32 = i;
   use_foo(f);
 }
 void test_noop(struct Foo *f, int * __ptr32 i) {
 // X64-LABEL: define dso_local void @test_noop({{.*}}ptr addrspace(270) noundef %i)
 // X86-LABEL: define dso_local void @test_noop({{.*}}ptr noundef %i)
+// AARCH64-LABEL: define dso_local void @test_noop({{.*}}ptr addrspace(270) noundef %i) local_unnamed_addr #0
 // X64-NOT: addrspacecast
 // X86-NOT: addrspacecast
+// AARCH64-NOT: addrspacecast
   f->p32 = i;
   use_foo(f);
 }
@@ -42,8 +51,10 @@ void test_noop(struct Foo *f, int * __ptr32 i) {
 void test_other(struct Foo *f, __attribute__((address_space(10))) int *i) {
 // X64-LABEL: define dso_local void @test_other({{.*}}ptr addrspace(10) noundef %i)
 // X86-LABEL: define dso_local void @test_other({{.*}}ptr addrspace(10) noundef %i)
+// AARCH64-LABEL: define dso_local void @test_other({{.*}}ptr addrspace(10) noundef %i) local_unnamed_addr #0
 // X64: %{{.+}} = addrspacecast ptr addrspace(10) %i to ptr addrspace(270)
 // X86: %{{.+}} = addrspacecast ptr addrspace(10) %i to ptr
+// AARCH64: %{{.+}} = addrspacecast ptr addrspace(10) %i to ptr addrspace(270)
   f->p32 = (int * __ptr32)i;
   use_foo(f);
 }
@@ -54,6 +65,8 @@ int test_compare1(int *__ptr32 __uptr i, int *__ptr64 j) {
   // X64: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}}
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr addrspace(271)
   // X86: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}}
+  // AARCH64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(271)
+  // AARCH64: %cmp = icmp eq ptr addrspace(271) %i, %{{.+}}
   return (i == j);
 }
 
@@ -63,6 +76,8 @@ int test_compare2(int *__ptr32 __sptr i, int *__ptr64 j) {
   // X64: %cmp = icmp eq ptr addrspace(270) %i, %{{.+}}
   // X86: %{{.+}} = addrspacecast ptr addrspace(272) %j to ptr
   // X86: %cmp = icmp eq ptr %i, %{{.+}}
+  // AARCH64: %{{.+}} = addrspacecast ptr %j to ptr addrspace(270)
+  // AARCH64: %cmp = icmp eq ptr addrspace(270) %i, %{{.+}}
   return (i == j);
 }
 
@@ -72,6 +87,8 @@ int test_compare3(int *__ptr32 __uptr i, int *__ptr64 j) {
   // X64: %cmp = icmp eq ptr %j, %{{.+}}
   // X86: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr addrspace(272)
   // X86: %cmp = icmp eq ptr addrspace(272) %j, %{{.+}}
+  // AARCH64: %{{.+}} = addrspacecast ptr addrspace(271) %i to ptr
+  // AARCH64: %cmp = icmp eq ptr %j, %{{.+}}
   return (j == i);
 }
 
@@ -81,5 +98,7 @@ int test_compare4(int *__ptr32 __sptr i, int *__ptr64 j) {
   // X64: %cmp = icmp eq ptr %j, %{{.+}}
   // X86: %{{.+}} = addrspacecast ptr %i to ptr addrspace(272)
   // X86: %cmp = icmp eq ptr addrspace(272) %j, %{{.+}}
+  // AARCH64: %{{.+}} = addrspacecast ptr addrspace(270) %i to ptr
+  // AARCH64: %cmp = icmp eq ptr %j, %{{.+}}
   return (j == i);
 }
diff --git a/clang/test/CodeGen/target-data.c b/clang/test/CodeGen/target-data.c
index 8548aa00cfe8..26a1bf2a1a57 100644
--- a/clang/test/CodeGen/target-data.c
+++ b/clang/test/CodeGen/target-data.c
@@ -185,15 +185,15 @@
 
 // RUN: %clang_cc1 -triple arm64-unknown -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64
-// AARCH64: target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
+// AARCH64: target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32"
 
 // RUN: %clang_cc1 -triple arm64_32-apple-ios7.0 -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64-ILP32
-// AARCH64-ILP32: target datalayout = "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32"
+// AARCH64-ILP32: target datalayout = "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
 
 // RUN: %clang_cc1 -triple arm64-pc-win32-macho -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=AARCH64-WIN32-MACHO
-// AARCH64-WIN32-MACHO: target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128-Fn32"
+// AARCH64-WIN32-MACHO: target datalayout = "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32"
 
 // RUN: %clang_cc1 -triple thumb-unknown-gnueabi -o - -emit-llvm %s | \
 // RUN: FileCheck %s -check-prefix=THUMB
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 477b77a6dd53..32f66f77f19f 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5577,11 +5577,24 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
     return Res;
   }
 
+  auto AddPtr32Ptr64AddrSpaces = [&DL, &Res]() {
+    // If the datalayout matches the expected format, add pointer size address
+    // spaces to the datalayout.
+    StringRef AddrSpaces{"-p270:32:32-p271:32:32-p272:64:64"};
+    if (!DL.contains(AddrSpaces)) {
+      SmallVector<StringRef, 4> Groups;
+      Regex R("^([Ee]-m:[a-z](-p:32:32)?)(-.*)$");
+      if (R.match(Res, &Groups))
+        Res = (Groups[1] + AddrSpaces + Groups[3]).str();
+    }
+  };
+
   // AArch64 data layout upgrades.
   if (T.isAArch64()) {
     // Add "-Fn32"
     if (!DL.empty() && !DL.contains("-Fn32"))
       Res.append("-Fn32");
+    AddPtr32Ptr64AddrSpaces();
     return Res;
   }
 
@@ -5600,15 +5613,7 @@ std::string llvm::UpgradeDataLayoutString(StringRef DL, StringRef TT) {
   if (!T.isX86())
     return Res;
 
-  // If the datalayout matches the expected format, add pointer size address
-  // spaces to the datalayout.
-  std::string AddrSpaces = "-p270:32:32-p271:32:32-p272:64:64";
-  if (StringRef Ref = Res; !Ref.contains(AddrSpaces)) {
-    SmallVector<StringRef, 4> Groups;
-    Regex R("(e-m:[a-z](-p:32:32)?)(-[if]64:.*$)");
-    if (R.match(Res, &Groups))
-      Res = (Groups[1] + AddrSpaces + Groups[3]).str();
-  }
+  AddPtr32Ptr64AddrSpaces();
 
   // i128 values need to be 16-byte-aligned. LLVM already called into libgcc
   // for i128 operations prior to this being reflected in the data layout, and
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 7b0ae2335867..21b86f5fe5d9 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -290,15 +290,19 @@ static std::string computeDataLayout(const Triple &TT,
                                      bool LittleEndian) {
   if (TT.isOSBinFormatMachO()) {
     if (TT.getArch() == Triple::aarch64_32)
-      return "e-m:o-p:32:32-i64:64-i128:128-n32:64-S128-Fn32";
-    return "e-m:o-i64:64-i128:128-n32:64-S128-Fn32";
+      return "e-m:o-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-"
+             "n32:64-S128-Fn32";
+    return "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-"
+           "Fn32";
   }
   if (TT.isOSBinFormatCOFF())
-    return "e-m:w-p:64:64-i32:32-i64:64-i128:128-n32:64-S128-Fn32";
+    return "e-m:w-p270:32:32-p271:32:32-p272:64:64-p:64:64-i32:32-i64:64-i128:"
+           "128-n32:64-S128-Fn32";
   std::string Endian = LittleEndian ? "e" : "E";
   std::string Ptr32 = TT.getEnvironment() == Triple::GNUILP32 ? "-p:32:32" : "";
   return Endian + "-m:e" + Ptr32 +
-         "-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128-Fn32";
+         "-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:64-i128:128-"
+         "n32:64-S128-Fn32";
 }
 
 static StringRef computeDefaultCPU(const Triple &TT, StringRef CPU) {
diff --git a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
index 1cd4a47c7573..e7fb4e071864 100644
--- a/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
+++ b/llvm/unittests/Bitcode/DataLayoutUpgradeTest.cpp
@@ -23,6 +23,8 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
       "e-m:o-i64:64-f80:128-n8:16:32:64-S128", "x86_64-apple-macosx");
   std::string DL4 =
       UpgradeDataLayoutString("e-m:o-i64:64-i128:128-n32:64-S128", "aarch64--");
+  std::string DL5 = UpgradeDataLayoutString(
+      "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128", "aarch64--");
   EXPECT_EQ(DL1,
             "e-m:e-p:32:32-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128"
             "-f80:128-n8:16:32:64-S128");
@@ -31,7 +33,10 @@ TEST(DataLayoutUpgradeTest, ValidDataLayoutUpgrade) {
             "-f80:128-n8:16:32-S32");
   EXPECT_EQ(DL3, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:"
                  "128-n8:16:32:64-S128");
-  EXPECT_EQ(DL4, "e-m:o-i64:64-i128:128-n32:64-S128-Fn32");
+  EXPECT_EQ(DL4, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:"
+                 "64-S128-Fn32");
+  EXPECT_EQ(DL5, "e-m:e-p270:32:32-p271:32:32-p272:64:64-i8:8:32-i16:16:32-i64:"
+                 "64-i128:128-n32:64-S128-Fn32");
 
   // Check that AMDGPU targets add -G1 if it's not present.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32", "r600"), "e-p:32:32-G1");
@@ -94,14 +99,16 @@ TEST(DataLayoutUpgradeTest, NoDataLayoutUpgrade) {
   std::string DL2 = UpgradeDataLayoutString("e-m:e-i64:64-n32:64",
                                             "powerpc64le-unknown-linux-gnu");
   std::string DL3 = UpgradeDataLayoutString(
-      "e-m:o-i64:64-i128:128-n32:64-S128-Fn32", "aarch64--");
+      "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:64-S128-Fn32",
+      "aarch64--");
   EXPECT_EQ(
       DL1,
       "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-i128:128:128"
       "-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64"
       "-f80:128:128-n8:16:32:64-S128");
   EXPECT_EQ(DL2, "e-m:e-i64:64-n32:64");
-  EXPECT_EQ(DL3, "e-m:o-i64:64-i128:128-n32:64-S128-Fn32");
+  EXPECT_EQ(DL3, "e-m:o-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-n32:"
+                 "64-S128-Fn32");
 
   // Check that AMDGPU targets don't add -G1 if there is already a -G flag.
   EXPECT_EQ(UpgradeDataLayoutString("e-p:32:32-G2", "r600"), "e-p:32:32-G2");
-- 
GitLab


From e511026bf04b9d10d91d107174037b48f531278a Mon Sep 17 00:00:00 2001
From: Andrew Luo <andrew.zhao.luo@gmail.com>
Date: Tue, 15 Oct 2024 10:38:45 -0700
Subject: [PATCH 002/329] [MLIR] Make More Specific Function Header For
 StringLiteral Optimization in `Diagnostic` (#112154)

Diagnostic stores various notes/error messages which might help the user
in debugging. For the most part, the `Diagnostic` when receiving an
error message will copy and own the contents of the string.

However, there is one optimization where given a `const char*`, the
class will assume this is a StringLiteral which is immutable and
lifetime matches that of the entire program. As a result, instead of
copying the message in these cases the class will simply store the
underlying pointer.

This is problematic since `const char*` is not specific enough to always
imply a StringLiteral which can lead to bugs, e.g. if the underlying
pointer is freed before the diagnostic reports.

We solve this problem by choosing a more specific function signature.
While not full-proof, this should cover a lot more cases.

A potentially better alternative is just deleting this special handling
of string literals, but I am unsure of the implications (it does sound
safe to do however with a negligble impact on performance).
---
 mlir/include/mlir/IR/Diagnostics.h |  3 +-
 mlir/unittests/IR/CMakeLists.txt   |  1 +
 mlir/unittests/IR/Diagnostic.cpp   | 63 ++++++++++++++++++++++++++++++
 3 files changed, 66 insertions(+), 1 deletion(-)
 create mode 100644 mlir/unittests/IR/Diagnostic.cpp

diff --git a/mlir/include/mlir/IR/Diagnostics.h b/mlir/include/mlir/IR/Diagnostics.h
index cb30bb3f5968..8429325412dc 100644
--- a/mlir/include/mlir/IR/Diagnostics.h
+++ b/mlir/include/mlir/IR/Diagnostics.h
@@ -183,7 +183,8 @@ public:
   Diagnostic &operator<<(StringAttr val);
 
   /// Stream in a string literal.
-  Diagnostic &operator<<(const char *val) {
+  template <size_t n>
+  Diagnostic &operator<<(const char (&val)[n]) {
     arguments.push_back(DiagnosticArgument(val));
     return *this;
   }
diff --git a/mlir/unittests/IR/CMakeLists.txt b/mlir/unittests/IR/CMakeLists.txt
index 547e536dd9cb..384116ba5c45 100644
--- a/mlir/unittests/IR/CMakeLists.txt
+++ b/mlir/unittests/IR/CMakeLists.txt
@@ -4,6 +4,7 @@ add_mlir_unittest(MLIRIRTests
   AffineMapTest.cpp
   AttributeTest.cpp
   AttrTypeReplacerTest.cpp
+  Diagnostic.cpp
   DialectTest.cpp
   InterfaceTest.cpp
   IRMapping.cpp
diff --git a/mlir/unittests/IR/Diagnostic.cpp b/mlir/unittests/IR/Diagnostic.cpp
new file mode 100644
index 000000000000..96e09d333092
--- /dev/null
+++ b/mlir/unittests/IR/Diagnostic.cpp
@@ -0,0 +1,63 @@
+//===- Diagnostic.cpp - Dialect unit tests -------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "mlir/IR/Diagnostics.h"
+#include "mlir/Support/TypeID.h"
+#include "gtest/gtest.h"
+
+using namespace mlir;
+using namespace mlir::detail;
+
+namespace {
+
+TEST(DiagnosticLifetime, TestCopiesConstCharStar) {
+  const auto *expectedMessage = "Error 1, don't mutate this";
+
+  // Copy expected message into a mutable container, and call the constructor.
+  std::string myStr(expectedMessage);
+
+  mlir::MLIRContext context;
+  Diagnostic diagnostic(mlir::UnknownLoc::get(&context),
+                        DiagnosticSeverity::Note);
+  diagnostic << myStr.c_str();
+
+  // Mutate underlying pointer, but ensure diagnostic still has orig. message
+  myStr[0] = '^';
+
+  std::string resultMessage;
+  llvm::raw_string_ostream stringStream(resultMessage);
+  diagnostic.print(stringStream);
+  ASSERT_STREQ(expectedMessage, resultMessage.c_str());
+}
+
+TEST(DiagnosticLifetime, TestLazyCopyStringLiteral) {
+  char charArr[21] = "Error 1, mutate this";
+  mlir::MLIRContext context;
+  Diagnostic diagnostic(mlir::UnknownLoc::get(&context),
+                        DiagnosticSeverity::Note);
+
+  // Diagnostic contains optimization which assumes string literals are
+  // represented by `const char[]` type. This is imperfect as we can sometimes
+  // trick the type system as seen below.
+  //
+  // Still we use this to check the diagnostic is lazily storing the pointer.
+  auto addToDiagnosticAsConst = [&diagnostic](const char(&charArr)[21]) {
+    diagnostic << charArr;
+  };
+  addToDiagnosticAsConst(charArr);
+
+  // Mutate the underlying pointer and ensure the string does change
+  charArr[0] = '^';
+
+  std::string resultMessage;
+  llvm::raw_string_ostream stringStream(resultMessage);
+  diagnostic.print(stringStream);
+  ASSERT_STREQ("^rror 1, mutate this", resultMessage.c_str());
+}
+
+} // namespace
-- 
GitLab


From 66f968cf3725a9d3554765c900f9c2de75190a87 Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Tue, 15 Oct 2024 10:39:39 -0700
Subject: [PATCH 003/329] [libc][setjmp] fix setjmp test via naked fn attr
 (#88054)

This would consistently fail for me locally, to the point where I could not run
ninja libc-unit-tests without ninja libc_setjmp_unittests failing.

Turns out that since I enabled -ftrivial-auto-var-init=pattern in
commit 1d5c16d ("[libc] default enable -ftrivial-auto-var-init=pattern
(#78776)")
this has been a problem. Our x86_64 setjmp definition disabled -Wuninitialized,
so we wound up clobbering these registers and instead backing up
0xAAAAAAAAAAAAAAAA rather than the actual register value.

Use `naked` function attribute to avoid function prolog/epilog.
---
 libc/src/setjmp/x86_64/CMakeLists.txt |  6 ---
 libc/src/setjmp/x86_64/setjmp.cpp     | 60 +++++++++++----------------
 2 files changed, 24 insertions(+), 42 deletions(-)

diff --git a/libc/src/setjmp/x86_64/CMakeLists.txt b/libc/src/setjmp/x86_64/CMakeLists.txt
index c789e5def7fe..b5b0d9ba6559 100644
--- a/libc/src/setjmp/x86_64/CMakeLists.txt
+++ b/libc/src/setjmp/x86_64/CMakeLists.txt
@@ -8,12 +8,6 @@ add_entrypoint_object(
     libc.hdr.types.jmp_buf
   COMPILE_OPTIONS
     -O3
-    -fno-omit-frame-pointer
-    # TODO: Remove once one of these lands:
-    # https://github.com/llvm/llvm-project/pull/87837
-    # https://github.com/llvm/llvm-project/pull/88054
-    # https://github.com/llvm/llvm-project/pull/88157
-    -ftrivial-auto-var-init=uninitialized
 )
 
 add_entrypoint_object(
diff --git a/libc/src/setjmp/x86_64/setjmp.cpp b/libc/src/setjmp/x86_64/setjmp.cpp
index 62d9c13c68e4..c9ca578fb1e6 100644
--- a/libc/src/setjmp/x86_64/setjmp.cpp
+++ b/libc/src/setjmp/x86_64/setjmp.cpp
@@ -6,6 +6,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "include/llvm-libc-macros/offsetof-macro.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 #include "src/setjmp/setjmp_impl.h"
@@ -16,42 +17,29 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(int, setjmp, (jmp_buf buf)) {
-  register __UINT64_TYPE__ rbx __asm__("rbx");
-  register __UINT64_TYPE__ r12 __asm__("r12");
-  register __UINT64_TYPE__ r13 __asm__("r13");
-  register __UINT64_TYPE__ r14 __asm__("r14");
-  register __UINT64_TYPE__ r15 __asm__("r15");
-
-  // We want to store the register values as is. So, we will suppress the
-  // compiler warnings about the uninitialized variables declared above.
-#pragma GCC diagnostic push
-#pragma GCC diagnostic ignored "-Wuninitialized"
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->rbx) : "r"(rbx) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r12) : "r"(r12) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r13) : "r"(r13) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r14) : "r"(r14) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=m"(buf->r15) : "r"(r15) :);
-#pragma GCC diagnostic pop
-
-  // We want the rbp of the caller, which is what __builtin_frame_address(1)
-  // should return. But, compilers generate a warning that calling
-  // __builtin_frame_address with non-zero argument is unsafe. So, we use
-  // the knowledge of the x86_64 ABI to fetch the callers rbp. As per the ABI,
-  // the rbp of the caller is pushed on to the stack and then new top is saved
-  // in this function's rbp. So, we fetch it from location at which this
-  // functions's rbp is pointing.
-  buf->rbp = *reinterpret_cast<__UINTPTR_TYPE__ *>(__builtin_frame_address(0));
-
-  // The callers stack address is exactly 2 pointer widths ahead of the current
-  // frame pointer - between the current frame pointer and the rsp of the caller
-  // are the return address (pushed by the x86_64 call instruction) and the
-  // previous stack pointer as required by the x86_64 ABI.
-  // The stack pointer is ahead because the stack grows down on x86_64.
-  buf->rsp = reinterpret_cast<__UINTPTR_TYPE__>(__builtin_frame_address(0)) +
-             sizeof(__UINTPTR_TYPE__) * 2;
-  buf->rip = reinterpret_cast<__UINTPTR_TYPE__>(__builtin_return_address(0));
-  return 0;
+[[gnu::naked]]
+LLVM_LIBC_FUNCTION(int, setjmp, (__jmp_buf * buf)) {
+  asm(R"(
+      mov %%rbx, %c[rbx](%%rdi)
+      mov %%rbp, %c[rbp](%%rdi)
+      mov %%r12, %c[r12](%%rdi)
+      mov %%r13, %c[r13](%%rdi)
+      mov %%r14, %c[r14](%%rdi)
+      mov %%r15, %c[r15](%%rdi)
+
+      lea 8(%%rsp), %%rax
+      mov %%rax, %c[rsp](%%rdi)
+
+      mov (%%rsp), %%rax
+      mov %%rax, %c[rip](%%rdi)
+
+      xorl %%eax, %%eax
+      retq)" ::[rbx] "i"(offsetof(__jmp_buf, rbx)),
+      [rbp] "i"(offsetof(__jmp_buf, rbp)), [r12] "i"(offsetof(__jmp_buf, r12)),
+      [r13] "i"(offsetof(__jmp_buf, r13)), [r14] "i"(offsetof(__jmp_buf, r14)),
+      [r15] "i"(offsetof(__jmp_buf, r15)), [rsp] "i"(offsetof(__jmp_buf, rsp)),
+      [rip] "i"(offsetof(__jmp_buf, rip))
+      : "rax");
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-- 
GitLab


From 7d7fb7ce5f4f33d668cdf4c5063c58ec444c274c Mon Sep 17 00:00:00 2001
From: Artem Belevich <tra@google.com>
Date: Tue, 15 Oct 2024 10:45:12 -0700
Subject: [PATCH 004/329] [NVPTX] restrict `cvta.param` use to kernels only.
 (#112278)

If `cvta.param` is used in regular functions, it may produce an
invalid pointer. It's unclear if it's a bug in ptxas or we're not using `cvta.param` correctly, but,
regardless of the underlying reason, the instruction has to be disabled for non-kernels, at least for now.
---
 llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp      |  3 +-
 .../CodeGen/NVPTX/lower-args-gridconstant.ll  | 82 +++++++++++++++++--
 2 files changed, 77 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 4a184037add4..3041c16c7a76 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -545,7 +545,8 @@ struct ArgUseChecker : PtrUseVisitor<ArgUseChecker> {
 void NVPTXLowerArgs::handleByValParam(const NVPTXTargetMachine &TM,
                                       Argument *Arg) {
   Function *Func = Arg->getParent();
-  bool HasCvtaParam = TM.getSubtargetImpl(*Func)->hasCvtaParam();
+  bool HasCvtaParam =
+      TM.getSubtargetImpl(*Func)->hasCvtaParam() && isKernelFunction(*Func);
   bool IsGridConstant = HasCvtaParam && isParamGridConstant(*Arg);
   const DataLayout &DL = Func->getDataLayout();
   BasicBlock::iterator FirstInst = Func->getEntryBlock().begin();
diff --git a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
index b203a78d6773..33fa3afc94b8 100644
--- a/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-args-gridconstant.ll
@@ -2,6 +2,72 @@
 ; RUN: opt < %s -S -nvptx-lower-args --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes OPT
 ; RUN: llc < %s --mtriple nvptx64-nvidia-cuda -mcpu=sm_70 -mattr=+ptx77 | FileCheck %s --check-prefixes PTX
 
+%struct.uint4 = type { i32, i32, i32, i32 }
+
+@gi = dso_local addrspace(1) externally_initialized global %struct.uint4 { i32 50462976, i32 117835012, i32 185207048, i32 252579084 }, align 16
+
+; Function Attrs: mustprogress nofree noinline norecurse nosync nounwind willreturn memory(read, inaccessiblemem: none)
+; Regular functions mus still make a copy. `cvta.param` does not always work there.
+define dso_local noundef i32 @non_kernel_function(ptr nocapture noundef readonly byval(%struct.uint4) align 16 %a, i1 noundef zeroext %b, i32 noundef %c) local_unnamed_addr #0 {
+; OPT-LABEL: define dso_local noundef i32 @non_kernel_function(
+; OPT-SAME: ptr nocapture noundef readonly byval([[STRUCT_UINT4:%.*]]) align 16 [[A:%.*]], i1 noundef zeroext [[B:%.*]], i32 noundef [[C:%.*]]) local_unnamed_addr #[[ATTR0:[0-9]+]] {
+; OPT-NEXT:  [[ENTRY:.*:]]
+; OPT-NEXT:    [[A1:%.*]] = alloca [[STRUCT_UINT4]], align 16
+; OPT-NEXT:    [[A2:%.*]] = addrspacecast ptr [[A]] to ptr addrspace(101)
+; OPT-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr align 16 [[A1]], ptr addrspace(101) align 16 [[A2]], i64 16, i1 false)
+; OPT-NEXT:    [[A_:%.*]] = select i1 [[B]], ptr [[A1]], ptr addrspacecast (ptr addrspace(1) @gi to ptr)
+; OPT-NEXT:    [[IDX_EXT:%.*]] = sext i32 [[C]] to i64
+; OPT-NEXT:    [[ADD_PTR:%.*]] = getelementptr inbounds i8, ptr [[A_]], i64 [[IDX_EXT]]
+; OPT-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ADD_PTR]], align 1
+; OPT-NEXT:    ret i32 [[TMP0]]
+;
+; PTX-LABEL: non_kernel_function(
+; PTX:       {
+; PTX-NEXT:    .local .align 16 .b8 __local_depot0[16];
+; PTX-NEXT:    .reg .b64 %SP;
+; PTX-NEXT:    .reg .b64 %SPL;
+; PTX-NEXT:    .reg .pred %p<2>;
+; PTX-NEXT:    .reg .b16 %rs<3>;
+; PTX-NEXT:    .reg .b32 %r<11>;
+; PTX-NEXT:    .reg .b64 %rd<10>;
+; PTX-EMPTY:
+; PTX-NEXT:  // %bb.0: // %entry
+; PTX-NEXT:    mov.u64 %SPL, __local_depot0;
+; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
+; PTX-NEXT:    ld.param.u8 %rs1, [non_kernel_function_param_1];
+; PTX-NEXT:    and.b16 %rs2, %rs1, 1;
+; PTX-NEXT:    setp.eq.b16 %p1, %rs2, 1;
+; PTX-NEXT:    ld.param.s32 %rd1, [non_kernel_function_param_2];
+; PTX-NEXT:    add.u64 %rd2, %SP, 0;
+; PTX-NEXT:    or.b64 %rd3, %rd2, 8;
+; PTX-NEXT:    ld.param.u64 %rd4, [non_kernel_function_param_0+8];
+; PTX-NEXT:    st.u64 [%rd3], %rd4;
+; PTX-NEXT:    ld.param.u64 %rd5, [non_kernel_function_param_0];
+; PTX-NEXT:    st.u64 [%SP+0], %rd5;
+; PTX-NEXT:    mov.u64 %rd6, gi;
+; PTX-NEXT:    cvta.global.u64 %rd7, %rd6;
+; PTX-NEXT:    selp.b64 %rd8, %rd2, %rd7, %p1;
+; PTX-NEXT:    add.s64 %rd9, %rd8, %rd1;
+; PTX-NEXT:    ld.u8 %r1, [%rd9];
+; PTX-NEXT:    ld.u8 %r2, [%rd9+1];
+; PTX-NEXT:    shl.b32 %r3, %r2, 8;
+; PTX-NEXT:    or.b32 %r4, %r3, %r1;
+; PTX-NEXT:    ld.u8 %r5, [%rd9+2];
+; PTX-NEXT:    shl.b32 %r6, %r5, 16;
+; PTX-NEXT:    ld.u8 %r7, [%rd9+3];
+; PTX-NEXT:    shl.b32 %r8, %r7, 24;
+; PTX-NEXT:    or.b32 %r9, %r8, %r6;
+; PTX-NEXT:    or.b32 %r10, %r9, %r4;
+; PTX-NEXT:    st.param.b32 [func_retval0+0], %r10;
+; PTX-NEXT:    ret;
+entry:
+  %a. = select i1 %b, ptr %a, ptr addrspacecast (ptr addrspace(1) @gi to ptr), !dbg !17
+  %idx.ext = sext i32 %c to i64, !dbg !18
+  %add.ptr = getelementptr inbounds i8, ptr %a., i64 %idx.ext, !dbg !18
+  %0 = load i32, ptr %add.ptr, align 1, !dbg !19
+  ret i32 %0, !dbg !23
+}
+
 define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %out, i32 %n) {
 ; PTX-LABEL: grid_const_int(
 ; PTX:       {
@@ -17,7 +83,7 @@ define void @grid_const_int(ptr byval(i32) align 4 %input1, i32 %input2, ptr %ou
 ; PTX-NEXT:    st.global.u32 [%rd2], %r3;
 ; PTX-NEXT:    ret;
 ; OPT-LABEL: define void @grid_const_int(
-; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0:[0-9]+]] {
+; OPT-SAME: ptr byval(i32) align 4 [[INPUT1:%.*]], i32 [[INPUT2:%.*]], ptr [[OUT:%.*]], i32 [[N:%.*]]) #[[ATTR0]] {
 ; OPT-NEXT:    [[OUT2:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
 ; OPT-NEXT:    [[OUT3:%.*]] = addrspacecast ptr addrspace(1) [[OUT2]] to ptr
 ; OPT-NEXT:    [[INPUT11:%.*]] = addrspacecast ptr [[INPUT1]] to ptr addrspace(101)
@@ -106,14 +172,14 @@ define void @grid_const_escape(ptr byval(%struct.s) align 4 %input) {
 define void @multiple_grid_const_escape(ptr byval(%struct.s) align 4 %input, i32 %a, ptr byval(i32) align 4 %b) {
 ; PTX-LABEL: multiple_grid_const_escape(
 ; PTX:       {
-; PTX-NEXT:    .local .align 4 .b8 __local_depot3[4];
+; PTX-NEXT:    .local .align 4 .b8 __local_depot4[4];
 ; PTX-NEXT:    .reg .b64 %SP;
 ; PTX-NEXT:    .reg .b64 %SPL;
 ; PTX-NEXT:    .reg .b32 %r<4>;
 ; PTX-NEXT:    .reg .b64 %rd<10>;
 ; PTX-EMPTY:
 ; PTX-NEXT:  // %bb.0:
-; PTX-NEXT:    mov.u64 %SPL, __local_depot3;
+; PTX-NEXT:    mov.u64 %SPL, __local_depot4;
 ; PTX-NEXT:    cvta.local.u64 %SP, %SPL;
 ; PTX-NEXT:    mov.b64 %rd2, multiple_grid_const_escape_param_0;
 ; PTX-NEXT:    mov.b64 %rd3, multiple_grid_const_escape_param_2;
@@ -342,10 +408,10 @@ define void @grid_const_phi(ptr byval(%struct.s) align 4 %input1, ptr %inout) {
 ; PTX-NEXT:    cvta.param.u64 %rd8, %rd7;
 ; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
 ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
-; PTX-NEXT:    @%p1 bra $L__BB8_2;
+; PTX-NEXT:    @%p1 bra $L__BB9_2;
 ; PTX-NEXT:  // %bb.1: // %second
 ; PTX-NEXT:    add.s64 %rd8, %rd8, 4;
-; PTX-NEXT:  $L__BB8_2: // %merge
+; PTX-NEXT:  $L__BB9_2: // %merge
 ; PTX-NEXT:    ld.u32 %r2, [%rd8];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
@@ -402,13 +468,13 @@ define void @grid_const_phi_ngc(ptr byval(%struct.s) align 4 %input1, ptr byval(
 ; PTX-NEXT:    cvta.param.u64 %rd11, %rd10;
 ; PTX-NEXT:    ld.global.u32 %r1, [%rd1];
 ; PTX-NEXT:    setp.lt.s32 %p1, %r1, 0;
-; PTX-NEXT:    @%p1 bra $L__BB9_2;
+; PTX-NEXT:    @%p1 bra $L__BB10_2;
 ; PTX-NEXT:  // %bb.1: // %second
 ; PTX-NEXT:    mov.b64 %rd8, grid_const_phi_ngc_param_1;
 ; PTX-NEXT:    mov.u64 %rd9, %rd8;
 ; PTX-NEXT:    cvta.param.u64 %rd2, %rd9;
 ; PTX-NEXT:    add.s64 %rd11, %rd2, 4;
-; PTX-NEXT:  $L__BB9_2: // %merge
+; PTX-NEXT:  $L__BB10_2: // %merge
 ; PTX-NEXT:    ld.u32 %r2, [%rd11];
 ; PTX-NEXT:    st.global.u32 [%rd1], %r2;
 ; PTX-NEXT:    ret;
@@ -567,3 +633,5 @@ declare dso_local ptr @escape3(ptr, ptr, ptr) local_unnamed_addr
 
 !22 = !{ptr @grid_const_ptrtoint, !"kernel", i32 1, !"grid_constant", !23}
 !23 = !{i32 1}
+
+
-- 
GitLab


From fe7f5f9126cea9ceba703d5bd07b766181f2bd72 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 15 Oct 2024 10:45:42 -0700
Subject: [PATCH 005/329] [NFC][Intrumentation] Don't brake long RUN: lines
 (#112281)

I find multiline 'RUN:' statements hard to read.

` *\\\n; RUN: *` -> ` ` for  ./llvm/test/Instrumentation/
---
 .../asan-detect-invalid-pointer-pair.ll       |  9 +++----
 .../AddressSanitizer/asan-funclet.ll          |  6 ++---
 .../asan-masked-load-store.ll                 |  6 ++---
 .../asan-optimize-callbacks.ll                |  7 ++---
 .../AddressSanitizer/asan-vp-load-store.ll    |  6 ++---
 .../AddressSanitizer/freebsd.ll               | 26 +++++--------------
 .../AddressSanitizer/no-global-ctors.ll       |  7 ++---
 .../AddressSanitizer/no_global_dtors.ll       |  7 ++---
 .../AddressSanitizer/stack_dynamic_alloca.ll  | 18 +++----------
 .../AddressSanitizer/stack_layout.ll          |  6 ++---
 .../Instrumentation/AddressSanitizer/ubsan.ll |  3 +--
 .../AddressSanitizer/vector-load-store.ll     |  6 ++---
 .../AddressSanitizer/with-ifunc.ll            | 18 +++++--------
 .../HWAddressSanitizer/prologue.ll            | 21 +++++----------
 .../HWAddressSanitizer/stack-coloring.ll      |  8 ++----
 .../HWAddressSanitizer/use-after-scope.ll     | 26 +++++--------------
 .../HeapProfiler/masked-load-store.ll         | 12 +++------
 .../MemorySanitizer/X86/msan_x86_bts_asm.ll   |  8 ++----
 .../MemorySanitizer/X86/msan_x86intrinsics.ll |  7 ++---
 .../MemorySanitizer/X86/vararg-too-large.ll   |  3 +--
 .../MemorySanitizer/X86/vararg_call.ll        | 12 +++------
 .../MemorySanitizer/array_types.ll            |  7 ++---
 .../Instrumentation/MemorySanitizer/bmi.ll    |  3 +--
 .../MemorySanitizer/byval-alignment.ll        |  3 +--
 .../MemorySanitizer/check-array.ll            |  3 +--
 .../MemorySanitizer/check-struct.ll           |  3 +--
 .../Instrumentation/MemorySanitizer/clmul.ll  |  3 +--
 .../MemorySanitizer/i386/msan_x86_bts_asm.ll  |  8 ++----
 .../i386/msan_x86intrinsics.ll                |  7 ++---
 .../MemorySanitizer/i386/vararg-too-large.ll  |  3 +--
 .../MemorySanitizer/i386/vararg_call.ll       | 12 +++------
 .../instrumentation-with-call-threshold.ll    | 14 +++-------
 .../MemorySanitizer/manual-shadow.ll          | 16 ++++--------
 .../MemorySanitizer/missing_origin.ll         |  3 +--
 .../MemorySanitizer/msan_asm_conservative.ll  | 14 +++-------
 .../MemorySanitizer/msan_eager.ll             |  6 ++---
 .../MemorySanitizer/msan_kernel_basic.ll      |  3 +--
 .../MemorySanitizer/msan_llvm_is_constant.ll  |  3 +--
 .../MemorySanitizer/mul_by_constant.ll        |  3 +--
 .../MemorySanitizer/origin-alignment.ll       |  8 ++----
 .../MemorySanitizer/origin-array.ll           |  3 +--
 .../Instrumentation/MemorySanitizer/reduce.ll |  3 +--
 .../stable_set_alloca_origin.ll               |  6 ++---
 .../MemorySanitizer/store-long-origin.ll      |  3 +--
 .../MemorySanitizer/vector_arith.ll           |  3 +--
 .../MemorySanitizer/vector_cmp.ll             |  3 +--
 .../MemorySanitizer/vector_pack.ll            |  3 +--
 .../MemorySanitizer/vector_shift.ll           |  3 +--
 .../SanitizerCoverage/crit-edge-sancov.ll     |  3 +--
 .../stack-depth-variable-declared-by-user.ll  |  3 +--
 50 files changed, 112 insertions(+), 265 deletions(-)

diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll
index 9cf40fe06c12..a98df8b1d104 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-detect-invalid-pointer-pair.ll
@@ -1,9 +1,6 @@
-; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S \
-; RUN:     | FileCheck %s --check-prefixes=CMP,NOSUB,ALL
-; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-sub -S \
-; RUN:     | FileCheck %s --check-prefixes=SUB,NOCMP,ALL
-; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-pair -S \
-; RUN:     | FileCheck %s --check-prefixes=CMP,SUB,ALL
+; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S | FileCheck %s --check-prefixes=CMP,NOSUB,ALL
+; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-sub -S | FileCheck %s --check-prefixes=SUB,NOCMP,ALL
+; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-pair -S | FileCheck %s --check-prefixes=CMP,SUB,ALL
 ; Support instrumentation of invalid pointer pair detection.
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
index e986f1085426..e9c1075a2cb9 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-funclet.ll
@@ -1,10 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
 
 ; Test appropriate tagging of funclet for function calls generated by asan.
-; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 \
-; RUN:   -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INLINE
-; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 -asan-instrumentation-with-call-threshold=0  \
-; RUN:   -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-OUTLINE
+; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-INLINE
+; RUN: opt -S -passes=asan,win-eh-prepare -asan-use-stack-safety=0 -asan-max-inline-poisoning-size=0 -asan-instrumentation-with-call-threshold=0 -asan-detect-invalid-pointer-cmp -asan-detect-invalid-pointer-sub -asan-use-after-scope < %s | FileCheck %s --check-prefixes=CHECK,CHECK-OUTLINE
 
 ; REQUIRES: x86-registered-target
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
index 9a641287960f..597b3bb855b4 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-masked-load-store.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S \
-; RUN:     | FileCheck %s
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S \
-; RUN:     | FileCheck %s -check-prefix=DISABLED
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S | FileCheck %s -check-prefix=DISABLED
 
 ; Support ASan instrumentation for constant-mask llvm.masked.{load,store}
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll
index ea028066e585..c4ea78a0c91a 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-optimize-callbacks.ll
@@ -1,8 +1,5 @@
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 \
-; RUN:   -asan-optimize-callbacks -S | FileCheck %s --check-prefixes=LOAD,STORE
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 \
-; RUN:   -asan-optimize-callbacks --asan-kernel -S | \
-; RUN:   FileCheck %s --check-prefixes=LOAD-KERNEL,STORE-KERNEL
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-optimize-callbacks -S | FileCheck %s --check-prefixes=LOAD,STORE
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-optimize-callbacks --asan-kernel -S | FileCheck %s --check-prefixes=LOAD-KERNEL,STORE-KERNEL
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll
index ee752c8c61da..d22671aa84f8 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/asan-vp-load-store.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S \
-; RUN:     | FileCheck %s
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S \
-; RUN:     | FileCheck %s -check-prefix=DISABLED
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -asan-instrument-reads=0 -asan-instrument-writes=0 -S | FileCheck %s -check-prefix=DISABLED
 
 ; Support ASan instrumentation for constant-mask llvm.vp.{load,store}
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll b/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll
index 5e09f7400588..c09f7a513072 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/freebsd.ll
@@ -1,22 +1,10 @@
-; RUN: opt < %s -passes=asan -S \
-; RUN:     -mtriple=i386-unknown-freebsd \
-; RUN:     -data-layout="e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" | \
-; RUN:     FileCheck --check-prefix=CHECK-32 %s
-
-; RUN: opt < %s -passes=asan -S \
-; RUN:     -mtriple=x86_64-unknown-freebsd \
-; RUN:     -data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | \
-; RUN:     FileCheck --check-prefix=CHECK-64 %s
-
-; RUN: opt < %s -passes=asan -S \
-; RUN:     -mtriple=aarch64-unknown-freebsd \
-; RUN:     -data-layout="e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" | \
-; RUN:     FileCheck --check-prefix=CHECK-AARCH64 %s
-
-; RUN: opt < %s -passes=asan -S \
-; RUN:     -mtriple=mips64-unknown-freebsd \
-; RUN:     -data-layout="E-m:e-i64:64-n32:64-S128" | \
-; RUN:     FileCheck --check-prefix=CHECK-MIPS64 %s
+; RUN: opt < %s -passes=asan -S -mtriple=i386-unknown-freebsd -data-layout="e-m:e-p:32:32-f64:32:64-f80:32-n8:16:32-S128" | FileCheck --check-prefix=CHECK-32 %s
+
+; RUN: opt < %s -passes=asan -S -mtriple=x86_64-unknown-freebsd -data-layout="e-m:e-i64:64-f80:128-n8:16:32:64-S128" | FileCheck --check-prefix=CHECK-64 %s
+
+; RUN: opt < %s -passes=asan -S -mtriple=aarch64-unknown-freebsd -data-layout="e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128" | FileCheck --check-prefix=CHECK-AARCH64 %s
+
+; RUN: opt < %s -passes=asan -S -mtriple=mips64-unknown-freebsd -data-layout="E-m:e-i64:64-n32:64-S128" | FileCheck --check-prefix=CHECK-MIPS64 %s
 
 define i32 @read_4_bytes(ptr %a) sanitize_address {
 entry:
diff --git a/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll b/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll
index 79ed0b78fd6a..abbc5bda1238 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/no-global-ctors.ll
@@ -1,13 +1,10 @@
 ; Check Default behaviour still emits ctors
-; RUN: opt < %s -passes=asan -S | \
-; RUN:   FileCheck -check-prefix=CHECK-DEFAULT %s
+; RUN: opt < %s -passes=asan -S | FileCheck -check-prefix=CHECK-DEFAULT %s
 ; CHECK-DEFAULT: llvm.global_ctor{{.+}}asan.module_ctor
 ; CHECK-DEFAULT: define internal void @asan.module_ctor
 
 ; Check with ctor emission disabled
-; RUN: opt < %s -passes=asan \
-; RUN:   -asan-constructor-kind=none -S | \
-; RUN:   FileCheck %s
+; RUN: opt < %s -passes=asan -asan-constructor-kind=none -S | FileCheck %s
 ; CHECK-NOT: llvm.global_ctor{{.+}}asan.module_ctor
 ; CHECK-NOT: define internal void @asan.module_ctor
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll b/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll
index b927322e403a..5204d80f1db5 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/no_global_dtors.ll
@@ -1,13 +1,10 @@
 ; Check Default behaviour still emits dtors
-; RUN: opt < %s -passes=asan -S | \
-; RUN:   FileCheck -check-prefix=CHECK-DEFAULT %s
+; RUN: opt < %s -passes=asan -S | FileCheck -check-prefix=CHECK-DEFAULT %s
 ; CHECK-DEFAULT: llvm.global_dtor{{.+}}asan.module_dtor
 ; CHECK-DEFAULT: define internal void @asan.module_dtor
 
 ; Check with dtor emission disabled
-; RUN: opt < %s -passes=asan \
-; RUN:   -asan-destructor-kind=none -S | \
-; RUN:   FileCheck %s
+; RUN: opt < %s -passes=asan -asan-destructor-kind=none -S | FileCheck %s
 ; CHECK-NOT: llvm.global_dtor{{.+}}asan.module_dtor
 ; CHECK-NOT: define internal void @asan.module_dtor
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
index cbb2001c45e6..d56cd340d70a 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack_dynamic_alloca.ll
@@ -1,17 +1,7 @@
-; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 \
-; RUN:       -asan-use-after-return=runtime -S | FileCheck %s \
-; RUN:       --check-prefixes=CHECK,CHECK-RUNTIME
-; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-mapping-scale=5 -asan-use-stack-safety=0 \
-; RUN:       -asan-use-after-return=runtime -S | FileCheck %s \
-; RUN:       --check-prefixes=CHECK,CHECK-RUNTIME
-; RUN: opt < %s -passes=asan  -asan-stack-dynamic-alloca -asan-use-stack-safety=0 \
-; RUN:       -asan-use-after-return=always -S | FileCheck %s \
-; RUN:       --check-prefixes=CHECK,CHECK-ALWAYS \
-; RUN:       --implicit-check-not=__asan_option_detect_stack_use_after_return
-; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 \
-; RUN:       -asan-use-after-return=always -S | FileCheck %s \
-; RUN:       --check-prefixes=CHECK,CHECK-ALWAYS \
-; RUN:       --implicit-check-not=__asan_option_detect_stack_use_after_return
+; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 -asan-use-after-return=runtime -S | FileCheck %s --check-prefixes=CHECK,CHECK-RUNTIME
+; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-mapping-scale=5 -asan-use-stack-safety=0 -asan-use-after-return=runtime -S | FileCheck %s --check-prefixes=CHECK,CHECK-RUNTIME
+; RUN: opt < %s -passes=asan  -asan-stack-dynamic-alloca -asan-use-stack-safety=0 -asan-use-after-return=always -S | FileCheck %s --check-prefixes=CHECK,CHECK-ALWAYS --implicit-check-not=__asan_option_detect_stack_use_after_return
+; RUN: opt < %s -passes=asan -asan-stack-dynamic-alloca -asan-use-stack-safety=0 -asan-use-after-return=always -S | FileCheck %s --check-prefixes=CHECK,CHECK-ALWAYS --implicit-check-not=__asan_option_detect_stack_use_after_return
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll b/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll
index 48465be36789..afbfbd6c7a96 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/stack_layout.ll
@@ -1,9 +1,7 @@
 ; Test the ASan's stack layout.
 ; More tests in tests/Transforms/Utils/ASanStackFrameLayoutTest.cpp
-; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=0 -asan-use-after-scope -S \
-; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-STATIC
-; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=1 -asan-use-after-scope -S \
-; RUN:     | FileCheck %s --check-prefixes=CHECK,CHECK-DYNAMIC
+; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=0 -asan-use-after-scope -S | FileCheck %s --check-prefixes=CHECK,CHECK-STATIC
+; RUN: opt < %s -passes=asan -asan-use-stack-safety=0 -asan-stack-dynamic-alloca=1 -asan-use-after-scope -S | FileCheck %s --check-prefixes=CHECK,CHECK-DYNAMIC
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll b/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll
index 62bbc985cb1b..5953feb9a598 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/ubsan.ll
@@ -1,8 +1,7 @@
 ; ASan shouldn't instrument code added by UBSan.
 
 ; RUN: opt < %s -passes=asan -S | FileCheck %s
-; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S \
-; RUN:     | FileCheck %s --check-prefixes=NOCMP
+; RUN: opt < %s -passes=asan -asan-detect-invalid-pointer-cmp -S | FileCheck %s --check-prefixes=NOCMP
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll b/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll
index 833dc9641d8e..51bef51d6b35 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/vector-load-store.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=100 -S \
-; RUN:     | FileCheck %s
-; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S \
-; RUN:     | FileCheck %s -check-prefix=CALLS
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=100 -S | FileCheck %s
+; RUN: opt < %s -passes=asan -asan-instrumentation-with-call-threshold=0 -S | FileCheck %s -check-prefix=CALLS
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 
diff --git a/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll b/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll
index 5a9603757be3..f198fc01d263 100644
--- a/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll
+++ b/llvm/test/Instrumentation/AddressSanitizer/with-ifunc.ll
@@ -1,19 +1,13 @@
 ; Test -asan-with-ifunc flag.
 ;
-; RUN: opt -passes=asan -S -asan-with-ifunc=0 < %s | \
-; RUN:     FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 < %s | \
-; RUN:     FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=1 < %s | \
-; RUN:     FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC-NOREMAT
+; RUN: opt -passes=asan -S -asan-with-ifunc=0 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
+; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC
+; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=1 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC-NOREMAT
 
 ; Pre-Lollipop Android does not support ifunc.
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android20 < %s | \
-; RUN:     FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android < %s | \
-; RUN:     FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
-; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android21 < %s | \
-; RUN:     FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC
+; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android20 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
+; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android < %s | FileCheck %s --check-prefixes=CHECK,CHECK-NOIFUNC
+; RUN: opt -passes=asan -S -asan-with-ifunc=1 -asan-with-ifunc-suppress-remat=0 -mtriple=armv7-linux-android21 < %s | FileCheck %s --check-prefixes=CHECK,CHECK-IFUNC
 
 target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
 target triple = "armv7--linux-android22"
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll b/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll
index 73fc077c9562..1698592bafc6 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/prologue.ll
@@ -1,20 +1,13 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals --global-value-regex "![0-9]+" --version 2
 ; Test -hwasan-with-ifunc flag.
 ;
-; RUN: opt -passes=hwasan -S < %s | \
-; RUN:     FileCheck %s
-; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=instr < %s | \
-; RUN:     FileCheck %s --check-prefixes=NOIFUNC-TLS-HISTORY
-; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=none < %s | \
-; RUN:     FileCheck %s --check-prefixes=NOIFUNC-TLS-NOHISTORY
-; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=global -hwasan-with-frame-record=0 < %s | \
-; RUN:     FileCheck %s --check-prefixes=NOIFUNC-NOTLS
-; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=ifunc -hwasan-with-frame-record=0 < %s | \
-; RUN:     FileCheck %s --check-prefixes=IFUNC-NOTLS
-; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia < %s | \
-; RUN:     FileCheck %s --check-prefixes=FUCHSIA
-; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia -hwasan-record-stack-history=libcall < %s | \
-; RUN:     FileCheck %s --check-prefixes=FUCHSIA-LIBCALL
+; RUN: opt -passes=hwasan -S < %s | FileCheck %s
+; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=instr < %s | FileCheck %s --check-prefixes=NOIFUNC-TLS-HISTORY
+; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=tls -hwasan-record-stack-history=none < %s | FileCheck %s --check-prefixes=NOIFUNC-TLS-NOHISTORY
+; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=global -hwasan-with-frame-record=0 < %s | FileCheck %s --check-prefixes=NOIFUNC-NOTLS
+; RUN: opt -passes=hwasan -S -hwasan-mapping-offset-dynamic=ifunc -hwasan-with-frame-record=0 < %s | FileCheck %s --check-prefixes=IFUNC-NOTLS
+; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia < %s | FileCheck %s --check-prefixes=FUCHSIA
+; RUN: opt -passes=hwasan -S -mtriple=aarch64-fuchsia -hwasan-record-stack-history=libcall < %s | FileCheck %s --check-prefixes=FUCHSIA-LIBCALL
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64--linux-android22"
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll b/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll
index 253e976a7d60..ae6fe5776f20 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/stack-coloring.ll
@@ -1,12 +1,8 @@
 ; Test that storage for allocas with disjoint lifetimes is reused with
 ; use-after-scope.
 
-; RUN: opt -S -passes=hwasan %s -hwasan-use-after-scope -o - | \
-; RUN:   llc -no-stack-coloring=false -o - | \
-; RUN:   FileCheck %s --check-prefix=COLOR
-; RUN: opt -S -passes=hwasan -hwasan-use-after-scope %s -o - | \
-; RUN:   llc -no-stack-coloring=true -o - | \
-; RUN:   FileCheck %s --check-prefix=NOCOLOR
+; RUN: opt -S -passes=hwasan %s -hwasan-use-after-scope -o - | llc -no-stack-coloring=false -o - | FileCheck %s --check-prefix=COLOR
+; RUN: opt -S -passes=hwasan -hwasan-use-after-scope %s -o - | llc -no-stack-coloring=true -o - | FileCheck %s --check-prefix=NOCOLOR
 
 target datalayout = "e-m:e-i8:8:32-i16:16:32-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-android29"
diff --git a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
index 643b2a8c274c..16e6cda59a61 100644
--- a/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
+++ b/llvm/test/Instrumentation/HWAddressSanitizer/use-after-scope.ll
@@ -1,26 +1,12 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; REQUIRES: aarch64-registered-target, x86-registered-target
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan \
-; RUN:   -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | \
-; RUN:   FileCheck %s --check-prefixes=X86-SCOPE
-; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan \
-; RUN:   -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | \
-; RUN:   FileCheck %s --check-prefixes=X86-NOSCOPE
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=X86-SCOPE
+; RUN: opt -mtriple=x86_64-unknown-linux-gnu -passes=hwasan -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=X86-NOSCOPE
 
-; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \
-; RUN:   -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | \
-; RUN:   FileCheck %s --check-prefixes=AARCH64-SCOPE
-; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \
-; RUN:   -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | \
-; RUN:   FileCheck %s --check-prefixes=AARCH64-NOSCOPE
-; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \
-; RUN:   -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls \
-; RUN:   -hwasan-use-short-granules=1 -S < %s | \
-; RUN:   FileCheck %s --check-prefixes=AARCH64-SHORT-SCOPE
-; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan \
-; RUN:   -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls \
-; RUN:   -hwasan-use-short-granules=1 -S < %s | \
-; RUN:   FileCheck %s --check-prefixes=AARCH64-SHORT-NOSCOPE
+; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=AARCH64-SCOPE
+; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -S < %s | FileCheck %s --check-prefixes=AARCH64-NOSCOPE
+; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=1 -hwasan-generate-tags-with-calls -hwasan-use-short-granules=1 -S < %s | FileCheck %s --check-prefixes=AARCH64-SHORT-SCOPE
+; RUN: opt -mtriple=aarch64-unknown-linux-android29 -passes=hwasan -hwasan-use-after-scope=0 -hwasan-generate-tags-with-calls -hwasan-use-short-granules=1 -S < %s | FileCheck %s --check-prefixes=AARCH64-SHORT-NOSCOPE
 
 
 define dso_local i32 @standard_lifetime() local_unnamed_addr sanitize_hwaddress {
diff --git a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
index a0a309149e8f..80d6e0f3b36d 100644
--- a/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
+++ b/llvm/test/Instrumentation/HeapProfiler/masked-load-store.ll
@@ -1,11 +1,7 @@
-; RUN: opt < %s -passes=memprof -memprof-use-callbacks -S \
-; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL
-; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -S \
-; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL
-; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-writes=0 -S \
-; RUN:     | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL
-; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -memprof-instrument-writes=0 -S \
-; RUN:     | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL
+; RUN: opt < %s -passes=memprof -memprof-use-callbacks -S | FileCheck %s -check-prefix=LOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -S | FileCheck %s -check-prefix=NOLOAD -check-prefix=STORE -check-prefix=ALL
+; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-writes=0 -S | FileCheck %s -check-prefix=LOAD -check-prefix=NOSTORE -check-prefix=ALL
+; RUN: opt < %s -passes=memprof -memprof-use-callbacks -memprof-instrument-reads=0 -memprof-instrument-writes=0 -S | FileCheck %s -check-prefix=NOLOAD -check-prefix=NOSTORE -check-prefix=ALL
 ; Support memory profiling instrumentation for constant-mask llvm.masked.{load,store}
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll
index dd2fecb081be..57a2599dbab8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86_bts_asm.ll
@@ -1,10 +1,6 @@
 ; Test for the conservative assembly handling mode used by KMSAN.
-; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN: -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck        \
-; RUN: "-check-prefix=CHECK" %s
-; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN: -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck        \
-; RUN: "-check-prefixes=CHECK,CHECK-CONS" %s
+; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck "-check-prefix=CHECK" %s
+; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck "-check-prefixes=CHECK,CHECK-CONS" %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll
index a83a94a06b98..a0e7b13e5c9e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/msan_x86intrinsics.ll
@@ -1,8 +1,5 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK                       \
-; RUN: -check-prefix=CHECK-ORIGINS %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
 ; REQUIRES: x86-registered-target
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll
index adb3e208d855..9a7f4b985293 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg-too-large.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck %s
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
 ; passed to a variadic function.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll
index 32d43e11fbd9..7a3f0dd88f9c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/X86/vararg_call.ll
@@ -1,11 +1,7 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
-; RUN: opt < %s -msan-check-access-address=0 -S          \
-; RUN: -passes="msan<track-origins=1>" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
+; RUN: opt < %s -msan-check-access-address=0 -S -passes="msan<track-origins=1>" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
 
 ; Test that shadow and origin are stored for variadic function params.
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
index d9e4eeb7f7cb..236b01914703 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/array_types.ll
@@ -1,8 +1,5 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK                       \
-; RUN: %s --allow-empty
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK %s --allow-empty
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
index 327fec0ed702..2f60bd8b357b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/bmi.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll b/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll
index f83a92287d24..e06576e2fead 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/byval-alignment.ll
@@ -1,7 +1,6 @@
 ; Test that copy alignment for byval arguments is limited by param-tls slot alignment.
 
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/check-array.ll b/llvm/test/Instrumentation/MemorySanitizer/check-array.ll
index a4e5e37eaaf8..3ffa0d254ab0 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/check-array.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/check-array.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-eager-checks -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | \
-; RUN:   FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
+; RUN: opt < %s -msan-eager-checks -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll b/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll
index d24ffe329619..0be50758bd21 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/check-struct.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | \
-; RUN:   FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/clmul.ll b/llvm/test/Instrumentation/MemorySanitizer/clmul.ll
index ae4f2d2c868a..a498e1932d4b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/clmul.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/clmul.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll
index dd2fecb081be..57a2599dbab8 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86_bts_asm.ll
@@ -1,10 +1,6 @@
 ; Test for the conservative assembly handling mode used by KMSAN.
-; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN: -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck        \
-; RUN: "-check-prefix=CHECK" %s
-; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN: -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck        \
-; RUN: "-check-prefixes=CHECK,CHECK-CONS" %s
+; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck "-check-prefix=CHECK" %s
+; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck "-check-prefixes=CHECK,CHECK-CONS" %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll
index a83a94a06b98..a0e7b13e5c9e 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/msan_x86intrinsics.ll
@@ -1,8 +1,5 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK                       \
-; RUN: -check-prefix=CHECK-ORIGINS %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
 ; REQUIRES: x86-registered-target
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll
index adb3e208d855..9a7f4b985293 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg-too-large.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S 2>&1 -passes=msan | FileCheck %s
 
 ; Test that MSan doesn't generate code overflowing __msan_va_arg_tls when too many arguments are
 ; passed to a variadic function.
diff --git a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll
index 32d43e11fbd9..7a3f0dd88f9c 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/i386/vararg_call.ll
@@ -1,11 +1,7 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
-; RUN: opt < %s -msan-check-access-address=0 -S          \
-; RUN: -passes="msan<track-origins=1>" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
+; RUN: opt < %s -msan-check-access-address=0 -S -passes="msan<track-origins=1>" 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s "--check-prefixes=CHECK,CHECK-ORIGIN"
 
 ; Test that shadow and origin are stored for variadic function params.
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll b/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
index d7156540e003..b0069d2bae88 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/instrumentation-with-call-threshold.ll
@@ -2,17 +2,9 @@
 ; Test that in with-calls mode there are no calls to __msan_chain_origin - they
 ; are done from __msan_maybe_store_origin_*.
 
-; RUN: opt < %s -msan-check-access-address=0                                   \
-; RUN: -msan-instrumentation-with-call-threshold=0 -S -passes=msan 2>&1 |      \
-; RUN: FileCheck %s
-; RUN: opt < %s -msan-check-access-address=0                                   \
-; RUN: -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 -S    \
-; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK                       \
-; RUN: -check-prefix=CHECK-ORIGINS %s
-; RUN: opt < %s -msan-check-access-address=0                                   \
-; RUN: -msan-instrumentation-with-call-threshold=0 -msan-track-origins=2 -S    \
-; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK                       \
-; RUN: -check-prefix=CHECK-ORIGINS %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-instrumentation-with-call-threshold=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll b/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll
index 42c3656d6a7d..4c7466c7b772 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/manual-shadow.ll
@@ -1,16 +1,10 @@
 ; Test that the msan layout customization options work as expected
 ;
-; RUN: opt < %s -msan-shadow-base 3735928559 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: --check-prefix=CHECK-BASE %s
-; RUN: opt < %s -msan-shadow-base 3735928559 -msan-and-mask 4294901760 -S      \
-; RUN: -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-AND %s
-; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879 -S           \
-; RUN: -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-XOR %s
-; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879              \
-; RUN: -msan-and-mask 4294901760 -S -passes=msan 2>&1 | FileCheck              \
-; RUN: --check-prefix=CHECK-XOR-AND %s
-; RUN: opt < %s -msan-track-origins 1 -msan-origin-base 1777777 -S -passes=msan\
-; RUN: 2>&1 | FileCheck --check-prefix=CHECK-ORIGIN-BASE %s
+; RUN: opt < %s -msan-shadow-base 3735928559 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-BASE %s
+; RUN: opt < %s -msan-shadow-base 3735928559 -msan-and-mask 4294901760 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-AND %s
+; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-XOR %s
+; RUN: opt < %s -msan-shadow-base 3735928559 -msan-xor-mask 48879 -msan-and-mask 4294901760 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-XOR-AND %s
+; RUN: opt < %s -msan-track-origins 1 -msan-origin-base 1777777 -S -passes=msan 2>&1 | FileCheck --check-prefix=CHECK-ORIGIN-BASE %s
 
 target triple = "x86_64-unknown-linux-gnu"
 
diff --git a/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll b/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll
index 1c3c3ddd3fcf..94e0f746b093 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/missing_origin.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
index 2a643e12a487..1b9ffaf9e505 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_asm_conservative.ll
@@ -1,14 +1,8 @@
 ; Test for handling of asm constraints in MSan instrumentation.
-; RUN: opt < %s -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | \
-; RUN:   FileCheck %s
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | \
-; RUN:   FileCheck --check-prefixes=CHECK,USER-CONS %s
-; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN:   -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck      \
-; RUN:   --check-prefixes=CHECK,KMSAN %s
-; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0                    \
-; RUN:   -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck      \
-; RUN:   --check-prefixes=CHECK,KMSAN,CHECK-CONS %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck --check-prefixes=CHECK,USER-CONS %s
+; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=0 -S -passes=msan 2>&1 | FileCheck --check-prefixes=CHECK,KMSAN %s
+; RUN: opt < %s -msan-kernel=1 -msan-check-access-address=0 -msan-handle-asm-conservative=1 -S -passes=msan 2>&1 | FileCheck --check-prefixes=CHECK,KMSAN,CHECK-CONS %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll
index 30ab3280bdec..946c95b072ea 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_eager.ll
@@ -1,8 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -msan-eager-checks -S -passes='module(msan)' 2>&1 | \
-; RUN:   FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='msan<eager-checks>' 2>&1 | \
-; RUN:   FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -msan-eager-checks -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='msan<eager-checks>' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
index 309921bfc352..4b7a910af08b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_kernel_basic.ll
@@ -1,6 +1,5 @@
 ; KMSAN instrumentation tests
-; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck %s             \
-; RUN: -check-prefixes=CHECK
+; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck %s -check-prefixes=CHECK
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll
index 7ca92a1cecb2..b81ae888efc6 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/msan_llvm_is_constant.ll
@@ -1,7 +1,6 @@
 ; Make sure MSan doesn't insert shadow checks for @llvm.is.constant.* arguments.
 
-; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck                \
-; RUN: -check-prefixes=CHECK %s
+; RUN: opt < %s -msan-kernel=1 -S -passes=msan 2>&1 | FileCheck -check-prefixes=CHECK %s
 ; RUN: opt < %s -S -passes=msan 2>&1 | FileCheck -check-prefixes=CHECK %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll b/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll
index 50a4a1b17df2..89522a603b23 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/mul_by_constant.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll b/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll
index 7a3ef6695682..addfdf40f7e0 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/origin-alignment.ll
@@ -1,9 +1,5 @@
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK                       \
-; RUN: -check-prefix=CHECK-ORIGINS1 %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck -check-prefix=CHECK                       \
-; RUN: -check-prefix=CHECK-ORIGINS2 %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS1 %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck -check-prefix=CHECK -check-prefix=CHECK-ORIGINS2 %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll b/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll
index 228d686ad7f9..1f6d1bf0c207 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/origin-array.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/reduce.ll b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
index 0c688811611b..2a79b5e72bca 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/reduce.ll
@@ -1,6 +1,5 @@
 
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | \
-; RUN:   FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes='module(msan)' 2>&1 | FileCheck -allow-deprecated-dag-overlap --check-prefix=CHECK %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll b/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll
index 999085575f08..bb6b9b62507d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/stable_set_alloca_origin.ll
@@ -1,7 +1,5 @@
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=2 -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll b/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll
index 43fec99d86d7..a91ac2654b41 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/store-long-origin.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S          \
-; RUN: -passes=msan 2>&1 | FileCheck %s
+; RUN: opt < %s -msan-check-access-address=0 -msan-track-origins=1 -S -passes=msan 2>&1 | FileCheck %s
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
index fe5cf9dcc65b..c0d738145f28 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_arith.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll
index f4868294a3e7..f4092c542d6b 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_cmp.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
index 13f7a1612de9..0f6f1fe4a7dc 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_pack.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 target datalayout = "e-p:64:64:64-i1:8:8-i8:8:8-i16:16:16-i32:32:32-i64:64:64-f32:32:32-f64:64:64-v64:64:64-v128:128:128-a0:0:64-s0:64:64-f80:128:128-n8:16:32:64-S128"
diff --git a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
index 441dd8f64e28..461d6cb9217d 100644
--- a/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
+++ b/llvm/test/Instrumentation/MemorySanitizer/vector_shift.ll
@@ -1,5 +1,4 @@
-; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck  \
-; RUN: %s
+; RUN: opt < %s -msan-check-access-address=0 -S -passes=msan 2>&1 | FileCheck %s
 ; REQUIRES: x86-registered-target
 
 ; Test instrumentation of vector shift instructions.
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll b/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll
index f42fa7139fd5..81848b69aa3b 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/crit-edge-sancov.ll
@@ -1,5 +1,4 @@
-; RUN: opt -passes='module(sancov-module)' -sanitizer-coverage-trace-pc \
-; RUN: -sanitizer-coverage-level=3 %s -S -o - | FileCheck %s
+; RUN: opt -passes='module(sancov-module)' -sanitizer-coverage-trace-pc -sanitizer-coverage-level=3 %s -S -o - | FileCheck %s
 
 ; The edge between %entry and %for.inc.i is a critical edge.
 ; SanitizerCoveragePass must split this critical edge in order to track
diff --git a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll
index 4d11e1cb41e6..73a1ef64aa95 100644
--- a/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll
+++ b/llvm/test/Instrumentation/SanitizerCoverage/stack-depth-variable-declared-by-user.ll
@@ -1,7 +1,6 @@
 ; Ensure that we terminate with a useful error message (instead of crash) if the
 ; user declares `__sancov_lowest_stack` with an unexpected type.
-; RUN: not opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 \
-; RUN:         -sanitizer-coverage-stack-depth -S 2>&1 | FileCheck %s
+; RUN: not opt < %s -passes='module(sancov-module)' -sanitizer-coverage-level=1 -sanitizer-coverage-stack-depth -S 2>&1 | FileCheck %s
 
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128"
 target triple = "x86_64-unknown-linux-gnu"
-- 
GitLab


From a24e8a7f8c4159b9605d2fd0a687ff06e0e36c5b Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Wed, 16 Oct 2024 01:50:36 +0800
Subject: [PATCH 006/329] [mlir][vector] Update document for
 `vector.splat`(NFC) (#112363)

This PR updates the document for `vector.splat`, specifying that the
operand type must match the element type of the result.
---
 mlir/include/mlir/Dialect/Vector/IR/VectorOps.td | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
index b0de7c11b9d4..c02b16ea9317 100644
--- a/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
+++ b/mlir/include/mlir/Dialect/Vector/IR/VectorOps.td
@@ -2808,7 +2808,7 @@ def Vector_SplatOp : Vector_Op<"splat", [
 
     ```mlir
     %s = arith.constant 10.1 : f32
-    %t = vector.splat %s : vector<8x16xi32>
+    %t = vector.splat %s : vector<8x16xf32>
     ```
   }];
 
-- 
GitLab


From 060d151476b871b48662dbd1947b67d9b0ae9d13 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 15 Oct 2024 10:49:55 -0700
Subject: [PATCH 007/329] [SLP][NFCI]Check early for deleted instructions

Check as early as possible for the deleted instructions before trying to
vectorize the code. May reduce number of attempts for the vectorization.
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 50 ++++++++++++-------
 1 file changed, 31 insertions(+), 19 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 0d4a0b3745dd..84d77f917bbb 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -18212,13 +18212,22 @@ bool SLPVectorizerPass::tryToVectorizeList(ArrayRef<Value *> VL, BoUpSLP &R,
       if ((VF > MinVF && ActualVF <= VF / 2) || (VF == MinVF && ActualVF < 2))
         break;
 
-      ArrayRef<Value *> Ops = VL.slice(I, ActualVF);
-      // Check that a previous iteration of this loop did not delete the Value.
-      if (llvm::any_of(Ops, [&R](Value *V) {
-            auto *I = dyn_cast<Instruction>(V);
-            return I && R.isDeleted(I);
-          }))
-        continue;
+      SmallVector<Value *> Ops(ActualVF, nullptr);
+      unsigned Idx = 0;
+      for (Value *V : VL.drop_front(I)) {
+        // Check that a previous iteration of this loop did not delete the
+        // Value.
+        if (auto *Inst = dyn_cast<Instruction>(V);
+            !Inst || !R.isDeleted(Inst)) {
+          Ops[Idx] = V;
+          ++Idx;
+          if (Idx == ActualVF)
+            break;
+        }
+      }
+      // Not enough vectorizable instructions - exit.
+      if (Idx != ActualVF)
+        break;
 
       LLVM_DEBUG(dbgs() << "SLP: Analyzing " << ActualVF << " operations "
                         << "\n");
@@ -18286,7 +18295,8 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   // Vectorize in current basic block only.
   auto *Op0 = dyn_cast<Instruction>(I->getOperand(0));
   auto *Op1 = dyn_cast<Instruction>(I->getOperand(1));
-  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P)
+  if (!Op0 || !Op1 || Op0->getParent() != P || Op1->getParent() != P ||
+      R.isDeleted(Op0) || R.isDeleted(Op1))
     return false;
 
   // First collect all possible candidates
@@ -18299,18 +18309,18 @@ bool SLPVectorizerPass::tryToVectorize(Instruction *I, BoUpSLP &R) {
   if (A && B && B->hasOneUse()) {
     auto *B0 = dyn_cast<BinaryOperator>(B->getOperand(0));
     auto *B1 = dyn_cast<BinaryOperator>(B->getOperand(1));
-    if (B0 && B0->getParent() == P)
+    if (B0 && B0->getParent() == P && !R.isDeleted(B0))
       Candidates.emplace_back(A, B0);
-    if (B1 && B1->getParent() == P)
+    if (B1 && B1->getParent() == P && !R.isDeleted(B1))
       Candidates.emplace_back(A, B1);
   }
   // Try to skip A.
   if (B && A && A->hasOneUse()) {
     auto *A0 = dyn_cast<BinaryOperator>(A->getOperand(0));
     auto *A1 = dyn_cast<BinaryOperator>(A->getOperand(1));
-    if (A0 && A0->getParent() == P)
+    if (A0 && A0->getParent() == P && !R.isDeleted(A0))
       Candidates.emplace_back(A0, B);
-    if (A1 && A1->getParent() == P)
+    if (A1 && A1->getParent() == P && !R.isDeleted(A1))
       Candidates.emplace_back(A1, B);
   }
 
@@ -19769,16 +19779,16 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst,
                                    TargetTransformInfo *TTI,
                                    SmallVectorImpl<Value *> &BuildVectorOpds,
                                    SmallVectorImpl<Value *> &InsertElts,
-                                   unsigned OperandOffset) {
+                                   unsigned OperandOffset, const BoUpSLP &R) {
   do {
     Value *InsertedOperand = LastInsertInst->getOperand(1);
     std::optional<unsigned> OperandIndex =
         getElementIndex(LastInsertInst, OperandOffset);
-    if (!OperandIndex)
+    if (!OperandIndex || R.isDeleted(LastInsertInst))
       return;
     if (isa<InsertElementInst, InsertValueInst>(InsertedOperand)) {
       findBuildAggregate_rec(cast<Instruction>(InsertedOperand), TTI,
-                             BuildVectorOpds, InsertElts, *OperandIndex);
+                             BuildVectorOpds, InsertElts, *OperandIndex, R);
 
     } else {
       BuildVectorOpds[*OperandIndex] = InsertedOperand;
@@ -19807,7 +19817,8 @@ static void findBuildAggregate_rec(Instruction *LastInsertInst,
 static bool findBuildAggregate(Instruction *LastInsertInst,
                                TargetTransformInfo *TTI,
                                SmallVectorImpl<Value *> &BuildVectorOpds,
-                               SmallVectorImpl<Value *> &InsertElts) {
+                               SmallVectorImpl<Value *> &InsertElts,
+                               const BoUpSLP &R) {
 
   assert((isa<InsertElementInst>(LastInsertInst) ||
           isa<InsertValueInst>(LastInsertInst)) &&
@@ -19822,7 +19833,8 @@ static bool findBuildAggregate(Instruction *LastInsertInst,
   BuildVectorOpds.resize(*AggregateSize);
   InsertElts.resize(*AggregateSize);
 
-  findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0);
+  findBuildAggregate_rec(LastInsertInst, TTI, BuildVectorOpds, InsertElts, 0,
+                         R);
   llvm::erase(BuildVectorOpds, nullptr);
   llvm::erase(InsertElts, nullptr);
   if (BuildVectorOpds.size() >= 2)
@@ -20068,7 +20080,7 @@ bool SLPVectorizerPass::vectorizeInsertValueInst(InsertValueInst *IVI,
 
   SmallVector<Value *, 16> BuildVectorOpds;
   SmallVector<Value *, 16> BuildVectorInsts;
-  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts))
+  if (!findBuildAggregate(IVI, TTI, BuildVectorOpds, BuildVectorInsts, R))
     return false;
 
   if (MaxVFOnly && BuildVectorOpds.size() == 2) {
@@ -20090,7 +20102,7 @@ bool SLPVectorizerPass::vectorizeInsertElementInst(InsertElementInst *IEI,
   SmallVector<Value *, 16> BuildVectorInsts;
   SmallVector<Value *, 16> BuildVectorOpds;
   SmallVector<int> Mask;
-  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts) ||
+  if (!findBuildAggregate(IEI, TTI, BuildVectorOpds, BuildVectorInsts, R) ||
       (llvm::all_of(BuildVectorOpds, IsaPred<ExtractElementInst, UndefValue>) &&
        isFixedVectorShuffle(BuildVectorOpds, Mask)))
     return false;
-- 
GitLab


From 583fa4f5b7c1d114d44cfe8c357e1c12a0d0544e Mon Sep 17 00:00:00 2001
From: Alexey Bader <alexey.bader@intel.com>
Date: Tue, 15 Oct 2024 11:05:16 -0700
Subject: [PATCH 008/329] [InstCombine] Extend fcmp+select folding to
 minnum/maxnum intrinsics (#112088)

Today, InstCombine can fold fcmp+select patterns to minnum/maxnum
intrinsics when the nnan and nsz flags are set. The ordering of the
operands in both the fcmp and select instructions is important for the
folding to occur.

maxnum patterns:
1. (a op b) ? a : b -> maxnum(a, b), where op is one of {ogt, oge}
2. (a op b) ? b : a -> maxnum(a, b), where op is one of {ule, ult}

The second pattern is supposed to make the order of the operands in the
select instruction irrelevant. However, the pattern matching code uses
the CmpInst::getInversePredicate method to invert the comparison
predicate. This method doesn't take into account the fast-math flags,
which can lead missing the folding opportunity.

The patch extends the pattern matching code to handle unordered fcmp
instructions. This allows the folding to occur even when the select
instruction has the operands in the inverse order.

New maxnum patterns:
1. (a op b) ? a : b -> maxnum(a, b), where op is one of {ugt, uge}
2. (a op b) ? b : a -> maxnum(a, b), where op is one of {ole, olt}

The same changes are applied to the minnum intrinsic.
---
 llvm/include/llvm/IR/PatternMatch.h           |  26 ++++
 llvm/lib/Analysis/IVDescriptors.cpp           |   8 +-
 llvm/lib/Analysis/ValueTracking.cpp           |   8 +-
 .../InstCombine/InstCombineSelect.cpp         |   4 +-
 .../Transforms/InstCombine/clamp-to-minmax.ll |  47 +++---
 .../Transforms/InstCombine/minmax-fold.ll     |   6 +-
 llvm/test/Transforms/InstCombine/minmax-fp.ll |   9 +-
 .../InstCombine/unordered-fcmp-select.ll      |   2 +-
 llvm/unittests/IR/PatternMatch.cpp            | 134 ++++++++++++++++++
 9 files changed, 194 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/IR/PatternMatch.h b/llvm/include/llvm/IR/PatternMatch.h
index 8c6b7895470b..c3349c9772c7 100644
--- a/llvm/include/llvm/IR/PatternMatch.h
+++ b/llvm/include/llvm/IR/PatternMatch.h
@@ -2387,6 +2387,32 @@ m_UnordFMin(const LHS &L, const RHS &R) {
   return MaxMin_match<FCmpInst, LHS, RHS, ufmin_pred_ty>(L, R);
 }
 
+/// Match an 'ordered' or 'unordered' floating point maximum function.
+/// Floating point has one special value 'NaN'. Therefore, there is no total
+/// order. However, if we can ignore the 'NaN' value (for example, because of a
+/// 'no-nans-float-math' flag) a combination of a fcmp and select has 'maximum'
+/// semantics.
+template <typename LHS, typename RHS>
+inline match_combine_or<MaxMin_match<FCmpInst, LHS, RHS, ofmax_pred_ty>,
+                        MaxMin_match<FCmpInst, LHS, RHS, ufmax_pred_ty>>
+m_OrdOrUnordFMax(const LHS &L, const RHS &R) {
+  return m_CombineOr(MaxMin_match<FCmpInst, LHS, RHS, ofmax_pred_ty>(L, R),
+                     MaxMin_match<FCmpInst, LHS, RHS, ufmax_pred_ty>(L, R));
+}
+
+/// Match an 'ordered' or 'unordered' floating point minimum function.
+/// Floating point has one special value 'NaN'. Therefore, there is no total
+/// order. However, if we can ignore the 'NaN' value (for example, because of a
+/// 'no-nans-float-math' flag) a combination of a fcmp and select has 'minimum'
+/// semantics.
+template <typename LHS, typename RHS>
+inline match_combine_or<MaxMin_match<FCmpInst, LHS, RHS, ofmin_pred_ty>,
+                        MaxMin_match<FCmpInst, LHS, RHS, ufmin_pred_ty>>
+m_OrdOrUnordFMin(const LHS &L, const RHS &R) {
+  return m_CombineOr(MaxMin_match<FCmpInst, LHS, RHS, ofmin_pred_ty>(L, R),
+                     MaxMin_match<FCmpInst, LHS, RHS, ufmin_pred_ty>(L, R));
+}
+
 /// Matches a 'Not' as 'xor V, -1' or 'xor -1, V'.
 /// NOTE: we first match the 'Not' (by matching '-1'),
 /// and only then match the inner matcher!
diff --git a/llvm/lib/Analysis/IVDescriptors.cpp b/llvm/lib/Analysis/IVDescriptors.cpp
index 53001421ce6f..76cde01782bb 100644
--- a/llvm/lib/Analysis/IVDescriptors.cpp
+++ b/llvm/lib/Analysis/IVDescriptors.cpp
@@ -690,13 +690,9 @@ RecurrenceDescriptor::isMinMaxPattern(Instruction *I, RecurKind Kind,
     return InstDesc(Kind == RecurKind::SMax, I);
   if (match(I, m_SMin(m_Value(), m_Value())))
     return InstDesc(Kind == RecurKind::SMin, I);
-  if (match(I, m_OrdFMin(m_Value(), m_Value())))
+  if (match(I, m_OrdOrUnordFMin(m_Value(), m_Value())))
     return InstDesc(Kind == RecurKind::FMin, I);
-  if (match(I, m_OrdFMax(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMax, I);
-  if (match(I, m_UnordFMin(m_Value(), m_Value())))
-    return InstDesc(Kind == RecurKind::FMin, I);
-  if (match(I, m_UnordFMax(m_Value(), m_Value())))
+  if (match(I, m_OrdOrUnordFMax(m_Value(), m_Value())))
     return InstDesc(Kind == RecurKind::FMax, I);
   if (match(I, m_Intrinsic<Intrinsic::minnum>(m_Value(), m_Value())))
     return InstDesc(Kind == RecurKind::FMin, I);
diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index c71d17011d7a..eb8d17044a17 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8248,9 +8248,7 @@ static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred,
   case CmpInst::FCMP_OLE:
   case CmpInst::FCMP_ULT:
   case CmpInst::FCMP_ULE:
-    if (match(FalseVal,
-              m_CombineOr(m_OrdFMin(m_Specific(CmpLHS), m_APFloat(FC2)),
-                          m_UnordFMin(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
+    if (match(FalseVal, m_OrdOrUnordFMin(m_Specific(CmpLHS), m_APFloat(FC2))) &&
         *FC1 < *FC2)
       return {SPF_FMAXNUM, SPNB_RETURNS_ANY, false};
     break;
@@ -8258,9 +8256,7 @@ static SelectPatternResult matchFastFloatClamp(CmpInst::Predicate Pred,
   case CmpInst::FCMP_OGE:
   case CmpInst::FCMP_UGT:
   case CmpInst::FCMP_UGE:
-    if (match(FalseVal,
-              m_CombineOr(m_OrdFMax(m_Specific(CmpLHS), m_APFloat(FC2)),
-                          m_UnordFMax(m_Specific(CmpLHS), m_APFloat(FC2)))) &&
+    if (match(FalseVal, m_OrdOrUnordFMax(m_Specific(CmpLHS), m_APFloat(FC2))) &&
         *FC1 > *FC2)
       return {SPF_FMINNUM, SPNB_RETURNS_ANY, false};
     break;
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 820d3608c8dc..8be2eeed84ad 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3837,11 +3837,11 @@ Instruction *InstCombinerImpl::visitSelectInst(SelectInst &SI) {
     // minnum/maxnum intrinsics.
     if (SIFPOp->hasNoNaNs() && SIFPOp->hasNoSignedZeros()) {
       Value *X, *Y;
-      if (match(&SI, m_OrdFMax(m_Value(X), m_Value(Y))))
+      if (match(&SI, m_OrdOrUnordFMax(m_Value(X), m_Value(Y))))
         return replaceInstUsesWith(
             SI, Builder.CreateBinaryIntrinsic(Intrinsic::maxnum, X, Y, &SI));
 
-      if (match(&SI, m_OrdFMin(m_Value(X), m_Value(Y))))
+      if (match(&SI, m_OrdOrUnordFMin(m_Value(X), m_Value(Y))))
         return replaceInstUsesWith(
             SI, Builder.CreateBinaryIntrinsic(Intrinsic::minnum, X, Y, &SI));
     }
diff --git a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
index 1dd0b17e9f46..c6fee0914f0e 100644
--- a/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
+++ b/llvm/test/Transforms/InstCombine/clamp-to-minmax.ll
@@ -67,10 +67,10 @@ define float @clamp_float_fast_ordered_nonstrict_minmax(float %x) {
 ; (X < C1) ? C1 : MIN(X, C2)
 define float @clamp_float_fast_unordered_strict_maxmin(float %x) {
 ; CHECK-LABEL: @clamp_float_fast_unordered_strict_maxmin(
-; CHECK-NEXT:    [[CMP2_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT:    [[MIN:%.*]] = select fast i1 [[CMP2_INV]], float 2.550000e+02, float [[X]]
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.maxnum.f32(float [[MIN]], float 1.000000e+00)
-; CHECK-NEXT:    ret float [[R1]]
+; CHECK-NEXT:    [[MIN:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02)
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast ult float [[X]], 1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP1]], float 1.000000e+00, float [[MIN]]
+; CHECK-NEXT:    ret float [[R]]
 ;
   %cmp2 = fcmp fast ult float %x, 255.0
   %min = select i1 %cmp2, float %x, float 255.0
@@ -82,10 +82,10 @@ define float @clamp_float_fast_unordered_strict_maxmin(float %x) {
 ; (X <= C1) ? C1 : MIN(X, C2)
 define float @clamp_float_fast_unordered_nonstrict_maxmin(float %x) {
 ; CHECK-LABEL: @clamp_float_fast_unordered_nonstrict_maxmin(
-; CHECK-NEXT:    [[CMP2_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT:    [[MIN:%.*]] = select fast i1 [[CMP2_INV]], float 2.550000e+02, float [[X]]
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.maxnum.f32(float [[MIN]], float 1.000000e+00)
-; CHECK-NEXT:    ret float [[R1]]
+; CHECK-NEXT:    [[MIN:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) 
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast ule float [[X]], 1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP1]], float 1.000000e+00, float [[MIN]]
+; CHECK-NEXT:    ret float [[R]]
 ;
   %cmp2 = fcmp fast ult float %x, 255.0
   %min = select i1 %cmp2, float %x, float 255.0
@@ -97,10 +97,10 @@ define float @clamp_float_fast_unordered_nonstrict_maxmin(float %x) {
 ; (X > C1) ? C1 : MAX(X, C2)
 define float @clamp_float_fast_unordered_strict_minmax(float %x) {
 ; CHECK-LABEL: @clamp_float_fast_unordered_strict_minmax(
-; CHECK-NEXT:    [[CMP2_INV:%.*]] = fcmp fast ole float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[MAX:%.*]] = select fast i1 [[CMP2_INV]], float 1.000000e+00, float [[X]]
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.minnum.f32(float [[MAX]], float 2.550000e+02)
-; CHECK-NEXT:    ret float [[R1]]
+; CHECK-NEXT:    [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) 
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast ugt float [[X]], 2.550000e+02
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP1]], float 2.550000e+02, float [[MAX]]
+; CHECK-NEXT:    ret float [[R]]
 ;
   %cmp2 = fcmp fast ugt float %x, 1.0
   %max = select i1 %cmp2, float %x, float 1.0
@@ -112,10 +112,10 @@ define float @clamp_float_fast_unordered_strict_minmax(float %x) {
 ; (X >= C1) ? C1 : MAX(X, C2)
 define float @clamp_float_fast_unordered_nonstrict_minmax(float %x) {
 ; CHECK-LABEL: @clamp_float_fast_unordered_nonstrict_minmax(
-; CHECK-NEXT:    [[CMP2_INV:%.*]] = fcmp fast ole float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[MAX:%.*]] = select fast i1 [[CMP2_INV]], float 1.000000e+00, float [[X]]
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.minnum.f32(float [[MAX]], float 2.550000e+02)
-; CHECK-NEXT:    ret float [[R1]]
+; CHECK-NEXT:    [[MAX:%.*]] = call fast float @llvm.maxnum.f32(float [[X:%.*]], float 1.000000e+00) 
+; CHECK-NEXT:    [[CMP1:%.*]] = fcmp fast uge float [[X]], 2.550000e+02
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP1]], float 2.550000e+02, float [[MAX]]
+; CHECK-NEXT:    ret float [[R]]
 ;
   %cmp2 = fcmp fast ugt float %x, 1.0
   %max = select i1 %cmp2, float %x, float 1.0
@@ -127,13 +127,12 @@ define float @clamp_float_fast_unordered_nonstrict_minmax(float %x) {
 ; Some more checks with fast
 
 ; (X > 1.0) ? min(x, 255.0) : 1.0
-; That did not match because select was in inverse order.
 define float @clamp_test_1(float %x) {
 ; CHECK-LABEL: @clamp_test_1(
-; CHECK-NEXT:    [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT:    [[INNER_SEL:%.*]] = select fast i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]]
-; CHECK-NEXT:    [[R1:%.*]] = call fast float @llvm.maxnum.f32(float [[INNER_SEL]], float 1.000000e+00)
-; CHECK-NEXT:    ret float [[R1]]
+; CHECK-NEXT:    [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02)
+; CHECK-NEXT:    [[OUTER_CMP:%.*]] = fcmp fast ugt float [[X]], 1.000000e+00
+; CHECK-NEXT:    [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00
+; CHECK-NEXT:    ret float [[R]]
 ;
   %inner_cmp = fcmp fast ult float %x, 255.0
   %inner_sel = select i1 %inner_cmp, float %x, float 255.0
@@ -147,8 +146,7 @@ define float @clamp_test_1(float %x) {
 ; Like @clamp_test_1 but HighConst < LowConst
 define float @clamp_negative_wrong_const(float %x) {
 ; CHECK-LABEL: @clamp_negative_wrong_const(
-; CHECK-NEXT:    [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT:    [[INNER_SEL:%.*]] = select fast i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]]
+; CHECK-NEXT:    [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) 
 ; CHECK-NEXT:    [[OUTER_CMP:%.*]] = fcmp fast ugt float [[X]], 5.120000e+02
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 5.120000e+02
 ; CHECK-NEXT:    ret float [[R]]
@@ -163,8 +161,7 @@ define float @clamp_negative_wrong_const(float %x) {
 ; Like @clamp_test_1 but both are min
 define float @clamp_negative_same_op(float %x) {
 ; CHECK-LABEL: @clamp_negative_same_op(
-; CHECK-NEXT:    [[INNER_CMP_INV:%.*]] = fcmp fast oge float [[X:%.*]], 2.550000e+02
-; CHECK-NEXT:    [[INNER_SEL:%.*]] = select fast i1 [[INNER_CMP_INV]], float 2.550000e+02, float [[X]]
+; CHECK-NEXT:    [[INNER_SEL:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 2.550000e+02) 
 ; CHECK-NEXT:    [[OUTER_CMP:%.*]] = fcmp fast ult float [[X]], 1.000000e+00
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[OUTER_CMP]], float [[INNER_SEL]], float 1.000000e+00
 ; CHECK-NEXT:    ret float [[R]]
diff --git a/llvm/test/Transforms/InstCombine/minmax-fold.ll b/llvm/test/Transforms/InstCombine/minmax-fold.ll
index 26cd4996e687..ec1c7aff4096 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fold.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fold.ll
@@ -852,10 +852,8 @@ define i32 @common_factor_umax_extra_use_both(i32 %a, i32 %b, i32 %c) {
 
 define float @not_min_of_min(i8 %i, float %x) {
 ; CHECK-LABEL: @not_min_of_min(
-; CHECK-NEXT:    [[CMP1_INV:%.*]] = fcmp fast oge float [[X:%.*]], 1.000000e+00
-; CHECK-NEXT:    [[MIN1:%.*]] = select fast i1 [[CMP1_INV]], float 1.000000e+00, float [[X]]
-; CHECK-NEXT:    [[CMP2_INV:%.*]] = fcmp fast oge float [[X]], 2.000000e+00
-; CHECK-NEXT:    [[MIN2:%.*]] = select fast i1 [[CMP2_INV]], float 2.000000e+00, float [[X]]
+; CHECK-NEXT:    [[MIN1:%.*]] = call fast float @llvm.minnum.f32(float [[X:%.*]], float 1.000000e+00)
+; CHECK-NEXT:    [[MIN2:%.*]] = call fast float @llvm.minnum.f32(float [[X]], float 2.000000e+00)
 ; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i8 [[I:%.*]], 16
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP3]], float [[MIN1]], float [[MIN2]]
 ; CHECK-NEXT:    ret float [[R]]
diff --git a/llvm/test/Transforms/InstCombine/minmax-fp.ll b/llvm/test/Transforms/InstCombine/minmax-fp.ll
index b9e46caa6375..1276b7b3e386 100644
--- a/llvm/test/Transforms/InstCombine/minmax-fp.ll
+++ b/llvm/test/Transforms/InstCombine/minmax-fp.ll
@@ -160,8 +160,7 @@ define i8 @t9(float %a) {
   ; Either operand could be NaN, but fast modifier applied.
 define i8 @t11(float %a, float %b) {
 ; CHECK-LABEL: @t11(
-; CHECK-NEXT:    [[DOTINV:%.*]] = fcmp fast oge float [[B:%.*]], [[A:%.*]]
-; CHECK-NEXT:    [[DOTV:%.*]] = select fast i1 [[DOTINV]], float [[A]], float [[B]]
+; CHECK-NEXT:    [[DOTV:%.*]] = call fast float @llvm.minnum.f32(float [[B:%.*]], float [[A:%.*]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = fptosi float [[DOTV]] to i8
 ; CHECK-NEXT:    ret i8 [[TMP1]]
 ;
@@ -282,8 +281,7 @@ define float @fneg_fmax(float %x, float %y) {
 
 define <2 x float> @fsub_fmax(<2 x float> %x, <2 x float> %y) {
 ; CHECK-LABEL: @fsub_fmax(
-; CHECK-NEXT:    [[COND_INV:%.*]] = fcmp nnan nsz ogt <2 x float> [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[MAX_V:%.*]] = select nnan nsz <2 x i1> [[COND_INV]], <2 x float> [[Y]], <2 x float> [[X]]
+; CHECK-NEXT:    [[MAX_V:%.*]] = call nnan nsz <2 x float> @llvm.minnum.v2f32(<2 x float> [[X:%.*]], <2 x float> [[Y:%.*]])
 ; CHECK-NEXT:    [[MAX:%.*]] = fneg <2 x float> [[MAX_V]]
 ; CHECK-NEXT:    ret <2 x float> [[MAX]]
 ;
@@ -310,8 +308,7 @@ define <2 x double> @fsub_fmin(<2 x double> %x, <2 x double> %y) {
 
 define double @fneg_fmin(double %x, double %y) {
 ; CHECK-LABEL: @fneg_fmin(
-; CHECK-NEXT:    [[COND_INV:%.*]] = fcmp nnan nsz olt double [[X:%.*]], [[Y:%.*]]
-; CHECK-NEXT:    [[MAX_V:%.*]] = select nnan nsz i1 [[COND_INV]], double [[Y]], double [[X]]
+; CHECK-NEXT:    [[MAX_V:%.*]] = call nnan nsz double @llvm.maxnum.f64(double [[X:%.*]], double [[Y:%.*]])
 ; CHECK-NEXT:    [[MAX:%.*]] = fneg double [[MAX_V]]
 ; CHECK-NEXT:    ret double [[MAX]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll b/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll
index 62c12c15a075..b164dd983a89 100644
--- a/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll
+++ b/llvm/test/Transforms/InstCombine/unordered-fcmp-select.ll
@@ -115,7 +115,7 @@ define float @select_max_ugt_2_use_cmp(float %a, float %b) {
 ; CHECK-LABEL: @select_max_ugt_2_use_cmp(
 ; CHECK-NEXT:    [[CMP:%.*]] = fcmp reassoc ugt float [[A:%.*]], [[B:%.*]]
 ; CHECK-NEXT:    call void @foo(i1 [[CMP]])
-; CHECK-NEXT:    [[SEL:%.*]] = select fast i1 [[CMP]], float [[A]], float [[B]]
+; CHECK-NEXT:    [[SEL:%.*]] = call fast float @llvm.maxnum.f32(float [[A]], float [[B]])
 ; CHECK-NEXT:    ret float [[SEL]]
 ;
   %cmp = fcmp reassoc ugt float %a, %b
diff --git a/llvm/unittests/IR/PatternMatch.cpp b/llvm/unittests/IR/PatternMatch.cpp
index 7dc4b9f448d3..367ba6ab52a5 100644
--- a/llvm/unittests/IR/PatternMatch.cpp
+++ b/llvm/unittests/IR/PatternMatch.cpp
@@ -1040,6 +1040,140 @@ TEST_F(PatternMatchTest, FloatingPointUnorderedMax) {
   EXPECT_EQ(R, MatchR);
 }
 
+TEST_F(PatternMatchTest, FloatingPointMin) {
+  Type *FltTy = IRB.getFloatTy();
+  Value *L = ConstantFP::get(FltTy, 1.0);
+  Value *R = ConstantFP::get(FltTy, 2.0);
+  Value *MatchL, *MatchR;
+
+  // Test OLT.
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOLT(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test OLE.
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOLE(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test ULT.
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpULT(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test ULE.
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpULE(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test no match on OGE.
+  EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpOGE(L, R), L, R)));
+
+  // Test no match on OGT.
+  EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpOGT(L, R), L, R)));
+
+  // Test no match on UGE.
+  EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpUGE(L, R), L, R)));
+
+  // Test no match on UGT.
+  EXPECT_FALSE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpUGT(L, R), L, R)));
+
+  // Test inverted selects. Note, that this "inverts" the ordering, e.g.:
+  // %cmp = fcmp oge L, R
+  // %min = select %cmp R, L
+
+  // [OU]GE with inverted select.
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOGE(L, R), R, L)));
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpUGE(L, R), R, L)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // [OU]GT with inverted select.
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOGT(L, R), R, L)));
+  EXPECT_TRUE(m_OrdOrUnordFMin(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpUGT(L, R), R, L)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+}
+
+TEST_F(PatternMatchTest, FloatingPointMax) {
+  Type *FltTy = IRB.getFloatTy();
+  Value *L = ConstantFP::get(FltTy, 1.0);
+  Value *R = ConstantFP::get(FltTy, 2.0);
+  Value *MatchL, *MatchR;
+
+  // Test OGT.
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOGT(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test OGE.
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOGE(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test UGT.
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpUGT(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test UGE.
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpUGE(L, R), L, R)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // Test no match on OLE.
+  EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpOLE(L, R), L, R)));
+
+  // Test no match on OLT.
+  EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpOLT(L, R), L, R)));
+
+  // Test no match on ULE.
+  EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpULE(L, R), L, R)));
+
+  // Test no match on ULT.
+  EXPECT_FALSE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                   .match(IRB.CreateSelect(IRB.CreateFCmpULT(L, R), L, R)));
+
+  // Test inverted selects. Note, that this "inverts" the ordering, e.g.:
+  // %cmp = fcmp ole L, R
+  // %max = select %cmp, R, L
+
+  // [OU]LE with inverted select.
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOLE(L, R), R, L)));
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpULE(L, R), R, L)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+
+  // [OUT]LT with inverted select.
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpOLT(L, R), R, L)));
+  EXPECT_TRUE(m_OrdOrUnordFMax(m_Value(MatchL), m_Value(MatchR))
+                  .match(IRB.CreateSelect(IRB.CreateFCmpULT(L, R), R, L)));
+  EXPECT_EQ(L, MatchL);
+  EXPECT_EQ(R, MatchR);
+}
+
 TEST_F(PatternMatchTest, OverflowingBinOps) {
   Value *L = IRB.getInt32(1);
   Value *R = IRB.getInt32(2);
-- 
GitLab


From 7e72e5ba86e59d77dccd1db472b78637a8dc1cbc Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 15 Oct 2024 11:12:03 -0700
Subject: [PATCH 009/329] Reland '[flang][cuda] Add cuf.register_kernel
 operation' (#112389)

The operation will be used in the CUF constructor to register the kernel
functions. This allow to delay this until codegen when the gpu.binary
will be available.

Reland of #112268 with correct shared library build support.
---
 .../flang/Optimizer/Dialect/CUF/CUFOps.td     | 19 +++++++
 .../lib/Optimizer/Dialect/CUF/CMakeLists.txt  |  1 +
 flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp    | 37 ++++++++++++++
 flang/test/Fir/CUDA/cuda-register-func.fir    | 20 ++++++++
 flang/test/Fir/cuf-invalid.fir                | 50 +++++++++++++++++++
 flang/tools/fir-opt/fir-opt.cpp               |  1 +
 6 files changed, 128 insertions(+)
 create mode 100644 flang/test/Fir/CUDA/cuda-register-func.fir

diff --git a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
index f643674f1d5d..98d1ef529738 100644
--- a/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
+++ b/flang/include/flang/Optimizer/Dialect/CUF/CUFOps.td
@@ -288,4 +288,23 @@ def cuf_KernelOp : cuf_Op<"kernel", [AttrSizedOperandSegments,
   let hasVerifier = 1;
 }
 
+def cuf_RegisterKernelOp : cuf_Op<"register_kernel", []> {
+  let summary = "Register a CUDA kernel";
+
+  let arguments = (ins
+    SymbolRefAttr:$name
+  );
+
+  let assemblyFormat = [{
+    $name attr-dict
+  }];
+
+  let hasVerifier = 1;
+
+  let extraClassDeclaration = [{
+    mlir::StringAttr getKernelName();
+    mlir::StringAttr getKernelModuleName();
+  }];
+}
+
 #endif // FORTRAN_DIALECT_CUF_CUF_OPS
diff --git a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
index 83d468bafdfe..b2221199995d 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
+++ b/flang/lib/Optimizer/Dialect/CUF/CMakeLists.txt
@@ -14,6 +14,7 @@ add_flang_library(CUFDialect
   FIRDialect
   FIRDialectSupport
   MLIRIR
+  MLIRGPUDialect
   MLIRTargetLLVMIRExport
 
   LINK_COMPONENTS
diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 7fb2dcf4af11..9e3bbd1f9cbe 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -15,6 +15,7 @@
 #include "flang/Optimizer/Dialect/CUF/CUFDialect.h"
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -253,6 +254,42 @@ llvm::LogicalResult cuf::KernelOp::verify() {
   return mlir::success();
 }
 
+//===----------------------------------------------------------------------===//
+// RegisterKernelOp
+//===----------------------------------------------------------------------===//
+
+mlir::StringAttr cuf::RegisterKernelOp::getKernelModuleName() {
+  return getName().getRootReference();
+}
+
+mlir::StringAttr cuf::RegisterKernelOp::getKernelName() {
+  return getName().getLeafReference();
+}
+
+mlir::LogicalResult cuf::RegisterKernelOp::verify() {
+  if (getKernelName() == getKernelModuleName())
+    return emitOpError("expect a module and a kernel name");
+
+  auto mod = getOperation()->getParentOfType<mlir::ModuleOp>();
+  if (!mod)
+    return emitOpError("expect to be in a module");
+
+  mlir::SymbolTable symTab(mod);
+  auto gpuMod = symTab.lookup<mlir::gpu::GPUModuleOp>(getKernelModuleName());
+  if (!gpuMod)
+    return emitOpError("gpu module not found");
+
+  mlir::SymbolTable gpuSymTab(gpuMod);
+  auto func = gpuSymTab.lookup<mlir::gpu::GPUFuncOp>(getKernelName());
+  if (!func)
+    return emitOpError("device function not found");
+
+  if (!func.isKernel())
+    return emitOpError("only kernel gpu.func can be registered");
+
+  return mlir::success();
+}
+
 // Tablegen operators
 
 #define GET_OP_CLASSES
diff --git a/flang/test/Fir/CUDA/cuda-register-func.fir b/flang/test/Fir/CUDA/cuda-register-func.fir
new file mode 100644
index 000000000000..a428f68eb3bf
--- /dev/null
+++ b/flang/test/Fir/CUDA/cuda-register-func.fir
@@ -0,0 +1,20 @@
+// RUN: fir-opt %s | FileCheck %s
+
+module attributes {gpu.container_module} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QPsub_device1() kernel {
+      gpu.return
+    }
+    gpu.func @_QPsub_device2(%arg0: !fir.ref<f32>) kernel {
+      gpu.return
+    }
+  }
+  llvm.func internal @__cudaFortranConstructor() {
+    cuf.register_kernel @cuda_device_mod::@_QPsub_device1
+    cuf.register_kernel @cuda_device_mod::@_QPsub_device2
+    llvm.return
+  }
+}
+
+// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device1
+// CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device2
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
index e9aeaa281e2a..a5747b8ee4a3 100644
--- a/flang/test/Fir/cuf-invalid.fir
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -125,3 +125,53 @@ func.func @_QPsub1(%arg0: !fir.ref<!fir.array<?xf32>> {cuf.data_attr = #cuf.cuda
   cuf.data_transfer %20#0 to %11#0, %19 : !fir.shape<1> {transfer_kind = #cuf.cuda_transfer<host_device>} : !fir.box<!fir.array<?xf32>>, !fir.box<!fir.array<?xf32>>
   return
 }
+
+// -----
+
+module attributes {gpu.container_module} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QPsub_device1() {
+      gpu.return
+    }
+  }
+  llvm.func internal @__cudaFortranConstructor() {
+    // expected-error@+1{{'cuf.register_kernel' op only kernel gpu.func can be registered}}
+    cuf.register_kernel @cuda_device_mod::@_QPsub_device1
+    llvm.return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  gpu.module @cuda_device_mod {
+    gpu.func @_QPsub_device1() {
+      gpu.return
+    }
+  }
+  llvm.func internal @__cudaFortranConstructor() {
+    // expected-error@+1{{'cuf.register_kernel' op device function not found}}
+    cuf.register_kernel @cuda_device_mod::@_QPsub_device2
+    llvm.return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  llvm.func internal @__cudaFortranConstructor() {
+    // expected-error@+1{{'cuf.register_kernel' op gpu module not found}}
+    cuf.register_kernel @cuda_device_mod::@_QPsub_device1
+    llvm.return
+  }
+}
+
+// -----
+
+module attributes {gpu.container_module} {
+  llvm.func internal @__cudaFortranConstructor() {
+    // expected-error@+1{{'cuf.register_kernel' op expect a module and a kernel name}}
+    cuf.register_kernel @_QPsub_device1
+    llvm.return
+  }
+}
diff --git a/flang/tools/fir-opt/fir-opt.cpp b/flang/tools/fir-opt/fir-opt.cpp
index f75fba27c68f..84a74770cf03 100644
--- a/flang/tools/fir-opt/fir-opt.cpp
+++ b/flang/tools/fir-opt/fir-opt.cpp
@@ -42,6 +42,7 @@ int main(int argc, char **argv) {
 #endif
   DialectRegistry registry;
   fir::support::registerDialects(registry);
+  registry.insert<mlir::gpu::GPUDialect>();
   fir::support::addFIRExtensions(registry);
   return failed(MlirOptMain(argc, argv, "FIR modular optimizer driver\n",
       registry));
-- 
GitLab


From 87db0c06013412dd34953b0aaa3c3c02e45bd571 Mon Sep 17 00:00:00 2001
From: Michael Jones <michaelrj@google.com>
Date: Tue, 15 Oct 2024 11:18:38 -0700
Subject: [PATCH 010/329] [libc] Add bigint casting between word types
 (#111914)

Previously you could cast between bigints with different numbers of
bits, but only if they had the same underlying type. This patch adds the
ability to cast between bigints with different underlying types, which
is needed for #110894
---
 libc/src/__support/big_int.h             |  99 ++++++++++++++--
 libc/test/src/__support/big_int_test.cpp | 142 ++++++++++++++++++++++-
 2 files changed, 229 insertions(+), 12 deletions(-)

diff --git a/libc/src/__support/big_int.h b/libc/src/__support/big_int.h
index 681782d57319..246b89f08f2f 100644
--- a/libc/src/__support/big_int.h
+++ b/libc/src/__support/big_int.h
@@ -14,7 +14,7 @@
 #include "src/__support/CPP/limits.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/CPP/type_traits.h"
-#include "src/__support/macros/attributes.h"          // LIBC_INLINE
+#include "src/__support/macros/attributes.h" // LIBC_INLINE
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/optimization.h"        // LIBC_UNLIKELY
 #include "src/__support/macros/properties/compiler.h" // LIBC_COMPILER_IS_CLANG
@@ -361,17 +361,94 @@ public:
 
   LIBC_INLINE constexpr BigInt(const BigInt &other) = default;
 
-  template <size_t OtherBits, bool OtherSigned>
+  template <size_t OtherBits, bool OtherSigned, typename OtherWordType>
   LIBC_INLINE constexpr BigInt(
-      const BigInt<OtherBits, OtherSigned, WordType> &other) {
-    if (OtherBits >= Bits) { // truncate
-      for (size_t i = 0; i < WORD_COUNT; ++i)
-        val[i] = other[i];
-    } else { // zero or sign extend
-      size_t i = 0;
-      for (; i < OtherBits / WORD_SIZE; ++i)
-        val[i] = other[i];
-      extend(i, Signed && other.is_neg());
+      const BigInt<OtherBits, OtherSigned, OtherWordType> &other) {
+    using BigIntOther = BigInt<OtherBits, OtherSigned, OtherWordType>;
+    const bool should_sign_extend = Signed && other.is_neg();
+
+    static_assert(!(Bits == OtherBits && WORD_SIZE != BigIntOther::WORD_SIZE) &&
+                  "This is currently untested for casting between bigints with "
+                  "the same bit width but different word sizes.");
+
+    if constexpr (BigIntOther::WORD_SIZE < WORD_SIZE) {
+      // OtherWordType is smaller
+      constexpr size_t WORD_SIZE_RATIO = WORD_SIZE / BigIntOther::WORD_SIZE;
+      static_assert(
+          (WORD_SIZE % BigIntOther::WORD_SIZE) == 0 &&
+          "Word types must be multiples of each other for correct conversion.");
+      if constexpr (OtherBits >= Bits) { // truncate
+        // for each big word
+        for (size_t i = 0; i < WORD_COUNT; ++i) {
+          WordType cur_word = 0;
+          // combine WORD_SIZE_RATIO small words into a big word
+          for (size_t j = 0; j < WORD_SIZE_RATIO; ++j)
+            cur_word |= static_cast<WordType>(other[(i * WORD_SIZE_RATIO) + j])
+                        << (BigIntOther::WORD_SIZE * j);
+
+          val[i] = cur_word;
+        }
+      } else { // zero or sign extend
+        size_t i = 0;
+        WordType cur_word = 0;
+        // for each small word
+        for (; i < BigIntOther::WORD_COUNT; ++i) {
+          // combine WORD_SIZE_RATIO small words into a big word
+          cur_word |= static_cast<WordType>(other[i])
+                      << (BigIntOther::WORD_SIZE * (i % WORD_SIZE_RATIO));
+          // if we've completed a big word, copy it into place and reset
+          if ((i % WORD_SIZE_RATIO) == WORD_SIZE_RATIO - 1) {
+            val[i / WORD_SIZE_RATIO] = cur_word;
+            cur_word = 0;
+          }
+        }
+        // Pretend there are extra words of the correct sign extension as needed
+
+        const WordType extension_bits =
+            should_sign_extend ? cpp::numeric_limits<WordType>::max()
+                               : cpp::numeric_limits<WordType>::min();
+        if ((i % WORD_SIZE_RATIO) != 0) {
+          cur_word |= static_cast<WordType>(extension_bits)
+                      << (BigIntOther::WORD_SIZE * (i % WORD_SIZE_RATIO));
+        }
+        // Copy the last word into place.
+        val[(i / WORD_SIZE_RATIO)] = cur_word;
+        extend((i / WORD_SIZE_RATIO) + 1, should_sign_extend);
+      }
+    } else if constexpr (BigIntOther::WORD_SIZE == WORD_SIZE) {
+      if constexpr (OtherBits >= Bits) { // truncate
+        for (size_t i = 0; i < WORD_COUNT; ++i)
+          val[i] = other[i];
+      } else { // zero or sign extend
+        size_t i = 0;
+        for (; i < BigIntOther::WORD_COUNT; ++i)
+          val[i] = other[i];
+        extend(i, should_sign_extend);
+      }
+    } else {
+      // OtherWordType is bigger.
+      constexpr size_t WORD_SIZE_RATIO = BigIntOther::WORD_SIZE / WORD_SIZE;
+      static_assert(
+          (BigIntOther::WORD_SIZE % WORD_SIZE) == 0 &&
+          "Word types must be multiples of each other for correct conversion.");
+      if constexpr (OtherBits >= Bits) { // truncate
+        // for each small word
+        for (size_t i = 0; i < WORD_COUNT; ++i) {
+          // split each big word into WORD_SIZE_RATIO small words
+          val[i] = static_cast<WordType>(other[i / WORD_SIZE_RATIO] >>
+                                         ((i % WORD_SIZE_RATIO) * WORD_SIZE));
+        }
+      } else { // zero or sign extend
+        size_t i = 0;
+        // for each big word
+        for (; i < BigIntOther::WORD_COUNT; ++i) {
+          // split each big word into WORD_SIZE_RATIO small words
+          for (size_t j = 0; j < WORD_SIZE_RATIO; ++j)
+            val[(i * WORD_SIZE_RATIO) + j] =
+                static_cast<WordType>(other[i] >> (j * WORD_SIZE));
+        }
+        extend(i * WORD_SIZE_RATIO, should_sign_extend);
+      }
     }
   }
 
diff --git a/libc/test/src/__support/big_int_test.cpp b/libc/test/src/__support/big_int_test.cpp
index a1ce69baaae2..471ca72a8f6e 100644
--- a/libc/test/src/__support/big_int_test.cpp
+++ b/libc/test/src/__support/big_int_test.cpp
@@ -8,7 +8,7 @@
 
 #include "src/__support/CPP/optional.h"
 #include "src/__support/big_int.h"
-#include "src/__support/integer_literals.h"        // parse_unsigned_bigint
+#include "src/__support/integer_literals.h" // parse_unsigned_bigint
 #include "src/__support/macros/config.h"
 #include "src/__support/macros/properties/types.h" // LIBC_TYPES_HAS_INT128
 
@@ -208,6 +208,7 @@ TYPED_TEST(LlvmLibcUIntClassTest, CountBits, Types) {
 }
 
 using LL_UInt16 = UInt<16>;
+using LL_UInt32 = UInt<32>;
 using LL_UInt64 = UInt<64>;
 // We want to test UInt<128> explicitly. So, for
 // convenience, we use a sugar which does not conflict with the UInt128 type
@@ -927,4 +928,143 @@ TEST(LlvmLibcUIntClassTest, OtherWordTypeTests) {
   ASSERT_EQ(static_cast<int>(a >> 64), 1);
 }
 
+TEST(LlvmLibcUIntClassTest, OtherWordTypeCastTests) {
+  using LL_UInt96 = BigInt<96, false, uint32_t>;
+
+  LL_UInt96 a({123, 456, 789});
+
+  ASSERT_EQ(static_cast<int>(a), 123);
+  ASSERT_EQ(static_cast<int>(a >> 32), 456);
+  ASSERT_EQ(static_cast<int>(a >> 64), 789);
+
+  // Bigger word with more bits to smaller word with less bits.
+  LL_UInt128 b(a);
+
+  ASSERT_EQ(static_cast<int>(b), 123);
+  ASSERT_EQ(static_cast<int>(b >> 32), 456);
+  ASSERT_EQ(static_cast<int>(b >> 64), 789);
+  ASSERT_EQ(static_cast<int>(b >> 96), 0);
+
+  b = (b << 32) + 987;
+
+  ASSERT_EQ(static_cast<int>(b), 987);
+  ASSERT_EQ(static_cast<int>(b >> 32), 123);
+  ASSERT_EQ(static_cast<int>(b >> 64), 456);
+  ASSERT_EQ(static_cast<int>(b >> 96), 789);
+
+  // Smaller word with less bits to bigger word with more bits.
+  LL_UInt96 c(b);
+
+  ASSERT_EQ(static_cast<int>(c), 987);
+  ASSERT_EQ(static_cast<int>(c >> 32), 123);
+  ASSERT_EQ(static_cast<int>(c >> 64), 456);
+
+  // Smaller word with more bits to bigger word with less bits
+  LL_UInt64 d(c);
+
+  ASSERT_EQ(static_cast<int>(d), 987);
+  ASSERT_EQ(static_cast<int>(d >> 32), 123);
+
+  // Bigger word with less bits to smaller word with more bits
+
+  LL_UInt96 e(d);
+
+  ASSERT_EQ(static_cast<int>(e), 987);
+  ASSERT_EQ(static_cast<int>(e >> 32), 123);
+
+  e = (e << 32) + 654;
+
+  ASSERT_EQ(static_cast<int>(e), 654);
+  ASSERT_EQ(static_cast<int>(e >> 32), 987);
+  ASSERT_EQ(static_cast<int>(e >> 64), 123);
+}
+
+TEST(LlvmLibcUIntClassTest, SignedOtherWordTypeCastTests) {
+  using LL_Int64 = BigInt<64, true, uint64_t>;
+  using LL_Int96 = BigInt<96, true, uint32_t>;
+
+  LL_Int64 zero_64(0);
+  LL_Int96 zero_96(0);
+  LL_Int192 zero_192(0);
+
+  LL_Int96 plus_a({0x1234, 0x5678, 0x9ABC});
+
+  ASSERT_EQ(static_cast<int>(plus_a), 0x1234);
+  ASSERT_EQ(static_cast<int>(plus_a >> 32), 0x5678);
+  ASSERT_EQ(static_cast<int>(plus_a >> 64), 0x9ABC);
+
+  LL_Int96 minus_a(-plus_a);
+
+  // The reason that the numbers are inverted and not negated is that we're
+  // using two's complement. To negate a two's complement number you flip the
+  // bits and add 1, so minus_a is {~0x1234, ~0x5678, ~0x9ABC} + {1,0,0}.
+  ASSERT_EQ(static_cast<int>(minus_a), (~0x1234) + 1);
+  ASSERT_EQ(static_cast<int>(minus_a >> 32), ~0x5678);
+  ASSERT_EQ(static_cast<int>(minus_a >> 64), ~0x9ABC);
+
+  ASSERT_TRUE(plus_a + minus_a == zero_96);
+
+  // 192 so there's an extra block to get sign extended to
+  LL_Int192 bigger_plus_a(plus_a);
+
+  ASSERT_EQ(static_cast<int>(bigger_plus_a), 0x1234);
+  ASSERT_EQ(static_cast<int>(bigger_plus_a >> 32), 0x5678);
+  ASSERT_EQ(static_cast<int>(bigger_plus_a >> 64), 0x9ABC);
+  ASSERT_EQ(static_cast<int>(bigger_plus_a >> 96), 0);
+  ASSERT_EQ(static_cast<int>(bigger_plus_a >> 128), 0);
+  ASSERT_EQ(static_cast<int>(bigger_plus_a >> 160), 0);
+
+  LL_Int192 bigger_minus_a(minus_a);
+
+  ASSERT_EQ(static_cast<int>(bigger_minus_a), (~0x1234) + 1);
+  ASSERT_EQ(static_cast<int>(bigger_minus_a >> 32), ~0x5678);
+  ASSERT_EQ(static_cast<int>(bigger_minus_a >> 64), ~0x9ABC);
+  ASSERT_EQ(static_cast<int>(bigger_minus_a >> 96), ~0);
+  ASSERT_EQ(static_cast<int>(bigger_minus_a >> 128), ~0);
+  ASSERT_EQ(static_cast<int>(bigger_minus_a >> 160), ~0);
+
+  ASSERT_TRUE(bigger_plus_a + bigger_minus_a == zero_192);
+
+  LL_Int64 smaller_plus_a(plus_a);
+
+  ASSERT_EQ(static_cast<int>(smaller_plus_a), 0x1234);
+  ASSERT_EQ(static_cast<int>(smaller_plus_a >> 32), 0x5678);
+
+  LL_Int64 smaller_minus_a(minus_a);
+
+  ASSERT_EQ(static_cast<int>(smaller_minus_a), (~0x1234) + 1);
+  ASSERT_EQ(static_cast<int>(smaller_minus_a >> 32), ~0x5678);
+
+  ASSERT_TRUE(smaller_plus_a + smaller_minus_a == zero_64);
+
+  // Also try going from bigger word size to smaller word size
+  LL_Int96 smaller_back_plus_a(smaller_plus_a);
+
+  ASSERT_EQ(static_cast<int>(smaller_back_plus_a), 0x1234);
+  ASSERT_EQ(static_cast<int>(smaller_back_plus_a >> 32), 0x5678);
+  ASSERT_EQ(static_cast<int>(smaller_back_plus_a >> 64), 0);
+
+  LL_Int96 smaller_back_minus_a(smaller_minus_a);
+
+  ASSERT_EQ(static_cast<int>(smaller_back_minus_a), (~0x1234) + 1);
+  ASSERT_EQ(static_cast<int>(smaller_back_minus_a >> 32), ~0x5678);
+  ASSERT_EQ(static_cast<int>(smaller_back_minus_a >> 64), ~0);
+
+  ASSERT_TRUE(smaller_back_plus_a + smaller_back_minus_a == zero_96);
+
+  LL_Int96 bigger_back_plus_a(bigger_plus_a);
+
+  ASSERT_EQ(static_cast<int>(bigger_back_plus_a), 0x1234);
+  ASSERT_EQ(static_cast<int>(bigger_back_plus_a >> 32), 0x5678);
+  ASSERT_EQ(static_cast<int>(bigger_back_plus_a >> 64), 0x9ABC);
+
+  LL_Int96 bigger_back_minus_a(bigger_minus_a);
+
+  ASSERT_EQ(static_cast<int>(bigger_back_minus_a), (~0x1234) + 1);
+  ASSERT_EQ(static_cast<int>(bigger_back_minus_a >> 32), ~0x5678);
+  ASSERT_EQ(static_cast<int>(bigger_back_minus_a >> 64), ~0x9ABC);
+
+  ASSERT_TRUE(bigger_back_plus_a + bigger_back_minus_a == zero_96);
+}
+
 } // namespace LIBC_NAMESPACE_DECL
-- 
GitLab


From a758bcdbd92efb64a3482eb95d2769d74e33f5bb Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Tue, 15 Oct 2024 19:24:43 +0100
Subject: [PATCH 011/329] [mlir][td] Rename pack_paddings in structured.pad
 (#111036)

The pack_paddings attribute in the structure.pad TD Op is used to set
the `nofold` attribute in the generated tensor.pad Op. The current name
is confusing and suggests that there's a relation with the tensor.pack
Op. This patch renames it as `nofold_flags` to better match the actual
usage.
---
 .../Linalg/TransformOps/LinalgTransformOps.td |  2 +-
 .../Dialect/Linalg/Transforms/Transforms.h    |  6 ++---
 .../TransformOps/LinalgTransformOps.cpp       | 26 +++++++++----------
 .../lib/Dialect/Linalg/Transforms/Padding.cpp |  6 ++---
 .../mlir/dialects/transform/structured.py     |  4 +--
 .../Linalg/matmul-shared-memory-padding.mlir  |  4 +--
 .../Linalg/pad-to-specific-memory-space.mlir  |  4 +--
 .../test/Dialect/Linalg/transform-op-pad.mlir | 26 +++++++++----------
 .../Dialect/Linalg/transform-ops-invalid.mlir |  4 +--
 .../dialects/transform_structured_ext.py      |  6 ++---
 10 files changed, 44 insertions(+), 44 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index f9036cf96e9a..98b915138122 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1028,7 +1028,7 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
          Variadic<TransformAnyParamTypeOrAnyHandle>:$pad_to_multiple_of,
          DefaultValuedOptionalAttr<DenseI64ArrayAttr, "{}">:
                           $static_pad_to_multiple_of,
-         DefaultValuedAttr<I64ArrayAttr, "{}">:$pack_paddings,
+         DefaultValuedAttr<I64ArrayAttr, "{}">:$nofold_flags,
          DefaultValuedAttr<
           TypedArrayAttrBase<I64ArrayAttr, "array of arrays of i64">,
           "{}">:$transpose_paddings,
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 0693e31b4f70..96e0b3c978d5 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -296,9 +296,9 @@ struct LinalgPaddingOptions {
   }
   /// A flag for every operand to mark the PadOp as nofold which enables
   /// packing for statically shaped operands.
-  SmallVector<bool> packPaddings;
+  SmallVector<bool> nofoldFlags;
   LinalgPaddingOptions &setPackPaddings(ArrayRef<bool> pp) {
-    packPaddings.assign(pp.begin(), pp.end());
+    nofoldFlags.assign(pp.begin(), pp.end());
     return *this;
   }
   /// A number of loops to hoist the PadOp out for every operand.
@@ -530,7 +530,7 @@ void peelLoops(RewriterBase &rewriter, ArrayRef<scf::ForOp> loops);
 ///
 /// * "options.padToMultipleOf" indicates that each padding dimension should be
 ///   padded to the specified multiple.
-/// * Use "options.paddingValues" and "options.packPaddings" to set padding
+/// * Use "options.paddingValues" and "options.nofoldFlags" to set padding
 ///   value and nofold attribute of the created tensor::PadOps, respectively.
 /// * The unpadded results (extracted slice of the cloned operation) are
 ///   returned via `replacements`.
diff --git a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
index 8e7621754f76..ad72b5d7becc 100644
--- a/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
+++ b/mlir/lib/Dialect/Linalg/TransformOps/LinalgTransformOps.cpp
@@ -1727,7 +1727,7 @@ transform::PackTransposeOp::apply(transform::TransformRewriter &rewriter,
 void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                              ArrayRef<int64_t> paddingDimensions,
                              ArrayRef<int64_t> padToMultipleOf,
-                             ArrayRef<int64_t> packPaddings,
+                             ArrayRef<int64_t> nofoldFlags,
                              ArrayRef<Attribute> transposePaddings,
                              StringRef copyBackOp) {
   auto resultType = transform::AnyOpType::get(b.getContext());
@@ -1742,7 +1742,7 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                (padToMultipleOf.empty()
                     ? DenseI64ArrayAttr()
                     : b.getDenseI64ArrayAttr(padToMultipleOf)),
-               /*packPaddings=*/b.getI64ArrayAttr(packPaddings),
+               /*nofoldFlags=*/b.getI64ArrayAttr(nofoldFlags),
                /*transposePaddings=*/b.getArrayAttr(transposePaddings),
                /*copyBackOp=*/b.getStringAttr(copyBackOp));
 }
@@ -1750,7 +1750,7 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
 void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                              ArrayRef<int64_t> paddingDimensions,
                              ArrayRef<OpFoldResult> mixedPadToMultipleOf,
-                             ArrayRef<int64_t> packPaddings,
+                             ArrayRef<int64_t> nofoldFlags,
                              ArrayRef<Attribute> transposePaddings,
                              StringRef copyBackOp) {
   auto resultType = transform::AnyOpType::get(b.getContext());
@@ -1766,7 +1766,7 @@ void transform::PadOp::build(OpBuilder &b, OperationState &result, Value target,
                /*paddingDimensions=*/b.getI64ArrayAttr(paddingDimensions),
                /*padToMultipleOf=*/dynamicPadToMultipleOf,
                /*padToMultipleOf=*/staticPadToMultipleOf,
-               /*packPaddings=*/b.getI64ArrayAttr(packPaddings),
+               /*nofoldFlags=*/b.getI64ArrayAttr(nofoldFlags),
                /*transposePaddings=*/b.getArrayAttr(transposePaddings),
                /*copyBackOp=*/b.getStringAttr(copyBackOp));
 }
@@ -1800,10 +1800,10 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter,
     }
 
     // Convert the integer packing flags to booleans.
-    SmallVector<bool> packPaddings;
+    SmallVector<bool> nofoldFlags;
     for (int64_t packPadding :
-         extractFromIntegerArrayAttr<int64_t>(getPackPaddings()))
-      packPaddings.push_back(static_cast<bool>(packPadding));
+         extractFromIntegerArrayAttr<int64_t>(getNofoldFlags()))
+      nofoldFlags.push_back(static_cast<bool>(packPadding));
 
     // Convert the padding values to attributes.
     SmallVector<Attribute> paddingValues;
@@ -1861,7 +1861,7 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter,
 
     options.padToMultipleOf = padToMultipleOf;
     options.paddingValues = paddingValues;
-    options.packPaddings = packPaddings;
+    options.nofoldFlags = nofoldFlags;
     if (getCopyBackOp() ==
         bufferization::MaterializeInDestinationOp::getOperationName()) {
       options.copyBackOp = LinalgPaddingOptions::CopyBackOp::
@@ -1907,14 +1907,14 @@ transform::PadOp::apply(transform::TransformRewriter &rewriter,
 }
 
 LogicalResult transform::PadOp::verify() {
-  SmallVector<int64_t> packPaddings =
-      extractFromIntegerArrayAttr<int64_t>(getPackPaddings());
-  if (any_of(packPaddings, [](int64_t packPadding) {
+  SmallVector<int64_t> nofoldFlags =
+      extractFromIntegerArrayAttr<int64_t>(getNofoldFlags());
+  if (any_of(nofoldFlags, [](int64_t packPadding) {
         return packPadding != 0 && packPadding != 1;
       })) {
     return emitOpError()
-           << "expects pack_paddings to contain booleans (0/1), found "
-           << getPackPaddings();
+           << "expects nofold_flags to contain booleans (0/1), found "
+           << getNofoldFlags();
   }
 
   SmallVector<int64_t> paddingDimensions =
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
index a066c4440891..9a685f6dc96a 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Padding.cpp
@@ -88,7 +88,7 @@ static LogicalResult computePaddedShape(linalg::LinalgOp opToPad,
 }
 
 /// Pad the `opOperand` in the "paddingDimensions" using the padding value and
-/// the nofold flag found in "paddingValues" and "packPaddings", respectively.
+/// the nofold flag found in "paddingValues" and "nofoldFlags", respectively.
 ///
 /// Exit early and return the `opOperand` value if it already has the requested
 /// shape. i.e.:
@@ -117,8 +117,8 @@ static FailureOr<Value> padOperandToSmallestStaticBoundingBox(
 
   // Return the unpadded operand if padding to a static shape is not needed and
   // if the nofold flag is not set.
-  bool nofold = opOperand->getOperandNumber() < options.packPaddings.size()
-                    ? options.packPaddings[opOperand->getOperandNumber()]
+  bool nofold = opOperand->getOperandNumber() < options.nofoldFlags.size()
+                    ? bool(options.nofoldFlags[opOperand->getOperandNumber()])
                     : false;
   if (!nofold && alreadyHasRequestedShape)
     return opOperand->get();
diff --git a/mlir/python/mlir/dialects/transform/structured.py b/mlir/python/mlir/dialects/transform/structured.py
index 41051c0d5b2f..f6111f516f8c 100644
--- a/mlir/python/mlir/dialects/transform/structured.py
+++ b/mlir/python/mlir/dialects/transform/structured.py
@@ -377,7 +377,7 @@ class PadOp(PadOp):
         pad_to_multiple_of: Optional[Union[DynamicIndexList, ArrayAttr]] = None,
         padding_values: Optional[Union[ArrayAttr, Sequence[Attribute]]] = None,
         padding_dimensions: OptionalIntList = None,
-        pack_paddings: OptionalIntList = None,
+        nofold_flags: OptionalIntList = None,
         transpose_paddings: Optional[
             Union[ArrayAttr, Sequence[Union[ArrayAttr, IntOrAttrList]]]
         ] = None,
@@ -407,7 +407,7 @@ class PadOp(PadOp):
             padding_values=padding_values,
             padding_dimensions=padding_dimensions,
             static_pad_to_multiple_of=static_pad_to_multiple_of,
-            pack_paddings=pack_paddings,
+            nofold_flags=nofold_flags,
             transpose_paddings=transpose_paddings,
             copy_back_op=copy_back_op,
             loc=loc,
diff --git a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
index 9c223737750a..8a3bb1bc52dc 100644
--- a/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
+++ b/mlir/test/Dialect/Linalg/matmul-shared-memory-padding.mlir
@@ -59,7 +59,7 @@ module attributes {transform.with_named_sequence} {
     // Pad linalg.matmul.
     %padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op
         {padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
-         padding_dimensions=[0, 1, 2], pack_paddings=[1, 1, 1],
+         padding_dimensions=[0, 1, 2], nofold_flags=[1, 1, 1],
          copy_back_op = "linalg.copy"}
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
@@ -180,7 +180,7 @@ module attributes {transform.with_named_sequence} {
     // Pad linalg.matmul.
     %padded, %pad, %copy_back = transform.structured.pad %tiled_linalg_op
         {padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
-         padding_dimensions=[0, 1, 2], pack_paddings=[1, 1, 1],
+         padding_dimensions=[0, 1, 2], nofold_flags=[1, 1, 1],
          copy_back_op = "linalg.copy"}
         : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 
diff --git a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
index 5e5657980ba1..373ed5f0d790 100644
--- a/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
+++ b/mlir/test/Dialect/Linalg/pad-to-specific-memory-space.mlir
@@ -54,7 +54,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 1]
+      nofold_flags=[1, 1, 1]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     %buffer, %new_ops = transform.structured.bufferize_to_allocation %pad {memory_space = 3, emit_dealloc} : !transform.any_op
     %2 = transform.bufferization.one_shot_bufferize %arg1 {bufferize_function_boundaries=true} : (!transform.any_op) -> !transform.any_op
@@ -115,7 +115,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 1]
+      nofold_flags=[1, 1, 1]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.structured.vectorize %pad vector_sizes [10, 12] : !transform.any_op
     %vector_write = transform.structured.match ops{["vector.transfer_write"]} in %arg1 : (!transform.any_op) -> !transform.any_op
diff --git a/mlir/test/Dialect/Linalg/transform-op-pad.mlir b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
index 120a525f3bda..ab2711545405 100644
--- a/mlir/test/Dialect/Linalg/transform-op-pad.mlir
+++ b/mlir/test/Dialect/Linalg/transform-op-pad.mlir
@@ -39,7 +39,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0]
+      nofold_flags=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.op<"bufferization.materialize_in_destination">)
     %p = transform.num_associations %copy_back : (!transform.op<"bufferization.materialize_in_destination">) -> !transform.param<i64>
     // expected-remark @below {{1}}
@@ -76,7 +76,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 pad_to_multiple_of [2, 2, 1] {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0]
+      nofold_flags=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -112,7 +112,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 pad_to_multiple_of [%c2, 2, 1] {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0]
+      nofold_flags=[1, 1, 0]
     } : (!transform.any_op, !transform.param<i64>) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -155,7 +155,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0]
+      nofold_flags=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -178,7 +178,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0: i32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0]
+      nofold_flags=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -201,7 +201,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=["{foo}", 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0]
+      nofold_flags=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -210,7 +210,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // With all padded being static, there's nothing to pad. However, with the
-// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are
+// `nofold` attribute set (see `nofold_flags`), the corresponding pad Ops are
 // preserved.
 
 // CHECK-LABEL: @zero_pad_static(
@@ -239,7 +239,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 0]
+      nofold_flags=[1, 1, 0]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -248,7 +248,7 @@ module attributes {transform.with_named_sequence} {
 // -----
 
 // With all padded dims being static, there's nothing to pad. However, with the
-// `nofold` attribute set (see `pack_paddings`), the corresponding pad Ops are
+// `nofold` attribute set (see `nofold_flags`), the corresponding pad Ops are
 // preserved. Same as above, but some dims are now dynamic.
 
 // CHECK-LABEL: @zero_pad_dynamic(
@@ -278,7 +278,7 @@ module attributes {transform.with_named_sequence} {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       // Note - only the static dim is padded
       padding_dimensions=[2],
-      pack_paddings=[1, 1, 1]
+      nofold_flags=[1, 1, 1]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -305,7 +305,7 @@ module attributes {transform.with_named_sequence} {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       // Note - attempting to pad non-static dim
       padding_dimensions=[1],
-      pack_paddings=[1, 1, 1]
+      nofold_flags=[1, 1, 1]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -362,7 +362,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 1]
+      nofold_flags=[1, 1, 1]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
@@ -414,7 +414,7 @@ module attributes {transform.with_named_sequence} {
     %padded, %pad, %copy_back = transform.structured.pad %0 {
       padding_values=[0.0 : f32, 0.0 : f32, 0.0 : f32],
       padding_dimensions=[0, 1, 2],
-      pack_paddings=[1, 1, 1]
+      nofold_flags=[1, 1, 1]
     } : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
     transform.yield
   }
diff --git a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
index e86d4962530a..a30b56c7c58e 100644
--- a/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
+++ b/mlir/test/Dialect/Linalg/transform-ops-invalid.mlir
@@ -18,8 +18,8 @@ transform.sequence failures(propagate) {
 
 transform.sequence failures(propagate) {
 ^bb0(%arg0: !transform.any_op):
-  // expected-error@below {{expects pack_paddings to contain booleans (0/1), found [1, 7]}}
-  transform.structured.pad %arg0 {pack_paddings=[1, 7]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
+  // expected-error@below {{expects nofold_flags to contain booleans (0/1), found [1, 7]}}
+  transform.structured.pad %arg0 {nofold_flags=[1, 7]} : (!transform.any_op) -> (!transform.any_op, !transform.any_op, !transform.any_op)
 }
 
 // -----
diff --git a/mlir/test/python/dialects/transform_structured_ext.py b/mlir/test/python/dialects/transform_structured_ext.py
index 3ea73e8beea3..d029b3bfa6b1 100644
--- a/mlir/test/python/dialects/transform_structured_ext.py
+++ b/mlir/test/python/dialects/transform_structured_ext.py
@@ -304,7 +304,7 @@ def testPadOpNoArgs(target):
     # CHECK: transform.sequence
     # CHECK: transform.structured.pad
     # CHECK-NOT: copy_back_op
-    # CHECK-NOT: pack_paddings
+    # CHECK-NOT: nofold_flags
     # CHECK-NOT: pad_to_multiple_of
     # CHECK-NOT: padding_dimensions
     # CHECK-NOT: padding_values
@@ -319,7 +319,7 @@ def testPadOpArgs(target):
         pad_to_multiple_of=[128],
         padding_values=[FloatAttr.get_f32(42.0), StringAttr.get("0")],
         padding_dimensions=Attribute.parse("[1]"),
-        pack_paddings=[0],
+        nofold_flags=[0],
         transpose_paddings=[[1, Attribute.parse("0")], Attribute.parse("[0, 1]")],
         copy_back_op="linalg.copy",
     )
@@ -328,7 +328,7 @@ def testPadOpArgs(target):
     # CHECK: transform.structured.pad
     # CHECK-DAG: pad_to_multiple_of [128]
     # CHECK-DAG: copy_back_op = "linalg.copy"
-    # CHECK-DAG: pack_paddings = [0]
+    # CHECK-DAG: nofold_flags = [0]
     # CHECK-DAG: padding_dimensions = [1]
     # CHECK-DAG: padding_values = [4.200000e+01 : f32, "0"]
     # CHECK-DAG: transpose_paddings = {{\[}}[1, 0], [0, 1]]
-- 
GitLab


From b0a25468faca32d9db4d35e56fb120ed4eaeec09 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 15 Oct 2024 22:29:24 +0400
Subject: [PATCH 012/329] AMDGPU: Add baseline tests for flat-may-alias private
 atomic expansions (#109406)

---
 .../CodeGen/AMDGPU/flat-atomic-fadd.f64.ll    |   51 +
 .../AMDGPU/flat_atomics_i64_noprivate.ll      | 6804 ++++++++++++
 .../flat_atomics_i64_system_noprivate.ll      | 9196 +++++++++++++++++
 ...expand-atomicrmw-flat-noalias-addrspace.ll | 1523 +++
 4 files changed, 17574 insertions(+)
 create mode 100644 llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
 create mode 100644 llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll

diff --git a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
index e5dcf9ce309c..32cb1056022d 100644
--- a/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/flat-atomic-fadd.f64.ll
@@ -77,6 +77,29 @@ define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw(ptr %ptr, double %d
   ret void
 }
 
+define amdgpu_ps void @flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate(ptr %ptr, double %data) #0 {
+  ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_no_rtn_atomicrmw_noprivate
+  ; GFX90A_GFX940: bb.0 (%ir-block.0):
+  ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX90A_GFX940-NEXT: {{  $}}
+  ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A_GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A_GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A_GFX940-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+  ; GFX90A_GFX940-NEXT:   FLAT_ATOMIC_ADD_F64 killed [[COPY4]], killed [[COPY5]], 0, 0, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+  ; GFX90A_GFX940-NEXT:   S_ENDPGM 0
+  %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret void
+}
+
 define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %data) #0 {
   ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw
   ; GFX90A_GFX940: bb.0 (%ir-block.0):
@@ -104,8 +127,36 @@ define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw(ptr %ptr, double %da
   ret double %ret
 }
 
+define amdgpu_ps double @flat_atomic_fadd_f64_rtn_atomicrmw__noprivate(ptr %ptr, double %data) #0 {
+  ; GFX90A_GFX940-LABEL: name: flat_atomic_fadd_f64_rtn_atomicrmw__noprivate
+  ; GFX90A_GFX940: bb.0 (%ir-block.0):
+  ; GFX90A_GFX940-NEXT:   liveins: $vgpr0, $vgpr1, $vgpr2, $vgpr3
+  ; GFX90A_GFX940-NEXT: {{  $}}
+  ; GFX90A_GFX940-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr3
+  ; GFX90A_GFX940-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr2
+  ; GFX90A_GFX940-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+  ; GFX90A_GFX940-NEXT:   [[COPY3:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+  ; GFX90A_GFX940-NEXT:   [[DEF:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[DEF1:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY1]], %subreg.sub0, [[COPY]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[DEF2:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[DEF3:%[0-9]+]]:sgpr_32 = IMPLICIT_DEF
+  ; GFX90A_GFX940-NEXT:   [[REG_SEQUENCE1:%[0-9]+]]:vreg_64_align2 = REG_SEQUENCE [[COPY3]], %subreg.sub0, [[COPY2]], %subreg.sub1
+  ; GFX90A_GFX940-NEXT:   [[COPY4:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE1]]
+  ; GFX90A_GFX940-NEXT:   [[COPY5:%[0-9]+]]:vreg_64_align2 = COPY [[REG_SEQUENCE]]
+  ; GFX90A_GFX940-NEXT:   [[FLAT_ATOMIC_ADD_F64_RTN:%[0-9]+]]:vreg_64_align2 = FLAT_ATOMIC_ADD_F64_RTN killed [[COPY4]], killed [[COPY5]], 0, 1, implicit $exec, implicit $flat_scr :: (load store syncscope("wavefront") monotonic (s64) on %ir.ptr)
+  ; GFX90A_GFX940-NEXT:   [[COPY6:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub0
+  ; GFX90A_GFX940-NEXT:   [[COPY7:%[0-9]+]]:vgpr_32 = COPY [[FLAT_ATOMIC_ADD_F64_RTN]].sub1
+  ; GFX90A_GFX940-NEXT:   $sgpr0 = COPY [[COPY6]]
+  ; GFX90A_GFX940-NEXT:   $sgpr1 = COPY [[COPY7]]
+  ; GFX90A_GFX940-NEXT:   SI_RETURN_TO_EPILOG $sgpr0, $sgpr1
+  %ret = atomicrmw fadd ptr %ptr, double %data syncscope("wavefront") monotonic, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret double %ret
+}
+
 declare double @llvm.amdgcn.flat.atomic.fadd.f64.p1.f64(ptr, double)
 
 attributes #0 = { nounwind }
 
 !0 = !{}
+!1 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
new file mode 100644
index 000000000000..64bd4804ccd5
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_noprivate.ll
@@ -0,0 +1,6804 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1200 < %s | FileCheck -check-prefix=GFX12 %s
+
+define amdgpu_kernel void @atomic_add_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_add_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_add_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_add_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_add_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile add ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_add_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_add_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile add ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_add_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_add_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_add_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_add_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_add_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_add_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile add ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_and_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_and_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_and_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_and_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile and ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_and_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_and_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile and ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_and_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_and_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_and_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_and_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_and_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_and_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile and ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_sub_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_sub_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_sub_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_sub_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile sub ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_sub_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_sub_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile sub ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_sub_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_sub_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_sub_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_sub_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_sub_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_sub_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile sub ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_max_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_max_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile max ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_max_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_max_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_smax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile max ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smax_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[2:3], v[0:1] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_max_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile max ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_umax_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_umax_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umax_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umax ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_umax_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_umax_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_umax_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile umax ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umax_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umax_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[2:3], v[0:1] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umax_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umax_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umax_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_max_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umax ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_min_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_min_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_min_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile min ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_min_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_min_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_smin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile min ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_min_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smin_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[2:3], v[0:1] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_min_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_smin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_min_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_i64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile min ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_umin_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_umin_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umin_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umin_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile umin ptr %gep, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_umin_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_umin_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_umin_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile umin ptr %out, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umin_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umin_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[2:3], v[0:1] scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umin_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umin_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    s_waitcnt vmcnt(0)
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umin_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_umin_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    s_waitcnt vmcnt(0)
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_umin_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_min_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_SE
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SE
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile umin ptr %ptr, i64 %in syncscope("workgroup") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_or_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_or_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_or_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_or_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile or ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_or_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_or_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile or ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_or_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_or_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_or_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_or_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_or_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_or_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile or ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_xchg_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_f64_offset(ptr %out, double %in) {
+; GFX7-LABEL: atomic_xchg_f64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_f64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_f64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr double, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, double %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_pointer_offset(ptr %out, ptr %in) {
+; GFX7-LABEL: atomic_xchg_pointer_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_pointer_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_pointer_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr ptr, ptr %out, i32 4
+  %val = atomicrmw volatile xchg ptr %gep, ptr %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_xchg_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xchg_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xchg_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xchg ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_xchg_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_xchg_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile xchg ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xchg_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xchg_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xchg_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xchg_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_swap_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xchg ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_xor_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_xor_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xor_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xor_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile xor ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_xor_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_xor_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile xor ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xor_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_xor_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_xor_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_xor_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_xor_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_xor_b64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile xor ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_i64_offset(ptr %in, ptr %out) {
+; GFX7-LABEL: atomic_load_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %in, i64 4
+  %val = load atomic i64, ptr %gep  seq_cst, align 8
+  store i64 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_i64(ptr %in, ptr %out) {
+; GFX7-LABEL: atomic_load_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %val = load atomic i64, ptr %in syncscope("agent") seq_cst, align 8
+  store i64 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_i64_addr64_offset(ptr %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_load_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %in, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %val = load atomic i64, ptr %gep seq_cst, align 8
+  store i64 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_i64_addr64(ptr %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_load_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %in, i64 %index
+  %val = load atomic i64, ptr %ptr seq_cst, align 8
+  store i64 %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_i64_offset(i64 %in, ptr %out) {
+; GFX7-LABEL: atomic_store_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_add_u32 s0, s2, 32
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_addc_u32 s1, s3, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_add_u32 s0, s2, 32
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  store atomic i64 %in, ptr %gep  seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_i64(i64 %in, ptr %out) {
+; GFX7-LABEL: atomic_store_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  store atomic i64 %in, ptr %out seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_i64_addr64_offset(i64 %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_store_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s6, s0
+; GFX7-NEXT:    s_addc_u32 s1, s7, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s6, s0
+; GFX8-NEXT:    s_addc_u32 s1, s7, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  store atomic i64 %in, ptr %gep seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_i64_addr64(i64 %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_store_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s6, s0
+; GFX7-NEXT:    s_addc_u32 s1, s7, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s6, s0
+; GFX8-NEXT:    s_addc_u32 s1, s7, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  store atomic i64 %in, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_offset(ptr %out, i64 %in, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s2, s4, 32
+; GFX7-NEXT:    s_addc_u32 s3, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s2, s4, 32
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_soffset(ptr %out, i64 %in, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_soffset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s2, s4, 0x11940
+; GFX7-NEXT:    s_addc_u32 s3, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_soffset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s2, s4, 0x11940
+; GFX8-NEXT:    s_addc_u32 s3, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_soffset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:72000 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 9000
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_offset(ptr %out, ptr %out2, i64 %in, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64_offset(ptr %out, i64 %in, i64 %index, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x11
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    s_add_u32 s0, s4, s2
+; GFX7-NEXT:    s_addc_u32 s3, s5, s3
+; GFX7-NEXT:    s_add_u32 s2, s0, 32
+; GFX7-NEXT:    s_addc_u32 s3, s3, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    s_add_u32 s0, s4, s2
+; GFX8-NEXT:    s_addc_u32 s3, s5, s3
+; GFX8-NEXT:    s_add_u32 s2, s0, 32
+; GFX8-NEXT:    s_addc_u32 s3, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x44
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %val = cmpxchg volatile ptr %gep, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64(ptr %out, i64 %in, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s4 :: v_dual_mov_b32 v5, s5
+; GFX12-NEXT:    v_dual_mov_b32 v0, s6 :: v_dual_mov_b32 v1, s7
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret(ptr %out, ptr %out2, i64 %in, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v4, s0 :: v_dual_mov_b32 v5, s1
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %val = cmpxchg volatile ptr %out, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_addr64(ptr %out, i64 %in, i64 %index, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    v_dual_mov_b32 v5, s1 :: v_dual_mov_b32 v4, s0
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[4:5], v[0:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  ret void
+}
+
+define amdgpu_kernel void @atomic_cmpxchg_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index, i64 %old) {
+; GFX7-LABEL: atomic_cmpxchg_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x11
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX7-NEXT:    s_add_u32 s2, s4, s2
+; GFX7-NEXT:    s_addc_u32 s3, s5, s3
+; GFX7-NEXT:    v_mov_b32_e32 v5, s3
+; GFX7-NEXT:    v_mov_b32_e32 v0, s8
+; GFX7-NEXT:    v_mov_b32_e32 v1, s9
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s2
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_cmpxchg_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x44
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX8-NEXT:    s_add_u32 s2, s4, s2
+; GFX8-NEXT:    s_addc_u32 s3, s5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v5, s3
+; GFX8-NEXT:    v_mov_b32_e32 v0, s8
+; GFX8-NEXT:    v_mov_b32_e32 v1, s9
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s2
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_cmpxchg_i64_ret_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b256 s[4:11], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x44
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[2:3], s[10:11], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s8 :: v_dual_mov_b32 v1, s9
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[4:5], s[2:3]
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    v_dual_mov_b32 v5, s3 :: v_dual_mov_b32 v4, s2
+; GFX12-NEXT:    flat_atomic_cmpswap_b64 v[0:1], v[4:5], v[0:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %val = cmpxchg volatile ptr %ptr, i64 %old, i64 %in syncscope("agent") seq_cst seq_cst
+  %extract0 = extractvalue { i64, i1 } %val, 0
+  store i64 %extract0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f64_offset(ptr %in, ptr %out) {
+; GFX7-LABEL: atomic_load_f64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_f64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr double, ptr %in, i64 4
+  %val = load atomic double, ptr %gep  seq_cst, align 8
+  store double %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f64(ptr %in, ptr %out) {
+; GFX7-LABEL: atomic_load_f64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_f64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %val = load atomic double, ptr %in syncscope("agent") seq_cst, align 8
+  store double %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f64_addr64_offset(ptr %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_load_f64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_f64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr double, ptr %in, i64 %index
+  %gep = getelementptr double, ptr %ptr, i64 4
+  %val = load atomic double, ptr %gep seq_cst, align 8
+  store double %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_load_f64_addr64(ptr %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_load_f64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_load_f64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_load_f64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    flat_load_b64 v[0:1], v[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr double, ptr %in, i64 %index
+  %val = load atomic double, ptr %ptr seq_cst, align 8
+  store double %val, ptr %out
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_f64_offset(double %in, ptr %out) {
+; GFX7-LABEL: atomic_store_f64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    s_add_u32 s0, s2, 32
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    s_addc_u32 s1, s3, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_f64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    s_add_u32 s0, s2, 32
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    s_addc_u32 s1, s3, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr double, ptr %out, i64 4
+  store atomic double %in, ptr %gep  seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_f64(double %in, ptr %out) {
+; GFX7-LABEL: atomic_store_f64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_f64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  store atomic double %in, ptr %out seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_f64_addr64_offset(double %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_store_f64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s6, s0
+; GFX7-NEXT:    s_addc_u32 s1, s7, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_f64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s6, s0
+; GFX8-NEXT:    s_addc_u32 s1, s7, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64_addr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] offset:32 scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr double, ptr %out, i64 %index
+  %gep = getelementptr double, ptr %ptr, i64 4
+  store atomic double %in, ptr %gep seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_store_f64_addr64(double %in, ptr %out, i64 %index) {
+; GFX7-LABEL: atomic_store_f64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s6, s0
+; GFX7-NEXT:    s_addc_u32 s1, s7, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_store_f64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s6, s0
+; GFX8-NEXT:    s_addc_u32 s1, s7, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_store_f64_addr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    s_add_nc_u64 s[2:3], s[2:3], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    global_wb scope:SCOPE_SYS
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1] scope:SCOPE_SYS
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr double, ptr %out, i64 %index
+  store atomic double %in, ptr %ptr seq_cst, align 8
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_inc_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_inc_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64_incr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_inc_i64_incr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64_incr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_incr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64_ret_incr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_inc_i64_ret_incr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64_ret_incr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret_incr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_inc_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_inc_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64_incr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_inc_i64_incr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64_incr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_incr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_inc_i64_ret_incr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_inc_i64_ret_incr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_inc_i64_ret_incr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_inc_i64_ret_incr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_inc_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile uinc_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_dec_i64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64_ret_offset(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_dec_i64_ret_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64_ret_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64_decr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_dec_i64_decr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64_decr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_decr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] offset:32 scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64_ret_decr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_dec_i64_ret_decr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64_ret_decr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret_decr64_offset:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] offset:32 th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw volatile udec_wrap ptr %gep, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_dec_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s0 :: v_dual_mov_b32 v1, s1
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64_ret(ptr %out, ptr %out2, i64 %in) {
+; GFX7-LABEL: atomic_dec_i64_ret:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s7
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64_ret:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s7
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX12-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    v_dual_mov_b32 v2, s0 :: v_dual_mov_b32 v3, s1
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[0:1], v[2:3] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s6 :: v_dual_mov_b32 v3, s7
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw volatile udec_wrap ptr %out, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64_decr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_dec_i64_decr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x9
+; GFX7-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0xd
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX7-NEXT:    s_add_u32 s0, s4, s0
+; GFX7-NEXT:    s_addc_u32 s1, s5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64_decr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX8-NEXT:    s_add_u32 s0, s4, s0
+; GFX8-NEXT:    s_addc_u32 s1, s5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_decr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_clause 0x1
+; GFX12-NEXT:    s_load_b64 s[4:5], s[2:3], 0x34
+; GFX12-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s2 :: v_dual_mov_b32 v1, s3
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[4:5]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[2:3], v[0:1] scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_storecnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  ret void
+}
+
+define amdgpu_kernel void @atomic_dec_i64_ret_decr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_dec_i64_ret_decr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v3, s1
+; GFX7-NEXT:    v_mov_b32_e32 v2, s0
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_mov_b32_e32 v2, s2
+; GFX7-NEXT:    v_mov_b32_e32 v3, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_dec_i64_ret_decr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v3, s1
+; GFX8-NEXT:    v_mov_b32_e32 v2, s0
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_mov_b32_e32 v2, s2
+; GFX8-NEXT:    v_mov_b32_e32 v3, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX12-LABEL: atomic_dec_i64_ret_decr64:
+; GFX12:       ; %bb.0: ; %entry
+; GFX12-NEXT:    s_load_b256 s[0:7], s[2:3], 0x24
+; GFX12-NEXT:    s_wait_kmcnt 0x0
+; GFX12-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX12-NEXT:    v_dual_mov_b32 v0, s4 :: v_dual_mov_b32 v1, s5
+; GFX12-NEXT:    s_add_nc_u64 s[0:1], s[0:1], s[6:7]
+; GFX12-NEXT:    s_delay_alu instid0(SALU_CYCLE_1)
+; GFX12-NEXT:    v_dual_mov_b32 v3, s1 :: v_dual_mov_b32 v2, s0
+; GFX12-NEXT:    flat_atomic_dec_u64 v[0:1], v[2:3], v[0:1] th:TH_ATOMIC_RETURN scope:SCOPE_DEV
+; GFX12-NEXT:    s_wait_loadcnt_dscnt 0x0
+; GFX12-NEXT:    global_inv scope:SCOPE_DEV
+; GFX12-NEXT:    v_dual_mov_b32 v2, s2 :: v_dual_mov_b32 v3, s3
+; GFX12-NEXT:    flat_store_b64 v[2:3], v[0:1]
+; GFX12-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw volatile udec_wrap ptr %ptr, i64 %in syncscope("agent") seq_cst, !noalias.addrspace !0
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+!0 = !{i32 5, i32 6}
diff --git a/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
new file mode 100644
index 000000000000..edd5620dc411
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/flat_atomics_i64_system_noprivate.ll
@@ -0,0 +1,9196 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -mtriple=amdgcn -mcpu=bonaire -verify-machineinstrs < %s | FileCheck -check-prefix=GFX7 %s
+; RUN: llc -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx900 -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX9 %s
+
+; ---------------------------------------------------------------------
+; atomicrmw xchg
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_xchg_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_xchg_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_xchg_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_xchg_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_xchg_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_xchg_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw xchg ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_xchg_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw xchg ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw xchg f64
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_xchg_f64_noret(ptr %ptr, double %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_xchg_f64_noret_offset(ptr %out, double %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %out, i32 4
+  %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define double @flat_atomic_xchg_f64_ret(ptr %ptr, double %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
+  ret double %result
+}
+
+define double @flat_atomic_xchg_f64_ret_offset(ptr %out, double %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %out, i32 4
+  %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
+  ret double %result
+}
+
+define amdgpu_gfx void @flat_atomic_xchg_f64_noret_scalar(ptr inreg %ptr, double inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_xchg_f64_noret_offset_scalar(ptr inreg %out, double inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %out, i32 4
+  %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx double @flat_atomic_xchg_f64_ret_scalar(ptr inreg %ptr, double inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw xchg ptr %ptr, double %in seq_cst, !noalias.addrspace !1
+  ret double %result
+}
+
+define amdgpu_gfx double @flat_atomic_xchg_f64_ret_offset_scalar(ptr inreg %out, double inreg %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %out, i32 4
+  %result = atomicrmw xchg ptr %gep, double %in seq_cst, !noalias.addrspace !1
+  ret double %result
+}
+
+define void @flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %out, i64 4
+  %tmp0 = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define double @flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory(ptr %out, double %in) {
+; GFX7-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xchg_f64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_swap_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr double, ptr %out, i64 4
+  %result = atomicrmw xchg ptr %gep, double %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret double %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw add
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_add_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_add_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_add_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_add_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_add_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_add_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_add_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_add_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_add_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_add_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_add_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_add_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_add_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_add_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_add_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw add ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_add_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_add_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw add ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_add_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_add_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw add ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw sub
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_sub_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_sub_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_sub_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_sub_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_sub_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_sub_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_sub_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw sub ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_sub_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_sub_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_sub_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw sub ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw and
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_and_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_and_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_and_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_and_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_and_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_and_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_and_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_and_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_and_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_and_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_and_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_and_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_and_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_and_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_and_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw and ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_and_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_and_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw and ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_and_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_and_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw and ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw nand
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_nand_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v6, v[0:1]
+; GFX7-NEXT:    flat_load_dword v7, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX7-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v4
+; GFX7-NEXT:    v_not_b32_e32 v4, v8
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX8-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX8-NEXT:    v_not_b32_e32 v5, v4
+; GFX8-NEXT:    v_not_b32_e32 v4, v8
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB50_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB50_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_nand_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, v7, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, v6, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v0
+; GFX7-NEXT:    v_not_b32_e32 v4, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, v7, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, v6, v2
+; GFX8-NEXT:    v_not_b32_e32 v5, v0
+; GFX8-NEXT:    v_not_b32_e32 v4, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB51_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB51_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_nand_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX7-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v4
+; GFX7-NEXT:    v_not_b32_e32 v4, v8
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX8-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX8-NEXT:    v_not_b32_e32 v5, v4
+; GFX8-NEXT:    v_not_b32_e32 v4, v8
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB52_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB52_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_nand_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v9, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, v8, v2
+; GFX7-NEXT:    v_not_b32_e32 v7, v0
+; GFX7-NEXT:    v_not_b32_e32 v6, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, v9, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, v8, v2
+; GFX8-NEXT:    v_not_b32_e32 v7, v0
+; GFX8-NEXT:    v_not_b32_e32 v6, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB53_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB53_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_nand_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s34
+; GFX7-NEXT:    v_mov_b32_e32 v4, s35
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, s7, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s6, v2
+; GFX7-NEXT:    v_not_b32_e32 v1, v0
+; GFX7-NEXT:    v_not_b32_e32 v0, v6
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s34
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, s7, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, s6, v2
+; GFX8-NEXT:    v_not_b32_e32 v1, v0
+; GFX8-NEXT:    v_not_b32_e32 v0, v6
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB54_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, s6, v2
+; GFX9-NEXT:    v_not_b32_e32 v1, v0
+; GFX9-NEXT:    v_not_b32_e32 v0, v6
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB54_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_nand_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v4, s34
+; GFX7-NEXT:    v_mov_b32_e32 v5, s35
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    flat_load_dword v2, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, s7, v3
+; GFX7-NEXT:    v_and_b32_e32 v6, s6, v2
+; GFX7-NEXT:    v_not_b32_e32 v1, v0
+; GFX7-NEXT:    v_not_b32_e32 v0, v6
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v4, s34
+; GFX8-NEXT:    v_mov_b32_e32 v5, s35
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, s7, v3
+; GFX8-NEXT:    v_and_b32_e32 v6, s6, v2
+; GFX8-NEXT:    v_not_b32_e32 v1, v0
+; GFX8-NEXT:    v_not_b32_e32 v0, v6
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB55_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v0, s7, v3
+; GFX9-NEXT:    v_and_b32_e32 v6, s6, v2
+; GFX9-NEXT:    v_not_b32_e32 v1, v0
+; GFX9-NEXT:    v_not_b32_e32 v0, v6
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB55_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX7-NEXT:    v_not_b32_e32 v5, v0
+; GFX7-NEXT:    v_not_b32_e32 v4, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX8-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX8-NEXT:    v_not_b32_e32 v5, v0
+; GFX8-NEXT:    v_not_b32_e32 v4, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB56_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX9-NEXT:    v_not_b32_e32 v5, v0
+; GFX9-NEXT:    v_not_b32_e32 v4, v1
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB56_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw nand ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_nand_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX7-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX7-NEXT:    v_not_b32_e32 v5, v0
+; GFX7-NEXT:    v_not_b32_e32 v4, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX8-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX8-NEXT:    v_not_b32_e32 v5, v0
+; GFX8-NEXT:    v_not_b32_e32 v4, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB57_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v1
+; GFX9-NEXT:    v_mov_b32_e32 v6, v0
+; GFX9-NEXT:    v_and_b32_e32 v0, s7, v7
+; GFX9-NEXT:    v_and_b32_e32 v1, s6, v6
+; GFX9-NEXT:    v_not_b32_e32 v5, v0
+; GFX9-NEXT:    v_not_b32_e32 v4, v1
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB57_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_and_b32_e32 v0, v7, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, v6, v2
+; GFX7-NEXT:    v_not_b32_e32 v5, v0
+; GFX7-NEXT:    v_not_b32_e32 v4, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_and_b32_e32 v0, v7, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, v6, v2
+; GFX8-NEXT:    v_not_b32_e32 v5, v0
+; GFX8-NEXT:    v_not_b32_e32 v4, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB58_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB58_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_and_b32_e32 v0, v9, v3
+; GFX7-NEXT:    v_and_b32_e32 v1, v8, v2
+; GFX7-NEXT:    v_not_b32_e32 v7, v0
+; GFX7-NEXT:    v_not_b32_e32 v6, v1
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_and_b32_e32 v0, v9, v3
+; GFX8-NEXT:    v_and_b32_e32 v1, v8, v2
+; GFX8-NEXT:    v_not_b32_e32 v7, v0
+; GFX8-NEXT:    v_not_b32_e32 v6, v1
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_nand_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB59_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_and_b32_e32 v4, v7, v3
+; GFX9-NEXT:    v_and_b32_e32 v8, v6, v2
+; GFX9-NEXT:    v_not_b32_e32 v5, v4
+; GFX9-NEXT:    v_not_b32_e32 v4, v8
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB59_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw nand ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw or
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_or_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_or_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_or_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_or_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_or_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_or_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_or_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_or_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_or_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_or_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_or_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_or_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_or_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_or_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_or_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw or ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_or_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_or_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw or ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_or_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_or_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw or ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw xor
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_xor_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_xor_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_xor_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_xor_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_xor_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_xor_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_xor_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw xor ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_xor_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_xor_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_xor_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw xor ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw max
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_max_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_max_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v6, v[0:1]
+; GFX7-NEXT:    flat_load_dword v7, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB80_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB80_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_max_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_max_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB81_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB81_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_max_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_max_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB82_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB82_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_max_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_max_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB83_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB83_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB83_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB83_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_max_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_max_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s34
+; GFX7-NEXT:    v_mov_b32_e32 v4, s35
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:  .LBB84_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB84_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s34
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:  .LBB84_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB84_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB84_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB84_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_max_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_max_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v4, s34
+; GFX7-NEXT:    v_mov_b32_e32 v5, s35
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    flat_load_dword v2, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:  .LBB85_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB85_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v4, s34
+; GFX8-NEXT:    v_mov_b32_e32 v5, s35
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:  .LBB85_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB85_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB85_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB85_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_max_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_max_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:  .LBB86_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB86_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:  .LBB86_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB86_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB86_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB86_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_max_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_max_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:  .LBB87_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB87_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:  .LBB87_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB87_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB87_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB87_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_kernel void @atomic_max_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
+; GFX7-NEXT:  .LBB88_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB88_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:  .LBB88_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB88_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_max_i64_addr64_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:  .LBB88_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB88_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s6
+; GFX7-NEXT:    s_addc_u32 s1, s1, s7
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
+; GFX7-NEXT:  .LBB89_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB89_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s6
+; GFX8-NEXT:    s_addc_u32 s1, s1, s7
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:  .LBB89_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB89_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_max_i64_ret_addr64_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:  .LBB89_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB89_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_addr64(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
+; GFX7-NEXT:  .LBB90_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB90_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:  .LBB90_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB90_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_max_i64_addr64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:  .LBB90_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB90_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_kernel void @atomic_max_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_max_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s6
+; GFX7-NEXT:    s_addc_u32 s1, s1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
+; GFX7-NEXT:  .LBB91_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB91_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_max_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s6
+; GFX8-NEXT:    s_addc_u32 s1, s1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:  .LBB91_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB91_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_max_i64_ret_addr64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:  .LBB91_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_i64_e32 vcc, s[8:9], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB91_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw max ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define void @flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB92_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB92_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB92_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB92_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB92_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB92_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB93_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB93_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB93_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_gt_i64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB93_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_max_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB93_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_gt_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB93_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw max ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw umax
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_umax_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v6, v[0:1]
+; GFX7-NEXT:    flat_load_dword v7, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB94_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB94_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB94_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB94_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB94_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB94_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_umax_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB95_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB95_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB95_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB95_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB95_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB95_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_umax_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB96_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB96_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB96_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB96_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB96_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB96_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_umax_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB97_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB97_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB97_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB97_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB97_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB97_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_umax_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s34
+; GFX7-NEXT:    v_mov_b32_e32 v4, s35
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:  .LBB98_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB98_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s34
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:  .LBB98_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB98_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB98_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB98_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_umax_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v4, s34
+; GFX7-NEXT:    v_mov_b32_e32 v5, s35
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    flat_load_dword v2, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:  .LBB99_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB99_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v4, s34
+; GFX8-NEXT:    v_mov_b32_e32 v5, s35
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:  .LBB99_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB99_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB99_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB99_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:  .LBB100_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB100_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:  .LBB100_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB100_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB100_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB100_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_umax_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:  .LBB101_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB101_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:  .LBB101_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB101_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB101_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB101_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_kernel void @atomic_umax_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umax_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
+; GFX7-NEXT:  .LBB102_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB102_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:  .LBB102_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB102_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_umax_i64_addr64_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:  .LBB102_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB102_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s6
+; GFX7-NEXT:    s_addc_u32 s1, s1, s7
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
+; GFX7-NEXT:  .LBB103_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB103_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s6
+; GFX8-NEXT:    s_addc_u32 s1, s1, s7
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:  .LBB103_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB103_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_umax_i64_ret_addr64_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:  .LBB103_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB103_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_umax_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_umax_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s6
+; GFX7-NEXT:    s_addc_u32 s1, s1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
+; GFX7-NEXT:  .LBB104_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB104_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_umax_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s6
+; GFX8-NEXT:    s_addc_u32 s1, s1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:  .LBB104_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_lt_u64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB104_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_umax_i64_ret_addr64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:  .LBB104_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_lt_u64_e32 vcc, s[8:9], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB104_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw umax ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define void @flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB105_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB105_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB105_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB105_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB105_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB105_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB106_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB106_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB106_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_gt_u64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB106_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umax_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB106_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_gt_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB106_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw umax ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw umin
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_umin_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v6, v[0:1]
+; GFX7-NEXT:    flat_load_dword v7, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB107_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB107_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB107_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB107_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB107_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB107_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_umin_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB108_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB108_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB108_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB108_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB108_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB108_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_umin_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB109_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB109_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB109_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB109_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB109_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB109_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_umin_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB110_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB110_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB110_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB110_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB110_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB110_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_umin_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s34
+; GFX7-NEXT:    v_mov_b32_e32 v4, s35
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:  .LBB111_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB111_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s34
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:  .LBB111_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB111_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB111_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB111_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_umin_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v4, s34
+; GFX7-NEXT:    v_mov_b32_e32 v5, s35
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    flat_load_dword v2, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:  .LBB112_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB112_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v4, s34
+; GFX8-NEXT:    v_mov_b32_e32 v5, s35
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:  .LBB112_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB112_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB112_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB112_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:  .LBB113_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB113_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:  .LBB113_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB113_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB113_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB113_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw umin ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_umin_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:  .LBB114_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB114_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:  .LBB114_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB114_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB114_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_u64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB114_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB115_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB115_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB115_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB115_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB115_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB115_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB116_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB116_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB116_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_le_u64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB116_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_umin_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB116_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_le_u64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB116_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw umin ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw min
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_min_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_min_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v6, v[0:1]
+; GFX7-NEXT:    flat_load_dword v7, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB117_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB117_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v6, v[0:1]
+; GFX8-NEXT:    flat_load_dword v7, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB117_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB117_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB117_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB117_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_min_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_min_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB118_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB118_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB118_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB118_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB118_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB118_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_min_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_min_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v5, vcc, 4, v0
+; GFX7-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v4, v[0:1]
+; GFX7-NEXT:    flat_load_dword v5, v[5:6]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB119_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v7, v5
+; GFX7-NEXT:    v_mov_b32_e32 v6, v4
+; GFX7-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB119_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v0, v4
+; GFX7-NEXT:    v_mov_b32_e32 v1, v5
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v5, vcc, 4, v0
+; GFX8-NEXT:    v_addc_u32_e32 v6, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v4, v[0:1]
+; GFX8-NEXT:    flat_load_dword v5, v[5:6]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB119_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v7, v5
+; GFX8-NEXT:    v_mov_b32_e32 v6, v4
+; GFX8-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB119_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v0, v4
+; GFX8-NEXT:    v_mov_b32_e32 v1, v5
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB119_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB119_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_min_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_min_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB120_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB120_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB120_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB120_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB120_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB120_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_min_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_min_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v3, s34
+; GFX7-NEXT:    v_mov_b32_e32 v4, s35
+; GFX7-NEXT:    flat_load_dword v2, v[0:1]
+; GFX7-NEXT:    flat_load_dword v3, v[3:4]
+; GFX7-NEXT:    v_mov_b32_e32 v4, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:    v_mov_b32_e32 v5, s5
+; GFX7-NEXT:  .LBB121_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB121_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v3, s34
+; GFX8-NEXT:    v_mov_b32_e32 v4, s35
+; GFX8-NEXT:    flat_load_dword v2, v[0:1]
+; GFX8-NEXT:    flat_load_dword v3, v[3:4]
+; GFX8-NEXT:    v_mov_b32_e32 v4, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:    v_mov_b32_e32 v5, s5
+; GFX8-NEXT:  .LBB121_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB121_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB121_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB121_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_min_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_min_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v4, s34
+; GFX7-NEXT:    v_mov_b32_e32 v5, s35
+; GFX7-NEXT:    flat_load_dword v3, v[0:1]
+; GFX7-NEXT:    flat_load_dword v2, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s7
+; GFX7-NEXT:    v_mov_b32_e32 v7, s6
+; GFX7-NEXT:  .LBB122_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB122_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v4, s34
+; GFX8-NEXT:    v_mov_b32_e32 v5, s35
+; GFX8-NEXT:    flat_load_dword v3, v[0:1]
+; GFX8-NEXT:    flat_load_dword v2, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s7
+; GFX8-NEXT:    v_mov_b32_e32 v7, s6
+; GFX8-NEXT:  .LBB122_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB122_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB122_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB122_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_min_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_min_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s4
+; GFX7-NEXT:    s_add_u32 s34, s4, 4
+; GFX7-NEXT:    v_mov_b32_e32 v1, s5
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v0, v[0:1]
+; GFX7-NEXT:    flat_load_dword v1, v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:  .LBB123_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB123_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    s_add_u32 s34, s4, 4
+; GFX8-NEXT:    v_mov_b32_e32 v1, s5
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v0, v[0:1]
+; GFX8-NEXT:    flat_load_dword v1, v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:  .LBB123_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB123_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB123_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB123_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_min_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_min_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    s_add_u32 s36, s4, 36
+; GFX7-NEXT:    s_addc_u32 s37, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s36
+; GFX7-NEXT:    v_mov_b32_e32 v1, s37
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[2:3]
+; GFX7-NEXT:    s_mov_b64 s[34:35], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s7
+; GFX7-NEXT:    v_mov_b32_e32 v5, s6
+; GFX7-NEXT:  .LBB124_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_cbranch_execnz .LBB124_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    s_add_u32 s36, s4, 36
+; GFX8-NEXT:    s_addc_u32 s37, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s36
+; GFX8-NEXT:    v_mov_b32_e32 v1, s37
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[2:3]
+; GFX8-NEXT:    s_mov_b64 s[34:35], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s7
+; GFX8-NEXT:    v_mov_b32_e32 v5, s6
+; GFX8-NEXT:  .LBB124_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_cbranch_execnz .LBB124_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[0:1], v[0:1] offset:32
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    s_mov_b64 s[34:35], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s7
+; GFX9-NEXT:    v_mov_b32_e32 v5, s6
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:  .LBB124_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v1
+; GFX9-NEXT:    v_mov_b32_e32 v8, v0
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[2:3], v[6:9] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[34:35], vcc, s[34:35]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_cbranch_execnz .LBB124_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[34:35]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_kernel void @atomic_min_i64_addr64_offset(ptr %out, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_min_i64_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0xd
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s4
+; GFX7-NEXT:    s_addc_u32 s1, s1, s5
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
+; GFX7-NEXT:  .LBB125_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB125_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x34
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[4:5], s[4:5], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s4
+; GFX8-NEXT:    s_addc_u32 s1, s1, s5
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:  .LBB125_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB125_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_min_i64_addr64_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x34
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[0:1], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v4, s0
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[4:5] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:  .LBB125_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB125_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_ret_addr64_offset(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s6
+; GFX7-NEXT:    s_addc_u32 s1, s1, s7
+; GFX7-NEXT:    s_add_u32 s0, s0, 32
+; GFX7-NEXT:    s_addc_u32 s1, s1, 0
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
+; GFX7-NEXT:  .LBB126_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB126_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s6
+; GFX8-NEXT:    s_addc_u32 s1, s1, s7
+; GFX8-NEXT:    s_add_u32 s0, s0, 32
+; GFX8-NEXT:    s_addc_u32 s1, s1, 0
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:  .LBB126_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB126_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_min_i64_ret_addr64_offset:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:  .LBB126_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB126_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %gep = getelementptr i64, ptr %ptr, i64 4
+  %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64(ptr %out, i64 %in) {
+; GFX7-LABEL: atomic_min_i64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v5, s1
+; GFX7-NEXT:    v_mov_b32_e32 v6, s3
+; GFX7-NEXT:    v_mov_b32_e32 v7, s2
+; GFX7-NEXT:    v_mov_b32_e32 v4, s0
+; GFX7-NEXT:  .LBB127_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX7-NEXT:    v_mov_b32_e32 v3, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v2, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB127_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x24
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v5, s1
+; GFX8-NEXT:    v_mov_b32_e32 v6, s3
+; GFX8-NEXT:    v_mov_b32_e32 v7, s2
+; GFX8-NEXT:    v_mov_b32_e32 v4, s0
+; GFX8-NEXT:  .LBB127_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[2:3], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX8-NEXT:    v_mov_b32_e32 v3, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v2, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB127_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_min_i64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0x24
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s4
+; GFX9-NEXT:    v_mov_b32_e32 v1, s5
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v4, s4
+; GFX9-NEXT:    v_mov_b32_e32 v6, s7
+; GFX9-NEXT:    v_mov_b32_e32 v7, s6
+; GFX9-NEXT:    v_mov_b32_e32 v5, s5
+; GFX9-NEXT:  .LBB127_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v1, v6, v3, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v0, v7, v2, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[0:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[2:3]
+; GFX9-NEXT:    v_mov_b32_e32 v3, v1
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v2, v0
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB127_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_endpgm
+entry:
+  %tmp0 = atomicrmw min ptr %out, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_kernel void @atomic_min_i64_ret_addr64(ptr %out, ptr %out2, i64 %in, i64 %index) {
+; GFX7-LABEL: atomic_min_i64_ret_addr64:
+; GFX7:       ; %bb.0: ; %entry
+; GFX7-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x9
+; GFX7-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX7-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX7-NEXT:    s_add_u32 s0, s0, s6
+; GFX7-NEXT:    s_addc_u32 s1, s1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v0, s0
+; GFX7-NEXT:    v_mov_b32_e32 v1, s1
+; GFX7-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_mov_b64 s[0:1], 0
+; GFX7-NEXT:    v_mov_b32_e32 v4, s5
+; GFX7-NEXT:    v_mov_b32_e32 v5, s4
+; GFX7-NEXT:  .LBB128_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v3
+; GFX7-NEXT:    v_mov_b32_e32 v8, v2
+; GFX7-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    s_cbranch_execnz .LBB128_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX7-NEXT:    v_mov_b32_e32 v0, s2
+; GFX7-NEXT:    v_mov_b32_e32 v1, s3
+; GFX7-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_endpgm
+;
+; GFX8-LABEL: atomic_min_i64_ret_addr64:
+; GFX8:       ; %bb.0: ; %entry
+; GFX8-NEXT:    s_load_dwordx8 s[0:7], s[2:3], 0x24
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_lshl_b64 s[6:7], s[6:7], 3
+; GFX8-NEXT:    s_add_u32 s0, s0, s6
+; GFX8-NEXT:    s_addc_u32 s1, s1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v0, s0
+; GFX8-NEXT:    v_mov_b32_e32 v1, s1
+; GFX8-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_mov_b64 s[0:1], 0
+; GFX8-NEXT:    v_mov_b32_e32 v4, s5
+; GFX8-NEXT:    v_mov_b32_e32 v5, s4
+; GFX8-NEXT:  .LBB128_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v3
+; GFX8-NEXT:    v_mov_b32_e32 v8, v2
+; GFX8-NEXT:    v_cmp_ge_i64_e32 vcc, s[4:5], v[8:9]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    s_cbranch_execnz .LBB128_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX8-NEXT:    v_mov_b32_e32 v0, s2
+; GFX8-NEXT:    v_mov_b32_e32 v1, s3
+; GFX8-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_endpgm
+;
+; GFX9-LABEL: atomic_min_i64_ret_addr64:
+; GFX9:       ; %bb.0: ; %entry
+; GFX9-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x24
+; GFX9-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX9-NEXT:    s_lshl_b64 s[0:1], s[10:11], 3
+; GFX9-NEXT:    s_add_u32 s0, s4, s0
+; GFX9-NEXT:    s_addc_u32 s1, s5, s1
+; GFX9-NEXT:    v_mov_b32_e32 v0, s0
+; GFX9-NEXT:    v_mov_b32_e32 v1, s1
+; GFX9-NEXT:    flat_load_dwordx2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_mov_b64 s[0:1], 0
+; GFX9-NEXT:    v_mov_b32_e32 v4, s9
+; GFX9-NEXT:    v_mov_b32_e32 v5, s8
+; GFX9-NEXT:  .LBB128_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v9, v3
+; GFX9-NEXT:    v_mov_b32_e32 v8, v2
+; GFX9-NEXT:    v_cmp_ge_i64_e32 vcc, s[8:9], v[8:9]
+; GFX9-NEXT:    v_cndmask_b32_e32 v7, v4, v9, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v6, v5, v8, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[2:3], v[0:1], v[6:9] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[2:3], v[8:9]
+; GFX9-NEXT:    s_or_b64 s[0:1], vcc, s[0:1]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    s_cbranch_execnz .LBB128_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[0:1]
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    flat_store_dwordx2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_endpgm
+entry:
+  %ptr = getelementptr i64, ptr %out, i64 %index
+  %tmp0 = atomicrmw min ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1, !noalias.addrspace !1
+  store i64 %tmp0, ptr %out2
+  ret void
+}
+
+define void @flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v8, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v7, v[0:1]
+; GFX7-NEXT:    flat_load_dword v6, v[8:9]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB129_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX7-NEXT:    v_mov_b32_e32 v7, v1
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    v_mov_b32_e32 v6, v0
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB129_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v8, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v9, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v7, v[0:1]
+; GFX8-NEXT:    flat_load_dword v6, v[8:9]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB129_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[8:9], v[4:7] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[6:7]
+; GFX8-NEXT:    v_mov_b32_e32 v7, v1
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    v_mov_b32_e32 v6, v0
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB129_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[6:7], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB129_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB129_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v4, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 36, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_load_dword v1, v[0:1]
+; GFX7-NEXT:    flat_load_dword v0, v[4:5]
+; GFX7-NEXT:    s_mov_b64 s[4:5], 0
+; GFX7-NEXT:  .LBB130_1: ; %atomicrmw.start
+; GFX7-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v9, v1
+; GFX7-NEXT:    v_mov_b32_e32 v8, v0
+; GFX7-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; GFX7-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX7-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX7-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX7-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX7-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_cbranch_execnz .LBB130_1
+; GFX7-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX7-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v4, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v5, vcc, 0, v1, vcc
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 36, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_load_dword v1, v[0:1]
+; GFX8-NEXT:    flat_load_dword v0, v[4:5]
+; GFX8-NEXT:    s_mov_b64 s[4:5], 0
+; GFX8-NEXT:  .LBB130_1: ; %atomicrmw.start
+; GFX8-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v9, v1
+; GFX8-NEXT:    v_mov_b32_e32 v8, v0
+; GFX8-NEXT:    v_cmp_le_i64_e32 vcc, v[8:9], v[2:3]
+; GFX8-NEXT:    v_cndmask_b32_e32 v7, v3, v9, vcc
+; GFX8-NEXT:    v_cndmask_b32_e32 v6, v2, v8, vcc
+; GFX8-NEXT:    flat_atomic_cmpswap_x2 v[0:1], v[4:5], v[6:9] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    v_cmp_eq_u64_e32 vcc, v[0:1], v[8:9]
+; GFX8-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX8-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_cbranch_execnz .LBB130_1
+; GFX8-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX8-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_min_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_load_dwordx2 v[4:5], v[0:1] offset:32
+; GFX9-NEXT:    s_mov_b64 s[4:5], 0
+; GFX9-NEXT:  .LBB130_1: ; %atomicrmw.start
+; GFX9-NEXT:    ; =>This Inner Loop Header: Depth=1
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v7, v5
+; GFX9-NEXT:    v_mov_b32_e32 v6, v4
+; GFX9-NEXT:    v_cmp_le_i64_e32 vcc, v[6:7], v[2:3]
+; GFX9-NEXT:    v_cndmask_b32_e32 v5, v3, v7, vcc
+; GFX9-NEXT:    v_cndmask_b32_e32 v4, v2, v6, vcc
+; GFX9-NEXT:    flat_atomic_cmpswap_x2 v[4:5], v[0:1], v[4:7] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    v_cmp_eq_u64_e32 vcc, v[4:5], v[6:7]
+; GFX9-NEXT:    s_or_b64 s[4:5], vcc, s[4:5]
+; GFX9-NEXT:    s_andn2_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    s_cbranch_execnz .LBB130_1
+; GFX9-NEXT:  ; %bb.2: ; %atomicrmw.end
+; GFX9-NEXT:    s_or_b64 exec, exec, s[4:5]
+; GFX9-NEXT:    v_mov_b32_e32 v0, v4
+; GFX9-NEXT:    v_mov_b32_e32 v1, v5
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw min ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw uinc_wrap
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_uinc_wrap_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_uinc_wrap_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_uinc_wrap_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_uinc_wrap_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_uinc_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_inc_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw uinc_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_uinc_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_uinc_wrap_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_inc_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw uinc_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+; ---------------------------------------------------------------------
+; atomicrmw udec_wrap
+; ---------------------------------------------------------------------
+
+define void @flat_atomic_udec_wrap_i64_noret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define void @flat_atomic_udec_wrap_i64_noret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_udec_wrap_i64_ret(ptr %ptr, i64 %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define i64 @flat_atomic_udec_wrap_i64_ret_offset(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %tmp0 = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx void @flat_atomic_udec_wrap_i64_noret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_dec_x2 v[2:3], v[0:1] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret void
+}
+
+define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_scalar(ptr inreg %ptr, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v2, s4
+; GFX7-NEXT:    v_mov_b32_e32 v3, s5
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v2, s4
+; GFX8-NEXT:    v_mov_b32_e32 v3, s5
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %result = atomicrmw udec_wrap ptr %ptr, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define amdgpu_gfx i64 @flat_atomic_udec_wrap_i64_ret_offset_scalar(ptr inreg %out, i64 inreg %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    s_add_u32 s34, s4, 32
+; GFX7-NEXT:    s_addc_u32 s35, s5, 0
+; GFX7-NEXT:    v_mov_b32_e32 v2, s34
+; GFX7-NEXT:    v_mov_b32_e32 v0, s6
+; GFX7-NEXT:    v_mov_b32_e32 v1, s7
+; GFX7-NEXT:    v_mov_b32_e32 v3, s35
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    s_add_u32 s34, s4, 32
+; GFX8-NEXT:    s_addc_u32 s35, s5, 0
+; GFX8-NEXT:    v_mov_b32_e32 v2, s34
+; GFX8-NEXT:    v_mov_b32_e32 v0, s6
+; GFX8-NEXT:    v_mov_b32_e32 v1, s7
+; GFX8-NEXT:    v_mov_b32_e32 v3, s35
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset_scalar:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    v_mov_b32_e32 v0, s6
+; GFX9-NEXT:    v_mov_b32_e32 v1, s7
+; GFX9-NEXT:    v_mov_b32_e32 v2, s4
+; GFX9-NEXT:    v_mov_b32_e32 v3, s5
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3], v[0:1] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !noalias.addrspace !1
+  ret i64 %result
+}
+
+define void @flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3]
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_noret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[2:3] offset:32
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %tmp0 = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret void
+}
+
+define i64 @flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory(ptr %out, i64 %in) {
+; GFX7-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX7:       ; %bb.0:
+; GFX7-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    v_add_i32_e32 v0, vcc, 32, v0
+; GFX7-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX7-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX7-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX7-NEXT:    buffer_wbinvl1_vol
+; GFX7-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX8-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    v_add_u32_e32 v0, vcc, 32, v0
+; GFX8-NEXT:    v_addc_u32_e32 v1, vcc, 0, v1, vcc
+; GFX8-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] glc
+; GFX8-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX8-NEXT:    buffer_wbinvl1_vol
+; GFX8-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX9-LABEL: flat_atomic_udec_wrap_i64_ret_offset__amdgpu_no_remote_memory:
+; GFX9:       ; %bb.0:
+; GFX9-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    flat_atomic_dec_x2 v[0:1], v[0:1], v[2:3] offset:32 glc
+; GFX9-NEXT:    s_waitcnt vmcnt(0) lgkmcnt(0)
+; GFX9-NEXT:    buffer_wbinvl1_vol
+; GFX9-NEXT:    s_setpc_b64 s[30:31]
+  %gep = getelementptr i64, ptr %out, i64 4
+  %result = atomicrmw udec_wrap ptr %gep, i64 %in seq_cst, !amdgpu.no.remote.memory !0, !noalias.addrspace !1
+  ret i64 %result
+}
+
+!0 = !{}
+!1 = !{i32 5, i32 6}
diff --git a/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
new file mode 100644
index 000000000000..3de502874d32
--- /dev/null
+++ b/llvm/test/Transforms/AtomicExpand/AMDGPU/expand-atomicrmw-flat-noalias-addrspace.ll
@@ -0,0 +1,1523 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx700 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX7 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx900 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX900 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx908 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX908 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx90a -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX90A %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx940 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX940 %s
+; RUN: opt -S -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1200 -passes=atomic-expand %s | FileCheck -check-prefixes=ALL,GFX12 %s
+
+; --------------------------------------------------------------------
+; Idempotent expansion cases without noalias.addrspace
+; --------------------------------------------------------------------
+
+define i64 @test_flat_atomicrmw_add_0_i64_agent(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0:[0-9]+]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0:![0-9]+]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_sub_0_i64_agent(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_or_0_i64_agent(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_xor_0_i64_agent(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+; --------------------------------------------------------------------
+; Idempotent expansion cases with noalias.addrspace
+; --------------------------------------------------------------------
+
+define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_add_0_i64_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1:![0-9]+]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_sub_0_i64_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw sub ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw sub ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_or_0_i64_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw or ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw or ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(ptr %ptr) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_xor_0_i64_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xor ptr [[PTR]], i64 0 syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xor ptr %ptr, i64 0 syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for add
+; --------------------------------------------------------------------
+
+define i64 @test_flat_atomicrmw_add_i64_agent(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_add_i64_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw add ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw add ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) {
+; ALL-LABEL: define i32 @test_flat_atomicrmw_add_i32_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw add ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw add ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for xchg
+; --------------------------------------------------------------------
+
+define i64 @test_flat_atomicrmw_xchg_i64_agent(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_xchg_i64_agent__noalias_xchgrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw xchg ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(ptr %ptr, i32 %value) {
+; ALL-LABEL: define i32 @test_flat_atomicrmw_xchg_i32_agent__noalias_xchgrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw xchg ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for xchg (pointer type)
+; --------------------------------------------------------------------
+
+define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(ptr %ptr, ptr addrspace(1) %value) {
+; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent(
+; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret ptr addrspace(1) [[RES]]
+;
+  %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret ptr addrspace(1) %res
+}
+
+define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(1) %value) {
+; ALL-LABEL: define ptr addrspace(1) @test_flat_atomicrmw_xchg_p1_agent__noalias_xchgrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(1) [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(1) [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret ptr addrspace(1) [[RES]]
+;
+  %res = atomicrmw xchg ptr %ptr, ptr addrspace(1) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret ptr addrspace(1) %res
+}
+
+define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5(ptr %ptr, ptr addrspace(3) %value) {
+; ALL-LABEL: define ptr addrspace(3) @test_flat_atomicrmw_xchg_p3_agent__noalias_xchgrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], ptr addrspace(3) [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw xchg ptr [[PTR]], ptr addrspace(3) [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret ptr addrspace(3) [[RES]]
+;
+  %res = atomicrmw xchg ptr %ptr, ptr addrspace(3) %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret ptr addrspace(3) %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for and
+; --------------------------------------------------------------------
+
+define i64 @test_flat_atomicrmw_and_i64_agent(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_and_i64_agent__noalias_addrspace_5__maybe_fine_grained(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw and ptr [[PTR]], i64 [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]]
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw and ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1
+  ret i64 %res
+}
+
+
+define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) {
+; ALL-LABEL: define i32 @test_flat_atomicrmw_and_i32_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[RES:%.*]] = atomicrmw and ptr [[PTR]], i32 [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; ALL-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw and ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for fadd
+; --------------------------------------------------------------------
+
+define double @test_flat_atomicrmw_fadd_f64_agent(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX7-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX7-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX7-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret double [[TMP5]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[RES]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX7-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX7-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX7-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret double [[TMP5]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[RES]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX7-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX7-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX7-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret double [[TMP5]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX90A-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX90A:       [[ATOMICRMW_START]]:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fadd_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = fadd double [[LOADED]], [[VALUE]]
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fadd ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1
+  ret double %res
+}
+
+define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) {
+; GFX7-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX7-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX7-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX7-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret float [[TMP5]]
+;
+; GFX900-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd float [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[IS_SHARED:%.*]] = call i1 @llvm.amdgcn.is.shared(ptr [[PTR]])
+; GFX90A-NEXT:    br i1 [[IS_SHARED]], label %[[ATOMICRMW_SHARED:.*]], label %[[ATOMICRMW_CHECK_PRIVATE:.*]]
+; GFX90A:       [[ATOMICRMW_SHARED]]:
+; GFX90A-NEXT:    [[TMP1:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(3)
+; GFX90A-NEXT:    [[TMP2:%.*]] = atomicrmw fadd ptr addrspace(3) [[TMP1]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT:    br label %[[ATOMICRMW_PHI:.*]]
+; GFX90A:       [[ATOMICRMW_CHECK_PRIVATE]]:
+; GFX90A-NEXT:    [[IS_PRIVATE:%.*]] = call i1 @llvm.amdgcn.is.private(ptr [[PTR]])
+; GFX90A-NEXT:    br i1 [[IS_PRIVATE]], label %[[ATOMICRMW_PRIVATE:.*]], label %[[ATOMICRMW_GLOBAL:.*]]
+; GFX90A:       [[ATOMICRMW_PRIVATE]]:
+; GFX90A-NEXT:    [[TMP3:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(5)
+; GFX90A-NEXT:    [[LOADED_PRIVATE:%.*]] = load float, ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd float [[LOADED_PRIVATE]], [[VALUE]]
+; GFX90A-NEXT:    store float [[NEW]], ptr addrspace(5) [[TMP3]], align 4
+; GFX90A-NEXT:    br label %[[ATOMICRMW_PHI]]
+; GFX90A:       [[ATOMICRMW_GLOBAL]]:
+; GFX90A-NEXT:    [[TMP4:%.*]] = addrspacecast ptr [[PTR]] to ptr addrspace(1)
+; GFX90A-NEXT:    [[TMP5:%.*]] = atomicrmw fadd ptr addrspace(1) [[TMP4]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX90A-NEXT:    br label %[[ATOMICRMW_PHI]]
+; GFX90A:       [[ATOMICRMW_PHI]]:
+; GFX90A-NEXT:    [[LOADED_PHI:%.*]] = phi float [ [[TMP2]], %[[ATOMICRMW_SHARED]] ], [ [[LOADED_PRIVATE]], %[[ATOMICRMW_PRIVATE]] ], [ [[TMP5]], %[[ATOMICRMW_GLOBAL]] ]
+; GFX90A-NEXT:    br label %[[ATOMICRMW_END:.*]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+;
+; GFX940-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_flat_atomicrmw_fadd_f32_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]], !amdgpu.ignore.denormal.mode [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fadd ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0, !amdgpu.ignore.denormal.mode !0
+  ret float %res
+}
+
+define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(ptr %ptr, <2 x half> %value) {
+; GFX7-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX7-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX7-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX7-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret <2 x half> [[TMP5]]
+;
+; GFX900-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret <2 x half> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret <2 x half> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load <2 x half>, ptr [[PTR]], align 4
+; GFX90A-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX90A:       [[ATOMICRMW_START]]:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi <2 x half> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd <2 x half> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast <2 x half> [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast <2 x half> [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x half>
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret <2 x half> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret <2 x half> [[RES]]
+;
+; GFX12-LABEL: define <2 x half> @test_flat_atomicrmw_fadd_v2f16_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], <2 x half> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x half> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret <2 x half> [[RES]]
+;
+  %res = atomicrmw fadd ptr %ptr, <2 x half> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret <2 x half> %res
+}
+
+define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(ptr %ptr, <2 x bfloat> %value) {
+; GFX7-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX7-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX7-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX7-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+; GFX900-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+; GFX908-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+; GFX90A-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load <2 x bfloat>, ptr [[PTR]], align 4
+; GFX90A-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX90A:       [[ATOMICRMW_START]]:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi <2 x bfloat> [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = fadd <2 x bfloat> [[LOADED]], [[VALUE]]
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast <2 x bfloat> [[NEW]] to i32
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast <2 x bfloat> [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to <2 x bfloat>
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret <2 x bfloat> [[TMP5]]
+;
+; GFX940-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret <2 x bfloat> [[RES]]
+;
+; GFX12-LABEL: define <2 x bfloat> @test_flat_atomicrmw_fadd_v2bf16_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], <2 x bfloat> [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fadd ptr [[PTR]], <2 x bfloat> [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret <2 x bfloat> [[RES]]
+;
+  %res = atomicrmw fadd ptr %ptr, <2 x bfloat> %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret <2 x bfloat> %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for fmin
+; --------------------------------------------------------------------
+
+define double @test_flat_atomicrmw_fmin_f64_agent(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX7-NEXT:    ret double [[RES]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[RES]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX7-NEXT:    ret double [[RES]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[RES]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[TMP2:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX7-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX7-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX7-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret double [[TMP6]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX90A-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX90A:       [[ATOMICRMW_START]]:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fmin_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = call double @llvm.minnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fmin ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1
+  ret double %res
+}
+
+define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) {
+; GFX7-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX7-NEXT:    ret float [[RES]]
+;
+; GFX900-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX90A-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX90A:       [[ATOMICRMW_START]]:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[LOADED_PHI:%.*]], %[[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+;
+; GFX940-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX940-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX940:       [[ATOMICRMW_START]]:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.minnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX940:       [[ATOMICRMW_END]]:
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_flat_atomicrmw_fmin_f32_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmin ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmin ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for fmax
+; --------------------------------------------------------------------
+
+define double @test_flat_atomicrmw_fmax_f64_agent(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX7-NEXT:    ret double [[RES]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[RES]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX7-NEXT:    ret double [[RES]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[RES]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[RES]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX90A-NEXT:    ret double [[RES]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret double %res
+}
+
+define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, double %value) {
+; GFX7-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX7-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX7-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX7:       [[ATOMICRMW_START]]:
+; GFX7-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP6:%.*]], %[[ATOMICRMW_START]] ]
+; GFX7-NEXT:    [[TMP2:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX7-NEXT:    [[TMP3:%.*]] = bitcast double [[TMP2]] to i64
+; GFX7-NEXT:    [[TMP4:%.*]] = bitcast double [[LOADED]] to i64
+; GFX7-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP4]], i64 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX7-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP5]], 1
+; GFX7-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP5]], 0
+; GFX7-NEXT:    [[TMP6]] = bitcast i64 [[NEWLOADED]] to double
+; GFX7-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX7:       [[ATOMICRMW_END]]:
+; GFX7-NEXT:    ret double [[TMP6]]
+;
+; GFX900-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX900-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret double [[TMP5]]
+;
+; GFX908-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX908-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret double [[TMP5]]
+;
+; GFX90A-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX90A-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX90A-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX90A:       [[ATOMICRMW_START]]:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX90A-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX90A-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX90A-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret double [[TMP5]]
+;
+; GFX940-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX940-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], double [[VALUE]] syncscope("agent") seq_cst, align 8, !noalias.addrspace [[META1]]
+; GFX940-NEXT:    ret double [[RES]]
+;
+; GFX12-LABEL: define double @test_flat_atomicrmw_fmax_f64_agent__noalias_addrspace_5__maybe_fine_grained(
+; GFX12-SAME: ptr [[PTR:%.*]], double [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[TMP1:%.*]] = load double, ptr [[PTR]], align 8
+; GFX12-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX12:       [[ATOMICRMW_START]]:
+; GFX12-NEXT:    [[LOADED:%.*]] = phi double [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX12-NEXT:    [[NEW:%.*]] = call double @llvm.maxnum.f64(double [[LOADED]], double [[VALUE]])
+; GFX12-NEXT:    [[TMP2:%.*]] = bitcast double [[NEW]] to i64
+; GFX12-NEXT:    [[TMP3:%.*]] = bitcast double [[LOADED]] to i64
+; GFX12-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i64 [[TMP3]], i64 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 8
+; GFX12-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP4]], 1
+; GFX12-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i64, i1 } [[TMP4]], 0
+; GFX12-NEXT:    [[TMP5]] = bitcast i64 [[NEWLOADED]] to double
+; GFX12-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX12:       [[ATOMICRMW_END]]:
+; GFX12-NEXT:    ret double [[TMP5]]
+;
+  %res = atomicrmw fmax ptr %ptr, double %value syncscope("agent") seq_cst, !noalias.addrspace !1
+  ret double %res
+}
+
+define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(ptr %ptr, float %value) {
+; GFX7-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(
+; GFX7-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX7-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX7-NEXT:    ret float [[RES]]
+;
+; GFX900-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(
+; GFX900-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX900-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX900-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX900:       [[ATOMICRMW_START]]:
+; GFX900-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX900-NEXT:    [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX900-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX900-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX900-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX900-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX900-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX900-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX900-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX900:       [[ATOMICRMW_END]]:
+; GFX900-NEXT:    ret float [[TMP5]]
+;
+; GFX908-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(
+; GFX908-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX908-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX908-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX908:       [[ATOMICRMW_START]]:
+; GFX908-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[TMP5:%.*]], %[[ATOMICRMW_START]] ]
+; GFX908-NEXT:    [[NEW:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX908-NEXT:    [[TMP2:%.*]] = bitcast float [[NEW]] to i32
+; GFX908-NEXT:    [[TMP3:%.*]] = bitcast float [[LOADED]] to i32
+; GFX908-NEXT:    [[TMP4:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP3]], i32 [[TMP2]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX908-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP4]], 1
+; GFX908-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP4]], 0
+; GFX908-NEXT:    [[TMP5]] = bitcast i32 [[NEWLOADED]] to float
+; GFX908-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX908:       [[ATOMICRMW_END]]:
+; GFX908-NEXT:    ret float [[TMP5]]
+;
+; GFX90A-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(
+; GFX90A-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX90A-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX90A-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX90A:       [[ATOMICRMW_START]]:
+; GFX90A-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[LOADED_PHI:%.*]], %[[ATOMICRMW_START]] ]
+; GFX90A-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX90A-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX90A-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX90A-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX90A-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX90A-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX90A-NEXT:    [[LOADED_PHI]] = bitcast i32 [[NEWLOADED]] to float
+; GFX90A-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX90A:       [[ATOMICRMW_END]]:
+; GFX90A-NEXT:    ret float [[LOADED_PHI]]
+;
+; GFX940-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(
+; GFX940-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX940-NEXT:    [[TMP1:%.*]] = load float, ptr [[PTR]], align 4
+; GFX940-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; GFX940:       [[ATOMICRMW_START]]:
+; GFX940-NEXT:    [[LOADED:%.*]] = phi float [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; GFX940-NEXT:    [[TMP2:%.*]] = call float @llvm.maxnum.f32(float [[LOADED]], float [[VALUE]])
+; GFX940-NEXT:    [[TMP3:%.*]] = bitcast float [[TMP2]] to i32
+; GFX940-NEXT:    [[TMP4:%.*]] = bitcast float [[LOADED]] to i32
+; GFX940-NEXT:    [[TMP5:%.*]] = cmpxchg ptr [[PTR]], i32 [[TMP4]], i32 [[TMP3]] syncscope("agent") seq_cst seq_cst, align 4
+; GFX940-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP5]], 1
+; GFX940-NEXT:    [[NEWLOADED:%.*]] = extractvalue { i32, i1 } [[TMP5]], 0
+; GFX940-NEXT:    [[RES]] = bitcast i32 [[NEWLOADED]] to float
+; GFX940-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; GFX940:       [[ATOMICRMW_END]]:
+; GFX940-NEXT:    ret float [[RES]]
+;
+; GFX12-LABEL: define float @test_flat_atomicrmw_fmax_f32_agent__noalias_addrspace_5(
+; GFX12-SAME: ptr [[PTR:%.*]], float [[VALUE:%.*]]) #[[ATTR0]] {
+; GFX12-NEXT:    [[RES:%.*]] = atomicrmw fmax ptr [[PTR]], float [[VALUE]] syncscope("agent") seq_cst, align 4, !noalias.addrspace [[META1]], !amdgpu.no.fine.grained.memory [[META0]]
+; GFX12-NEXT:    ret float [[RES]]
+;
+  %res = atomicrmw fmax ptr %ptr, float %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret float %res
+}
+
+; --------------------------------------------------------------------
+; General expansion for nand
+; --------------------------------------------------------------------
+
+define i64 @test_flat_atomicrmw_nand_i64_agent(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8
+; ALL-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; ALL:       [[ATOMICRMW_START]]:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
+; ALL-NEXT:    [[NEW:%.*]] = xor i64 [[TMP2]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; ALL-NEXT:    [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; ALL:       [[ATOMICRMW_END]]:
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8
+; ALL-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; ALL:       [[ATOMICRMW_START]]:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
+; ALL-NEXT:    [[NEW:%.*]] = xor i64 [[TMP2]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; ALL-NEXT:    [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; ALL:       [[ATOMICRMW_END]]:
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i64 %res
+}
+
+define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_grained(ptr %ptr, i64 %value) {
+; ALL-LABEL: define i64 @test_flat_atomicrmw_nand_i64_agent__noalias_addrspace_5__maybe_fine_grained(
+; ALL-SAME: ptr [[PTR:%.*]], i64 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[TMP1:%.*]] = load i64, ptr [[PTR]], align 8
+; ALL-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; ALL:       [[ATOMICRMW_START]]:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i64 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[TMP2:%.*]] = and i64 [[LOADED]], [[VALUE]]
+; ALL-NEXT:    [[NEW:%.*]] = xor i64 [[TMP2]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i64 [[LOADED]], i64 [[NEW]] syncscope("agent") seq_cst seq_cst, align 8
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i64, i1 } [[TMP3]], 1
+; ALL-NEXT:    [[RES]] = extractvalue { i64, i1 } [[TMP3]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; ALL:       [[ATOMICRMW_END]]:
+; ALL-NEXT:    ret i64 [[RES]]
+;
+  %res = atomicrmw nand ptr %ptr, i64 %value syncscope("agent") seq_cst, !noalias.addrspace !1
+  ret i64 %res
+}
+
+
+define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(ptr %ptr, i32 %value) {
+; ALL-LABEL: define i32 @test_flat_atomicrmw_nand_i32_agent__noalias_addrspace_5(
+; ALL-SAME: ptr [[PTR:%.*]], i32 [[VALUE:%.*]]) #[[ATTR0]] {
+; ALL-NEXT:    [[TMP1:%.*]] = load i32, ptr [[PTR]], align 4
+; ALL-NEXT:    br label %[[ATOMICRMW_START:.*]]
+; ALL:       [[ATOMICRMW_START]]:
+; ALL-NEXT:    [[LOADED:%.*]] = phi i32 [ [[TMP1]], [[TMP0:%.*]] ], [ [[RES:%.*]], %[[ATOMICRMW_START]] ]
+; ALL-NEXT:    [[TMP2:%.*]] = and i32 [[LOADED]], [[VALUE]]
+; ALL-NEXT:    [[NEW:%.*]] = xor i32 [[TMP2]], -1
+; ALL-NEXT:    [[TMP3:%.*]] = cmpxchg ptr [[PTR]], i32 [[LOADED]], i32 [[NEW]] syncscope("agent") seq_cst seq_cst, align 4
+; ALL-NEXT:    [[SUCCESS:%.*]] = extractvalue { i32, i1 } [[TMP3]], 1
+; ALL-NEXT:    [[RES]] = extractvalue { i32, i1 } [[TMP3]], 0
+; ALL-NEXT:    br i1 [[SUCCESS]], label %[[ATOMICRMW_END:.*]], label %[[ATOMICRMW_START]]
+; ALL:       [[ATOMICRMW_END]]:
+; ALL-NEXT:    ret i32 [[RES]]
+;
+  %res = atomicrmw nand ptr %ptr, i32 %value syncscope("agent") seq_cst, !noalias.addrspace !1, !amdgpu.no.fine.grained.memory !0
+  ret i32 %res
+}
+
+!0 = !{}
+!1 = !{i32 5, i32 6}
+
+;.
+; GFX7: [[META0]] = !{}
+; GFX7: [[META1]] = !{i32 5, i32 6}
+;.
+; GFX900: [[META0]] = !{}
+; GFX900: [[META1]] = !{i32 5, i32 6}
+;.
+; GFX908: [[META0]] = !{}
+; GFX908: [[META1]] = !{i32 5, i32 6}
+;.
+; GFX90A: [[META0]] = !{}
+; GFX90A: [[META1]] = !{i32 5, i32 6}
+;.
+; GFX940: [[META0]] = !{}
+; GFX940: [[META1]] = !{i32 5, i32 6}
+;.
+; GFX12: [[META0]] = !{}
+; GFX12: [[META1]] = !{i32 5, i32 6}
+;.
-- 
GitLab


From 84ee629bc515e5a2247043c668c7da38447c20e9 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Tue, 15 Oct 2024 22:46:24 +0400
Subject: [PATCH 013/329] clang: Remove some pointer bitcasts (#112324)

Obsolete since opaque pointers.
---
 clang/lib/CodeGen/CGBuiltin.cpp | 39 ++++++++++-----------------------
 clang/lib/CodeGen/CGExprCXX.cpp | 11 +---------
 2 files changed, 13 insertions(+), 37 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index c563f2618b42..157e743a39bf 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -1288,9 +1288,8 @@ static llvm::Value *EmitBitTestIntrinsic(CodeGenFunction &CGF,
   // Bit = BitBaseI8[BitPos >> 3] & (1 << (BitPos & 0x7)) != 0;
   Value *ByteIndex = CGF.Builder.CreateAShr(
       BitPos, llvm::ConstantInt::get(BitPos->getType(), 3), "bittest.byteidx");
-  Value *BitBaseI8 = CGF.Builder.CreatePointerCast(BitBase, CGF.Int8PtrTy);
-  Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBaseI8,
-                                                 ByteIndex, "bittest.byteaddr"),
+  Address ByteAddr(CGF.Builder.CreateInBoundsGEP(CGF.Int8Ty, BitBase, ByteIndex,
+                                                 "bittest.byteaddr"),
                    CGF.Int8Ty, CharUnits::One());
   Value *PosLow =
       CGF.Builder.CreateAnd(CGF.Builder.CreateTrunc(BitPos, CGF.Int8Ty),
@@ -5658,14 +5657,13 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
             *Arg3 = EmitScalarExpr(E->getArg(3));
       llvm::FunctionType *FTy = llvm::FunctionType::get(
           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
-      Value *BCast = Builder.CreatePointerCast(Arg3, I8PTy);
       // We know the third argument is an integer type, but we may need to cast
       // it to i32.
       if (Arg2->getType() != Int32Ty)
         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
       return RValue::get(
           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
-                          {Arg0, Arg1, Arg2, BCast, PacketSize, PacketAlign}));
+                          {Arg0, Arg1, Arg2, Arg3, PacketSize, PacketAlign}));
     }
   }
   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
@@ -11317,7 +11315,6 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     Value *Dst = EmitScalarExpr(E->getArg(0));
     Value *Val = EmitScalarExpr(E->getArg(1));
     Value *Size = EmitScalarExpr(E->getArg(2));
-    Dst = Builder.CreatePointerCast(Dst, Int8PtrTy);
     Val = Builder.CreateTrunc(Val, Int8Ty);
     Size = Builder.CreateIntCast(Size, Int64Ty, false);
     return Builder.CreateCall(
@@ -11342,34 +11339,27 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
   }
 
   if (MTEIntrinsicID != Intrinsic::not_intrinsic) {
-    llvm::Type *T = ConvertType(E->getType());
-
     if (MTEIntrinsicID == Intrinsic::aarch64_irg) {
       Value *Pointer = EmitScalarExpr(E->getArg(0));
       Value *Mask = EmitScalarExpr(E->getArg(1));
 
-      Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
       Mask = Builder.CreateZExt(Mask, Int64Ty);
-      Value *RV = Builder.CreateCall(
-                       CGM.getIntrinsic(MTEIntrinsicID), {Pointer, Mask});
-       return Builder.CreatePointerCast(RV, T);
+      return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
+                                {Pointer, Mask});
     }
     if (MTEIntrinsicID == Intrinsic::aarch64_addg) {
       Value *Pointer = EmitScalarExpr(E->getArg(0));
       Value *TagOffset = EmitScalarExpr(E->getArg(1));
 
-      Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
       TagOffset = Builder.CreateZExt(TagOffset, Int64Ty);
-      Value *RV = Builder.CreateCall(
-                       CGM.getIntrinsic(MTEIntrinsicID), {Pointer, TagOffset});
-      return Builder.CreatePointerCast(RV, T);
+      return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
+                                {Pointer, TagOffset});
     }
     if (MTEIntrinsicID == Intrinsic::aarch64_gmi) {
       Value *Pointer = EmitScalarExpr(E->getArg(0));
       Value *ExcludedMask = EmitScalarExpr(E->getArg(1));
 
       ExcludedMask = Builder.CreateZExt(ExcludedMask, Int64Ty);
-      Pointer = Builder.CreatePointerCast(Pointer, Int8PtrTy);
       return Builder.CreateCall(
                        CGM.getIntrinsic(MTEIntrinsicID), {Pointer, ExcludedMask});
     }
@@ -11378,25 +11368,20 @@ Value *CodeGenFunction::EmitAArch64BuiltinExpr(unsigned BuiltinID,
     // return address same as input address.
     if (MTEIntrinsicID == Intrinsic::aarch64_ldg) {
       Value *TagAddress = EmitScalarExpr(E->getArg(0));
-      TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
-      Value *RV = Builder.CreateCall(
-                    CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
-      return Builder.CreatePointerCast(RV, T);
+      return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
+                                {TagAddress, TagAddress});
     }
     // Although it is possible to supply a different tag (to set)
     // to this intrinsic (as first arg), for now we supply
     // the tag that is in input address arg (common use case).
     if (MTEIntrinsicID == Intrinsic::aarch64_stg) {
-        Value *TagAddress = EmitScalarExpr(E->getArg(0));
-        TagAddress = Builder.CreatePointerCast(TagAddress, Int8PtrTy);
-        return Builder.CreateCall(
-                 CGM.getIntrinsic(MTEIntrinsicID), {TagAddress, TagAddress});
+      Value *TagAddress = EmitScalarExpr(E->getArg(0));
+      return Builder.CreateCall(CGM.getIntrinsic(MTEIntrinsicID),
+                                {TagAddress, TagAddress});
     }
     if (MTEIntrinsicID == Intrinsic::aarch64_subp) {
       Value *PointerA = EmitScalarExpr(E->getArg(0));
       Value *PointerB = EmitScalarExpr(E->getArg(1));
-      PointerA = Builder.CreatePointerCast(PointerA, Int8PtrTy);
-      PointerB = Builder.CreatePointerCast(PointerB, Int8PtrTy);
       return Builder.CreateCall(
                        CGM.getIntrinsic(MTEIntrinsicID), {PointerA, PointerB});
     }
diff --git a/clang/lib/CodeGen/CGExprCXX.cpp b/clang/lib/CodeGen/CGExprCXX.cpp
index 1214bb054fb8..648b9b9ed980 100644
--- a/clang/lib/CodeGen/CGExprCXX.cpp
+++ b/clang/lib/CodeGen/CGExprCXX.cpp
@@ -1771,14 +1771,6 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
   EmitNewInitializer(*this, E, allocType, elementTy, result, numElements,
                      allocSizeWithoutCookie);
   llvm::Value *resultPtr = result.emitRawPointer(*this);
-  if (E->isArray()) {
-    // NewPtr is a pointer to the base element type.  If we're
-    // allocating an array of arrays, we'll need to cast back to the
-    // array pointer type.
-    llvm::Type *resultType = ConvertTypeForMem(E->getType());
-    if (resultPtr->getType() != resultType)
-      resultPtr = Builder.CreateBitCast(resultPtr, resultType);
-  }
 
   // Deactivate the 'operator delete' cleanup if we finished
   // initialization.
@@ -1805,7 +1797,7 @@ llvm::Value *CodeGenFunction::EmitCXXNewExpr(const CXXNewExpr *E) {
 }
 
 void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD,
-                                     llvm::Value *Ptr, QualType DeleteTy,
+                                     llvm::Value *DeletePtr, QualType DeleteTy,
                                      llvm::Value *NumElements,
                                      CharUnits CookieSize) {
   assert((!NumElements && CookieSize.isZero()) ||
@@ -1819,7 +1811,6 @@ void CodeGenFunction::EmitDeleteCall(const FunctionDecl *DeleteFD,
 
   // Pass the pointer itself.
   QualType ArgTy = *ParamTypeIt++;
-  llvm::Value *DeletePtr = Builder.CreateBitCast(Ptr, ConvertType(ArgTy));
   DeleteArgs.add(RValue::get(DeletePtr), ArgTy);
 
   // Pass the std::destroying_delete tag if present.
-- 
GitLab


From f7468a2531dae75d0e18ce22a36bfd6e7d7588ff Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Tue, 15 Oct 2024 11:57:17 -0700
Subject: [PATCH 014/329] [RISCV][VLOpt] Correct the printing of LMUL in the
 debug messages. (#112413)

---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 088f6d62dcbe..53373b7a0f15 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -114,10 +114,10 @@ struct OperandInfo {
       return;
     }
     assert(EMUL && "Expected EMUL to have value");
-    OS << "EMUL: ";
+    OS << "EMUL: m";
     if (EMUL->second)
-      OS << "m";
-    OS << "f" << EMUL->first;
+      OS << "f";
+    OS << EMUL->first;
     OS << ", EEW: " << (1 << Log2EEW);
   }
 };
-- 
GitLab


From 224f62de9e34d537b1fd282b47b773b04bea34f1 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Tue, 15 Oct 2024 12:19:21 -0700
Subject: [PATCH 015/329] [lldb-dap] Improving the naming consistency of
 startDebugging reverse request. (#112396)

Adjusting the name from `lldb-dap startDebugging` to `lldb-dap
start-debugging` to improve consistency with other names for commands in
lldb/lldb-dap.
---
 .../startDebugging/TestDAP_startDebugging.py  |  6 ++---
 lldb/tools/lldb-dap/DAP.cpp                   | 23 +++++++------------
 lldb/tools/lldb-dap/README.md                 |  8 +++----
 lldb/tools/lldb-dap/lldb-dap.cpp              |  2 +-
 4 files changed, 15 insertions(+), 24 deletions(-)

diff --git a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
index fd48e69cae5e..fd452d91e472 100644
--- a/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
+++ b/lldb/test/API/tools/lldb-dap/startDebugging/TestDAP_startDebugging.py
@@ -1,12 +1,10 @@
 """
-Test lldb-dap startDebugging reverse request
+Test lldb-dap start-debugging reverse requests.
 """
 
 
-import dap_server
 from lldbsuite.test.decorators import *
 from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
 import lldbdap_testcase
 
 
@@ -25,7 +23,7 @@ class TestDAP_startDebugging(lldbdap_testcase.DAPTestCaseBase):
         self.set_source_breakpoints(source, [breakpoint_line])
         self.continue_to_next_stop()
         self.dap_server.request_evaluate(
-            "`lldb-dap startDebugging attach '{\"pid\":321}'", context="repl"
+            "`lldb-dap start-debugging attach '{\"pid\":321}'", context="repl"
         )
 
         self.continue_to_exit()
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 5e75d84cf824..119779d7bfec 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -866,42 +866,35 @@ int64_t Variables::InsertVariable(lldb::SBValue variable, bool is_permanent) {
 bool StartDebuggingRequestHandler::DoExecute(
     lldb::SBDebugger debugger, char **command,
     lldb::SBCommandReturnObject &result) {
-  // Command format like: `startDebugging <launch|attach> <configuration>`
+  // Command format like: `start-debugging <launch|attach> <configuration>`
   if (!command) {
-    result.SetError("Invalid use of startDebugging");
-    result.SetStatus(lldb::eReturnStatusFailed);
+    result.SetError("Invalid use of start-debugging, expected format "
+                    "`start-debugging <launch|attach> <configuration>`.");
     return false;
   }
 
   if (!command[0] || llvm::StringRef(command[0]).empty()) {
-    result.SetError("startDebugging request type missing.");
-    result.SetStatus(lldb::eReturnStatusFailed);
+    result.SetError("start-debugging request type missing.");
     return false;
   }
 
   if (!command[1] || llvm::StringRef(command[1]).empty()) {
-    result.SetError("configuration missing.");
-    result.SetStatus(lldb::eReturnStatusFailed);
+    result.SetError("start-debugging debug configuration missing.");
     return false;
   }
 
   llvm::StringRef request{command[0]};
   std::string raw_configuration{command[1]};
 
-  int i = 2;
-  while (command[i]) {
-    raw_configuration.append(" ").append(command[i]);
-  }
-
   llvm::Expected<llvm::json::Value> configuration =
       llvm::json::parse(raw_configuration);
 
   if (!configuration) {
     llvm::Error err = configuration.takeError();
-    std::string msg =
-        "Failed to parse json configuration: " + llvm::toString(std::move(err));
+    std::string msg = "Failed to parse json configuration: " +
+                      llvm::toString(std::move(err)) + "\n\n" +
+                      raw_configuration;
     result.SetError(msg.c_str());
-    result.SetStatus(lldb::eReturnStatusFailed);
     return false;
   }
 
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 3a7d82e887cc..11086eb222d2 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -244,9 +244,9 @@ The escape character can be adjusted via the `commandEscapePrefix` configuration
 The `lldb-dap` tool includes additional custom commands to support the Debug
 Adapter Protocol features.
 
-#### `lldb-dap startDebugging`
+#### `lldb-dap start-debugging`
 
-Using the command `lldb-dap startDebugging` it is possible to trigger a
+Using the command `lldb-dap start-debugging` it is possible to trigger a
 reverse request to the client requesting a child debug session with the
 specified configuration. For example, this can be used to attached to forked or
 spawned processes. For more information see
@@ -255,7 +255,7 @@ spawned processes. For more information see
 The custom command has the following format:
 
 ```
-lldb-dap startDebugging <launch|attach> <configuration>
+lldb-dap start-debugging <launch|attach> <configuration>
 ```
 
 This will launch a server and then request a child debug session for a client.
@@ -264,7 +264,7 @@ This will launch a server and then request a child debug session for a client.
 {
   "program": "server",
   "postRunCommand": [
-    "lldb-dap startDebugging launch '{\"program\":\"client\"}'"
+    "lldb-dap start-debugging launch '{\"program\":\"client\"}'"
   ]
 }
 ```
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index a167088c8901..5e351ab11ab6 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -1889,7 +1889,7 @@ void request_initialize(const llvm::json::Object &request) {
       "lldb-dap", "Commands for managing lldb-dap.");
   if (GetBoolean(arguments, "supportsStartDebuggingRequest", false)) {
     cmd.AddCommand(
-        "startDebugging", new StartDebuggingRequestHandler(),
+        "start-debugging", new StartDebuggingRequestHandler(),
         "Sends a startDebugging request from the debug adapter to the client "
         "to start a child debug session of the same type as the caller.");
   }
-- 
GitLab


From be0c67c90e045b03b0ffecc06ca6f93e440f48d8 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 15 Oct 2024 12:31:06 -0700
Subject: [PATCH 016/329] [libc] Remove dependency on `cpp::function` in
 `rpc.h` (#112422)

Summary:
I'm going to attempt to move the `rpc.h` header to a separate folder
that we can install and include outside of `libc`. Before doing this I'm
going to try to trim up the file so there's not as many things I need to
copy to make it work. This dependency on `cpp::functional` is a low
hanging fruit. I only did it so that I could overload the argument of
the work function so that passing the id was optional in the lambda,
that's not a *huge* deal and it makes it more explicit I suppose.
---
 libc/src/__support/GPU/allocator.cpp   | 11 +++++-----
 libc/src/__support/OSUtil/gpu/exit.cpp |  5 +++--
 libc/src/__support/OSUtil/gpu/io.cpp   |  2 +-
 libc/src/__support/RPC/rpc.h           | 22 ++++---------------
 libc/src/gpu/rpc_host_call.cpp         |  4 ++--
 libc/src/stdio/gpu/clearerr.cpp        |  6 ++++--
 libc/src/stdio/gpu/fclose.cpp          |  5 +++--
 libc/src/stdio/gpu/feof.cpp            |  8 +++++--
 libc/src/stdio/gpu/ferror.cpp          |  8 +++++--
 libc/src/stdio/gpu/fflush.cpp          |  8 +++++--
 libc/src/stdio/gpu/fgets.cpp           |  2 +-
 libc/src/stdio/gpu/file.h              |  8 +++----
 libc/src/stdio/gpu/fopen.cpp           |  4 ++--
 libc/src/stdio/gpu/fseek.cpp           |  6 ++++--
 libc/src/stdio/gpu/ftell.cpp           |  8 +++++--
 libc/src/stdio/gpu/remove.cpp          |  5 +++--
 libc/src/stdio/gpu/rename.cpp          |  5 +++--
 libc/src/stdio/gpu/ungetc.cpp          |  6 ++++--
 libc/src/stdio/gpu/vfprintf_utils.h    |  6 +++---
 libc/src/stdlib/gpu/abort.cpp          |  5 +++--
 libc/src/stdlib/gpu/system.cpp         |  5 +++--
 libc/utils/gpu/server/rpc_server.cpp   | 30 +++++++++++++-------------
 22 files changed, 92 insertions(+), 77 deletions(-)

diff --git a/libc/src/__support/GPU/allocator.cpp b/libc/src/__support/GPU/allocator.cpp
index 01273e16a938..f98e61010479 100644
--- a/libc/src/__support/GPU/allocator.cpp
+++ b/libc/src/__support/GPU/allocator.cpp
@@ -18,17 +18,18 @@ namespace {
 void *rpc_allocate(uint64_t size) {
   void *ptr = nullptr;
   rpc::Client::Port port = rpc::client.open<RPC_MALLOC>();
-  port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = size; },
-                     [&](rpc::Buffer *buffer) {
-                       ptr = reinterpret_cast<void *>(buffer->data[0]);
-                     });
+  port.send_and_recv(
+      [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = size; },
+      [&](rpc::Buffer *buffer, uint32_t) {
+        ptr = reinterpret_cast<void *>(buffer->data[0]);
+      });
   port.close();
   return ptr;
 }
 
 void rpc_free(void *ptr) {
   rpc::Client::Port port = rpc::client.open<RPC_FREE>();
-  port.send([=](rpc::Buffer *buffer) {
+  port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(ptr);
   });
   port.close();
diff --git a/libc/src/__support/OSUtil/gpu/exit.cpp b/libc/src/__support/OSUtil/gpu/exit.cpp
index 360bcca1c6da..8aaa41b4e3ee 100644
--- a/libc/src/__support/OSUtil/gpu/exit.cpp
+++ b/libc/src/__support/OSUtil/gpu/exit.cpp
@@ -18,8 +18,9 @@ namespace internal {
 [[noreturn]] void exit(int status) {
   // We want to first make sure the server is listening before we exit.
   rpc::Client::Port port = rpc::client.open<RPC_EXIT>();
-  port.send_and_recv([](rpc::Buffer *) {}, [](rpc::Buffer *) {});
-  port.send([&](rpc::Buffer *buffer) {
+  port.send_and_recv([](rpc::Buffer *, uint32_t) {},
+                     [](rpc::Buffer *, uint32_t) {});
+  port.send([&](rpc::Buffer *buffer, uint32_t) {
     reinterpret_cast<uint32_t *>(buffer->data)[0] = status;
   });
   port.close();
diff --git a/libc/src/__support/OSUtil/gpu/io.cpp b/libc/src/__support/OSUtil/gpu/io.cpp
index f3000bd0f48b..f70c2e798cfe 100644
--- a/libc/src/__support/OSUtil/gpu/io.cpp
+++ b/libc/src/__support/OSUtil/gpu/io.cpp
@@ -17,7 +17,7 @@ namespace LIBC_NAMESPACE_DECL {
 void write_to_stderr(cpp::string_view msg) {
   rpc::Client::Port port = rpc::client.open<RPC_WRITE_TO_STDERR>();
   port.send_n(msg.data(), msg.size());
-  port.recv([](rpc::Buffer *) { /* void */ });
+  port.recv([](rpc::Buffer *, uint32_t) { /* void */ });
   port.close();
 }
 
diff --git a/libc/src/__support/RPC/rpc.h b/libc/src/__support/RPC/rpc.h
index a94b11902c11..c421dd82b294 100644
--- a/libc/src/__support/RPC/rpc.h
+++ b/libc/src/__support/RPC/rpc.h
@@ -21,7 +21,6 @@
 #include "rpc_util.h"
 #include "src/__support/CPP/algorithm.h" // max
 #include "src/__support/CPP/atomic.h"
-#include "src/__support/CPP/functional.h"
 #include "src/__support/CPP/optional.h"
 #include "src/__support/GPU/utils.h"
 #include "src/__support/macros/config.h"
@@ -266,22 +265,9 @@ template <bool Invert> struct Process {
 };
 
 /// Invokes a function accross every active buffer across the total lane size.
-static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *)> fn,
-                                   uint32_t lane_size, uint64_t lane_mask,
-                                   Buffer *slot) {
-  if constexpr (is_process_gpu()) {
-    fn(&slot[gpu::get_lane_id()]);
-  } else {
-    for (uint32_t i = 0; i < lane_size; i += gpu::get_lane_size())
-      if (lane_mask & (1ul << i))
-        fn(&slot[i]);
-  }
-}
-
-/// Alternate version that also provides the index of the current lane.
-static LIBC_INLINE void invoke_rpc(cpp::function<void(Buffer *, uint32_t)> fn,
-                                   uint32_t lane_size, uint64_t lane_mask,
-                                   Buffer *slot) {
+template <typename F>
+LIBC_INLINE static void invoke_rpc(F &&fn, uint32_t lane_size,
+                                   uint64_t lane_mask, Buffer *slot) {
   if constexpr (is_process_gpu()) {
     fn(&slot[gpu::get_lane_id()], gpu::get_lane_id());
   } else {
@@ -444,7 +430,7 @@ template <bool T>
 template <typename W>
 LIBC_INLINE void Port<T>::recv_and_send(W work) {
   recv(work);
-  send([](Buffer *) { /* no-op */ });
+  send([](Buffer *, uint32_t) { /* no-op */ });
 }
 
 /// Helper routine to simplify the interface when sending from the GPU using
diff --git a/libc/src/gpu/rpc_host_call.cpp b/libc/src/gpu/rpc_host_call.cpp
index f21fadc319c6..1181e9554d16 100644
--- a/libc/src/gpu/rpc_host_call.cpp
+++ b/libc/src/gpu/rpc_host_call.cpp
@@ -21,11 +21,11 @@ LLVM_LIBC_FUNCTION(unsigned long long, rpc_host_call,
                    (void *fn, void *data, size_t size)) {
   rpc::Client::Port port = rpc::client.open<RPC_HOST_CALL>();
   port.send_n(data, size);
-  port.send([=](rpc::Buffer *buffer) {
+  port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = reinterpret_cast<uintptr_t>(fn);
   });
   unsigned long long ret;
-  port.recv([&](rpc::Buffer *buffer) {
+  port.recv([&](rpc::Buffer *buffer, uint32_t) {
     ret = static_cast<unsigned long long>(buffer->data[0]);
   });
   port.close();
diff --git a/libc/src/stdio/gpu/clearerr.cpp b/libc/src/stdio/gpu/clearerr.cpp
index 5826a7bcb95f..4c631b9f946f 100644
--- a/libc/src/stdio/gpu/clearerr.cpp
+++ b/libc/src/stdio/gpu/clearerr.cpp
@@ -17,8 +17,10 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, clearerr, (::FILE * stream)) {
   rpc::Client::Port port = rpc::client.open<RPC_CLEARERR>();
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
-      [&](rpc::Buffer *) {});
+      [=](rpc::Buffer *buffer, uint32_t) {
+        buffer->data[0] = file::from_stream(stream);
+      },
+      [&](rpc::Buffer *, uint32_t) {});
   port.close();
 }
 
diff --git a/libc/src/stdio/gpu/fclose.cpp b/libc/src/stdio/gpu/fclose.cpp
index 78caccd90c69..683e0548495d 100644
--- a/libc/src/stdio/gpu/fclose.cpp
+++ b/libc/src/stdio/gpu/fclose.cpp
@@ -19,8 +19,9 @@ LLVM_LIBC_FUNCTION(int, fclose, (::FILE * stream)) {
   uint64_t ret = 0;
   uintptr_t file = reinterpret_cast<uintptr_t>(stream);
   rpc::Client::Port port = rpc::client.open<RPC_CLOSE_FILE>();
-  port.send_and_recv([=](rpc::Buffer *buffer) { buffer->data[0] = file; },
-                     [&](rpc::Buffer *buffer) { ret = buffer->data[0]; });
+  port.send_and_recv(
+      [=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = file; },
+      [&](rpc::Buffer *buffer, uint32_t) { ret = buffer->data[0]; });
   port.close();
 
   if (ret != 0)
diff --git a/libc/src/stdio/gpu/feof.cpp b/libc/src/stdio/gpu/feof.cpp
index 4a8a17332a0a..02adb4ce73d6 100644
--- a/libc/src/stdio/gpu/feof.cpp
+++ b/libc/src/stdio/gpu/feof.cpp
@@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(int, feof, (::FILE * stream)) {
   int ret;
   rpc::Client::Port port = rpc::client.open<RPC_FEOF>();
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+      [=](rpc::Buffer *buffer, uint32_t) {
+        buffer->data[0] = file::from_stream(stream);
+      },
+      [&](rpc::Buffer *buffer, uint32_t) {
+        ret = static_cast<int>(buffer->data[0]);
+      });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/ferror.cpp b/libc/src/stdio/gpu/ferror.cpp
index 1cee96f5ef23..ca777131fd1b 100644
--- a/libc/src/stdio/gpu/ferror.cpp
+++ b/libc/src/stdio/gpu/ferror.cpp
@@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(int, ferror, (::FILE * stream)) {
   int ret;
   rpc::Client::Port port = rpc::client.open<RPC_FERROR>();
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+      [=](rpc::Buffer *buffer, uint32_t) {
+        buffer->data[0] = file::from_stream(stream);
+      },
+      [&](rpc::Buffer *buffer, uint32_t) {
+        ret = static_cast<int>(buffer->data[0]);
+      });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/fflush.cpp b/libc/src/stdio/gpu/fflush.cpp
index be267a2e9ce1..577325b70c4e 100644
--- a/libc/src/stdio/gpu/fflush.cpp
+++ b/libc/src/stdio/gpu/fflush.cpp
@@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(int, fflush, (::FILE * stream)) {
   int ret;
   rpc::Client::Port port = rpc::client.open<RPC_FFLUSH>();
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+      [=](rpc::Buffer *buffer, uint32_t) {
+        buffer->data[0] = file::from_stream(stream);
+      },
+      [&](rpc::Buffer *buffer, uint32_t) {
+        ret = static_cast<int>(buffer->data[0]);
+      });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/fgets.cpp b/libc/src/stdio/gpu/fgets.cpp
index 942f6f0ff03b..fbc1b0cf7d1a 100644
--- a/libc/src/stdio/gpu/fgets.cpp
+++ b/libc/src/stdio/gpu/fgets.cpp
@@ -27,7 +27,7 @@ LLVM_LIBC_FUNCTION(char *, fgets,
   uint64_t recv_size;
   void *buf = nullptr;
   rpc::Client::Port port = rpc::client.open<RPC_READ_FGETS>();
-  port.send([=](rpc::Buffer *buffer) {
+  port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = count;
     buffer->data[1] = file::from_stream(stream);
   });
diff --git a/libc/src/stdio/gpu/file.h b/libc/src/stdio/gpu/file.h
index 0856a3430803..16d64e8f3775 100644
--- a/libc/src/stdio/gpu/file.h
+++ b/libc/src/stdio/gpu/file.h
@@ -55,13 +55,13 @@ LIBC_INLINE uint64_t write_impl(::FILE *file, const void *data, size_t size) {
   rpc::Client::Port port = rpc::client.open<opcode>();
 
   if constexpr (opcode == RPC_WRITE_TO_STREAM) {
-    port.send([&](rpc::Buffer *buffer) {
+    port.send([&](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = reinterpret_cast<uintptr_t>(file);
     });
   }
 
   port.send_n(data, size);
-  port.recv([&](rpc::Buffer *buffer) {
+  port.recv([&](rpc::Buffer *buffer, uint32_t) {
     ret = reinterpret_cast<uint64_t *>(buffer->data)[0];
   });
   port.close();
@@ -81,12 +81,12 @@ LIBC_INLINE uint64_t read_from_stream(::FILE *file, void *buf, size_t size) {
   uint64_t ret = 0;
   uint64_t recv_size;
   rpc::Client::Port port = rpc::client.open<RPC_READ_FROM_STREAM>();
-  port.send([=](rpc::Buffer *buffer) {
+  port.send([=](rpc::Buffer *buffer, uint32_t) {
     buffer->data[0] = size;
     buffer->data[1] = from_stream(file);
   });
   port.recv_n(&buf, &recv_size, [&](uint64_t) { return buf; });
-  port.recv([&](rpc::Buffer *buffer) { ret = buffer->data[0]; });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) { ret = buffer->data[0]; });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/fopen.cpp b/libc/src/stdio/gpu/fopen.cpp
index 76daece68ac9..e165d2acd210 100644
--- a/libc/src/stdio/gpu/fopen.cpp
+++ b/libc/src/stdio/gpu/fopen.cpp
@@ -21,10 +21,10 @@ LLVM_LIBC_FUNCTION(::FILE *, fopen,
   rpc::Client::Port port = rpc::client.open<RPC_OPEN_FILE>();
   port.send_n(path, internal::string_length(path) + 1);
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) {
+      [=](rpc::Buffer *buffer, uint32_t) {
         inline_memcpy(buffer->data, mode, internal::string_length(mode) + 1);
       },
-      [&](rpc::Buffer *buffer) { file = buffer->data[0]; });
+      [&](rpc::Buffer *buffer, uint32_t) { file = buffer->data[0]; });
   port.close();
 
   return reinterpret_cast<FILE *>(file);
diff --git a/libc/src/stdio/gpu/fseek.cpp b/libc/src/stdio/gpu/fseek.cpp
index 4f3e9ce6ec02..37c40bc602d8 100644
--- a/libc/src/stdio/gpu/fseek.cpp
+++ b/libc/src/stdio/gpu/fseek.cpp
@@ -18,12 +18,14 @@ LLVM_LIBC_FUNCTION(int, fseek, (::FILE * stream, long offset, int whence)) {
   int ret;
   rpc::Client::Port port = rpc::client.open<RPC_FSEEK>();
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) {
+      [=](rpc::Buffer *buffer, uint32_t) {
         buffer->data[0] = file::from_stream(stream);
         buffer->data[1] = static_cast<uint64_t>(offset);
         buffer->data[2] = static_cast<uint64_t>(whence);
       },
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+      [&](rpc::Buffer *buffer, uint32_t) {
+        ret = static_cast<int>(buffer->data[0]);
+      });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/ftell.cpp b/libc/src/stdio/gpu/ftell.cpp
index 483b1ad4fee0..226aeda2f8de 100644
--- a/libc/src/stdio/gpu/ftell.cpp
+++ b/libc/src/stdio/gpu/ftell.cpp
@@ -18,8 +18,12 @@ LLVM_LIBC_FUNCTION(long, ftell, (::FILE * stream)) {
   long ret;
   rpc::Client::Port port = rpc::client.open<RPC_FSEEK>();
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) { buffer->data[0] = file::from_stream(stream); },
-      [&](rpc::Buffer *buffer) { ret = static_cast<long>(buffer->data[0]); });
+      [=](rpc::Buffer *buffer, uint32_t) {
+        buffer->data[0] = file::from_stream(stream);
+      },
+      [&](rpc::Buffer *buffer, uint32_t) {
+        ret = static_cast<long>(buffer->data[0]);
+      });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/remove.cpp b/libc/src/stdio/gpu/remove.cpp
index 3f21e8aeff5a..6604be1c31f2 100644
--- a/libc/src/stdio/gpu/remove.cpp
+++ b/libc/src/stdio/gpu/remove.cpp
@@ -18,8 +18,9 @@ LLVM_LIBC_FUNCTION(int, remove, (const char *path)) {
   int ret;
   rpc::Client::Port port = rpc::client.open<RPC_REMOVE>();
   port.send_n(path, internal::string_length(path) + 1);
-  port.recv(
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) {
+    ret = static_cast<int>(buffer->data[0]);
+  });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/rename.cpp b/libc/src/stdio/gpu/rename.cpp
index 108722883584..e6396e212b8b 100644
--- a/libc/src/stdio/gpu/rename.cpp
+++ b/libc/src/stdio/gpu/rename.cpp
@@ -20,8 +20,9 @@ LLVM_LIBC_FUNCTION(int, rename, (const char *oldpath, const char *newpath)) {
   rpc::Client::Port port = rpc::client.open<RPC_RENAME>();
   port.send_n(oldpath, internal::string_length(oldpath) + 1);
   port.send_n(newpath, internal::string_length(newpath) + 1);
-  port.recv(
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) {
+    ret = static_cast<int>(buffer->data[0]);
+  });
   port.close();
 
   return ret;
diff --git a/libc/src/stdio/gpu/ungetc.cpp b/libc/src/stdio/gpu/ungetc.cpp
index e9232a5e43a2..dce14391b7de 100644
--- a/libc/src/stdio/gpu/ungetc.cpp
+++ b/libc/src/stdio/gpu/ungetc.cpp
@@ -18,11 +18,13 @@ LLVM_LIBC_FUNCTION(int, ungetc, (int c, ::FILE *stream)) {
   int ret;
   rpc::Client::Port port = rpc::client.open<RPC_UNGETC>();
   port.send_and_recv(
-      [=](rpc::Buffer *buffer) {
+      [=](rpc::Buffer *buffer, uint32_t) {
         buffer->data[0] = c;
         buffer->data[1] = file::from_stream(stream);
       },
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+      [&](rpc::Buffer *buffer, uint32_t) {
+        ret = static_cast<int>(buffer->data[0]);
+      });
   port.close();
   return ret;
 }
diff --git a/libc/src/stdio/gpu/vfprintf_utils.h b/libc/src/stdio/gpu/vfprintf_utils.h
index 7c012d139ba5..93ce1649869f 100644
--- a/libc/src/stdio/gpu/vfprintf_utils.h
+++ b/libc/src/stdio/gpu/vfprintf_utils.h
@@ -23,14 +23,14 @@ LIBC_INLINE int vfprintf_impl(::FILE *__restrict file,
 
   if constexpr (opcode == RPC_PRINTF_TO_STREAM ||
                 opcode == RPC_PRINTF_TO_STREAM_PACKED) {
-    port.send([&](rpc::Buffer *buffer) {
+    port.send([&](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = reinterpret_cast<uintptr_t>(file);
     });
   }
 
   size_t args_size = 0;
   port.send_n(format, format_size);
-  port.recv([&](rpc::Buffer *buffer) {
+  port.recv([&](rpc::Buffer *buffer, uint32_t) {
     args_size = static_cast<size_t>(buffer->data[0]);
   });
   port.send_n(vlist, args_size);
@@ -38,7 +38,7 @@ LIBC_INLINE int vfprintf_impl(::FILE *__restrict file,
   uint32_t ret = 0;
   for (;;) {
     const char *str = nullptr;
-    port.recv([&](rpc::Buffer *buffer) {
+    port.recv([&](rpc::Buffer *buffer, uint32_t) {
       ret = static_cast<uint32_t>(buffer->data[0]);
       str = reinterpret_cast<const char *>(buffer->data[1]);
     });
diff --git a/libc/src/stdlib/gpu/abort.cpp b/libc/src/stdlib/gpu/abort.cpp
index fee198607cc0..cfc7e9b8e228 100644
--- a/libc/src/stdlib/gpu/abort.cpp
+++ b/libc/src/stdlib/gpu/abort.cpp
@@ -17,8 +17,9 @@ namespace LIBC_NAMESPACE_DECL {
 LLVM_LIBC_FUNCTION(void, abort, ()) {
   // We want to first make sure the server is listening before we abort.
   rpc::Client::Port port = rpc::client.open<RPC_ABORT>();
-  port.send_and_recv([](rpc::Buffer *) {}, [](rpc::Buffer *) {});
-  port.send([&](rpc::Buffer *) {});
+  port.send_and_recv([](rpc::Buffer *, uint32_t) {},
+                     [](rpc::Buffer *, uint32_t) {});
+  port.send([&](rpc::Buffer *, uint32_t) {});
   port.close();
 
   gpu::end_program();
diff --git a/libc/src/stdlib/gpu/system.cpp b/libc/src/stdlib/gpu/system.cpp
index acf3a8c941ff..1890006512de 100644
--- a/libc/src/stdlib/gpu/system.cpp
+++ b/libc/src/stdlib/gpu/system.cpp
@@ -19,8 +19,9 @@ LLVM_LIBC_FUNCTION(int, system, (const char *command)) {
   int ret;
   rpc::Client::Port port = rpc::client.open<RPC_SYSTEM>();
   port.send_n(command, internal::string_length(command) + 1);
-  port.recv(
-      [&](rpc::Buffer *buffer) { ret = static_cast<int>(buffer->data[0]); });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) {
+    ret = static_cast<int>(buffer->data[0]);
+  });
   port.close();
 
   return ret;
diff --git a/libc/utils/gpu/server/rpc_server.cpp b/libc/utils/gpu/server/rpc_server.cpp
index ca10e67509ae..11b6d0e27ab9 100644
--- a/libc/utils/gpu/server/rpc_server.cpp
+++ b/libc/utils/gpu/server/rpc_server.cpp
@@ -302,8 +302,8 @@ rpc_status_t handle_server_impl(
   }
   case RPC_EXIT: {
     // Send a response to the client to signal that we are ready to exit.
-    port->recv_and_send([](rpc::Buffer *) {});
-    port->recv([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *, uint32_t) {});
+    port->recv([](rpc::Buffer *buffer, uint32_t) {
       int status = 0;
       std::memcpy(&status, buffer->data, sizeof(int));
       exit(status);
@@ -312,8 +312,8 @@ rpc_status_t handle_server_impl(
   }
   case RPC_ABORT: {
     // Send a response to the client to signal that we are ready to abort.
-    port->recv_and_send([](rpc::Buffer *) {});
-    port->recv([](rpc::Buffer *) {});
+    port->recv_and_send([](rpc::Buffer *, uint32_t) {});
+    port->recv([](rpc::Buffer *, uint32_t) {});
     abort();
     break;
   }
@@ -334,25 +334,25 @@ rpc_status_t handle_server_impl(
     break;
   }
   case RPC_FEOF: {
-    port->recv_and_send([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = feof(file::to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FERROR: {
-    port->recv_and_send([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = ferror(file::to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_CLEARERR: {
-    port->recv_and_send([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       clearerr(file::to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FSEEK: {
-    port->recv_and_send([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = fseek(file::to_stream(buffer->data[0]),
                               static_cast<long>(buffer->data[1]),
                               static_cast<int>(buffer->data[2]));
@@ -360,19 +360,19 @@ rpc_status_t handle_server_impl(
     break;
   }
   case RPC_FTELL: {
-    port->recv_and_send([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = ftell(file::to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_FFLUSH: {
-    port->recv_and_send([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = fflush(file::to_stream(buffer->data[0]));
     });
     break;
   }
   case RPC_UNGETC: {
-    port->recv_and_send([](rpc::Buffer *buffer) {
+    port->recv_and_send([](rpc::Buffer *buffer, uint32_t) {
       buffer->data[0] = ungetc(static_cast<int>(buffer->data[0]),
                                file::to_stream(buffer->data[1]));
     });
@@ -429,7 +429,7 @@ rpc_status_t handle_server_impl(
     break;
   }
   case RPC_NOOP: {
-    port->recv([](rpc::Buffer *) {});
+    port->recv([](rpc::Buffer *, uint32_t) {});
     break;
   }
   default: {
@@ -552,7 +552,7 @@ uint64_t rpc_get_client_size() { return sizeof(rpc::Client); }
 
 void rpc_send(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
   auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  port->send([=](rpc::Buffer *buffer) {
+  port->send([=](rpc::Buffer *buffer, uint32_t) {
     callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
   });
 }
@@ -564,7 +564,7 @@ void rpc_send_n(rpc_port_t ref, const void *const *src, uint64_t *size) {
 
 void rpc_recv(rpc_port_t ref, rpc_port_callback_ty callback, void *data) {
   auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  port->recv([=](rpc::Buffer *buffer) {
+  port->recv([=](rpc::Buffer *buffer, uint32_t) {
     callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
   });
 }
@@ -579,7 +579,7 @@ void rpc_recv_n(rpc_port_t ref, void **dst, uint64_t *size, rpc_alloc_ty alloc,
 void rpc_recv_and_send(rpc_port_t ref, rpc_port_callback_ty callback,
                        void *data) {
   auto port = reinterpret_cast<rpc::Server::Port *>(ref.handle);
-  port->recv_and_send([=](rpc::Buffer *buffer) {
+  port->recv_and_send([=](rpc::Buffer *buffer, uint32_t) {
     callback(reinterpret_cast<rpc_buffer_t *>(buffer), data);
   });
 }
-- 
GitLab


From 2a46e5d03985620cbc55ed9839a263dc9646c240 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 15 Oct 2024 20:50:28 +0100
Subject: [PATCH 017/329] [VPlan] Implement VPInterleaveRecipe::computeCost.
 (#106067)

Implement computing costs for VPInterleaveRecipe.

PR: https://github.com/llvm/llvm-project/pull/106067
---
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp | 28 ++++++++++++++++++-
 1 file changed, 27 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 368d6e58a557..b3befce6c92c 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -2941,7 +2941,33 @@ void VPInterleaveRecipe::print(raw_ostream &O, const Twine &Indent,
 
 InstructionCost VPInterleaveRecipe::computeCost(ElementCount VF,
                                                 VPCostContext &Ctx) const {
-  return Ctx.getLegacyCost(IG->getInsertPos(), VF);
+  Instruction *I = getInsertPos();
+  Type *ValTy = Ctx.Types.inferScalarType(
+      getNumDefinedValues() > 0 ? getVPValue(0) : getStoredValues()[0]);
+  auto *VectorTy = cast<VectorType>(ToVectorTy(ValTy, VF));
+  unsigned AS = getLoadStoreAddressSpace(I);
+  enum TTI::TargetCostKind CostKind = TTI::TCK_RecipThroughput;
+
+  unsigned InterleaveFactor = IG->getFactor();
+  auto *WideVecTy = VectorType::get(ValTy, VF * InterleaveFactor);
+
+  // Holds the indices of existing members in the interleaved group.
+  SmallVector<unsigned, 4> Indices;
+  for (unsigned IF = 0; IF < InterleaveFactor; IF++)
+    if (IG->getMember(IF))
+      Indices.push_back(IF);
+
+  // Calculate the cost of the whole interleaved group.
+  InstructionCost Cost = Ctx.TTI.getInterleavedMemoryOpCost(
+      I->getOpcode(), WideVecTy, IG->getFactor(), Indices, IG->getAlign(), AS,
+      CostKind, getMask(), NeedsMaskForGaps);
+
+  if (!IG->isReverse())
+    return Cost;
+
+  return Cost + IG->getNumMembers() *
+                    Ctx.TTI.getShuffleCost(TargetTransformInfo::SK_Reverse,
+                                           VectorTy, std::nullopt, CostKind, 0);
 }
 
 void VPCanonicalIVPHIRecipe::execute(VPTransformState &State) {
-- 
GitLab


From d1a47915d0f44d7392de1665dbb99cfceec907a5 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 15 Oct 2024 12:50:44 -0700
Subject: [PATCH 018/329] [Clang][TableGen] Use const pointers for various
 `Init *` pointers in SA checker emitter (#112321)

Use const pointers for various Init objects in SA checker emitter. This
is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/ClangSACheckersEmitter.cpp | 10 +++++-----
 1 file changed, 5 insertions(+), 5 deletions(-)

diff --git a/clang/utils/TableGen/ClangSACheckersEmitter.cpp b/clang/utils/TableGen/ClangSACheckersEmitter.cpp
index bebdcac32126..36012dbf7079 100644
--- a/clang/utils/TableGen/ClangSACheckersEmitter.cpp
+++ b/clang/utils/TableGen/ClangSACheckersEmitter.cpp
@@ -29,7 +29,7 @@ static std::string getPackageFullName(const Record *R, StringRef Sep = ".");
 static std::string getParentPackageFullName(const Record *R,
                                             StringRef Sep = ".") {
   std::string name;
-  if (DefInit *DI = dyn_cast<DefInit>(R->getValueInit("ParentPackage")))
+  if (const DefInit *DI = dyn_cast<DefInit>(R->getValueInit("ParentPackage")))
     name = getPackageFullName(DI->getDef(), Sep);
   return name;
 }
@@ -53,7 +53,7 @@ static std::string getCheckerFullName(const Record *R, StringRef Sep = ".") {
 }
 
 static std::string getStringValue(const Record &R, StringRef field) {
-  if (StringInit *SI = dyn_cast<StringInit>(R.getValueInit(field)))
+  if (const StringInit *SI = dyn_cast<StringInit>(R.getValueInit(field)))
     return std::string(SI->getValue());
   return std::string();
 }
@@ -94,7 +94,7 @@ static std::string getCheckerDocs(const Record &R) {
 /// the class itself has to be modified for adding a new option type in
 /// CheckerBase.td.
 static std::string getCheckerOptionType(const Record &R) {
-  if (BitsInit *BI = R.getValueAsBitsInit("Type")) {
+  if (const BitsInit *BI = R.getValueAsBitsInit("Type")) {
     switch(getValueFromBitsInit(BI, R)) {
     case 0:
       return "int";
@@ -111,7 +111,7 @@ static std::string getCheckerOptionType(const Record &R) {
 }
 
 static std::string getDevelopmentStage(const Record &R) {
-  if (BitsInit *BI = R.getValueAsBitsInit("DevelopmentStage")) {
+  if (const BitsInit *BI = R.getValueAsBitsInit("DevelopmentStage")) {
     switch(getValueFromBitsInit(BI, R)) {
     case 0:
       return "alpha";
@@ -131,7 +131,7 @@ static bool isHidden(const Record *R) {
     return true;
 
   // Not declared as hidden, check the parent package if it is hidden.
-  if (DefInit *DI = dyn_cast<DefInit>(R->getValueInit("ParentPackage")))
+  if (const DefInit *DI = dyn_cast<DefInit>(R->getValueInit("ParentPackage")))
     return isHidden(DI->getDef());
 
   return false;
-- 
GitLab


From 685bec722f008ae26593a5ebe3d58ca8e5c4a7c2 Mon Sep 17 00:00:00 2001
From: Alexey Bataev <a.bataev@outlook.com>
Date: Tue, 15 Oct 2024 12:59:44 -0700
Subject: [PATCH 019/329] Revert "[SLP]Initial non-power-of-2 support (but
 still whole register) for reductions"

This reverts commit 8287fa8e596d8fc8655c8df3bc99e068ad9f7d4b to
investigate and fix compile time regressions reported by https://llvm-compile-time-tracker.com/compare.php?from=ec78f0da0e9b1b8e2b2323e434ea742e272dd913&to=8287fa8e596d8fc8655c8df3bc99e068ad9f7d4b&stat=instructions:u
---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 20 ++++--------
 .../SLPVectorizer/X86/horizontal-list.ll      | 32 ++++++++++++++-----
 2 files changed, 31 insertions(+), 21 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 84d77f917bbb..336126cc1fbc 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -291,8 +291,6 @@ getFloorFullVectorNumberOfElements(const TargetTransformInfo &TTI, Type *Ty,
   if (NumParts == 0 || NumParts >= Sz)
     return bit_floor(Sz);
   unsigned RegVF = bit_ceil(divideCeil(Sz, NumParts));
-  if (RegVF > Sz)
-    return bit_floor(Sz);
   return (Sz / RegVF) * RegVF;
 }
 
@@ -19073,8 +19071,7 @@ public:
 
       unsigned ReduxWidth = NumReducedVals;
       if (!VectorizeNonPowerOf2 || !has_single_bit(ReduxWidth + 1))
-        ReduxWidth = getFloorFullVectorNumberOfElements(
-            *TTI, Candidates.front()->getType(), ReduxWidth);
+        ReduxWidth = bit_floor(ReduxWidth);
       ReduxWidth = std::min(ReduxWidth, MaxElts);
 
       unsigned Start = 0;
@@ -19082,7 +19079,10 @@ public:
       // Restarts vectorization attempt with lower vector factor.
       unsigned PrevReduxWidth = ReduxWidth;
       bool CheckForReusedReductionOpsLocal = false;
-      auto AdjustReducedVals = [&](bool IgnoreVL = false) {
+      auto &&AdjustReducedVals = [&Pos, &Start, &ReduxWidth, NumReducedVals,
+                                  &CheckForReusedReductionOpsLocal,
+                                  &PrevReduxWidth, &V,
+                                  &IgnoreList](bool IgnoreVL = false) {
         bool IsAnyRedOpGathered = !IgnoreVL && V.isAnyGathered(IgnoreList);
         if (!CheckForReusedReductionOpsLocal && PrevReduxWidth == ReduxWidth) {
           // Check if any of the reduction ops are gathered. If so, worth
@@ -19093,10 +19093,7 @@ public:
         if (Pos < NumReducedVals - ReduxWidth + 1)
           return IsAnyRedOpGathered;
         Pos = Start;
-        --ReduxWidth;
-        if (ReduxWidth > 1)
-          ReduxWidth = getFloorFullVectorNumberOfElements(
-              *TTI, Candidates.front()->getType(), ReduxWidth);
+        ReduxWidth = bit_ceil(ReduxWidth) / 2;
         return IsAnyRedOpGathered;
       };
       bool AnyVectorized = false;
@@ -19328,10 +19325,7 @@ public:
         }
         Pos += ReduxWidth;
         Start = Pos;
-        ReduxWidth = NumReducedVals - Pos;
-        if (ReduxWidth > 1)
-          ReduxWidth = getFloorFullVectorNumberOfElements(
-              *TTI, Candidates.front()->getType(), NumReducedVals - Pos);
+        ReduxWidth = llvm::bit_floor(NumReducedVals - Pos);
         AnyVectorized = true;
       }
       if (OptReusedScalars && !AnyVectorized) {
diff --git a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
index c9ff2d6426d2..72e29839230e 100644
--- a/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
+++ b/llvm/test/Transforms/SLPVectorizer/X86/horizontal-list.ll
@@ -318,14 +318,22 @@ entry:
 define float @f(ptr nocapture readonly %x) {
 ; CHECK-LABEL: @f(
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
-; CHECK-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
+; CHECK-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
 ; CHECK-NEXT:    ret float [[OP_RDX]]
 ;
 ; THRESHOLD-LABEL: @f(
 ; THRESHOLD-NEXT:  entry:
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <48 x float>, ptr [[X:%.*]], align 4
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v48f32(float 0.000000e+00, <48 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <32 x float>, ptr [[X:%.*]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_32:%.*]] = getelementptr inbounds float, ptr [[X]], i64 32
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <16 x float>, ptr [[ARRAYIDX_32]], align 4
+; THRESHOLD-NEXT:    [[TMP2:%.*]] = call fast float @llvm.vector.reduce.fadd.v32f32(float 0.000000e+00, <32 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP3:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP2]], [[TMP3]]
 ; THRESHOLD-NEXT:    ret float [[OP_RDX]]
 ;
   entry:
@@ -598,14 +606,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
 ; CHECK-LABEL: @loadadd31(
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; CHECK-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
+; CHECK-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
 ; CHECK-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
 ; CHECK-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
 ; CHECK-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
 ; CHECK-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; CHECK-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; CHECK-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; CHECK-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
 ; CHECK-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
 ; CHECK-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
@@ -615,14 +627,18 @@ define float @loadadd31(ptr nocapture readonly %x) {
 ; THRESHOLD-LABEL: @loadadd31(
 ; THRESHOLD-NEXT:  entry:
 ; THRESHOLD-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[X:%.*]], i64 1
-; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <24 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[TMP0:%.*]] = load <16 x float>, ptr [[ARRAYIDX]], align 4
+; THRESHOLD-NEXT:    [[ARRAYIDX_16:%.*]] = getelementptr inbounds float, ptr [[X]], i64 17
+; THRESHOLD-NEXT:    [[TMP1:%.*]] = load <8 x float>, ptr [[ARRAYIDX_16]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_24:%.*]] = getelementptr inbounds float, ptr [[X]], i64 25
 ; THRESHOLD-NEXT:    [[TMP2:%.*]] = load <4 x float>, ptr [[ARRAYIDX_24]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_28:%.*]] = getelementptr inbounds float, ptr [[X]], i64 29
 ; THRESHOLD-NEXT:    [[TMP3:%.*]] = load float, ptr [[ARRAYIDX_28]], align 4
 ; THRESHOLD-NEXT:    [[ARRAYIDX_29:%.*]] = getelementptr inbounds float, ptr [[X]], i64 30
 ; THRESHOLD-NEXT:    [[TMP4:%.*]] = load float, ptr [[ARRAYIDX_29]], align 4
-; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = call fast float @llvm.vector.reduce.fadd.v24f32(float 0.000000e+00, <24 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP5:%.*]] = call fast float @llvm.vector.reduce.fadd.v16f32(float 0.000000e+00, <16 x float> [[TMP0]])
+; THRESHOLD-NEXT:    [[TMP6:%.*]] = call fast float @llvm.vector.reduce.fadd.v8f32(float 0.000000e+00, <8 x float> [[TMP1]])
+; THRESHOLD-NEXT:    [[OP_RDX:%.*]] = fadd fast float [[TMP5]], [[TMP6]]
 ; THRESHOLD-NEXT:    [[TMP7:%.*]] = call fast float @llvm.vector.reduce.fadd.v4f32(float 0.000000e+00, <4 x float> [[TMP2]])
 ; THRESHOLD-NEXT:    [[OP_RDX1:%.*]] = fadd fast float [[OP_RDX]], [[TMP7]]
 ; THRESHOLD-NEXT:    [[OP_RDX2:%.*]] = fadd fast float [[OP_RDX1]], [[TMP3]]
-- 
GitLab


From 46200fcf941d16bc8a494a3915e1178502e37a3e Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Tue, 15 Oct 2024 13:07:20 -0700
Subject: [PATCH 020/329] [libc] fix -Wmissing-attributes in setjmp (#112415)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes:

    llvm-project/libc/src/setjmp/x86_64/setjmp.cpp:21:25: error: ‘int
    __llvm_libc_19_0_0_git::setjmp(__jmp_buf*)’ specifies less restrictive
    attribute than its target ‘int
    __llvm_libc_19_0_0_git::__setjmp_impl__(__jmp_buf*)’: ‘nothrow’
    [-Werror=missing-attributes]
       21 | LLVM_LIBC_FUNCTION(int, setjmp, (__jmp_buf * buf)) {
          |                         ^~~~~~

observed in the GCC build by manually expanding LLVM_LIBC_FUNCTION to add
`gnu::nothrow` to the alias.

We probably need to revisit adding nothrow throughout our declarations, so
there is probably a better way to clean this up in the future.

Link: #88054
---
 libc/src/setjmp/setjmp_impl.h     | 13 +++++++++++++
 libc/src/setjmp/x86_64/setjmp.cpp |  2 +-
 2 files changed, 14 insertions(+), 1 deletion(-)

diff --git a/libc/src/setjmp/setjmp_impl.h b/libc/src/setjmp/setjmp_impl.h
index 4175a7397ae1..d035409e5819 100644
--- a/libc/src/setjmp/setjmp_impl.h
+++ b/libc/src/setjmp/setjmp_impl.h
@@ -13,9 +13,22 @@
 // public header setjmp.h which is also included. here.
 #include "hdr/types/jmp_buf.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/compiler.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
+// TODO(https://github.com/llvm/llvm-project/issues/112427)
+// Some of the architecture-specific definitions are marked `naked`, which in
+// GCC implies `nothrow`.
+//
+// Right now, our aliases aren't marked `nothrow`, so we wind up in a situation
+// where clang will emit -Wmissing-exception-spec if we add `nothrow` here, but
+// GCC will emit -Wmissing-attributes here without `nothrow`. We need to update
+// LLVM_LIBC_FUNCTION to denote when a function throws or not.
+
+#ifdef LIBC_COMPILER_IS_GCC
+[[gnu::nothrow]]
+#endif
 int setjmp(jmp_buf buf);
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/setjmp/x86_64/setjmp.cpp b/libc/src/setjmp/x86_64/setjmp.cpp
index c9ca578fb1e6..f6e82642edd7 100644
--- a/libc/src/setjmp/x86_64/setjmp.cpp
+++ b/libc/src/setjmp/x86_64/setjmp.cpp
@@ -18,7 +18,7 @@
 namespace LIBC_NAMESPACE_DECL {
 
 [[gnu::naked]]
-LLVM_LIBC_FUNCTION(int, setjmp, (__jmp_buf * buf)) {
+LLVM_LIBC_FUNCTION(int, setjmp, (jmp_buf buf)) {
   asm(R"(
       mov %%rbx, %c[rbx](%%rdi)
       mov %%rbp, %c[rbp](%%rdi)
-- 
GitLab


From de7f7ea884525cca24e8797319452bd8bc150752 Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 16 Oct 2024 00:08:58 +0400
Subject: [PATCH 021/329] [lldb][test] Fix TestStdCXXDisassembly test (#112357)

The patch #98694 was not enough. This test is still failed on the
buildbot https://lab.llvm.org/staging/#/builders/195/builds/4438
Use `USE_LIBSTDCPP := 1` instead for non Darwin OS and skip the test if
libstdc++.so is missing.
---
 lldb/test/API/lang/cpp/stl/Makefile                 | 6 +++++-
 lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py | 5 ++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/lldb/test/API/lang/cpp/stl/Makefile b/lldb/test/API/lang/cpp/stl/Makefile
index 8534fa9b0020..4408691f01b7 100644
--- a/lldb/test/API/lang/cpp/stl/Makefile
+++ b/lldb/test/API/lang/cpp/stl/Makefile
@@ -1,5 +1,9 @@
 CXX_SOURCES := main.cpp
 
-USE_SYSTEM_STDLIB := 1
+ifneq ($(OS),Darwin)
+    USE_LIBSTDCPP := 1
+else
+    USE_SYSTEM_STDLIB := 1
+endif
 
 include Makefile.rules
diff --git a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
index 8676ee16d83c..06f338b3ed1d 100644
--- a/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
+++ b/lldb/test/API/lang/cpp/stl/TestStdCXXDisassembly.py
@@ -43,7 +43,10 @@ class StdCXXDisassembleTestCase(TestBase):
         # At this point, lib_stdcxx is the full path to the stdc++ library and
         # module is the corresponding SBModule.
 
-        self.expect(lib_stdcxx, "Library StdC++ is located", exe=False, substrs=["lib"])
+        if "lib" not in lib_stdcxx:
+            self.skipTest(
+                "This test requires libstdc++.so or libc++.dylib in the target's module list."
+            )
 
         self.runCmd("image dump symtab '%s'" % lib_stdcxx)
         raw_output = self.res.GetOutput()
-- 
GitLab


From 8da5aa16f65bc297663573bacd3030f975b9fcde Mon Sep 17 00:00:00 2001
From: SJW <48454132+sjw36@users.noreply.github.com>
Date: Tue, 15 Oct 2024 15:13:49 -0500
Subject: [PATCH 022/329] [mlir][SCF] Fix dynamic loop pipeline peeling for
 num_stages > total_iters (#112418)

When pipelining an `scf.for` with dynamic loop bounds, the epilogue
ramp-down must align with the prologue when num_stages >
total_iterations.

For example:
```
scf.for (0..ub) {
  load(i)
  add(i)
  store(i)
}
```
When num_stages=3 the pipeline follows:
```
load(0)  -  add(0)      -  scf.for (0..ub-2)    -  store(ub-2)
            load(1)     -                       -  add(ub-1)     -  store(ub-1)

```
The trailing `store(ub-2)`, `i=ub-2`, must align with the ramp-up for
`i=0` when `ub < num_stages-1`, so the index `i` should be `max(0,
ub-2)` and each subsequent index is an increment. The predicate must
also handle this scenario, so it becomes `predicate[0] =
total_iterations > epilogue_stage`.
---
 .../Dialect/SCF/Transforms/LoopPipelining.cpp | 53 +++++++++------
 mlir/test/Dialect/SCF/loop-pipelining.mlir    | 67 ++++++++++---------
 2 files changed, 65 insertions(+), 55 deletions(-)

diff --git a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
index 83c9cf69ba03..1b458f410af6 100644
--- a/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
+++ b/mlir/lib/Dialect/SCF/Transforms/LoopPipelining.cpp
@@ -642,22 +642,25 @@ LogicalResult
 LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
                                     llvm::SmallVector<Value> &returnValues) {
   Location loc = forOp.getLoc();
+  Type t = lb.getType();
+
   // Emit different versions of the induction variable. They will be
   // removed by dead code if not used.
 
-  // bounds_range = ub - lb
-  // total_iterations = (bounds_range + step - 1) / step
-  Type t = lb.getType();
-  Value zero =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 0));
-  Value one =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, 1));
-  Value minusOne =
-      rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -1));
+  auto createConst = [&](int v) {
+    return rewriter.create<arith::ConstantOp>(loc,
+                                              rewriter.getIntegerAttr(t, v));
+  };
+
+  // total_iterations = cdiv(range_diff, step);
+  // - range_diff = ub - lb
+  // - total_iterations = (range_diff + step + (step < 0 ? 1 : -1)) / step
+  Value zero = createConst(0);
+  Value one = createConst(1);
   Value stepLessZero = rewriter.create<arith::CmpIOp>(
       loc, arith::CmpIPredicate::slt, step, zero);
   Value stepDecr =
-      rewriter.create<arith::SelectOp>(loc, stepLessZero, one, minusOne);
+      rewriter.create<arith::SelectOp>(loc, stepLessZero, one, createConst(-1));
 
   Value rangeDiff = rewriter.create<arith::SubIOp>(loc, ub, lb);
   Value rangeIncrStep = rewriter.create<arith::AddIOp>(loc, rangeDiff, step);
@@ -665,25 +668,31 @@ LoopPipelinerInternal::emitEpilogue(RewriterBase &rewriter,
       rewriter.create<arith::AddIOp>(loc, rangeIncrStep, stepDecr);
   Value totalIterations = rewriter.create<arith::DivSIOp>(loc, rangeDecr, step);
 
+  // If total_iters < max_stage, start the epilogue at zero to match the
+  // ramp-up in the prologue.
+  // start_iter = max(0, total_iters - max_stage)
+  Value iterI = rewriter.create<arith::SubIOp>(loc, totalIterations,
+                                               createConst(maxStage));
+  iterI = rewriter.create<arith::MaxSIOp>(loc, zero, iterI);
+
+  // Capture predicates for dynamic loops.
   SmallVector<Value> predicates(maxStage + 1);
-  for (int64_t i = 0; i < maxStage; i++) {
-    // iterI = total_iters - 1 - i
-    // May go negative...
-    Value minusI =
-        rewriter.create<arith::ConstantOp>(loc, rewriter.getIntegerAttr(t, -i));
-    Value iterI = rewriter.create<arith::AddIOp>(
-        loc, rewriter.create<arith::AddIOp>(loc, totalIterations, minusOne),
-        minusI);
+
+  for (int64_t i = 1; i <= maxStage; i++) {
     // newLastIter = lb + step * iterI
     Value newlastIter = rewriter.create<arith::AddIOp>(
         loc, lb, rewriter.create<arith::MulIOp>(loc, step, iterI));
 
-    setValueMapping(forOp.getInductionVar(), newlastIter, maxStage - i);
+    setValueMapping(forOp.getInductionVar(), newlastIter, i);
+
+    // increment to next iterI
+    iterI = rewriter.create<arith::AddIOp>(loc, iterI, one);
 
     if (dynamicLoop) {
-      // pred = iterI >= 0
-      predicates[i + 1] = rewriter.create<arith::CmpIOp>(
-          loc, arith::CmpIPredicate::sge, iterI, zero);
+      // Disable stages when `i` is greater than total_iters.
+      // pred = total_iters >= i
+      predicates[i] = rewriter.create<arith::CmpIOp>(
+          loc, arith::CmpIPredicate::sge, totalIterations, createConst(i));
     }
   }
 
diff --git a/mlir/test/Dialect/SCF/loop-pipelining.mlir b/mlir/test/Dialect/SCF/loop-pipelining.mlir
index af49d2afc049..c879c83275bf 100644
--- a/mlir/test/Dialect/SCF/loop-pipelining.mlir
+++ b/mlir/test/Dialect/SCF/loop-pipelining.mlir
@@ -767,6 +767,7 @@ func.func @stage_0_value_escape(%A: memref<?xf32>, %result: memref<?xf32>, %ub:
 // Check for predicated epilogue for dynamic loop.
 // CHECK-LABEL: dynamic_loop(
 //    CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
+//    CHECK-DAG:   %[[C2:.*]] = arith.constant 2 : index
 //    CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //    CHECK-DAG:   %[[CM1:.*]] = arith.constant -1 : index
 //        CHECK:   %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}}
@@ -779,32 +780,32 @@ func.func @stage_0_value_escape(%A: memref<?xf32>, %result: memref<?xf32>, %ub:
 //        CHECK:       scf.yield %[[ADDF_24]], %[[LOAD_27]]
 //        CHECK:   }
 //        CHECK:   %[[CMPI_10:.*]] = arith.cmpi slt, %[[STEP]], %[[C0]]
-//        CHECK:   %[[SEL_10:.*]] = arith.select %[[CMPI_10]], %[[C1]], %[[CM1]]
-//        CHECK:   %[[SUBI_10:.*]] = arith.subi %[[UB]], %[[LB]]
-//        CHECK:   %[[ADDI_11:.*]] = arith.addi %[[SUBI_10]], %[[STEP]]
-//        CHECK:   %[[ADDI_12:.*]] = arith.addi %[[ADDI_11]], %[[SEL_10]]
-//        CHECK:   %[[DIVSI_13:.*]] = arith.divsi %[[ADDI_12]], %[[STEP]]
-//        CHECK:   %[[ADDI_14:.*]] = arith.addi %[[DIVSI_13]], %[[CM1]]
-//        CHECK:   %[[MULI_15:.*]] = arith.muli %{{.*}}, %[[ADDI_14]]
-//        CHECK:   %[[ADDI_16:.*]] = arith.addi %{{.*}}, %[[MULI_15]]
-//        CHECK:   %[[CMPI_17:.*]] = arith.cmpi sge, %[[ADDI_14]], %[[C0]]
-//        CHECK:   %[[ADDI_18:.*]] = arith.addi %[[DIVSI_13]], %{{.*}}-1
-//        CHECK:   %[[ADDI_19:.*]] = arith.addi %[[ADDI_18]], %{{.*}}-1
-//        CHECK:   %[[MULI_20:.*]] = arith.muli %{{.*}}, %[[ADDI_19]]
-//        CHECK:   %[[ADDI_21:.*]] = arith.addi %{{.*}}, %[[MULI_20]]
-//        CHECK:   %[[CMPI_22:.*]] = arith.cmpi sge, %[[ADDI_19]], %[[C0]]
-//        CHECK:   scf.if %[[CMPI_17]] {
-//        CHECK:     memref.store %{{.*}}#0, %{{.*}}[%[[ADDI_21]]]
+//        CHECK:   %[[SELECT_11:.*]] = arith.select %[[CMPI_10]], %[[C1]], %[[CM1]]
+//        CHECK:   %[[SUBI_12:.*]] = arith.subi %[[UB]], %[[LB]]
+//        CHECK:   %[[ADDI_13:.*]] = arith.addi %[[SUBI_12]], %[[STEP]]
+//        CHECK:   %[[ADDI_14:.*]] = arith.addi %[[ADDI_13]], %[[SELECT_11]]
+//        CHECK:   %[[DIVSI_15:.*]] = arith.divsi %[[ADDI_14]], %[[STEP]]
+//        CHECK:   %[[SUBI_17:.*]] = arith.subi %[[DIVSI_15]], %[[C2]]
+//        CHECK:   %[[MAXSI_18:.*]] = arith.maxsi %[[SUBI_17]], %[[C0]]
+//        CHECK:   %[[MULI_19:.*]] = arith.muli %[[STEP]], %[[MAXSI_18]]
+//        CHECK:   %[[ADDI_20:.*]] = arith.addi %[[LB]], %[[MULI_19]]
+//        CHECK:   %[[ADDI_21:.*]] = arith.addi %[[MAXSI_18]], %[[C1]]
+//        CHECK:   %[[CMPI_22:.*]] = arith.cmpi sge, %[[DIVSI_15]], %[[C1]]
+//        CHECK:   %[[MULI_23:.*]] = arith.muli %[[STEP]], %[[ADDI_21]]
+//        CHECK:   %[[ADDI_24:.*]] = arith.addi %[[LB]], %[[MULI_23]]
+//        CHECK:   %[[CMPI_25:.*]] = arith.cmpi sge, %[[DIVSI_15]], %[[C2]]
+//        CHECK:   scf.if %[[CMPI_22]] {
+//        CHECK:     memref.store %{{.*}}#0, %{{.*}}[%[[ADDI_20]]]
 //        CHECK:   } else {
 //        CHECK:   }
-//        CHECK:   %[[IF_23:.*]] = scf.if %[[CMPI_22]] -> (f32) {
-//        CHECK:     %[[ADDF_24:.*]] = arith.addf %{{.*}}#1, %{{.*}}
-//        CHECK:     scf.yield %[[ADDF_24]]
+//        CHECK:   %[[IF_26:.*]] = scf.if %[[CMPI_25]]
+//        CHECK:     %[[ADDF_27:.*]] = arith.addf %{{.*}}#1, %{{.*}}
+//        CHECK:     scf.yield %[[ADDF_27]]
 //        CHECK:   } else {
 //        CHECK:     scf.yield %{{.*}}
 //        CHECK:   }
-//        CHECK:   scf.if %[[CMPI_22]] {
-//        CHECK:     memref.store %[[IF_23]], %{{.*}}[%[[ADDI_16]]]
+//        CHECK:   scf.if %[[CMPI_25]] {
+//        CHECK:     memref.store %[[IF_26]], %{{.*}}[%[[ADDI_24]]]
 //        CHECK:   } else {
 //        CHECK:   }
 //        CHECK:   return
@@ -842,6 +843,7 @@ func.func @dynamic_loop(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %
 //   CHECK-DAG:   %[[C1:.*]] = arith.constant 1 : index
 //   CHECK-DAG:   %[[C0:.*]] = arith.constant 0 : index
 //   CHECK-DAG:   %[[CM1:.*]] = arith.constant -1 : index
+//   CHECK-DAG:   %[[CF0:.*]] = arith.constant 0.000000e+00
 //       CHECK:   %[[UBM:.*]] = arith.subi %[[UB:.*]], %{{.*}}
 //       CHECK:   %{{.*}}:2 = scf.for %[[ARG5:.*]] = %[[LB:.*]] to %[[UBM]] step %[[STEP:.*]] iter_args(%[[ARG6:.*]] = %{{.*}}, %[[ARG7:.*]] = %{{.*}})
 //       CHECK:       %[[ADDF_13:.*]] = arith.addf %[[ARG7]], %[[ARG6]]
@@ -856,22 +858,21 @@ func.func @dynamic_loop(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %
 //       CHECK:     %[[ADDI_7:.*]] = arith.addi %[[SUBI_6]], %[[STEP]]
 //       CHECK:     %[[ADDI_8:.*]] = arith.addi %[[ADDI_7]], %[[SELECT_5]]
 //       CHECK:     %[[DIVSI_9:.*]] = arith.divsi %[[ADDI_8]], %[[STEP]]
-//       CHECK:     %[[ADDI_10:.*]] = arith.addi %[[DIVSI_9]], %[[CM1]]
-//       CHECK:     %[[CMPI_11:.*]] = arith.cmpi sge, %[[ADDI_10]], %[[C0]]
-//       CHECK:     %[[IF_10:.*]] = scf.if %[[CMPI_11]]
-//       CHECK:       %[[ADDF_13:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0
-//       CHECK:       scf.yield %[[ADDF_13]]
+//       CHECK:     %[[CMPI_10:.*]] = arith.cmpi sge, %[[DIVSI_9]], %[[C1]]
+//       CHECK:     %[[IF_11:.*]] = scf.if %[[CMPI_10]]
+//       CHECK:       %[[ADDF_14:.*]] = arith.addf %{{.*}}#1, %{{.*}}#0
+//       CHECK:       scf.yield %[[ADDF_14]]
 //       CHECK:     } else {
-//       CHECK:       scf.yield %{{.*}}
+//       CHECK:       scf.yield %[[CF0]]
 //       CHECK:     }
-//       CHECK:     %[[IF_11:.*]] = scf.if %[[CMPI_11]]
-//       CHECK:       %[[MULF_13:.*]] = arith.mulf %[[IF_10]], %{{.*}}
-//       CHECK:       scf.yield %[[MULF_13]]
+//       CHECK:     %[[IF_12:.*]] = scf.if %[[CMPI_10]]
+//       CHECK:       %[[MULF_14:.*]] = arith.mulf %[[IF_11]], %{{.*}}
+//       CHECK:       scf.yield %[[MULF_14]]
 //       CHECK:     } else {
-//       CHECK:       scf.yield %{{.*}}
+//       CHECK:       scf.yield %[[CF0]]
 //       CHECK:     }
-//       CHECK:     %[[SELECT_12:.*]] = arith.select %[[CMPI_11]], %[[IF_11]], %{{.*}}#0
-//       CHECK:     memref.store %[[SELECT_12]], %{{.*}}[%{{.*}}]
+//       CHECK:     %[[SELECT_13:.*]] = arith.select %[[CMPI_10]], %[[IF_12]], %{{.*}}#0
+//       CHECK:     memref.store %[[SELECT_13]], %{{.*}}[%[[C0]]]
 func.func @dynamic_loop_result(%A: memref<?xf32>, %result: memref<?xf32>, %lb: index, %ub: index, %step: index) {
   %cf0 = arith.constant 1.0 : f32
   %cf1 = arith.constant 33.0 : f32
-- 
GitLab


From 3cab8827fdb6928d355d82d710695ef7cfeb3a2c Mon Sep 17 00:00:00 2001
From: Peter Collingbourne <peter@pcc.me.uk>
Date: Tue, 15 Oct 2024 13:18:43 -0700
Subject: [PATCH 023/329] Revert "[AMDGPU] Serialize WWM_REG vreg flag
 (#110229)"

This reverts commit bec839d8eed9dd13fa7eaffd50b28f8f913de2e2.

Caused buildbot failures, e.g.
https://lab.llvm.org/buildbot/#/builders/52/builds/2928
---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp    | 11 -----------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp         | 10 ----------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h           |  8 --------
 .../MIR/AMDGPU/machine-function-info-no-ir.mir    | 15 ---------------
 4 files changed, 44 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 16e23879cd73..23ee0c3e896e 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1718,17 +1718,6 @@ bool GCNTargetMachine::parseMachineFunctionInfo(
     MFI->reserveWWMRegister(ParsedReg);
   }
 
-  for (const auto &[_, Info] : PFS.VRegInfosNamed) {
-    for (uint8_t Flag : Info->Flags) {
-      MFI->setFlag(Info->VReg, Flag);
-    }
-  }
-  for (const auto &[_, Info] : PFS.VRegInfos) {
-    for (uint8_t Flag : Info->Flags) {
-      MFI->setFlag(Info->VReg, Flag);
-    }
-  }
-
   auto parseAndCheckArgument = [&](const std::optional<yaml::SIArgument> &A,
                                    const TargetRegisterClass &RC,
                                    ArgDescriptor &Arg, unsigned UserSGPRs,
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index 20d48aa57adb..de9cbe403ab6 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3851,13 +3851,3 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
   }
   return 0;
 }
-
-SmallVector<StringLiteral>
-SIRegisterInfo::getVRegFlagsOfReg(Register Reg,
-                                  const MachineFunction &MF) const {
-  SmallVector<StringLiteral> RegFlags;
-  const SIMachineFunctionInfo *FuncInfo = MF.getInfo<SIMachineFunctionInfo>();
-  if (FuncInfo->checkFlag(Reg, AMDGPU::VirtRegFlag::WWM_REG))
-    RegFlags.push_back("WWM_REG");
-  return RegFlags;
-}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index fe0b66f75bba..99fa632c0300 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -457,14 +457,6 @@ public:
   // No check if the subreg is supported by the current RC is made.
   unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
                                      unsigned SubReg) const;
-
-  std::optional<uint8_t> getVRegFlagValue(StringRef Name) const override {
-    return Name == "WWM_REG" ? AMDGPU::VirtRegFlag::WWM_REG
-                             : std::optional<uint8_t>{};
-  }
-
-  SmallVector<StringLiteral>
-  getVRegFlagsOfReg(Register Reg, const MachineFunction &MF) const override;
 };
 
 namespace AMDGPU {
diff --git a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
index 51795a4fea51..ebbb89b7816c 100644
--- a/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
+++ b/llvm/test/CodeGen/MIR/AMDGPU/machine-function-info-no-ir.mir
@@ -578,18 +578,3 @@ body:             |
     SI_RETURN
 
 ...
----
-name: vregs
-# FULL: registers:
-# FULL-NEXT:   - { id: 0, class: vgpr_32, preferred-register: '$vgpr1', flags: [ WWM_REG ] }
-# FULL-NEXT:   - { id: 1, class: sgpr_64, preferred-register: '$sgpr0_sgpr1', flags: [  ] }
-# FULL-NEXT:   - { id: 2, class: sgpr_64, preferred-register: '', flags: [  ] }
-registers:
-  - { id: 0, class: vgpr_32, preferred-register: $vgpr1, flags: [ WWM_REG ]}
-  - { id: 1, class: sgpr_64, preferred-register: $sgpr0_sgpr1 }
-  - { id: 2, class: sgpr_64, flags: [ ] }
-body: |
-  bb.0:
-    %2:sgpr_64 = COPY %1
-    %1:sgpr_64 = COPY %0
-...
-- 
GitLab


From 5f2cf99e146ce99d4e148038d9bdd012331b4821 Mon Sep 17 00:00:00 2001
From: Dmitrii Galimzianov <dmt021@gmail.com>
Date: Tue, 15 Oct 2024 22:25:01 +0200
Subject: [PATCH 024/329] DynamicLoaderDarwin load images in parallel with
 preload (#110646)

This change enables `DynamicLoaderDarwin` to load modules in parallel
using the thread pool. This new behavior is controlled by a new setting
`plugin.dynamic-loader.darwin.experimental.enable-parallel-image-load`,
which is enabled by default. When disabled, DynamicLoaderDarwin will
load modules sequentially as before.
---
 .../DynamicLoader/MacOSX-DYLD/CMakeLists.txt  |  13 +++
 .../MacOSX-DYLD/DynamicLoaderDarwin.cpp       | 102 ++++++++++++------
 .../MacOSX-DYLD/DynamicLoaderDarwin.h         |  23 ++--
 .../DynamicLoaderDarwinProperties.cpp         |  53 +++++++++
 .../DynamicLoaderDarwinProperties.h           |  34 ++++++
 .../DynamicLoaderDarwinProperties.td          |   8 ++
 .../MacOSX-DYLD/DynamicLoaderMacOS.cpp        |  10 +-
 .../MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp   |  13 ++-
 .../MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h     |   2 +
 9 files changed, 215 insertions(+), 43 deletions(-)
 create mode 100644 lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp
 create mode 100644 lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h
 create mode 100644 lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td

diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt
index 7308374c8bfb..77a560541fcb 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/CMakeLists.txt
@@ -1,7 +1,16 @@
+lldb_tablegen(DynamicLoaderDarwinProperties.inc -gen-lldb-property-defs
+  SOURCE DynamicLoaderDarwinProperties.td
+  TARGET LLDBPluginDynamicLoaderDarwinPropertiesGen)
+
+lldb_tablegen(DynamicLoaderDarwinPropertiesEnum.inc -gen-lldb-property-enum-defs
+  SOURCE DynamicLoaderDarwinProperties.td
+  TARGET LLDBPluginDynamicLoaderDarwinPropertiesEnumGen)
+
 add_lldb_library(lldbPluginDynamicLoaderMacOSXDYLD PLUGIN
   DynamicLoaderMacOSXDYLD.cpp
   DynamicLoaderMacOS.cpp
   DynamicLoaderDarwin.cpp
+  DynamicLoaderDarwinProperties.cpp
 
   LINK_LIBS
     lldbBreakpoint
@@ -16,3 +25,7 @@ add_lldb_library(lldbPluginDynamicLoaderMacOSXDYLD PLUGIN
     Support
     TargetParser
   )
+
+add_dependencies(lldbPluginDynamicLoaderMacOSXDYLD
+  LLDBPluginDynamicLoaderDarwinPropertiesGen
+  LLDBPluginDynamicLoaderDarwinPropertiesEnumGen)
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
index 30242038a5f6..3659dfcd3c4c 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.cpp
@@ -8,6 +8,7 @@
 
 #include "DynamicLoaderDarwin.h"
 
+#include "DynamicLoaderDarwinProperties.h"
 #include "lldb/Breakpoint/StoppointCallbackContext.h"
 #include "lldb/Core/Debugger.h"
 #include "lldb/Core/Module.h"
@@ -31,6 +32,7 @@
 #include "lldb/Utility/LLDBLog.h"
 #include "lldb/Utility/Log.h"
 #include "lldb/Utility/State.h"
+#include "llvm/Support/ThreadPool.h"
 
 #include "Plugins/LanguageRuntime/ObjC/ObjCLanguageRuntime.h"
 #include "Plugins/TypeSystem/Clang/TypeSystemClang.h"
@@ -77,6 +79,17 @@ void DynamicLoaderDarwin::DidLaunch() {
   SetNotificationBreakpoint();
 }
 
+void DynamicLoaderDarwin::CreateSettings(lldb_private::Debugger &debugger) {
+  if (!PluginManager::GetSettingForDynamicLoaderPlugin(
+          debugger, DynamicLoaderDarwinProperties::GetSettingName())) {
+    const bool is_global_setting = true;
+    PluginManager::CreateSettingForDynamicLoaderPlugin(
+        debugger,
+        DynamicLoaderDarwinProperties::GetGlobal().GetValueProperties(),
+        "Properties for the DynamicLoaderDarwin plug-in.", is_global_setting);
+  }
+}
+
 // Clear out the state of this class.
 void DynamicLoaderDarwin::Clear(bool clear_process) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
@@ -88,7 +101,7 @@ void DynamicLoaderDarwin::Clear(bool clear_process) {
 }
 
 ModuleSP DynamicLoaderDarwin::FindTargetModuleForImageInfo(
-    ImageInfo &image_info, bool can_create, bool *did_create_ptr) {
+    const ImageInfo &image_info, bool can_create, bool *did_create_ptr) {
   if (did_create_ptr)
     *did_create_ptr = false;
 
@@ -517,8 +530,8 @@ bool DynamicLoaderDarwin::JSONImageInformationIntoImageInfo(
   return true;
 }
 
-void DynamicLoaderDarwin::UpdateSpecialBinariesFromNewImageInfos(
-    ImageInfo::collection &image_infos) {
+void DynamicLoaderDarwin::UpdateSpecialBinariesFromPreloadedModules(
+    std::vector<std::pair<ImageInfo, ModuleSP>> &images) {
   uint32_t exe_idx = UINT32_MAX;
   uint32_t dyld_idx = UINT32_MAX;
   Target &target = m_process->GetTarget();
@@ -526,35 +539,34 @@ void DynamicLoaderDarwin::UpdateSpecialBinariesFromNewImageInfos(
   ConstString g_dyld_sim_filename("dyld_sim");
 
   ArchSpec target_arch = target.GetArchitecture();
-  const size_t image_infos_size = image_infos.size();
-  for (size_t i = 0; i < image_infos_size; i++) {
-    if (image_infos[i].header.filetype == llvm::MachO::MH_DYLINKER) {
+  const size_t images_size = images.size();
+  for (size_t i = 0; i < images_size; i++) {
+    const auto &image_info = images[i].first;
+    if (image_info.header.filetype == llvm::MachO::MH_DYLINKER) {
       // In a "simulator" process we will have two dyld modules --
       // a "dyld" that we want to keep track of, and a "dyld_sim" which
       // we don't need to keep track of here.  dyld_sim will have a non-macosx
       // OS.
       if (target_arch.GetTriple().getEnvironment() == llvm::Triple::Simulator &&
-          image_infos[i].os_type != llvm::Triple::OSType::MacOSX) {
+          image_info.os_type != llvm::Triple::OSType::MacOSX) {
         continue;
       }
 
       dyld_idx = i;
     }
-    if (image_infos[i].header.filetype == llvm::MachO::MH_EXECUTE) {
+    if (image_info.header.filetype == llvm::MachO::MH_EXECUTE) {
       exe_idx = i;
     }
   }
 
   // Set the target executable if we haven't found one so far.
   if (exe_idx != UINT32_MAX && !target.GetExecutableModule()) {
-    const bool can_create = true;
-    ModuleSP exe_module_sp(FindTargetModuleForImageInfo(image_infos[exe_idx],
-                                                        can_create, nullptr));
+    ModuleSP exe_module_sp = images[exe_idx].second;
     if (exe_module_sp) {
       LLDB_LOGF(log, "Found executable module: %s",
                 exe_module_sp->GetFileSpec().GetPath().c_str());
       target.GetImages().AppendIfNeeded(exe_module_sp);
-      UpdateImageLoadAddress(exe_module_sp.get(), image_infos[exe_idx]);
+      UpdateImageLoadAddress(exe_module_sp.get(), images[exe_idx].first);
       if (exe_module_sp.get() != target.GetExecutableModulePointer())
         target.SetExecutableModule(exe_module_sp, eLoadDependentsNo);
 
@@ -581,14 +593,12 @@ void DynamicLoaderDarwin::UpdateSpecialBinariesFromNewImageInfos(
   }
 
   if (dyld_idx != UINT32_MAX) {
-    const bool can_create = true;
-    ModuleSP dyld_sp = FindTargetModuleForImageInfo(image_infos[dyld_idx],
-                                                    can_create, nullptr);
+    ModuleSP dyld_sp = images[dyld_idx].second;
     if (dyld_sp.get()) {
       LLDB_LOGF(log, "Found dyld module: %s",
                 dyld_sp->GetFileSpec().GetPath().c_str());
       target.GetImages().AppendIfNeeded(dyld_sp);
-      UpdateImageLoadAddress(dyld_sp.get(), image_infos[dyld_idx]);
+      UpdateImageLoadAddress(dyld_sp.get(), images[dyld_idx].first);
       SetDYLDModule(dyld_sp);
     }
   }
@@ -642,26 +652,58 @@ ModuleSP DynamicLoaderDarwin::GetDYLDModule() {
 
 void DynamicLoaderDarwin::ClearDYLDModule() { m_dyld_module_wp.reset(); }
 
+std::vector<std::pair<DynamicLoaderDarwin::ImageInfo, ModuleSP>>
+DynamicLoaderDarwin::PreloadModulesFromImageInfos(
+    const ImageInfo::collection &image_infos) {
+  const auto size = image_infos.size();
+  std::vector<std::pair<DynamicLoaderDarwin::ImageInfo, ModuleSP>> images(size);
+  auto LoadImage = [&](size_t i, ImageInfo::collection::const_iterator it) {
+    const auto &image_info = *it;
+    images[i] = std::make_pair(
+        image_info, FindTargetModuleForImageInfo(image_info, true, nullptr));
+  };
+  auto it = image_infos.begin();
+  bool is_parallel_load =
+      DynamicLoaderDarwinProperties::GetGlobal().GetEnableParallelImageLoad();
+  if (is_parallel_load) {
+    llvm::ThreadPoolTaskGroup taskGroup(Debugger::GetThreadPool());
+    for (size_t i = 0; i < size; ++i, ++it) {
+      taskGroup.async(LoadImage, i, it);
+    }
+    taskGroup.wait();
+  } else {
+    for (size_t i = 0; i < size; ++i, ++it) {
+      LoadImage(i, it);
+    }
+  }
+  return images;
+}
+
 bool DynamicLoaderDarwin::AddModulesUsingImageInfos(
     ImageInfo::collection &image_infos) {
   std::lock_guard<std::recursive_mutex> guard(m_mutex);
+  auto images = PreloadModulesFromImageInfos(image_infos);
+  return AddModulesUsingPreloadedModules(images);
+}
+
+bool DynamicLoaderDarwin::AddModulesUsingPreloadedModules(
+    std::vector<std::pair<ImageInfo, ModuleSP>> &images) {
+  std::lock_guard<std::recursive_mutex> guard(m_mutex);
   // Now add these images to the main list.
   ModuleList loaded_module_list;
   Log *log = GetLog(LLDBLog::DynamicLoader);
   Target &target = m_process->GetTarget();
   ModuleList &target_images = target.GetImages();
 
-  for (uint32_t idx = 0; idx < image_infos.size(); ++idx) {
+  for (uint32_t idx = 0; idx < images.size(); ++idx) {
+    auto &image_info = images[idx].first;
+    const auto &image_module_sp = images[idx].second;
     if (log) {
       LLDB_LOGF(log, "Adding new image at address=0x%16.16" PRIx64 ".",
-                image_infos[idx].address);
-      image_infos[idx].PutToLog(log);
+                image_info.address);
+      image_info.PutToLog(log);
     }
-
-    m_dyld_image_infos.push_back(image_infos[idx]);
-
-    ModuleSP image_module_sp(
-        FindTargetModuleForImageInfo(image_infos[idx], true, nullptr));
+    m_dyld_image_infos.push_back(image_info);
 
     if (image_module_sp) {
       ObjectFile *objfile = image_module_sp->GetObjectFile();
@@ -673,7 +715,7 @@ bool DynamicLoaderDarwin::AddModulesUsingImageInfos(
               sections->FindSectionByName(commpage_dbstr).get();
           if (commpage_section) {
             ModuleSpec module_spec(objfile->GetFileSpec(),
-                                   image_infos[idx].GetArchitecture());
+                                   image_info.GetArchitecture());
             module_spec.GetObjectName() = commpage_dbstr;
             ModuleSP commpage_image_module_sp(
                 target_images.FindFirstModule(module_spec));
@@ -686,17 +728,17 @@ bool DynamicLoaderDarwin::AddModulesUsingImageInfos(
               if (!commpage_image_module_sp ||
                   commpage_image_module_sp->GetObjectFile() == nullptr) {
                 commpage_image_module_sp = m_process->ReadModuleFromMemory(
-                    image_infos[idx].file_spec, image_infos[idx].address);
+                    image_info.file_spec, image_info.address);
                 // Always load a memory image right away in the target in case
                 // we end up trying to read the symbol table from memory... The
                 // __LINKEDIT will need to be mapped so we can figure out where
                 // the symbol table bits are...
                 bool changed = false;
                 UpdateImageLoadAddress(commpage_image_module_sp.get(),
-                                       image_infos[idx]);
+                                       image_info);
                 target.GetImages().Append(commpage_image_module_sp);
                 if (changed) {
-                  image_infos[idx].load_stop_id = m_process->GetStopID();
+                  image_info.load_stop_id = m_process->GetStopID();
                   loaded_module_list.AppendIfNeeded(commpage_image_module_sp);
                 }
               }
@@ -709,14 +751,14 @@ bool DynamicLoaderDarwin::AddModulesUsingImageInfos(
       // address. We need to check this so we don't mention that all loaded
       // shared libraries are newly loaded each time we hit out dyld breakpoint
       // since dyld will list all shared libraries each time.
-      if (UpdateImageLoadAddress(image_module_sp.get(), image_infos[idx])) {
+      if (UpdateImageLoadAddress(image_module_sp.get(), image_info)) {
         target_images.AppendIfNeeded(image_module_sp);
         loaded_module_list.AppendIfNeeded(image_module_sp);
       }
 
       // To support macCatalyst and legacy iOS simulator,
       // update the module's platform with the DYLD info.
-      ArchSpec dyld_spec = image_infos[idx].GetArchitecture();
+      ArchSpec dyld_spec = image_info.GetArchitecture();
       auto &dyld_triple = dyld_spec.GetTriple();
       if ((dyld_triple.getEnvironment() == llvm::Triple::MacABI &&
            dyld_triple.getOS() == llvm::Triple::IOS) ||
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
index 45c693163f81..bc5464d76b95 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwin.h
@@ -58,6 +58,8 @@ public:
 
   std::optional<lldb_private::Address> GetStartAddress() override;
 
+  static void CreateSettings(lldb_private::Debugger &debugger);
+
 protected:
   void PrivateInitialize(lldb_private::Process *process);
 
@@ -174,7 +176,7 @@ protected:
 
   bool UnloadModuleSections(lldb_private::Module *module, ImageInfo &info);
 
-  lldb::ModuleSP FindTargetModuleForImageInfo(ImageInfo &image_info,
+  lldb::ModuleSP FindTargetModuleForImageInfo(const ImageInfo &image_info,
                                               bool can_create,
                                               bool *did_create_ptr);
 
@@ -201,11 +203,18 @@ protected:
       lldb_private::StructuredData::ObjectSP image_details,
       ImageInfo::collection &image_infos);
 
-  // If image_infos contains / may contain dyld or executable image, call this
-  // method
-  // to keep our internal record keeping of the special binaries up-to-date.
-  void
-  UpdateSpecialBinariesFromNewImageInfos(ImageInfo::collection &image_infos);
+  // Finds/loads modules for a given `image_infos` and returns pairs
+  // (ImageInfo, ModuleSP).
+  // Prefer using this method rather than calling `FindTargetModuleForImageInfo`
+  // directly as this method may load the modules in parallel.
+  std::vector<std::pair<ImageInfo, lldb::ModuleSP>>
+  PreloadModulesFromImageInfos(const ImageInfo::collection &image_infos);
+
+  // If `images` contains / may contain dyld or executable image, call this
+  // method to keep our internal record keeping of the special binaries
+  // up-to-date.
+  void UpdateSpecialBinariesFromPreloadedModules(
+      std::vector<std::pair<ImageInfo, lldb::ModuleSP>> &images);
 
   // if image_info is a dyld binary, call this method
   bool UpdateDYLDImageInfoFromNewImageInfo(ImageInfo &image_info);
@@ -215,6 +224,8 @@ protected:
   void AddExecutableModuleIfInImageInfos(ImageInfo::collection &image_infos);
 
   bool AddModulesUsingImageInfos(ImageInfo::collection &image_infos);
+  bool AddModulesUsingPreloadedModules(
+      std::vector<std::pair<ImageInfo, lldb::ModuleSP>> &images);
 
   // Whether we should use the new dyld SPI to get shared library information,
   // or read
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp
new file mode 100644
index 000000000000..f4d8a071e6d5
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.cpp
@@ -0,0 +1,53 @@
+//===-- DynamicLoaderDarwinProperties.cpp ---------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DynamicLoaderDarwinProperties.h"
+
+using namespace lldb_private;
+
+#define LLDB_PROPERTIES_dynamicloaderdarwin_experimental
+#include "DynamicLoaderDarwinProperties.inc"
+
+enum {
+#define LLDB_PROPERTIES_dynamicloaderdarwin_experimental
+#include "DynamicLoaderDarwinPropertiesEnum.inc"
+};
+
+llvm::StringRef DynamicLoaderDarwinProperties::GetSettingName() {
+  static constexpr llvm::StringLiteral g_setting_name("darwin");
+  return g_setting_name;
+}
+
+DynamicLoaderDarwinProperties::ExperimentalProperties::ExperimentalProperties()
+    : Properties(std::make_shared<OptionValueProperties>(
+          GetExperimentalSettingsName())) {
+  m_collection_sp->Initialize(g_dynamicloaderdarwin_experimental_properties);
+}
+
+DynamicLoaderDarwinProperties::DynamicLoaderDarwinProperties()
+    : Properties(std::make_shared<OptionValueProperties>(GetSettingName())),
+      m_experimental_properties(std::make_unique<ExperimentalProperties>()) {
+  m_collection_sp->AppendProperty(
+      Properties::GetExperimentalSettingsName(),
+      "Experimental settings - setting these won't produce errors if the "
+      "setting is not present.",
+      true, m_experimental_properties->GetValueProperties());
+}
+
+bool DynamicLoaderDarwinProperties::GetEnableParallelImageLoad() const {
+  return m_experimental_properties->GetPropertyAtIndexAs<bool>(
+      ePropertyEnableParallelImageLoad,
+      g_dynamicloaderdarwin_experimental_properties
+              [ePropertyEnableParallelImageLoad]
+                  .default_uint_value != 0);
+}
+
+DynamicLoaderDarwinProperties &DynamicLoaderDarwinProperties::GetGlobal() {
+  static DynamicLoaderDarwinProperties g_settings;
+  return g_settings;
+}
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h
new file mode 100644
index 000000000000..4c5e800c4f3e
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.h
@@ -0,0 +1,34 @@
+//===-- DynamicLoaderDarwinProperties.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H
+#define LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H
+
+#include "lldb/Core/UserSettingsController.h"
+
+namespace lldb_private {
+
+class DynamicLoaderDarwinProperties : public Properties {
+public:
+  class ExperimentalProperties : public Properties {
+  public:
+    ExperimentalProperties();
+  };
+  static llvm::StringRef GetSettingName();
+  static DynamicLoaderDarwinProperties &GetGlobal();
+  DynamicLoaderDarwinProperties();
+  ~DynamicLoaderDarwinProperties() override = default;
+  bool GetEnableParallelImageLoad() const;
+
+private:
+  std::unique_ptr<ExperimentalProperties> m_experimental_properties;
+};
+
+} // namespace lldb_private
+
+#endif // LLDB_SOURCE_PLUGINS_DYNAMICLOADER_MACOSX_DYLD_DYNAMICLOADERDARWINPROPERTIES_H
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td
new file mode 100644
index 000000000000..c54580ce3472
--- /dev/null
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td
@@ -0,0 +1,8 @@
+include "../../../../include/lldb/Core/PropertiesBase.td"
+
+let Definition = "dynamicloaderdarwin_experimental" in {
+  def EnableParallelImageLoad: Property<"enable-parallel-image-load", "Boolean">,
+    Global,
+    DefaultTrue,
+    Desc<"Load images in parallel.">;
+}
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
index a038b65d4728..82555d1e028b 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOS.cpp
@@ -215,8 +215,9 @@ void DynamicLoaderMacOS::DoInitialImageFetch() {
       LLDB_LOGF(log, "Initial module fetch:  Adding %" PRId64 " modules.\n",
                 (uint64_t)image_infos.size());
 
-      UpdateSpecialBinariesFromNewImageInfos(image_infos);
-      AddModulesUsingImageInfos(image_infos);
+      auto images = PreloadModulesFromImageInfos(image_infos);
+      UpdateSpecialBinariesFromPreloadedModules(images);
+      AddModulesUsingPreloadedModules(images);
     }
   }
 
@@ -425,8 +426,9 @@ void DynamicLoaderMacOS::AddBinaries(
               ->GetAsArray()
               ->GetSize() == load_addresses.size()) {
     if (JSONImageInformationIntoImageInfo(binaries_info_sp, image_infos)) {
-      UpdateSpecialBinariesFromNewImageInfos(image_infos);
-      AddModulesUsingImageInfos(image_infos);
+      auto images = PreloadModulesFromImageInfos(image_infos);
+      UpdateSpecialBinariesFromPreloadedModules(images);
+      AddModulesUsingPreloadedModules(images);
     }
     m_dyld_image_infos_stop_id = m_process->GetStopID();
   }
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
index debd0f6ee83f..8fc77cbe1170 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.cpp
@@ -572,8 +572,9 @@ bool DynamicLoaderMacOSXDYLD::AddModulesUsingImageInfosAddress(
               ->GetSize() == image_infos_count) {
     bool return_value = false;
     if (JSONImageInformationIntoImageInfo(image_infos_json_sp, image_infos)) {
-      UpdateSpecialBinariesFromNewImageInfos(image_infos);
-      return_value = AddModulesUsingImageInfos(image_infos);
+      auto images = PreloadModulesFromImageInfos(image_infos);
+      UpdateSpecialBinariesFromPreloadedModules(images);
+      return_value = AddModulesUsingPreloadedModules(images);
     }
     m_dyld_image_infos_stop_id = m_process->GetStopID();
     return return_value;
@@ -1147,7 +1148,8 @@ bool DynamicLoaderMacOSXDYLD::IsFullyInitialized() {
 
 void DynamicLoaderMacOSXDYLD::Initialize() {
   PluginManager::RegisterPlugin(GetPluginNameStatic(),
-                                GetPluginDescriptionStatic(), CreateInstance);
+                                GetPluginDescriptionStatic(), CreateInstance,
+                                DebuggerInitialize);
   DynamicLoaderMacOS::Initialize();
 }
 
@@ -1156,6 +1158,11 @@ void DynamicLoaderMacOSXDYLD::Terminate() {
   PluginManager::UnregisterPlugin(CreateInstance);
 }
 
+void DynamicLoaderMacOSXDYLD::DebuggerInitialize(
+    lldb_private::Debugger &debugger) {
+  CreateSettings(debugger);
+}
+
 llvm::StringRef DynamicLoaderMacOSXDYLD::GetPluginDescriptionStatic() {
   return "Dynamic loader plug-in that watches for shared library loads/unloads "
          "in MacOSX user processes.";
diff --git a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h
index ae7451722a8d..924e2fc10774 100644
--- a/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h
+++ b/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/DynamicLoaderMacOSXDYLD.h
@@ -50,6 +50,8 @@ public:
   static lldb_private::DynamicLoader *
   CreateInstance(lldb_private::Process *process, bool force);
 
+  static void DebuggerInitialize(lldb_private::Debugger &debugger);
+
   /// Called after attaching a process.
   ///
   /// Allow DynamicLoader plug-ins to execute some code after
-- 
GitLab


From 97da5e670099848f7d136a6988afd6ea638e2210 Mon Sep 17 00:00:00 2001
From: David Stenberg <david.stenberg@ericsson.com>
Date: Tue, 15 Oct 2024 22:34:27 +0200
Subject: [PATCH 025/329] [GSYM] Remove redundant getInliningInfoForAddress
 call (#111136)

In DwarfTransformer::verify() line number information is retrieved for
each address using:

  auto DwarfInlineInfos =
      DICtx.getInliningInfoForAddress(SectAddr, DLIS);

Later down the loop, another such invocation was made before:

  Gsym->dump(Log, *FI);

There is a continue after that, DwarfInlineInfos do not affect the
dump() invocation, I am not aware of any other side effects that is
needed from the extra getInliningInfoForAddress() invocation, and tests
pass without it, so just remove it.
---
 llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
index 894abf5777f1..3f5604e6aa4b 100644
--- a/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
+++ b/llvm/lib/DebugInfo/GSYM/DwarfTransformer.cpp
@@ -699,7 +699,6 @@ llvm::Error DwarfTransformer::verify(StringRef GsymPath,
             Log << "    [" << Idx << "]: " << gii.Name << " @ " << gii.Dir
                 << '/' << gii.Base << ':' << gii.Line << '\n';
           }
-          DwarfInlineInfos = DICtx.getInliningInfoForAddress(SectAddr, DLIS);
           Gsym->dump(Log, *FI);
         }
         continue;
-- 
GitLab


From 3b4512074e8d2790794a49ea675f8a4af5817e60 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Tue, 15 Oct 2024 13:38:15 -0700
Subject: [PATCH 026/329] [HLSL] Make HLSLAttributedResourceType canonical and
 add code paths to convert HLSL types to DirectX target types (#110327)

Translates `RWBuffer` and `StructuredBuffer` resources buffer types to
DirectX target types `dx.TypedBuffer` and `dx.RawBuffer`.

Includes a change of `HLSLAttributesResourceType` from 'sugar' type to
full canonical type. This is required for codegen and other clang
infrastructure to work property on HLSL resource types.

Fixes #95952 (part 2/2)
---
 clang/include/clang/AST/Type.h                | 34 +++++++----
 clang/include/clang/Basic/TypeNodes.td        |  2 +-
 clang/lib/AST/ASTContext.cpp                  | 25 ++++++++-
 clang/lib/AST/ASTStructuralEquivalence.cpp    | 15 +----
 clang/lib/AST/DeclCXX.cpp                     |  6 +-
 clang/lib/AST/ExprConstant.cpp                |  1 +
 clang/lib/AST/ItaniumMangle.cpp               | 32 +++++++++++
 clang/lib/AST/MicrosoftMangle.cpp             |  5 ++
 clang/lib/AST/Type.cpp                        |  5 ++
 clang/lib/CodeGen/CodeGenFunction.cpp         |  1 +
 clang/lib/CodeGen/CodeGenTypes.cpp            |  3 +
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |  6 ++
 clang/lib/CodeGen/Targets/DirectX.cpp         | 41 ++++++++++----
 clang/lib/Sema/HLSLExternalSemaSource.cpp     |  2 -
 clang/lib/Sema/SemaLookup.cpp                 |  3 +
 clang/lib/Sema/SemaOverload.cpp               | 17 ++++++
 clang/lib/Sema/SemaTemplate.cpp               |  7 +++
 clang/lib/Sema/SemaTemplateDeduction.cpp      | 11 ++++
 clang/test/AST/HLSL/RWBuffer-AST.hlsl         |  2 -
 clang/test/AST/HLSL/StructuredBuffer-AST.hlsl |  2 -
 .../builtins/RWBuffer-elementtype.hlsl        | 18 ++++++
 .../StructuredBuffer-elementtype.hlsl         | 18 ++++++
 .../CodeGenHLSL/builtins/hlsl_resource_t.hlsl | 56 +++++++++++++++++--
 .../ParserHLSL/hlsl_contained_type_attr.hlsl  | 13 ++---
 clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl   |  9 +--
 .../test/ParserHLSL/hlsl_raw_buffer_attr.hlsl |  9 +--
 .../ParserHLSL/hlsl_resource_class_attr.hlsl  | 17 ++----
 .../hlsl_resource_handle_attrs.hlsl           |  2 -
 28 files changed, 277 insertions(+), 85 deletions(-)

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index 8ff04cf89a6b..deda5b3f70f3 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -2662,6 +2662,7 @@ public:
 #include "clang/Basic/HLSLIntangibleTypes.def"
   bool isHLSLSpecificType() const; // Any HLSL specific type
   bool isHLSLIntangibleType() const; // Any HLSL intangible type
+  bool isHLSLAttributedResourceType() const;
 
   /// Determines if this type, which must satisfy
   /// isObjCLifetimeType(), is implicitly __unsafe_unretained rather
@@ -6270,6 +6271,14 @@ public:
         : ResourceClass(ResourceClass), IsROV(IsROV), RawBuffer(RawBuffer) {}
 
     Attributes() : Attributes(llvm::dxil::ResourceClass::UAV, false, false) {}
+
+    friend bool operator==(const Attributes &LHS, const Attributes &RHS) {
+      return std::tie(LHS.ResourceClass, LHS.IsROV, LHS.RawBuffer) ==
+             std::tie(RHS.ResourceClass, RHS.IsROV, RHS.RawBuffer);
+    }
+    friend bool operator!=(const Attributes &LHS, const Attributes &RHS) {
+      return !(LHS == RHS);
+    }
   };
 
 private:
@@ -6279,9 +6288,9 @@ private:
   QualType ContainedType;
   const Attributes Attrs;
 
-  HLSLAttributedResourceType(QualType Canon, QualType Wrapped,
-                             QualType Contained, const Attributes &Attrs)
-      : Type(HLSLAttributedResource, Canon,
+  HLSLAttributedResourceType(QualType Wrapped, QualType Contained,
+                             const Attributes &Attrs)
+      : Type(HLSLAttributedResource, QualType(),
              Contained.isNull() ? TypeDependence::None
                                 : Contained->getDependence()),
         WrappedType(Wrapped), ContainedType(Contained), Attrs(Attrs) {}
@@ -6289,10 +6298,11 @@ private:
 public:
   QualType getWrappedType() const { return WrappedType; }
   QualType getContainedType() const { return ContainedType; }
+  bool hasContainedType() const { return !ContainedType.isNull(); }
   const Attributes &getAttrs() const { return Attrs; }
 
-  bool isSugared() const { return true; }
-  QualType desugar() const { return getWrappedType(); }
+  bool isSugared() const { return false; }
+  QualType desugar() const { return QualType(this, 0); }
 
   void Profile(llvm::FoldingSetNodeID &ID) {
     Profile(ID, WrappedType, ContainedType, Attrs);
@@ -8436,17 +8446,19 @@ inline bool Type::isOpenCLSpecificType() const {
   }
 #include "clang/Basic/HLSLIntangibleTypes.def"
 
-inline bool Type::isHLSLSpecificType() const {
+inline bool Type::isHLSLIntangibleType() const {
 #define HLSL_INTANGIBLE_TYPE(Name, Id, SingletonId) is##Id##Type() ||
   return
 #include "clang/Basic/HLSLIntangibleTypes.def"
-      false; // end boolean or operation
+      isHLSLAttributedResourceType();
 }
 
-inline bool Type::isHLSLIntangibleType() const {
-  // All HLSL specific types are currently intangible type as well, but that
-  // might change in the future.
-  return isHLSLSpecificType();
+inline bool Type::isHLSLSpecificType() const {
+  return isHLSLIntangibleType() || isa<HLSLAttributedResourceType>(this);
+}
+
+inline bool Type::isHLSLAttributedResourceType() const {
+  return isa<HLSLAttributedResourceType>(this);
 }
 
 inline bool Type::isTemplateTypeParmType() const {
diff --git a/clang/include/clang/Basic/TypeNodes.td b/clang/include/clang/Basic/TypeNodes.td
index 8cca392cddc1..7e550ca2992f 100644
--- a/clang/include/clang/Basic/TypeNodes.td
+++ b/clang/include/clang/Basic/TypeNodes.td
@@ -93,7 +93,7 @@ def EnumType : TypeNode<TagType>, LeafType;
 def ElaboratedType : TypeNode<Type>, NeverCanonical;
 def AttributedType : TypeNode<Type>, NeverCanonical;
 def BTFTagAttributedType : TypeNode<Type>, NeverCanonical;
-def HLSLAttributedResourceType : TypeNode<Type>, NeverCanonical;
+def HLSLAttributedResourceType : TypeNode<Type>;
 def TemplateTypeParmType : TypeNode<Type>, AlwaysDependent, LeafType;
 def SubstTemplateTypeParmType : TypeNode<Type>, NeverCanonical;
 def SubstTemplateTypeParmPackType : TypeNode<Type>, AlwaysDependent;
diff --git a/clang/lib/AST/ASTContext.cpp b/clang/lib/AST/ASTContext.cpp
index 034fbbe0bc78..4bf8ddd762e9 100644
--- a/clang/lib/AST/ASTContext.cpp
+++ b/clang/lib/AST/ASTContext.cpp
@@ -3437,6 +3437,9 @@ static void encodeTypeForFunctionPointerAuth(const ASTContext &Ctx,
     OS << II->getLength() << II->getName();
     return;
   }
+  case Type::HLSLAttributedResource:
+    llvm_unreachable("should never get here");
+    break;
   case Type::DeducedTemplateSpecialization:
   case Type::Auto:
 #define NON_CANONICAL_TYPE(Class, Base) case Type::Class:
@@ -4108,6 +4111,7 @@ QualType ASTContext::getVariableArrayDecayedType(QualType type) const {
   case Type::BitInt:
   case Type::DependentBitInt:
   case Type::ArrayParameter:
+  case Type::HLSLAttributedResource:
     llvm_unreachable("type should never be variably-modified");
 
   // These types can be variably-modified but should never need to
@@ -5233,9 +5237,8 @@ QualType ASTContext::getHLSLAttributedResourceType(
   if (Ty)
     return QualType(Ty, 0);
 
-  QualType Canon = getCanonicalType(Wrapped);
   Ty = new (*this, alignof(HLSLAttributedResourceType))
-      HLSLAttributedResourceType(Canon, Wrapped, Contained, Attrs);
+      HLSLAttributedResourceType(Wrapped, Contained, Attrs);
 
   Types.push_back(Ty);
   HLSLAttributedResourceTypes.InsertNode(Ty, InsertPos);
@@ -9106,6 +9109,9 @@ void ASTContext::getObjCEncodingForTypeImpl(QualType T, std::string &S,
   case Type::DeducedTemplateSpecialization:
     return;
 
+  case Type::HLSLAttributedResource:
+    llvm_unreachable("unexpected type");
+
   case Type::ArrayParameter:
   case Type::Pipe:
 #define ABSTRACT_TYPE(KIND, BASE)
@@ -11533,6 +11539,20 @@ QualType ASTContext::mergeTypes(QualType LHS, QualType RHS, bool OfBlockPointer,
       return {};
     return LHS;
   }
+  case Type::HLSLAttributedResource: {
+    const HLSLAttributedResourceType *LHSTy =
+        LHS->castAs<HLSLAttributedResourceType>();
+    const HLSLAttributedResourceType *RHSTy =
+        RHS->castAs<HLSLAttributedResourceType>();
+    assert(LHSTy->getWrappedType() == RHSTy->getWrappedType() &&
+           LHSTy->getWrappedType()->isHLSLResourceType() &&
+           "HLSLAttributedResourceType should always wrap __hlsl_resource_t");
+
+    if (LHSTy->getAttrs() == RHSTy->getAttrs() &&
+        LHSTy->getContainedType() == RHSTy->getContainedType())
+      return LHS;
+    return {};
+  }
   }
 
   llvm_unreachable("Invalid Type::Class!");
@@ -13368,6 +13388,7 @@ static QualType getCommonNonSugarTypeNode(ASTContext &Ctx, const Type *X,
     SUGAR_FREE_TYPE(Record)
     SUGAR_FREE_TYPE(SubstTemplateTypeParmPack)
     SUGAR_FREE_TYPE(UnresolvedUsing)
+    SUGAR_FREE_TYPE(HLSLAttributedResource)
 #undef SUGAR_FREE_TYPE
 #define NON_UNIQUE_TYPE(Class) UNEXPECTED_TYPE(Class, "non-unique")
     NON_UNIQUE_TYPE(TypeOfExpr)
diff --git a/clang/lib/AST/ASTStructuralEquivalence.cpp b/clang/lib/AST/ASTStructuralEquivalence.cpp
index 21f0562f9d72..120ddc0f26c0 100644
--- a/clang/lib/AST/ASTStructuralEquivalence.cpp
+++ b/clang/lib/AST/ASTStructuralEquivalence.cpp
@@ -802,16 +802,6 @@ static bool IsEquivalentExceptionSpec(StructuralEquivalenceContext &Context,
   return true;
 }
 
-// Determine structural equivalence of two instances of
-// HLSLAttributedResourceType::Attributes
-static bool
-IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
-                         const HLSLAttributedResourceType::Attributes &Attrs1,
-                         const HLSLAttributedResourceType::Attributes &Attrs2) {
-  return std::tie(Attrs1.ResourceClass, Attrs1.IsROV, Attrs1.RawBuffer) ==
-         std::tie(Attrs2.ResourceClass, Attrs2.IsROV, Attrs2.RawBuffer);
-}
-
 /// Determine structural equivalence of two types.
 static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
                                      QualType T1, QualType T2) {
@@ -1115,9 +1105,8 @@ static bool IsStructurallyEquivalent(StructuralEquivalenceContext &Context,
             Context, cast<HLSLAttributedResourceType>(T1)->getContainedType(),
             cast<HLSLAttributedResourceType>(T2)->getContainedType()))
       return false;
-    if (!IsStructurallyEquivalent(
-            Context, cast<HLSLAttributedResourceType>(T1)->getAttrs(),
-            cast<HLSLAttributedResourceType>(T2)->getAttrs()))
+    if (cast<HLSLAttributedResourceType>(T1)->getAttrs() !=
+        cast<HLSLAttributedResourceType>(T2)->getAttrs())
       return false;
     break;
 
diff --git a/clang/lib/AST/DeclCXX.cpp b/clang/lib/AST/DeclCXX.cpp
index 407ec14bbc00..08615d4393f5 100644
--- a/clang/lib/AST/DeclCXX.cpp
+++ b/clang/lib/AST/DeclCXX.cpp
@@ -1411,10 +1411,10 @@ void CXXRecordDecl::addedMember(Decl *D) {
         Ty = Ty->getArrayElementTypeNoTypeQual();
 
       Ty = Ty->getUnqualifiedDesugaredType();
-      if (Ty->isBuiltinType())
-        data().IsHLSLIntangible |= Ty->isHLSLIntangibleType();
-      else if (const RecordType *RT = dyn_cast<RecordType>(Ty))
+      if (const RecordType *RT = dyn_cast<RecordType>(Ty))
         data().IsHLSLIntangible |= RT->getAsCXXRecordDecl()->isHLSLIntangible();
+      else
+        data().IsHLSLIntangible |= Ty->isHLSLIntangibleType();
     }
   }
 
diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 51956c631786..52a7f5778ce6 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -12167,6 +12167,7 @@ GCCTypeClass EvaluateBuiltinClassifyType(QualType T,
   case Type::ObjCInterface:
   case Type::ObjCObjectPointer:
   case Type::Pipe:
+  case Type::HLSLAttributedResource:
     // Classify all other types that don't fit into the regular
     // classification the same way.
     return GCCTypeClass::None;
diff --git a/clang/lib/AST/ItaniumMangle.cpp b/clang/lib/AST/ItaniumMangle.cpp
index 777cdca1a0c0..d3ed35deb2b1 100644
--- a/clang/lib/AST/ItaniumMangle.cpp
+++ b/clang/lib/AST/ItaniumMangle.cpp
@@ -4512,6 +4512,38 @@ void CXXNameMangler::mangleType(const ArrayParameterType *T) {
   mangleType(cast<ConstantArrayType>(T));
 }
 
+void CXXNameMangler::mangleType(const HLSLAttributedResourceType *T) {
+  llvm::SmallString<64> Str("_Res");
+  const HLSLAttributedResourceType::Attributes &Attrs = T->getAttrs();
+  // map resource class to HLSL virtual register letter
+  switch (Attrs.ResourceClass) {
+  case llvm::dxil::ResourceClass::UAV:
+    Str += "_u";
+    break;
+  case llvm::dxil::ResourceClass::SRV:
+    Str += "_t";
+    break;
+  case llvm::dxil::ResourceClass::CBuffer:
+    Str += "_b";
+    break;
+  case llvm::dxil::ResourceClass::Sampler:
+    Str += "_s";
+    break;
+  }
+  if (Attrs.IsROV)
+    Str += "_ROV";
+  if (Attrs.RawBuffer)
+    Str += "_Raw";
+  if (T->hasContainedType())
+    Str += "_CT";
+  mangleVendorQualifier(Str);
+
+  if (T->hasContainedType()) {
+    mangleType(T->getContainedType());
+  }
+  mangleType(T->getWrappedType());
+}
+
 void CXXNameMangler::mangleIntegerLiteral(QualType T,
                                           const llvm::APSInt &Value) {
   //  <expr-primary> ::= L <type> <value number> E # integer literal
diff --git a/clang/lib/AST/MicrosoftMangle.cpp b/clang/lib/AST/MicrosoftMangle.cpp
index 4ccf3f76bf0c..3931fcaa3529 100644
--- a/clang/lib/AST/MicrosoftMangle.cpp
+++ b/clang/lib/AST/MicrosoftMangle.cpp
@@ -3754,6 +3754,11 @@ void MicrosoftCXXNameMangler::mangleType(const DependentBitIntType *T,
   Error(Range.getBegin(), "DependentBitInt type") << Range;
 }
 
+void MicrosoftCXXNameMangler::mangleType(const HLSLAttributedResourceType *T,
+                                         Qualifiers, SourceRange Range) {
+  llvm_unreachable("HLSL uses Itanium name mangling");
+}
+
 // <this-adjustment> ::= <no-adjustment> | <static-adjustment> |
 //                       <virtual-adjustment>
 // <no-adjustment>      ::= A # private near
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 6f4958801cfe..f013ed11d129 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -4575,6 +4575,8 @@ static CachedProperties computeCachedProperties(const Type *T) {
     return Cache::get(cast<AtomicType>(T)->getValueType());
   case Type::Pipe:
     return Cache::get(cast<PipeType>(T)->getElementType());
+  case Type::HLSLAttributedResource:
+    return Cache::get(cast<HLSLAttributedResourceType>(T)->getWrappedType());
   }
 
   llvm_unreachable("unhandled type class");
@@ -4664,6 +4666,8 @@ LinkageInfo LinkageComputer::computeTypeLinkageInfo(const Type *T) {
     return computeTypeLinkageInfo(cast<AtomicType>(T)->getValueType());
   case Type::Pipe:
     return computeTypeLinkageInfo(cast<PipeType>(T)->getElementType());
+  case Type::HLSLAttributedResource:
+    llvm_unreachable("not yet implemented");
   }
 
   llvm_unreachable("unhandled type class");
@@ -4846,6 +4850,7 @@ bool Type::canHaveNullability(bool ResultIfUnknown) const {
   case Type::BitInt:
   case Type::DependentBitInt:
   case Type::ArrayParameter:
+  case Type::HLSLAttributedResource:
     return false;
   }
   llvm_unreachable("bad type kind!");
diff --git a/clang/lib/CodeGen/CodeGenFunction.cpp b/clang/lib/CodeGen/CodeGenFunction.cpp
index f3023c7a20c4..2306043c90f4 100644
--- a/clang/lib/CodeGen/CodeGenFunction.cpp
+++ b/clang/lib/CodeGen/CodeGenFunction.cpp
@@ -282,6 +282,7 @@ TypeEvaluationKind CodeGenFunction::getEvaluationKind(QualType type) {
     case Type::ObjCObjectPointer:
     case Type::Pipe:
     case Type::BitInt:
+    case Type::HLSLAttributedResource:
       return TEK_Scalar;
 
     // Complexes.
diff --git a/clang/lib/CodeGen/CodeGenTypes.cpp b/clang/lib/CodeGen/CodeGenTypes.cpp
index 339632090a5b..54aa1d59d351 100644
--- a/clang/lib/CodeGen/CodeGenTypes.cpp
+++ b/clang/lib/CodeGen/CodeGenTypes.cpp
@@ -743,6 +743,9 @@ llvm::Type *CodeGenTypes::ConvertType(QualType T) {
     ResultType = llvm::Type::getIntNTy(getLLVMContext(), EIT->getNumBits());
     break;
   }
+  case Type::HLSLAttributedResource:
+    ResultType = CGM.getHLSLRuntime().convertHLSLSpecificType(Ty);
+    break;
   }
 
   assert(ResultType && "Didn't convert a type?");
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 75dab596e1b2..3cc17ebaacd9 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3947,6 +3947,9 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
     // abi::__pointer_to_member_type_info.
     VTableName = "_ZTVN10__cxxabiv129__pointer_to_member_type_infoE";
     break;
+
+  case Type::HLSLAttributedResource:
+    llvm_unreachable("HLSL doesn't support virtual functions");
   }
 
   llvm::Constant *VTable = nullptr;
@@ -4209,6 +4212,9 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
   case Type::Atomic:
     // No fields, at least for the moment.
     break;
+
+  case Type::HLSLAttributedResource:
+    llvm_unreachable("HLSL doesn't support RTTI");
   }
 
   llvm::Constant *Init = llvm::ConstantStruct::getAnon(Fields);
diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp
index 13da2c630629..303a4309d62f 100644
--- a/clang/lib/CodeGen/Targets/DirectX.cpp
+++ b/clang/lib/CodeGen/Targets/DirectX.cpp
@@ -29,19 +29,40 @@ public:
 
 llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(CodeGenModule &CGM,
                                                   const Type *Ty) const {
-  auto *BuiltinTy = dyn_cast<BuiltinType>(Ty);
-  if (!BuiltinTy || BuiltinTy->getKind() != BuiltinType::HLSLResource)
+  auto *ResType = dyn_cast<HLSLAttributedResourceType>(Ty);
+  if (!ResType)
     return nullptr;
 
   llvm::LLVMContext &Ctx = CGM.getLLVMContext();
-  // FIXME: translate __hlsl_resource_t to target("dx.TypedBuffer", <4 x float>,
-  // 1, 0, 0) only for now (RWBuffer<float4>); more work us needed to determine
-  // the target ext type and its parameters based on the handle type
-  // attributes (not yet implemented)
-  llvm::FixedVectorType *ElemType =
-      llvm::FixedVectorType::get(llvm::Type::getFloatTy(Ctx), 4);
-  unsigned Flags[] = {/*IsWriteable*/ 1, /*IsROV*/ 0, /*IsSigned*/ 0};
-  return llvm::TargetExtType::get(Ctx, "dx.TypedBuffer", {ElemType}, Flags);
+  const HLSLAttributedResourceType::Attributes &ResAttrs = ResType->getAttrs();
+  switch (ResAttrs.ResourceClass) {
+  case llvm::dxil::ResourceClass::UAV:
+  case llvm::dxil::ResourceClass::SRV: {
+    // TypedBuffer and RawBuffer both need element type
+    QualType ContainedTy = ResType->getContainedType();
+    if (ContainedTy.isNull())
+      return nullptr;
+
+    // convert element type
+    llvm::Type *ElemType = CGM.getTypes().ConvertType(ContainedTy);
+
+    llvm::StringRef TypeName =
+        ResAttrs.RawBuffer ? "dx.RawBuffer" : "dx.TypedBuffer";
+    SmallVector<unsigned, 3> Ints = {/*IsWriteable*/ ResAttrs.ResourceClass ==
+                                         llvm::dxil::ResourceClass::UAV,
+                                     /*IsROV*/ ResAttrs.IsROV};
+    if (!ResAttrs.RawBuffer)
+      Ints.push_back(/*IsSigned*/ ContainedTy->isSignedIntegerType());
+
+    return llvm::TargetExtType::get(Ctx, TypeName, {ElemType}, Ints);
+  }
+  case llvm::dxil::ResourceClass::CBuffer:
+    llvm_unreachable("dx.CBuffer handles are not implemented yet");
+    break;
+  case llvm::dxil::ResourceClass::Sampler:
+    llvm_unreachable("dx.Sampler handles are not implemented yet");
+    break;
+  }
 }
 
 } // namespace
diff --git a/clang/lib/Sema/HLSLExternalSemaSource.cpp b/clang/lib/Sema/HLSLExternalSemaSource.cpp
index 2913d16fca48..5f51047b4d7b 100644
--- a/clang/lib/Sema/HLSLExternalSemaSource.cpp
+++ b/clang/lib/Sema/HLSLExternalSemaSource.cpp
@@ -208,8 +208,6 @@ struct BuiltinTypeDeclBuilder {
   BuiltinTypeDeclBuilder &addArraySubscriptOperator(bool IsConst) {
     if (Record->isCompleteDefinition())
       return *this;
-    assert(Fields.count("h") > 0 &&
-           "Subscript operator must be added after the handle.");
 
     ASTContext &AST = Record->getASTContext();
     QualType ElemTy = AST.Char8Ty;
diff --git a/clang/lib/Sema/SemaLookup.cpp b/clang/lib/Sema/SemaLookup.cpp
index f3f62474d064..e5db11369221 100644
--- a/clang/lib/Sema/SemaLookup.cpp
+++ b/clang/lib/Sema/SemaLookup.cpp
@@ -3215,6 +3215,9 @@ addAssociatedClassesAndNamespaces(AssociatedLookup &Result, QualType Ty) {
     // Array parameter types are treated as fundamental types.
     case Type::ArrayParameter:
       break;
+
+    case Type::HLSLAttributedResource:
+      T = cast<HLSLAttributedResourceType>(T)->getWrappedType().getTypePtr();
     }
 
     if (Queue.empty())
diff --git a/clang/lib/Sema/SemaOverload.cpp b/clang/lib/Sema/SemaOverload.cpp
index 1205e85b4e6f..7b86299561a3 100644
--- a/clang/lib/Sema/SemaOverload.cpp
+++ b/clang/lib/Sema/SemaOverload.cpp
@@ -1798,6 +1798,23 @@ TryImplicitConversion(Sema &S, Expr *From, QualType ToType,
     return ICS;
   }
 
+  if (S.getLangOpts().HLSL && ToType->isHLSLAttributedResourceType() &&
+      FromType->isHLSLAttributedResourceType()) {
+    auto *ToResType = cast<HLSLAttributedResourceType>(ToType);
+    auto *FromResType = cast<HLSLAttributedResourceType>(FromType);
+    if (S.Context.hasSameUnqualifiedType(ToResType->getWrappedType(),
+                                         FromResType->getWrappedType()) &&
+        S.Context.hasSameUnqualifiedType(ToResType->getContainedType(),
+                                         FromResType->getContainedType()) &&
+        ToResType->getAttrs() == FromResType->getAttrs()) {
+      ICS.setStandard();
+      ICS.Standard.setAsIdentityConversion();
+      ICS.Standard.setFromType(FromType);
+      ICS.Standard.setAllToTypes(ToType);
+      return ICS;
+    }
+  }
+
   return TryUserDefinedConversion(S, From, ToType, SuppressUserConversions,
                                   AllowExplicit, InOverloadResolution, CStyle,
                                   AllowObjCWritebackConversion,
diff --git a/clang/lib/Sema/SemaTemplate.cpp b/clang/lib/Sema/SemaTemplate.cpp
index 294eb8e3353c..62f13610b528 100644
--- a/clang/lib/Sema/SemaTemplate.cpp
+++ b/clang/lib/Sema/SemaTemplate.cpp
@@ -6074,6 +6074,13 @@ bool UnnamedLocalNoLinkageFinder::VisitNestedNameSpecifier(
   llvm_unreachable("Invalid NestedNameSpecifier::Kind!");
 }
 
+bool UnnamedLocalNoLinkageFinder::VisitHLSLAttributedResourceType(
+    const HLSLAttributedResourceType *T) {
+  if (T->hasContainedType() && Visit(T->getContainedType()))
+    return true;
+  return Visit(T->getWrappedType());
+}
+
 bool Sema::CheckTemplateArgument(TypeSourceInfo *ArgInfo) {
   assert(ArgInfo && "invalid TypeSourceInfo");
   QualType Arg = ArgInfo->getType();
diff --git a/clang/lib/Sema/SemaTemplateDeduction.cpp b/clang/lib/Sema/SemaTemplateDeduction.cpp
index 7cfb8d687c79..db1d7fa23713 100644
--- a/clang/lib/Sema/SemaTemplateDeduction.cpp
+++ b/clang/lib/Sema/SemaTemplateDeduction.cpp
@@ -2449,6 +2449,7 @@ static TemplateDeductionResult DeduceTemplateArgumentsByTypeMatch(
     case Type::PackExpansion:
     case Type::Pipe:
     case Type::ArrayParameter:
+    case Type::HLSLAttributedResource:
       // No template argument deduction for these types
       return TemplateDeductionResult::Success;
 
@@ -6844,6 +6845,16 @@ MarkUsedTemplateParameters(ASTContext &Ctx, QualType T,
                                OnlyDeduced, Depth, Used);
     break;
 
+  case Type::HLSLAttributedResource:
+    MarkUsedTemplateParameters(
+        Ctx, cast<HLSLAttributedResourceType>(T)->getWrappedType(), OnlyDeduced,
+        Depth, Used);
+    if (cast<HLSLAttributedResourceType>(T)->hasContainedType())
+      MarkUsedTemplateParameters(
+          Ctx, cast<HLSLAttributedResourceType>(T)->getContainedType(),
+          OnlyDeduced, Depth, Used);
+    break;
+
   // None of these types have any template parameters in them.
   case Type::Builtin:
   case Type::VariableArray:
diff --git a/clang/test/AST/HLSL/RWBuffer-AST.hlsl b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
index 55c0dfa2eaa5..e6ce73dbd962 100644
--- a/clang/test/AST/HLSL/RWBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/RWBuffer-AST.hlsl
@@ -32,7 +32,6 @@ RWBuffer<float> Buffer;
 // CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
 
 // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
@@ -59,5 +58,4 @@ RWBuffer<float> Buffer;
 // CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]] 
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
index b31db8ce59f2..030fcfc31691 100644
--- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
@@ -34,7 +34,6 @@ StructuredBuffer<float> Buffer;
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
 
 // CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
@@ -62,5 +61,4 @@ StructuredBuffer<float> Buffer;
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl
index eca4f1598fd6..fa81b53fd9bd 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-elementtype.hlsl
@@ -1,5 +1,23 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s
 
+// NOTE: The type name number and whether the struct is packed or not will mostly
+// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) 
+// and theinterim field of the contained type is removed.
+
+// CHECK: %"class.hlsl::RWBuffer" = type <{ target("dx.TypedBuffer", i16, 1, 0, 1)
+// CHECK: %"class.hlsl::RWBuffer.0" = type <{ target("dx.TypedBuffer", i16, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.2" = type { target("dx.TypedBuffer", i32, 1, 0, 1)
+// CHECK: %"class.hlsl::RWBuffer.3" = type { target("dx.TypedBuffer", i32, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.4" = type { target("dx.TypedBuffer", i64, 1, 0, 1)
+// CHECK: %"class.hlsl::RWBuffer.5" = type { target("dx.TypedBuffer", i64, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.6" = type <{ target("dx.TypedBuffer", half, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.8" = type { target("dx.TypedBuffer", float, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.9" = type { target("dx.TypedBuffer", double, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.10" = type { target("dx.TypedBuffer", <4 x i16>, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.11" = type { target("dx.TypedBuffer", <3 x i32>, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.12" = type { target("dx.TypedBuffer", <2 x half>, 1, 0, 0)
+// CHECK: %"class.hlsl::RWBuffer.13" = type { target("dx.TypedBuffer", <3 x float>, 1, 0, 0)
+
 RWBuffer<int16_t> BufI16;
 RWBuffer<uint16_t> BufU16;
 RWBuffer<int> BufI32;
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
index 326885efbeea..a99c7f98a1af 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
@@ -1,5 +1,23 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s
 
+// NOTE: The number in type name and whether the struct is packed or not will mostly
+// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) 
+// and theinterim field of the contained type is removed.
+
+// CHECK: %"class.hlsl::StructuredBuffer" = type <{ target("dx.RawBuffer", i16, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.0" = type <{ target("dx.RawBuffer", i16, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.3" = type { target("dx.RawBuffer", i32, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.5" = type { target("dx.RawBuffer", i64, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.6" = type <{ target("dx.RawBuffer", half, 1, 0) 
+// CHECK: %"class.hlsl::StructuredBuffer.8" = type { target("dx.RawBuffer", float, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.9" = type { target("dx.RawBuffer", double, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.10" = type { target("dx.RawBuffer", <4 x i16>, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.11" = type { target("dx.RawBuffer", <3 x i32>, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.12" = type { target("dx.RawBuffer", <2 x half>, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.13" = type { target("dx.RawBuffer", <3 x float>, 1, 0)
+
 StructuredBuffer<int16_t> BufI16;
 StructuredBuffer<uint16_t> BufU16;
 StructuredBuffer<int> BufI32;
diff --git a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
index e735a85b589f..6751cf2703ce 100644
--- a/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/hlsl_resource_t.hlsl
@@ -1,9 +1,53 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -O1 -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -finclude-default-header -x hlsl -emit-llvm -o - %s | FileCheck %s
 
-void foo(__hlsl_resource_t res);
+using handle_float_t = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]];
 
-// CHECK: define void @_Z3baru17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM:[a-zA-Z0-9]+]])
-// CHECK: call void @_Z3foou17__hlsl_resource_t(target("dx.TypedBuffer", <4 x float>, 1, 0, 0) %[[PARAM]])
-void bar(__hlsl_resource_t a) {
-    foo(a);
+// CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", <4 x float>, 1, 0, 0)
+// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", %struct.MyStruct = type { <4 x float>, <2 x i32>, [8 x i8] }, 1, 0)
+
+// CHECK: define void @_Z2faU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+// CHECK: call void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %0)
+// CHECK: declare void @_Z4foo1U9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0))
+
+void foo1(handle_float_t res);
+
+void fa(handle_float_t a) {
+    foo1(a);
+}
+
+// CHECK: define void @_Z2fbU9_Res_u_CTfu17__hlsl_resource_t(target("dx.TypedBuffer", float, 1, 0, 0) %a)
+void fb(handle_float_t a) {
+    handle_float_t b = a;
 }
+
+// CHECK: define void @_Z2fcN4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %a)
+// CHECK: call void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 16 %agg.tmp)
+// CHECK: declare void @_Z4foo2N4hlsl8RWBufferIDv4_fEE(ptr noundef byval(%"class.hlsl::RWBuffer") align 16)
+void foo2(RWBuffer<float4> buf);
+
+void fc(RWBuffer<float4> a) {
+  foo2(a);
+}
+
+void fd(RWBuffer<float4> a) {
+  RWBuffer<float4> b = a;
+}
+
+struct MyStruct {
+  float4 f;
+  int2 i;
+};
+
+// CHECK: define void @_Z2feN4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 16 %a)
+// CHECK: call void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 16 %agg.tmp)
+// CHECK: declare void @_Z4foo3N4hlsl16StructuredBufferI8MyStructEE(ptr noundef byval(%"class.hlsl::StructuredBuffer") align 16)
+void foo3(StructuredBuffer<MyStruct> buf);
+
+void fe(StructuredBuffer<MyStruct> a) {
+  foo3(a);
+}
+
+void ff(StructuredBuffer<MyStruct> a) {
+  StructuredBuffer<MyStruct> b = a;
+}
+
diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl
index 2b7bd3102322..5a72aa242e58 100644
--- a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl
@@ -2,27 +2,24 @@
 
 typedef vector<float, 4> float4;
 
-// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 5]]:1, col:83>
+// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, col:83>
 // CHECK: -HLSLAttributedResourceType 0x{{[0-9a-f]+}} '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]]
-// CHECK-SAME: ' sugar
 using ResourceIntAliasT = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(int)]];
 ResourceIntAliasT h1;
 
-// CHECK: -VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, col:82> col:82 h2 '__hlsl_resource_t 
+// CHECK: -VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:82> col:82 h2 '__hlsl_resource_t 
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float4)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float4)]] h2;
 
-// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 7]]:1, line:[[# @LINE + 9]]:1> line:[[# @LINE + 7]]:30 S
+// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 6]]:1, line:[[# @LINE + 8]]:1> line:[[# @LINE + 6]]:30 S
 // CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} <col:11, col:20> col:20 referenced typename depth 0 index 0 T
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:23, line:[[# @LINE + 7]]:1> line:[[# @LINE + 5]]:30 struct S definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 5]]:3, col:79> col:79 h '__hlsl_resource_t
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:23, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:30 struct S definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:79> col:79 h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(T)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 template <typename T> struct S {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(T)]] h;
 };
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
index fdf2aacf4a4d..836d129c8d00 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
@@ -1,25 +1,22 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 5]]:3, col:68> col:68 h '__hlsl_resource_t
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:68> col:68 h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 struct MyBuffer {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h;
 };
 
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, col:66> col:66 res '__hlsl_resource_t
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:66> col:66 res '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 __hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res;
 
-// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 5]]:1, line:[[# @LINE + 7]]:1> line:[[# @LINE + 5]]:6 f 'void ()
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:6 f 'void ()
 // CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 void f() {
   __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r;
 }
diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl
index 71bf300ee7ae..84c924eec24e 100644
--- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl
@@ -1,25 +1,22 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 5]]:3, col:72> col:72 h1 '__hlsl_resource_t
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:72> col:72 h1 '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 struct MyBuffer {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h1;
 };
 
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, col:70> col:70 h2 '__hlsl_resource_t
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:70> col:70 h2 '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 __hlsl_resource_t [[hlsl::raw_buffer]] [[hlsl::resource_class(SRV)]] h2;
 
-// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 5]]:1, line:[[# @LINE + 7]]:1> line:[[# @LINE + 5]]:6 f 'void ()
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:6 f 'void ()
 // CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 h3 '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 void f() {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h3;
 }
diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
index 305fd95ab1eb..fbada8b4b99f 100644
--- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
@@ -1,33 +1,29 @@
 // RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
 
 // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:51> col:51 h '__hlsl_resource_t
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:3, col:51> col:51 h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 struct MyBuffer {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
 };
 
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:49> col:49 res '__hlsl_resource_t
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 2]]:1, col:49> col:49 res '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 __hlsl_resource_t [[hlsl::resource_class(SRV)]] res;
 
-// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:6 f 'void ()
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, line:[[# @LINE + 5]]:1> line:[[# @LINE + 3]]:6 f 'void ()
 // CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:55> col:55 r '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 void f() {
   __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r;
 }
 
-// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 7]]:1, line:[[# @LINE + 9]]:1> line:[[# @LINE + 7]]:29 MyBuffer2
+// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 6]]:1, line:[[# @LINE + 8]]:1> line:[[# @LINE + 6]]:29 MyBuffer2
 // CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} <col:10, col:19> col:19 typename depth 0 index 0 T
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, line:[[# @LINE + 7]]:1> line:[[# @LINE + 5]]:29 struct MyBuffer2 definition
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:29 struct MyBuffer2 definition
 // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:51> col:51 h '__hlsl_resource_t
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:3, col:51> col:51 h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 template<typename T> struct MyBuffer2 {
   __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
 };
@@ -38,5 +34,4 @@ template<typename T> struct MyBuffer2 {
 // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
 // CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE - 7]]:3, col:51> col:51 h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 MyBuffer2<float> myBuffer2;
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index e7d19c3da721..38d27bc21e4a 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -6,7 +6,6 @@
 // CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
 RWBuffer<float> Buffer1;
 
@@ -18,6 +17,5 @@ RWBuffer<float> Buffer1;
 // CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]
 // CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
 // CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector<float, 4>)]]
-// CHECK-SAME: ':'__hlsl_resource_t'
 // CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
 RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4];
-- 
GitLab


From dd47920ce97e7db1ddeec34acdd9cb0ca7dcd7c4 Mon Sep 17 00:00:00 2001
From: higher-performance <higher.performance.github@gmail.com>
Date: Tue, 15 Oct 2024 16:41:52 -0400
Subject: [PATCH 027/329] Make [[clang::lifetimebound]] work for expressions
 coming from default arguments (#112047)

Fixes #68596.
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 ++
 clang/lib/Sema/CheckExprLifetime.cpp          | 29 ++++++++++++++---
 clang/test/SemaCXX/attr-lifetimebound.cpp     | 31 +++++++++++++++++++
 3 files changed, 58 insertions(+), 4 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index e78acc8dc8c5..e9111394bcd3 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -10124,6 +10124,8 @@ def note_lambda_capture_initializer : Note<
   " via initialization of lambda capture %0}1">;
 def note_init_with_default_member_initializer : Note<
   "initializing field %0 with default member initializer">;
+def note_init_with_default_argument : Note<
+  "initializing parameter %0 with default argument">;
 
 // Check for initializing a member variable with the address or a reference to
 // a constructor parameter.
diff --git a/clang/lib/Sema/CheckExprLifetime.cpp b/clang/lib/Sema/CheckExprLifetime.cpp
index 9b3894767d86..8caeae5fcf9f 100644
--- a/clang/lib/Sema/CheckExprLifetime.cpp
+++ b/clang/lib/Sema/CheckExprLifetime.cpp
@@ -198,6 +198,7 @@ struct IndirectLocalPathEntry {
     GslReferenceInit,
     GslPointerInit,
     GslPointerAssignment,
+    DefaultArg,
   } Kind;
   Expr *E;
   union {
@@ -609,15 +610,22 @@ static void visitFunctionCallArguments(IndirectLocalPath &Path, Expr *Call,
   for (unsigned I = 0,
                 N = std::min<unsigned>(Callee->getNumParams(), Args.size());
        I != N; ++I) {
+    Expr *Arg = Args[I];
+    RevertToOldSizeRAII RAII(Path);
+    if (auto *DAE = dyn_cast<CXXDefaultArgExpr>(Arg)) {
+      Path.push_back(
+          {IndirectLocalPathEntry::DefaultArg, DAE, DAE->getParam()});
+      Arg = DAE->getExpr();
+    }
     if (CheckCoroCall || Callee->getParamDecl(I)->hasAttr<LifetimeBoundAttr>())
-      VisitLifetimeBoundArg(Callee->getParamDecl(I), Args[I]);
+      VisitLifetimeBoundArg(Callee->getParamDecl(I), Arg);
     else if (EnableGSLAnalysis && I == 0) {
       // Perform GSL analysis for the first argument
       if (shouldTrackFirstArgument(Callee)) {
-        VisitGSLPointerArg(Callee, Args[0]);
+        VisitGSLPointerArg(Callee, Arg);
       } else if (auto *Ctor = dyn_cast<CXXConstructExpr>(Call);
                  Ctor && shouldTrackFirstArgumentForConstructor(Ctor)) {
-        VisitGSLPointerArg(Ctor->getConstructor(), Args[0]);
+        VisitGSLPointerArg(Ctor->getConstructor(), Arg);
       }
     }
   }
@@ -1060,6 +1068,9 @@ static SourceRange nextPathEntryRange(const IndirectLocalPath &Path, unsigned I,
       if (!Path[I].Capture->capturesVariable())
         continue;
       return Path[I].E->getSourceRange();
+
+    case IndirectLocalPathEntry::DefaultArg:
+      return cast<CXXDefaultArgExpr>(Path[I].E)->getUsedLocation();
     }
   }
   return E->getSourceRange();
@@ -1370,7 +1381,7 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
         break;
       }
 
-      case IndirectLocalPathEntry::LambdaCaptureInit:
+      case IndirectLocalPathEntry::LambdaCaptureInit: {
         if (!Elem.Capture->capturesVariable())
           break;
         // FIXME: We can't easily tell apart an init-capture from a nested
@@ -1383,6 +1394,16 @@ static void checkExprLifetimeImpl(Sema &SemaRef,
             << nextPathEntryRange(Path, I + 1, L);
         break;
       }
+
+      case IndirectLocalPathEntry::DefaultArg: {
+        const auto *DAE = cast<CXXDefaultArgExpr>(Elem.E);
+        const ParmVarDecl *Param = DAE->getParam();
+        SemaRef.Diag(Param->getDefaultArgRange().getBegin(),
+                     diag::note_init_with_default_argument)
+            << Param << nextPathEntryRange(Path, I + 1, L);
+        break;
+      }
+      }
     }
 
     // We didn't lifetime-extend, so don't go any further; we don't need more
diff --git a/clang/test/SemaCXX/attr-lifetimebound.cpp b/clang/test/SemaCXX/attr-lifetimebound.cpp
index bdc581719173..d04bbb32433f 100644
--- a/clang/test/SemaCXX/attr-lifetimebound.cpp
+++ b/clang/test/SemaCXX/attr-lifetimebound.cpp
@@ -107,6 +107,37 @@ namespace std {
 using std::operator""s;
 using std::operator""sv;
 
+namespace default_args {
+  using IntArray = int[];
+  const int *defaultparam1(const int &def1 [[clang::lifetimebound]] = 0); // #def1
+  const int &defaultparam_array([[clang::lifetimebound]] const int *p = IntArray{1, 2, 3}); // #def2
+  struct A {
+    A(const char *, const int &def3 [[clang::lifetimebound]] = 0); // #def3
+  };
+  const int &defaultparam2(const int &def4 [[clang::lifetimebound]] = 0); // #def4
+  const int &defaultparam3(const int &def5 [[clang::lifetimebound]] = defaultparam2(), const int &def6 [[clang::lifetimebound]] = 0); // #def5 #def6
+  std::string_view defaultparam4(std::string_view s [[clang::lifetimebound]] = std::string()); // #def7
+
+  const int *test_default_args() {
+    const int *c = defaultparam1(); // expected-warning {{temporary whose address is used as value of local variable 'c' will be destroyed at the end of the full-expression}} expected-note@#def1 {{initializing parameter 'def1' with default argument}}
+    A a = A(""); // expected-warning {{temporary whose address is used as value of local variable 'a' will be destroyed at the end of the full-expression}} expected-note@#def3 {{initializing parameter 'def3' with default argument}}
+    const int &s = defaultparam2(); // expected-warning {{temporary bound to local reference 's' will be destroyed at the end of the full-expression}} expected-note@#def4 {{initializing parameter 'def4' with default argument}}
+    const int &t = defaultparam3(); // expected-warning {{temporary bound to local reference 't' will be destroyed at the end of the full-expression}} expected-note@#def4 {{initializing parameter 'def4' with default argument}} expected-note@#def5 {{initializing parameter 'def5' with default argument}} expected-warning {{temporary bound to local reference 't' will be destroyed at the end of the full-expression}} expected-note@#def6 {{initializing parameter 'def6' with default argument}}
+    const int &u = defaultparam_array(); // expected-warning {{temporary bound to local reference 'u' will be destroyed at the end of the full-expression}} expected-note@#def2 {{initializing parameter 'p' with default argument}}
+    int local;
+    const int &v = defaultparam2(local); // no warning
+    const int &w = defaultparam2(1); // expected-warning {{temporary bound to local reference 'w' will be destroyed at the end of the full-expression}}
+    if (false) {
+      return &defaultparam2();  // expected-warning {{returning address of local temporary object}}
+    }
+    if (false) {
+      return &defaultparam2(0);  // expected-warning {{returning address of local temporary object}} expected-note@#def4 {{initializing parameter 'def4' with default argument}}
+    }
+    std::string_view sv = defaultparam4(); // expected-warning {{temporary whose address is used as value of local variable 'sv' will be destroyed at the end of the full-expression}} expected-note@#def7 {{initializing parameter 's' with default argument}}
+    return nullptr;
+  }
+} // namespace default_args
+
 namespace p0936r0_examples {
   std::string_view s = "foo"s; // expected-warning {{temporary}}
 
-- 
GitLab


From 34cdd67c854ba5a7ec557291861f948ef674375f Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Tue, 15 Oct 2024 21:48:15 +0100
Subject: [PATCH 028/329] [VPlan] Use VPWidenIntrinsicRecipe to vp.select.
 (#110489)

Use VPWidenIntrinsicRecipe
(https://github.com/llvm/llvm-project/pull/110486)
to create vp.select intrinsics. This potentially offers an alternative
to duplicating EVL recipes for all existing recipes.

There are some recipes that will need duplicates (at least at the
moment), due to extra code-gen needs (e.g. widening loads and stores).
But in cases the intrinsic can directly be used, creating the widened
intrinsic directly would reduce the need to duplicate some recipes.


PR: https://github.com/llvm/llvm-project/pull/110489
---
 llvm/lib/Transforms/Vectorize/VPlan.h         | 12 ++++
 .../Transforms/Vectorize/VPlanAnalysis.cpp    |  2 +
 .../lib/Transforms/Vectorize/VPlanRecipes.cpp |  9 ++-
 .../Transforms/Vectorize/VPlanTransforms.cpp  |  9 +++
 .../Transforms/Vectorize/VPlanVerifier.cpp    |  4 ++
 ...rize-force-tail-with-evl-cond-reduction.ll |  2 +-
 .../RISCV/vplan-vp-select-intrinsics.ll       | 65 +++++++++++++++++++
 7 files changed, 101 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index ec91d08db5ba..4cef47e69f0e 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1669,6 +1669,16 @@ public:
         MayWriteToMemory(CI.mayWriteToMemory()),
         MayHaveSideEffects(CI.mayHaveSideEffects()) {}
 
+  VPWidenIntrinsicRecipe(Intrinsic::ID VectorIntrinsicID,
+                         ArrayRef<VPValue *> CallArguments, Type *Ty,
+                         bool MayReadFromMemory, bool MayWriteToMemory,
+                         bool MayHaveSideEffects, DebugLoc DL = {})
+      : VPRecipeWithIRFlags(VPDef::VPWidenIntrinsicSC, CallArguments),
+        VectorIntrinsicID(VectorIntrinsicID), ResultTy(Ty),
+        MayReadFromMemory(MayReadFromMemory),
+        MayWriteToMemory(MayWriteToMemory),
+        MayHaveSideEffects(MayHaveSideEffects) {}
+
   ~VPWidenIntrinsicRecipe() override = default;
 
   VPWidenIntrinsicRecipe *clone() override {
@@ -1706,6 +1716,8 @@ public:
   void print(raw_ostream &O, const Twine &Indent,
              VPSlotTracker &SlotTracker) const override;
 #endif
+
+  bool onlyFirstLaneUsed(const VPValue *Op) const override;
 };
 
 /// A recipe for widening Call instructions using library calls.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
index 5a5b3ac19c46..3eb5f3f40f84 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanAnalysis.cpp
@@ -61,6 +61,8 @@ Type *VPTypeAnalysis::inferScalarTypeForRecipe(const VPInstruction *R) {
   case Instruction::ICmp:
   case VPInstruction::ActiveLaneMask:
     return inferScalarType(R->getOperand(1));
+  case VPInstruction::ExplicitVectorLength:
+    return Type::getIntNTy(Ctx, 32);
   case VPInstruction::FirstOrderRecurrenceSplice:
   case VPInstruction::Not:
     return SetResultTyFromOp();
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index b3befce6c92c..0f90166528a4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -79,7 +79,6 @@ bool VPRecipeBase::mayWriteToMemory() const {
     return !cast<VPWidenCallRecipe>(this)
                 ->getCalledScalarFunction()
                 ->onlyReadsMemory();
-  case VPWidenIntrinsicSC:
     return cast<VPWidenIntrinsicRecipe>(this)->mayWriteToMemory();
   case VPBranchOnMaskSC:
   case VPScalarIVStepsSC:
@@ -1042,6 +1041,14 @@ StringRef VPWidenIntrinsicRecipe::getIntrinsicName() const {
   return Intrinsic::getBaseName(VectorIntrinsicID);
 }
 
+bool VPWidenIntrinsicRecipe::onlyFirstLaneUsed(const VPValue *Op) const {
+  assert(is_contained(operands(), Op) && "Op must be an operand of the recipe");
+  // Vector predication intrinsics only demand the the first lane the last
+  // operand (the EVL operand).
+  return VPIntrinsic::isVPIntrinsic(VectorIntrinsicID) &&
+         Op == getOperand(getNumOperands() - 1);
+}
+
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
 void VPWidenIntrinsicRecipe::print(raw_ostream &O, const Twine &Indent,
                                    VPSlotTracker &SlotTracker) const {
diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 379bfc0a4394..4443a7be4ad4 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1353,6 +1353,7 @@ void VPlanTransforms::addActiveLaneMask(
 /// Replace recipes with their EVL variants.
 static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
   SmallVector<VPValue *> HeaderMasks = collectAllHeaderMasks(Plan);
+  VPTypeAnalysis TypeInfo(Plan.getCanonicalIV()->getScalarType());
   for (VPValue *HeaderMask : collectAllHeaderMasks(Plan)) {
     for (VPUser *U : collectUsersRecursively(HeaderMask)) {
       auto *CurRecipe = dyn_cast<VPRecipeBase>(U);
@@ -1384,6 +1385,14 @@ static void transformRecipestoEVLRecipes(VPlan &Plan, VPValue &EVL) {
                 VPValue *NewMask = GetNewMask(Red->getCondOp());
                 return new VPReductionEVLRecipe(*Red, EVL, NewMask);
               })
+              .Case<VPWidenSelectRecipe>([&](VPWidenSelectRecipe *Sel) {
+                SmallVector<VPValue *> Ops(Sel->operands());
+                Ops.push_back(&EVL);
+                return new VPWidenIntrinsicRecipe(Intrinsic::vp_select, Ops,
+                                                  TypeInfo.inferScalarType(Sel),
+                                                  false, false, false);
+              })
+
               .Default([&](VPRecipeBase *R) { return nullptr; });
 
       if (!NewRecipe)
diff --git a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
index 99bc4c38a3c3..7ea5ee341cc5 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanVerifier.cpp
@@ -138,6 +138,10 @@ bool VPlanVerifier::verifyEVLRecipe(const VPInstruction &EVL) const {
   };
   for (const VPUser *U : EVL.users()) {
     if (!TypeSwitch<const VPUser *, bool>(U)
+             .Case<VPWidenIntrinsicRecipe>(
+                 [&](const VPWidenIntrinsicRecipe *S) {
+                   return VerifyEVLUse(*S, S->getNumOperands() - 1);
+                 })
              .Case<VPWidenStoreEVLRecipe>([&](const VPWidenStoreEVLRecipe *S) {
                return VerifyEVLUse(*S, 2);
              })
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
index 41796e848632..fc12dd54f88d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vectorize-force-tail-with-evl-cond-reduction.ll
@@ -70,7 +70,7 @@ define i32 @cond_add(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i32, ptr [[TMP17]], i32 0
 ; IF-EVL-INLOOP-NEXT:    [[VP_OP_LOAD:%.*]] = call <vscale x 4 x i32> @llvm.vp.load.nxv4i32.p0(ptr align 4 [[TMP18]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP19:%.*]] = icmp sgt <vscale x 4 x i32> [[VP_OP_LOAD]], shufflevector (<vscale x 4 x i32> insertelement (<vscale x 4 x i32> poison, i32 3, i64 0), <vscale x 4 x i32> poison, <vscale x 4 x i32> zeroinitializer)
-; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = select <vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer
+; IF-EVL-INLOOP-NEXT:    [[TMP20:%.*]] = call <vscale x 4 x i32> @llvm.vp.select.nxv4i32(<vscale x 4 x i1> [[TMP19]], <vscale x 4 x i32> [[VP_OP_LOAD]], <vscale x 4 x i32> zeroinitializer, i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP21:%.*]] = call i32 @llvm.vp.reduce.add.nxv4i32(i32 0, <vscale x 4 x i32> [[TMP20]], <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), i32 [[TMP12]])
 ; IF-EVL-INLOOP-NEXT:    [[TMP22]] = add i32 [[TMP21]], [[VEC_PHI]]
 ; IF-EVL-INLOOP-NEXT:    [[TMP23:%.*]] = zext i32 [[TMP12]] to i64
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
new file mode 100644
index 000000000000..c26ab2017280
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -0,0 +1,65 @@
+; REQUIRES: asserts
+
+ ; RUN: opt -passes=loop-vectorize -debug-only=loop-vectorize \
+ ; RUN: -force-tail-folding-style=data-with-evl \
+ ; RUN: -prefer-predicate-over-epilogue=predicate-dont-vectorize \
+ ; RUN: -mtriple=riscv64 -mattr=+v -riscv-v-vector-bits-max=128 -disable-output < %s 2>&1 | FileCheck --check-prefix=IF-EVL %s
+
+ define void @vp_select(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
+ ; IF-EVL: VPlan 'Final VPlan for VF={vscale x 1,vscale x 2,vscale x 4},UF={1}' {
+ ; IF-EVL-NEXT: Live-in vp<[[VFUF:%[0-9]+]]> = VF * UF
+ ; IF-EVL-NEXT: Live-in vp<[[VTC:%[0-9]+]]> = vector-trip-count
+ ; IF-EVL-NEXT: Live-in ir<%N> = original trip-count
+
+ ; IF-EVL: vector.ph:
+ ; IF-EVL-NEXT: Successor(s): vector loop
+
+ ; IF-EVL: <x1> vector loop: {
+ ; IF-EVL-NEXT:   vector.body:
+ ; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
+ ; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
+ ; IF-EVL-NEXT:     EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
+ ; IF-EVL-NEXT:     EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
+ ; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
+ ; IF-EVL-NEXT:     CLONE ir<[[GEP1:%.+]]> = getelementptr inbounds ir<%b>, vp<[[ST]]>
+ ; IF-EVL-NEXT:     vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
+ ; IF-EVL-NEXT:     WIDEN ir<[[LD1:%.+]]> = vp.load vp<[[PTR1]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT:     CLONE ir<[[GEP2:%.+]]> = getelementptr inbounds ir<%c>, vp<[[ST]]>
+ ; IF-EVL-NEXT:     vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
+ ; IF-EVL-NEXT:     WIDEN ir<[[LD2:%.+]]> = vp.load vp<[[PTR2]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT:     WIDEN ir<[[CMP:%.+]]> = icmp sgt ir<[[LD1]]>, ir<[[LD2]]>
+ ; IF-EVL-NEXT:     WIDEN ir<[[SUB:%.+]]> = vp.sub ir<0>, ir<[[LD2]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT:     WIDEN-INTRINSIC vp<[[SELECT:%.+]]> = call llvm.vp.select(ir<[[CMP]]>, ir<%1>, ir<%2>, vp<[[EVL]]>)
+ ; IF-EVL-NEXT:     WIDEN ir<[[ADD:%.+]]> = vp.add vp<[[SELECT]]>, ir<[[LD1]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT:     CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
+ ; IF-EVL-NEXT:     vp<[[PTR3:%.+]]> = vector-pointer ir<[[GEP3]]>
+ ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
+ ; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
+ ; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
+ ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+ ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
+ ; IF-EVL-NEXT:   No successors
+ ; IF-EVL-NEXT: }
+
+ entry:
+   br label %for.body
+
+ for.body:
+   %indvars.iv = phi i64 [ %indvars.iv.next, %for.body ], [ 0, %entry ]
+   %arrayidx = getelementptr inbounds i32, ptr %b, i64 %indvars.iv
+   %0 = load i32, ptr %arrayidx, align 4
+   %arrayidx3 = getelementptr inbounds i32, ptr %c, i64 %indvars.iv
+   %1 = load i32, ptr %arrayidx3, align 4
+   %cmp4 = icmp sgt i32 %0, %1
+   %2 = sub i32 0, %1
+   %cond.p = select i1 %cmp4, i32 %1, i32 %2
+   %cond = add i32 %cond.p, %0
+   %arrayidx15 = getelementptr inbounds i32, ptr %a, i64 %indvars.iv
+   store i32 %cond, ptr %arrayidx15, align 4
+   %indvars.iv.next = add nuw nsw i64 %indvars.iv, 1
+   %exitcond.not = icmp eq i64 %indvars.iv.next, %N
+   br i1 %exitcond.not, label %exit, label %for.body
+
+ exit:
+   ret void
+ }
-- 
GitLab


From ba898dba48592b388150a19147ec5c36c0204ae2 Mon Sep 17 00:00:00 2001
From: Jacek Caban <jacek@codeweavers.com>
Date: Tue, 15 Oct 2024 22:58:13 +0200
Subject: [PATCH 029/329] [LLD][COFF] Fix handling of weak aliases referencing
 lazy symbols (#112243)

The assumption that a symbol is either `Defined` or `Undefined` is not
always true for some cases. For example, `mangleMaybe` may create a weak
alias to a lazy archive symbol.
---
 lld/COFF/Driver.cpp       |  4 ++--
 lld/COFF/Symbols.cpp      |  8 ++++----
 lld/COFF/Symbols.h        |  5 ++++-
 lld/test/COFF/weak-lazy.s | 18 ++++++++++++++++++
 4 files changed, 28 insertions(+), 7 deletions(-)
 create mode 100644 lld/test/COFF/weak-lazy.s

diff --git a/lld/COFF/Driver.cpp b/lld/COFF/Driver.cpp
index 85a58a367718..12e1ae628112 100644
--- a/lld/COFF/Driver.cpp
+++ b/lld/COFF/Driver.cpp
@@ -1340,7 +1340,7 @@ void LinkerDriver::maybeCreateECExportThunk(StringRef name, Symbol *&sym) {
   if (!sym)
     return;
   if (auto undef = dyn_cast<Undefined>(sym))
-    def = undef->getWeakAlias();
+    def = undef->getDefinedWeakAlias();
   else
     def = dyn_cast<Defined>(sym);
   if (!def)
@@ -1376,7 +1376,7 @@ void LinkerDriver::createECExportThunks() {
       continue;
     Defined *targetSym;
     if (auto undef = dyn_cast<Undefined>(sym))
-      targetSym = undef->getWeakAlias();
+      targetSym = undef->getDefinedWeakAlias();
     else
       targetSym = dyn_cast<Defined>(sym);
     if (!targetSym)
diff --git a/lld/COFF/Symbols.cpp b/lld/COFF/Symbols.cpp
index 567c2b93776c..89f2da02bdcf 100644
--- a/lld/COFF/Symbols.cpp
+++ b/lld/COFF/Symbols.cpp
@@ -112,12 +112,12 @@ DefinedImportThunk::DefinedImportThunk(COFFLinkerContext &ctx, StringRef name,
                                        ImportThunkChunk *chunk)
     : Defined(DefinedImportThunkKind, name), wrappedSym(s), data(chunk) {}
 
-Defined *Undefined::getWeakAlias() {
+Symbol *Undefined::getWeakAlias() {
   // A weak alias may be a weak alias to another symbol, so check recursively.
   DenseSet<Symbol *> weakChain;
   for (Symbol *a = weakAlias; a; a = cast<Undefined>(a)->weakAlias) {
-    if (auto *d = dyn_cast<Defined>(a))
-      return d;
+    if (!isa<Undefined>(a))
+      return a;
     if (!weakChain.insert(a).second)
       break; // We have a cycle.
   }
@@ -125,7 +125,7 @@ Defined *Undefined::getWeakAlias() {
 }
 
 bool Undefined::resolveWeakAlias() {
-  Defined *d = getWeakAlias();
+  Defined *d = getDefinedWeakAlias();
   if (!d)
     return false;
 
diff --git a/lld/COFF/Symbols.h b/lld/COFF/Symbols.h
index 9b21e09bf83a..a898ebf05fd8 100644
--- a/lld/COFF/Symbols.h
+++ b/lld/COFF/Symbols.h
@@ -340,7 +340,10 @@ public:
   // If this symbol is external weak, try to resolve it to a defined
   // symbol by searching the chain of fallback symbols. Returns the symbol if
   // successful, otherwise returns null.
-  Defined *getWeakAlias();
+  Symbol *getWeakAlias();
+  Defined *getDefinedWeakAlias() {
+    return dyn_cast_or_null<Defined>(getWeakAlias());
+  }
 
   // If this symbol is external weak, replace this object with aliased symbol.
   bool resolveWeakAlias();
diff --git a/lld/test/COFF/weak-lazy.s b/lld/test/COFF/weak-lazy.s
new file mode 100644
index 000000000000..2812ba7af8b5
--- /dev/null
+++ b/lld/test/COFF/weak-lazy.s
@@ -0,0 +1,18 @@
+# REQUIRES: x86
+
+# RUN: llvm-mc -filetype=obj -triple=i686-windows %s -o %t.obj
+# RUN: llvm-lib -machine:x86 -out:%t-func.lib %t.obj
+
+# -export:func creates a weak alias to a lazy symbol. Make sure we can handle that when processing -export:func2=func.
+# RUN: lld-link -dll -noentry -machine:x86 -out:%t.dll %t-func.lib -export:func -export:func2=func
+
+        .text
+        .def    @feat.00;
+        .scl    3;
+        .type   0;
+        .endef
+        .globl  @feat.00
+.set @feat.00, 1
+        .globl _func@0
+_func@0:
+        retl
-- 
GitLab


From 23da16933b8ad48a967905369f576e5ec45b985f Mon Sep 17 00:00:00 2001
From: Lei Huang <lei@ca.ibm.com>
Date: Tue, 15 Oct 2024 16:58:36 -0400
Subject: [PATCH 030/329] [NFC][PowerPC] Use tablegen's MatchRegisterName()
 (#111553)

Use PPC `MatchRegisterName()` that is auto generated by table gen.
---
 .../Target/PowerPC/AsmParser/PPCAsmParser.cpp | 54 +++++--------------
 llvm/lib/Target/PowerPC/PPC.td                |  3 +-
 2 files changed, 15 insertions(+), 42 deletions(-)

diff --git a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
index cc20ad7822df..bf512481cf64 100644
--- a/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
+++ b/llvm/lib/Target/PowerPC/AsmParser/PPCAsmParser.cpp
@@ -1291,6 +1291,9 @@ bool PPCAsmParser::matchAndEmitInstruction(SMLoc IDLoc, unsigned &Opcode,
   llvm_unreachable("Implement any new match types added!");
 }
 
+#define GET_REGISTER_MATCHER
+#include "PPCGenAsmMatcher.inc"
+
 MCRegister PPCAsmParser::matchRegisterName(int64_t &IntVal) {
   if (getParser().getTok().is(AsmToken::Percent))
     getParser().Lex(); // Eat the '%'.
@@ -1298,55 +1301,25 @@ MCRegister PPCAsmParser::matchRegisterName(int64_t &IntVal) {
   if (!getParser().getTok().is(AsmToken::Identifier))
     return MCRegister();
 
-  MCRegister RegNo;
   StringRef Name = getParser().getTok().getString();
+  MCRegister RegNo = MatchRegisterName(Name);
+  if (!RegNo)
+    return RegNo;
+
+  Name.substr(Name.find_first_of("1234567890")).getAsInteger(10, IntVal);
+
+  // MatchRegisterName doesn't seem to have special handling for 64bit vs 32bit
+  // register types.
   if (Name.equals_insensitive("lr")) {
     RegNo = isPPC64() ? PPC::LR8 : PPC::LR;
     IntVal = 8;
   } else if (Name.equals_insensitive("ctr")) {
     RegNo = isPPC64() ? PPC::CTR8 : PPC::CTR;
     IntVal = 9;
-  } else if (Name.equals_insensitive("vrsave")) {
-    RegNo = PPC::VRSAVE;
+  } else if (Name.equals_insensitive("vrsave"))
     IntVal = 256;
-  } else if (Name.starts_with_insensitive("r") &&
-             !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
+  else if (Name.starts_with_insensitive("r"))
     RegNo = isPPC64() ? XRegs[IntVal] : RRegs[IntVal];
-  } else if (Name.starts_with_insensitive("f") &&
-             !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
-    RegNo = FRegs[IntVal];
-  } else if (Name.starts_with_insensitive("vs") &&
-             !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 64) {
-    RegNo = VSRegs[IntVal];
-  } else if (Name.starts_with_insensitive("v") &&
-             !Name.substr(1).getAsInteger(10, IntVal) && IntVal < 32) {
-    RegNo = VRegs[IntVal];
-  } else if (Name.starts_with_insensitive("cr") &&
-             !Name.substr(2).getAsInteger(10, IntVal) && IntVal < 8) {
-    RegNo = CRRegs[IntVal];
-  } else if (Name.starts_with_insensitive("acc") &&
-             !Name.substr(3).getAsInteger(10, IntVal) && IntVal < 8) {
-    RegNo = ACCRegs[IntVal];
-  } else if (Name.starts_with_insensitive("wacc_hi") &&
-             !Name.substr(7).getAsInteger(10, IntVal) && IntVal < 8) {
-    RegNo = ACCRegs[IntVal];
-  } else if (Name.starts_with_insensitive("wacc") &&
-             !Name.substr(4).getAsInteger(10, IntVal) && IntVal < 8) {
-    RegNo = WACCRegs[IntVal];
-  } else if (Name.starts_with_insensitive("dmrrowp") &&
-             !Name.substr(7).getAsInteger(10, IntVal) && IntVal < 32) {
-    RegNo = DMRROWpRegs[IntVal];
-  } else if (Name.starts_with_insensitive("dmrrow") &&
-             !Name.substr(6).getAsInteger(10, IntVal) && IntVal < 64) {
-    RegNo = DMRROWRegs[IntVal];
-  } else if (Name.starts_with_insensitive("dmrp") &&
-             !Name.substr(4).getAsInteger(10, IntVal) && IntVal < 4) {
-    RegNo = DMRROWpRegs[IntVal];
-  } else if (Name.starts_with_insensitive("dmr") &&
-             !Name.substr(3).getAsInteger(10, IntVal) && IntVal < 8) {
-    RegNo = DMRRegs[IntVal];
-  } else
-    return MCRegister();
 
   getParser().Lex();
   return RegNo;
@@ -1874,7 +1847,6 @@ extern "C" LLVM_EXTERNAL_VISIBILITY void LLVMInitializePowerPCAsmParser() {
   RegisterMCAsmParser<PPCAsmParser> D(getThePPC64LETarget());
 }
 
-#define GET_REGISTER_MATCHER
 #define GET_MATCHER_IMPLEMENTATION
 #define GET_MNEMONIC_SPELL_CHECKER
 #include "PPCGenAsmMatcher.inc"
diff --git a/llvm/lib/Target/PowerPC/PPC.td b/llvm/lib/Target/PowerPC/PPC.td
index da31a993b9c6..72c5909f10c3 100644
--- a/llvm/lib/Target/PowerPC/PPC.td
+++ b/llvm/lib/Target/PowerPC/PPC.td
@@ -719,7 +719,8 @@ def PPCAsmWriter : AsmWriter {
 }
 
 def PPCAsmParser : AsmParser {
-  let ShouldEmitMatchRegisterName = 0;
+  let ShouldEmitMatchRegisterName = 1;
+  let AllowDuplicateRegisterNames = 1;
 }
 
 def PPCAsmParserVariant : AsmParserVariant {
-- 
GitLab


From 85880140be35cdcdcad53cbb7255a85d5634af88 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Tue, 15 Oct 2024 14:18:37 -0700
Subject: [PATCH 031/329] [flang][cuda] Add kernel registration in CUF
 constructor (#112416)

Update the CUF constructor with the cuf.register_kernel operations.
---
 .../flang/Optimizer/Transforms/Passes.td      |  2 +-
 flang/lib/Optimizer/Transforms/CMakeLists.txt |  1 +
 .../Transforms/CUFAddConstructor.cpp          | 20 +++++++++++++++++--
 flang/test/Fir/CUDA/cuda-register-func.fir    |  8 ++------
 4 files changed, 22 insertions(+), 9 deletions(-)

diff --git a/flang/include/flang/Optimizer/Transforms/Passes.td b/flang/include/flang/Optimizer/Transforms/Passes.td
index bf75123e8537..af6bd41cbb71 100644
--- a/flang/include/flang/Optimizer/Transforms/Passes.td
+++ b/flang/include/flang/Optimizer/Transforms/Passes.td
@@ -439,7 +439,7 @@ def CufImplicitDeviceGlobal :
 def CUFAddConstructor : Pass<"cuf-add-constructor", "mlir::ModuleOp"> {
   let summary = "Add constructor to register CUDA Fortran allocators";
   let dependentDialects = [
-    "mlir::func::FuncDialect"
+    "cuf::CUFDialect", "mlir::func::FuncDialect"
   ];
 }
 
diff --git a/flang/lib/Optimizer/Transforms/CMakeLists.txt b/flang/lib/Optimizer/Transforms/CMakeLists.txt
index 5e1a0293e63c..352fe4cbe09e 100644
--- a/flang/lib/Optimizer/Transforms/CMakeLists.txt
+++ b/flang/lib/Optimizer/Transforms/CMakeLists.txt
@@ -49,6 +49,7 @@ add_flang_library(FIRTransforms
   HLFIRDialect
   MLIRAffineUtils
   MLIRFuncDialect
+  MLIRGPUDialect
   MLIRLLVMDialect
   MLIRLLVMCommonConversion
   MLIRMathTransforms
diff --git a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
index 48620fbc5858..3db24226e750 100644
--- a/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
+++ b/flang/lib/Optimizer/Transforms/CUFAddConstructor.cpp
@@ -12,6 +12,7 @@
 #include "flang/Optimizer/Dialect/FIRDialect.h"
 #include "flang/Optimizer/Dialect/FIROpsSupport.h"
 #include "flang/Runtime/entry-names.h"
+#include "mlir/Dialect/GPU/IR/GPUDialect.h"
 #include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/Pass/Pass.h"
 #include "llvm/ADT/SmallVector.h"
@@ -23,6 +24,8 @@ namespace fir {
 
 namespace {
 
+static constexpr llvm::StringRef cudaModName{"cuda_device_mod"};
+
 static constexpr llvm::StringRef cudaFortranCtorName{
     "__cudaFortranConstructor"};
 
@@ -31,6 +34,7 @@ struct CUFAddConstructor
 
   void runOnOperation() override {
     mlir::ModuleOp mod = getOperation();
+    mlir::SymbolTable symTab(mod);
     mlir::OpBuilder builder{mod.getBodyRegion()};
     builder.setInsertionPointToEnd(mod.getBody());
     mlir::Location loc = mod.getLoc();
@@ -48,13 +52,25 @@ struct CUFAddConstructor
         mod.getContext(), RTNAME_STRING(CUFRegisterAllocator));
     builder.setInsertionPointToEnd(mod.getBody());
 
-    // Create the constructor function that cal CUFRegisterAllocator.
-    builder.setInsertionPointToEnd(mod.getBody());
+    // Create the constructor function that call CUFRegisterAllocator.
     auto func = builder.create<mlir::LLVM::LLVMFuncOp>(loc, cudaFortranCtorName,
                                                        funcTy);
     func.setLinkage(mlir::LLVM::Linkage::Internal);
     builder.setInsertionPointToStart(func.addEntryBlock(builder));
     builder.create<mlir::LLVM::CallOp>(loc, funcTy, cufRegisterAllocatorRef);
+
+    // Register kernels
+    auto gpuMod = symTab.lookup<mlir::gpu::GPUModuleOp>(cudaModName);
+    if (gpuMod) {
+      for (auto func : gpuMod.getOps<mlir::gpu::GPUFuncOp>()) {
+        if (func.isKernel()) {
+          auto kernelName = mlir::SymbolRefAttr::get(
+              builder.getStringAttr(cudaModName),
+              {mlir::SymbolRefAttr::get(builder.getContext(), func.getName())});
+          builder.create<cuf::RegisterKernelOp>(loc, kernelName);
+        }
+      }
+    }
     builder.create<mlir::LLVM::ReturnOp>(loc, mlir::ValueRange{});
 
     // Create the llvm.global_ctor with the function.
diff --git a/flang/test/Fir/CUDA/cuda-register-func.fir b/flang/test/Fir/CUDA/cuda-register-func.fir
index a428f68eb3bf..277475f0883d 100644
--- a/flang/test/Fir/CUDA/cuda-register-func.fir
+++ b/flang/test/Fir/CUDA/cuda-register-func.fir
@@ -1,4 +1,4 @@
-// RUN: fir-opt %s | FileCheck %s
+// RUN: fir-opt --cuf-add-constructor %s | FileCheck %s
 
 module attributes {gpu.container_module} {
   gpu.module @cuda_device_mod {
@@ -9,12 +9,8 @@ module attributes {gpu.container_module} {
       gpu.return
     }
   }
-  llvm.func internal @__cudaFortranConstructor() {
-    cuf.register_kernel @cuda_device_mod::@_QPsub_device1
-    cuf.register_kernel @cuda_device_mod::@_QPsub_device2
-    llvm.return
-  }
 }
 
+// CHECK-LABEL: llvm.func internal @__cudaFortranConstructor()
 // CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device1
 // CHECK: cuf.register_kernel @cuda_device_mod::@_QPsub_device2
-- 
GitLab


From 5a9d6841ecaf7863809a8e2f67af55a45f374d36 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 15 Oct 2024 14:20:48 -0700
Subject: [PATCH 032/329] [flang] Split interoperability warnings, disable some
 by default (#111922)

Type interoperability warnings current issue for intrinsic types when
their type, kind, or length do not meet the requirements for C
interoperability. This turns out to be too noisy for the case of
one-byte characters with lengths other than one when creating C pointers
from C_LOC or C_F_POINTER -- it is not uncommon for programs to use
pointers to longer character objects.

So split the interoperability warning so that the case of a known bad
character length for an otherwise interoperable type is controlled by
its own UsageWarning enumerator, and leave that usage warning off by
default. This will better fit expectations in the default case while
still showing a warning under -pedantic.
---
 flang/include/flang/Common/Fortran-features.h |  6 +--
 flang/lib/Common/Fortran-features.cpp         |  1 +
 flang/lib/Evaluate/intrinsics.cpp             | 45 ++++++++++++++-----
 flang/test/Semantics/c_f_pointer.f90          |  5 ++-
 flang/test/Semantics/c_loc01.f90              |  5 ++-
 5 files changed, 45 insertions(+), 17 deletions(-)

diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index 3942a7926286..e021df13fe77 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -63,9 +63,9 @@ ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
     F202XAllocatableBreakingChange, OptionalMustBePresent, CommonBlockPadding,
     LogicalVsCBool, BindCCharLength, ProcDummyArgShapes, ExternalNameConflict,
     FoldingException, FoldingAvoidsRuntimeCrash, FoldingValueChecks,
-    FoldingFailure, FoldingLimit, Interoperability, Bounds, Preprocessing,
-    Scanning, OpenAccUsage, ProcPointerCompatibility, VoidMold,
-    KnownBadImplicitInterface, EmptyCase, CaseOverflow, CUDAUsage,
+    FoldingFailure, FoldingLimit, Interoperability, CharacterInteroperability,
+    Bounds, Preprocessing, Scanning, OpenAccUsage, ProcPointerCompatibility,
+    VoidMold, KnownBadImplicitInterface, EmptyCase, CaseOverflow, CUDAUsage,
     IgnoreTKRUsage, ExternalInterfaceMismatch, DefinedOperatorArgs, Final,
     ZeroDoStep, UnusedForallIndex, OpenMPUsage, ModuleFile, DataLength,
     IgnoredDirective, HomonymousSpecific, HomonymousResult,
diff --git a/flang/lib/Common/Fortran-features.cpp b/flang/lib/Common/Fortran-features.cpp
index 59f570e6ab6e..a53f32d74dc3 100644
--- a/flang/lib/Common/Fortran-features.cpp
+++ b/flang/lib/Common/Fortran-features.cpp
@@ -48,6 +48,7 @@ LanguageFeatureControl::LanguageFeatureControl() {
   warnUsage_.set(UsageWarning::FoldingFailure);
   warnUsage_.set(UsageWarning::FoldingLimit);
   warnUsage_.set(UsageWarning::Interoperability);
+  // CharacterInteroperability warnings about length are off by default
   warnUsage_.set(UsageWarning::Bounds);
   warnUsage_.set(UsageWarning::Preprocessing);
   warnUsage_.set(UsageWarning::Scanning);
diff --git a/flang/lib/Evaluate/intrinsics.cpp b/flang/lib/Evaluate/intrinsics.cpp
index 1f48fc21662e..4271faa0db12 100644
--- a/flang/lib/Evaluate/intrinsics.cpp
+++ b/flang/lib/Evaluate/intrinsics.cpp
@@ -2861,12 +2861,22 @@ IntrinsicProcTable::Implementation::HandleC_F_Pointer(
           }
         } else if (!IsInteroperableIntrinsicType(
                        *type, &context.languageFeatures())
-                        .value_or(true) &&
-            context.languageFeatures().ShouldWarn(
-                common::UsageWarning::Interoperability)) {
-          context.messages().Say(common::UsageWarning::Interoperability, at,
-              "FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type %s"_warn_en_US,
-              type->AsFortran());
+                        .value_or(true)) {
+          if (type->category() == TypeCategory::Character &&
+              type->kind() == 1) {
+            if (context.languageFeatures().ShouldWarn(
+                    common::UsageWarning::CharacterInteroperability)) {
+              context.messages().Say(
+                  common::UsageWarning::CharacterInteroperability, at,
+                  "FPTR= argument to C_F_POINTER() should not have the non-interoperable character length %s"_warn_en_US,
+                  type->AsFortran());
+            }
+          } else if (context.languageFeatures().ShouldWarn(
+                         common::UsageWarning::Interoperability)) {
+            context.messages().Say(common::UsageWarning::Interoperability, at,
+                "FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type or kind %s"_warn_en_US,
+                type->AsFortran());
+          }
         }
         if (ExtractCoarrayRef(*expr)) {
           context.messages().Say(at,
@@ -2963,12 +2973,23 @@ std::optional<SpecificCall> IntrinsicProcTable::Implementation::HandleC_Loc(
         context.messages().Say(arguments[0]->sourceLocation(),
             "C_LOC() argument may not be zero-length character"_err_en_US);
       } else if (typeAndShape->type().category() != TypeCategory::Derived &&
-          !IsInteroperableIntrinsicType(typeAndShape->type()).value_or(true) &&
-          context.languageFeatures().ShouldWarn(
-              common::UsageWarning::Interoperability)) {
-        context.messages().Say(common::UsageWarning::Interoperability,
-            arguments[0]->sourceLocation(),
-            "C_LOC() argument has non-interoperable intrinsic type, kind, or length"_warn_en_US);
+          !IsInteroperableIntrinsicType(typeAndShape->type()).value_or(true)) {
+        if (typeAndShape->type().category() == TypeCategory::Character &&
+            typeAndShape->type().kind() == 1) {
+          // Default character kind, but length is not known to be 1
+          if (context.languageFeatures().ShouldWarn(
+                  common::UsageWarning::CharacterInteroperability)) {
+            context.messages().Say(
+                common::UsageWarning::CharacterInteroperability,
+                arguments[0]->sourceLocation(),
+                "C_LOC() argument has non-interoperable character length"_warn_en_US);
+          }
+        } else if (context.languageFeatures().ShouldWarn(
+                       common::UsageWarning::Interoperability)) {
+          context.messages().Say(common::UsageWarning::Interoperability,
+              arguments[0]->sourceLocation(),
+              "C_LOC() argument has non-interoperable intrinsic type or kind"_warn_en_US);
+        }
       }
 
       characteristics::DummyDataObject ddo{std::move(*typeAndShape)};
diff --git a/flang/test/Semantics/c_f_pointer.f90 b/flang/test/Semantics/c_f_pointer.f90
index c2529201ee26..0cd0161b1fb0 100644
--- a/flang/test/Semantics/c_f_pointer.f90
+++ b/flang/test/Semantics/c_f_pointer.f90
@@ -18,6 +18,7 @@ program test
   end type
   type(notBindCType), pointer :: notBindC
   character(2), pointer :: c2ptr
+  character(1,4), pointer :: unicodePtr
   rankTwoArray = reshape([1, 2, 3, 4], shape(rankTwoArray))
   call c_f_pointer(scalarC, scalarIntF) ! ok
   call c_f_pointer(scalarC, arrayIntF, [1_8]) ! ok
@@ -48,6 +49,8 @@ program test
   call c_f_pointer(scalarC, unlimited)
   !WARNING: FPTR= argument to C_F_POINTER() should not have a derived type that is not BIND(C)
   call c_f_pointer(scalarC, notBindC)
-  !WARNING: FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type CHARACTER(KIND=1,LEN=2_8)
+  !WARNING: FPTR= argument to C_F_POINTER() should not have the non-interoperable character length CHARACTER(KIND=1,LEN=2_8)
   call c_f_pointer(scalarC, c2ptr)
+  !WARNING: FPTR= argument to C_F_POINTER() should not have the non-interoperable intrinsic type or kind CHARACTER(KIND=4,LEN=1_8)
+  call c_f_pointer(scalarC, unicodePtr)
 end program
diff --git a/flang/test/Semantics/c_loc01.f90 b/flang/test/Semantics/c_loc01.f90
index 9155ff4f4735..abae1e263e2e 100644
--- a/flang/test/Semantics/c_loc01.f90
+++ b/flang/test/Semantics/c_loc01.f90
@@ -21,6 +21,7 @@ module m
     type(hasLen(*)), target :: nclen
     integer, intent(in) :: n
     character(2), target :: ch
+    character(1,4), target :: unicode
     real :: arr1(purefun1(c_loc(targ))) ! ok
     real :: arr2(purefun2(c_funloc(subr))) ! ok
     character(:), allocatable, target :: deferred
@@ -40,8 +41,10 @@ module m
     cp = c_loc(nclen)
     !ERROR: C_LOC() argument may not be zero-length character
     cp = c_loc(ch(2:1))
-    !WARNING: C_LOC() argument has non-interoperable intrinsic type, kind, or length
+    !WARNING: C_LOC() argument has non-interoperable character length
     cp = c_loc(ch)
+    !WARNING: C_LOC() argument has non-interoperable intrinsic type or kind
+    cp = c_loc(unicode)
     cp = c_loc(ch(1:1)) ! ok
     cp = c_loc(deferred) ! ok
     cp = c_loc(p2ch) ! ok
-- 
GitLab


From a70ffe784da990a791da1e70e86cd877af3924bc Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 15 Oct 2024 14:22:22 -0700
Subject: [PATCH 033/329] [flang] Support "PRINT namelistname" (#112024)

Nearly every Fortran compiler supports "PRINT namelistname" as a synonym
for "WRITE (*, NML=namelistname)". Implement this extension via parse
tree rewriting.

Fixes https://github.com/llvm/llvm-project/issues/111738.
---
 flang/docs/Extensions.md                      |  2 ++
 flang/include/flang/Common/Fortran-features.h |  2 +-
 flang/lib/Semantics/rewrite-parse-tree.cpp    | 27 ++++++++++++++++++-
 flang/test/Semantics/rewrite02.f90            |  8 ++++++
 4 files changed, 37 insertions(+), 2 deletions(-)
 create mode 100644 flang/test/Semantics/rewrite02.f90

diff --git a/flang/docs/Extensions.md b/flang/docs/Extensions.md
index 3ffd2949e45b..f85a3eb39ed1 100644
--- a/flang/docs/Extensions.md
+++ b/flang/docs/Extensions.md
@@ -389,6 +389,8 @@ end
 * A local data object may appear in a specification expression, even
   when it is not a dummy argument or in COMMON, so long as it is
   has the SAVE attribute and was initialized.
+* `PRINT namelistname` is accepted and interpreted as
+  `WRITE(*,NML=namelistname)`, a near-universal extension.
 
 ### Extensions supported when enabled by options
 
diff --git a/flang/include/flang/Common/Fortran-features.h b/flang/include/flang/Common/Fortran-features.h
index e021df13fe77..2b57c7ae5064 100644
--- a/flang/include/flang/Common/Fortran-features.h
+++ b/flang/include/flang/Common/Fortran-features.h
@@ -53,7 +53,7 @@ ENUM_CLASS(LanguageFeature, BackslashEscapes, OldDebugLines,
     NonBindCInteroperability, CudaManaged, CudaUnified,
     PolymorphicActualAllocatableOrPointerToMonomorphicDummy, RelaxedPureDummy,
     UndefinableAsynchronousOrVolatileActual, AutomaticInMainProgram, PrintCptr,
-    SavedLocalInSpecExpr)
+    SavedLocalInSpecExpr, PrintNamelist)
 
 // Portability and suspicious usage warnings
 ENUM_CLASS(UsageWarning, Portability, PointerToUndefinable,
diff --git a/flang/lib/Semantics/rewrite-parse-tree.cpp b/flang/lib/Semantics/rewrite-parse-tree.cpp
index b4fb72ce2130..c90ae6634284 100644
--- a/flang/lib/Semantics/rewrite-parse-tree.cpp
+++ b/flang/lib/Semantics/rewrite-parse-tree.cpp
@@ -32,7 +32,7 @@ using namespace parser::literals;
 class RewriteMutator {
 public:
   RewriteMutator(SemanticsContext &context)
-      : errorOnUnresolvedName_{!context.AnyFatalError()},
+      : context_{context}, errorOnUnresolvedName_{!context.AnyFatalError()},
         messages_{context.messages()} {}
 
   // Default action for a parse tree node is to visit children.
@@ -42,6 +42,7 @@ public:
   void Post(parser::Name &);
   void Post(parser::SpecificationPart &);
   bool Pre(parser::ExecutionPart &);
+  bool Pre(parser::ActionStmt &);
   void Post(parser::ReadStmt &);
   void Post(parser::WriteStmt &);
 
@@ -66,6 +67,7 @@ public:
 private:
   using stmtFuncType =
       parser::Statement<common::Indirection<parser::StmtFunctionStmt>>;
+  SemanticsContext &context_;
   bool errorOnUnresolvedName_{true};
   parser::Messages &messages_;
   std::list<stmtFuncType> stmtFuncsToConvert_;
@@ -130,6 +132,29 @@ bool RewriteMutator::Pre(parser::ExecutionPart &x) {
   return true;
 }
 
+// Rewrite PRINT NML -> WRITE(*,NML=NML)
+bool RewriteMutator::Pre(parser::ActionStmt &x) {
+  if (auto *print{std::get_if<common::Indirection<parser::PrintStmt>>(&x.u)};
+      print &&
+      std::get<std::list<parser::OutputItem>>(print->value().t).empty()) {
+    auto &format{std::get<parser::Format>(print->value().t)};
+    if (std::holds_alternative<parser::Expr>(format.u)) {
+      if (auto *name{parser::Unwrap<parser::Name>(format)}; name &&
+          name->symbol && name->symbol->GetUltimate().has<NamelistDetails>() &&
+          context_.IsEnabled(common::LanguageFeature::PrintNamelist)) {
+        context_.Warn(common::LanguageFeature::PrintNamelist, name->source,
+            "nonstandard: namelist in PRINT statement"_port_en_US);
+        std::list<parser::IoControlSpec> controls;
+        controls.emplace_back(std::move(*name));
+        x.u = common::Indirection<parser::WriteStmt>::Make(
+            parser::IoUnit{parser::Star{}}, std::optional<parser::Format>{},
+            std::move(controls), std::list<parser::OutputItem>{});
+      }
+    }
+  }
+  return true;
+}
+
 // When a namelist group name appears (without NML=) in a READ or WRITE
 // statement in such a way that it can be misparsed as a format expression,
 // rewrite the I/O statement's parse tree node as if the namelist group
diff --git a/flang/test/Semantics/rewrite02.f90 b/flang/test/Semantics/rewrite02.f90
new file mode 100644
index 000000000000..2393498e65d2
--- /dev/null
+++ b/flang/test/Semantics/rewrite02.f90
@@ -0,0 +1,8 @@
+!RUN: %flang_fc1 -fdebug-unparse -pedantic %s  2>&1 | FileCheck %s
+!Test rewrite of "PRINT namelistname" into "WRITE(*,NML=namelistname)"
+!CHECK: nonstandard: namelist in PRINT statement
+namelist /nml/x
+x = 123.
+!CHECK: WRITE (*, NML=nml)
+print nml
+end
-- 
GitLab


From 35e86245196df1e6a1cf3b023f13f075e2ac2794 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 15 Oct 2024 14:22:48 -0700
Subject: [PATCH 034/329] [flang] Silence impossible error about SMP interface
 incompatibility (#112054)

It is possible for the compiler to emit an impossible error message
about dummy argument character length incompatibility in the case of a
MODULE SUBROUTINE or FUNCTION defined later in a submodule with MODULE
PROCEDURE, when the character length is defined by USE association in
its interface. The checking for separate module procedure interface
compatibility needs to use a more flexible check than just operator== on
a semantics::ParamValue.
---
 flang/include/flang/Evaluate/tools.h |  6 +++++-
 flang/include/flang/Semantics/type.h |  1 +
 flang/lib/Evaluate/tools.cpp         | 16 ++++++++++++++--
 flang/lib/Evaluate/type.cpp          |  5 ++++-
 flang/lib/Semantics/type.cpp         |  6 ++++++
 flang/test/Semantics/smp-def01.f90   | 23 +++++++++++++++++++++++
 6 files changed, 53 insertions(+), 4 deletions(-)
 create mode 100644 flang/test/Semantics/smp-def01.f90

diff --git a/flang/include/flang/Evaluate/tools.h b/flang/include/flang/Evaluate/tools.h
index d2887b69cc6d..f547138f5a11 100644
--- a/flang/include/flang/Evaluate/tools.h
+++ b/flang/include/flang/Evaluate/tools.h
@@ -1252,8 +1252,12 @@ private:
 // Predicate: should two expressions be considered identical for the purposes
 // of determining whether two procedure interfaces are compatible, modulo
 // naming of corresponding dummy arguments?
-std::optional<bool> AreEquivalentInInterface(
+template <typename T>
+std::optional<bool> AreEquivalentInInterface(const Expr<T> &, const Expr<T> &);
+extern template std::optional<bool> AreEquivalentInInterface<SubscriptInteger>(
     const Expr<SubscriptInteger> &, const Expr<SubscriptInteger> &);
+extern template std::optional<bool> AreEquivalentInInterface<SomeInteger>(
+    const Expr<SomeInteger> &, const Expr<SomeInteger> &);
 
 bool CheckForCoindexedObject(parser::ContextualMessages &,
     const std::optional<ActualArgument> &, const std::string &procName,
diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h
index e2131e7e160c..1292c381b65f 100644
--- a/flang/include/flang/Semantics/type.h
+++ b/flang/include/flang/Semantics/type.h
@@ -110,6 +110,7 @@ public:
     return category_ == that.category_ && expr_ == that.expr_;
   }
   bool operator!=(const ParamValue &that) const { return !(*this == that); }
+  bool IsEquivalentInInterface(const ParamValue &) const;
   std::string AsFortran() const;
 
 private:
diff --git a/flang/lib/Evaluate/tools.cpp b/flang/lib/Evaluate/tools.cpp
index c2545a870994..4d98220a7065 100644
--- a/flang/lib/Evaluate/tools.cpp
+++ b/flang/lib/Evaluate/tools.cpp
@@ -1320,8 +1320,10 @@ std::optional<Expr<SomeType>> HollerithToBOZ(FoldingContext &context,
 
 // Extracts a whole symbol being used as a bound of a dummy argument,
 // possibly wrapped with parentheses or MAX(0, ...).
+// Works with any integer expression.
+template <typename T> const Symbol *GetBoundSymbol(const Expr<T> &);
 template <int KIND>
-static const Symbol *GetBoundSymbol(
+const Symbol *GetBoundSymbol(
     const Expr<Type<TypeCategory::Integer, KIND>> &expr) {
   using T = Type<TypeCategory::Integer, KIND>;
   return common::visit(
@@ -1358,9 +1360,15 @@ static const Symbol *GetBoundSymbol(
       },
       expr.u);
 }
+template <>
+const Symbol *GetBoundSymbol<SomeInteger>(const Expr<SomeInteger> &expr) {
+  return common::visit(
+      [](const auto &kindExpr) { return GetBoundSymbol(kindExpr); }, expr.u);
+}
 
+template <typename T>
 std::optional<bool> AreEquivalentInInterface(
-    const Expr<SubscriptInteger> &x, const Expr<SubscriptInteger> &y) {
+    const Expr<T> &x, const Expr<T> &y) {
   auto xVal{ToInt64(x)};
   auto yVal{ToInt64(y)};
   if (xVal && yVal) {
@@ -1394,6 +1402,10 @@ std::optional<bool> AreEquivalentInInterface(
     return std::nullopt; // not sure
   }
 }
+template std::optional<bool> AreEquivalentInInterface<SubscriptInteger>(
+    const Expr<SubscriptInteger> &, const Expr<SubscriptInteger> &);
+template std::optional<bool> AreEquivalentInInterface<SomeInteger>(
+    const Expr<SomeInteger> &, const Expr<SomeInteger> &);
 
 bool CheckForCoindexedObject(parser::ContextualMessages &messages,
     const std::optional<ActualArgument> &arg, const std::string &procName,
diff --git a/flang/lib/Evaluate/type.cpp b/flang/lib/Evaluate/type.cpp
index a1df40667471..c00688853cd0 100644
--- a/flang/lib/Evaluate/type.cpp
+++ b/flang/lib/Evaluate/type.cpp
@@ -518,7 +518,10 @@ static bool AreSameDerivedType(
 
 bool DynamicType::IsEquivalentTo(const DynamicType &that) const {
   return category_ == that.category_ && kind_ == that.kind_ &&
-      PointeeComparison(charLengthParamValue_, that.charLengthParamValue_) &&
+      (charLengthParamValue_ == that.charLengthParamValue_ ||
+          (charLengthParamValue_ && that.charLengthParamValue_ &&
+              charLengthParamValue_->IsEquivalentInInterface(
+                  *that.charLengthParamValue_))) &&
       knownLength().has_value() == that.knownLength().has_value() &&
       (!knownLength() || *knownLength() == *that.knownLength()) &&
       AreSameDerivedType(derived_, that.derived_);
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index e867d7ad6e25..7f5f4e98a7d6 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -758,6 +758,12 @@ void ParamValue::SetExplicit(SomeIntExpr &&x) {
   expr_ = std::move(x);
 }
 
+bool ParamValue::IsEquivalentInInterface(const ParamValue &that) const {
+  return (category_ == that.category_ &&
+      expr_.has_value() == that.expr_.has_value() &&
+      (!expr_ || evaluate::AreEquivalentInInterface(*expr_, *that.expr_)));
+}
+
 std::string ParamValue::AsFortran() const {
   switch (category_) {
     SWITCH_COVERS_ALL_CASES
diff --git a/flang/test/Semantics/smp-def01.f90 b/flang/test/Semantics/smp-def01.f90
new file mode 100644
index 000000000000..7169bba45099
--- /dev/null
+++ b/flang/test/Semantics/smp-def01.f90
@@ -0,0 +1,23 @@
+!RUN: %flang -fsyntax-only %s 2>&1 | FileCheck --allow-empty %s
+!Ensure no bogus error message about incompatible character length
+!CHECK-NOT: error
+
+module m1
+  integer :: n = 1
+end
+
+module m2
+  interface
+    module subroutine s(a,b)
+      use m1
+      character(n) :: a
+      character(n) :: b
+    end
+  end interface
+end
+
+submodule(m2) m2s1
+ contains
+  module procedure s
+  end
+end
-- 
GitLab


From 38b9dd7a7f393f990251c6cc204cfbea05930a0e Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 15 Oct 2024 14:23:15 -0700
Subject: [PATCH 035/329] [flang] Fold ERFC_SCALED (#112287)

Move the ErfcScaled template function from the runtime into a new header
file in flang/include/Common, then use it in constant folding to
implement folding for the erfc_scaled() intrinsic function.
---
 flang/include/flang/Common/erfc-scaled.h  | 116 ++++++++++++++++++++++
 flang/lib/Evaluate/intrinsics-library.cpp |   2 +
 flang/runtime/numeric-templates.h         | 102 +------------------
 flang/test/Evaluate/fold-erfc-scaled.f90  |   7 ++
 4 files changed, 127 insertions(+), 100 deletions(-)
 create mode 100644 flang/include/flang/Common/erfc-scaled.h
 create mode 100644 flang/test/Evaluate/fold-erfc-scaled.f90

diff --git a/flang/include/flang/Common/erfc-scaled.h b/flang/include/flang/Common/erfc-scaled.h
new file mode 100644
index 000000000000..a1bf3ea0f092
--- /dev/null
+++ b/flang/include/flang/Common/erfc-scaled.h
@@ -0,0 +1,116 @@
+//===-- include/flang/Common/erfc-scaled.h-----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef FORTRAN_COMMON_ERFC_SCALED_H_
+#define FORTRAN_COMMON_ERFC_SCALED_H_
+
+namespace Fortran::common {
+template <typename T> inline T ErfcScaled(T arg) {
+  // Coefficients for approximation to erfc in the first interval.
+  static const T a[5] = {3.16112374387056560e00, 1.13864154151050156e02,
+      3.77485237685302021e02, 3.20937758913846947e03, 1.85777706184603153e-1};
+  static const T b[4] = {2.36012909523441209e01, 2.44024637934444173e02,
+      1.28261652607737228e03, 2.84423683343917062e03};
+
+  // Coefficients for approximation to erfc in the second interval.
+  static const T c[9] = {5.64188496988670089e-1, 8.88314979438837594e00,
+      6.61191906371416295e01, 2.98635138197400131e02, 8.81952221241769090e02,
+      1.71204761263407058e03, 2.05107837782607147e03, 1.23033935479799725e03,
+      2.15311535474403846e-8};
+  static const T d[8] = {1.57449261107098347e01, 1.17693950891312499e02,
+      5.37181101862009858e02, 1.62138957456669019e03, 3.29079923573345963e03,
+      4.36261909014324716e03, 3.43936767414372164e03, 1.23033935480374942e03};
+
+  // Coefficients for approximation to erfc in the third interval.
+  static const T p[6] = {3.05326634961232344e-1, 3.60344899949804439e-1,
+      1.25781726111229246e-1, 1.60837851487422766e-2, 6.58749161529837803e-4,
+      1.63153871373020978e-2};
+  static const T q[5] = {2.56852019228982242e00, 1.87295284992346047e00,
+      5.27905102951428412e-1, 6.05183413124413191e-2, 2.33520497626869185e-3};
+
+  constexpr T sqrtpi{1.7724538509078120380404576221783883301349L};
+  constexpr T rsqrtpi{0.5641895835477562869480794515607725858440L};
+  constexpr T epsilonby2{std::numeric_limits<T>::epsilon() * 0.5};
+  constexpr T xneg{-26.628e0};
+  constexpr T xhuge{6.71e7};
+  constexpr T thresh{0.46875e0};
+  constexpr T zero{0.0};
+  constexpr T one{1.0};
+  constexpr T four{4.0};
+  constexpr T sixteen{16.0};
+  constexpr T xmax{1.0 / (sqrtpi * std::numeric_limits<T>::min())};
+  static_assert(xmax > xhuge, "xmax must be greater than xhuge");
+
+  T ysq;
+  T xnum;
+  T xden;
+  T del;
+  T result;
+
+  auto x{arg};
+  auto y{std::fabs(x)};
+
+  if (y <= thresh) {
+    // evaluate erf for  |x| <= 0.46875
+    ysq = zero;
+    if (y > epsilonby2) {
+      ysq = y * y;
+    }
+    xnum = a[4] * ysq;
+    xden = ysq;
+    for (int i{0}; i < 3; i++) {
+      xnum = (xnum + a[i]) * ysq;
+      xden = (xden + b[i]) * ysq;
+    }
+    result = x * (xnum + a[3]) / (xden + b[3]);
+    result = one - result;
+    result = std::exp(ysq) * result;
+    return result;
+  } else if (y <= four) {
+    //  evaluate erfc for 0.46875 < |x| <= 4.0
+    xnum = c[8] * y;
+    xden = y;
+    for (int i{0}; i < 7; ++i) {
+      xnum = (xnum + c[i]) * y;
+      xden = (xden + d[i]) * y;
+    }
+    result = (xnum + c[7]) / (xden + d[7]);
+  } else {
+    //  evaluate erfc for |x| > 4.0
+    result = zero;
+    if (y >= xhuge) {
+      if (y < xmax) {
+        result = rsqrtpi / y;
+      }
+    } else {
+      ysq = one / (y * y);
+      xnum = p[5] * ysq;
+      xden = ysq;
+      for (int i{0}; i < 4; ++i) {
+        xnum = (xnum + p[i]) * ysq;
+        xden = (xden + q[i]) * ysq;
+      }
+      result = ysq * (xnum + p[4]) / (xden + q[4]);
+      result = (rsqrtpi - result) / y;
+    }
+  }
+  //  fix up for negative argument, erf, etc.
+  if (x < zero) {
+    if (x < xneg) {
+      result = std::numeric_limits<T>::max();
+    } else {
+      ysq = trunc(x * sixteen) / sixteen;
+      del = (x - ysq) * (x + ysq);
+      y = std::exp((ysq * ysq)) * std::exp((del));
+      result = (y + y) - result;
+    }
+  }
+  return result;
+}
+} // namespace Fortran::common
+#endif // FORTRAN_COMMON_ERFC_SCALED_H_
diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index ce9dd6b7b3df..ee4df2dbd113 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -14,6 +14,7 @@
 #include "flang/Evaluate/intrinsics-library.h"
 #include "fold-implementation.h"
 #include "host.h"
+#include "flang/Common/erfc-scaled.h"
 #include "flang/Common/static-multimap-view.h"
 #include "flang/Evaluate/expression.h"
 #include <cfloat>
@@ -231,6 +232,7 @@ struct HostRuntimeLibrary<HostT, LibraryVersion::Libm> {
       FolderFactory<F, F{std::cosh}>::Create("cosh"),
       FolderFactory<F, F{std::erf}>::Create("erf"),
       FolderFactory<F, F{std::erfc}>::Create("erfc"),
+      FolderFactory<F, F{common::ErfcScaled}>::Create("erfc_scaled"),
       FolderFactory<F, F{std::exp}>::Create("exp"),
       FolderFactory<F, F{std::tgamma}>::Create("gamma"),
       FolderFactory<F, F{std::log}>::Create("log"),
diff --git a/flang/runtime/numeric-templates.h b/flang/runtime/numeric-templates.h
index 0b00bbb94ddd..fbb371bffc27 100644
--- a/flang/runtime/numeric-templates.h
+++ b/flang/runtime/numeric-templates.h
@@ -21,6 +21,7 @@
 #include "terminator.h"
 #include "tools.h"
 #include "flang/Common/api-attrs.h"
+#include "flang/Common/erfc-scaled.h"
 #include "flang/Common/float128.h"
 #include <cstdint>
 #include <limits>
@@ -362,106 +363,7 @@ template <int PREC, typename T> inline RT_API_ATTRS T Spacing(T x) {
 
 // ERFC_SCALED (16.9.71)
 template <typename T> inline RT_API_ATTRS T ErfcScaled(T arg) {
-  // Coefficients for approximation to erfc in the first interval.
-  static const T a[5] = {3.16112374387056560e00, 1.13864154151050156e02,
-      3.77485237685302021e02, 3.20937758913846947e03, 1.85777706184603153e-1};
-  static const T b[4] = {2.36012909523441209e01, 2.44024637934444173e02,
-      1.28261652607737228e03, 2.84423683343917062e03};
-
-  // Coefficients for approximation to erfc in the second interval.
-  static const T c[9] = {5.64188496988670089e-1, 8.88314979438837594e00,
-      6.61191906371416295e01, 2.98635138197400131e02, 8.81952221241769090e02,
-      1.71204761263407058e03, 2.05107837782607147e03, 1.23033935479799725e03,
-      2.15311535474403846e-8};
-  static const T d[8] = {1.57449261107098347e01, 1.17693950891312499e02,
-      5.37181101862009858e02, 1.62138957456669019e03, 3.29079923573345963e03,
-      4.36261909014324716e03, 3.43936767414372164e03, 1.23033935480374942e03};
-
-  // Coefficients for approximation to erfc in the third interval.
-  static const T p[6] = {3.05326634961232344e-1, 3.60344899949804439e-1,
-      1.25781726111229246e-1, 1.60837851487422766e-2, 6.58749161529837803e-4,
-      1.63153871373020978e-2};
-  static const T q[5] = {2.56852019228982242e00, 1.87295284992346047e00,
-      5.27905102951428412e-1, 6.05183413124413191e-2, 2.33520497626869185e-3};
-
-  constexpr T sqrtpi{1.7724538509078120380404576221783883301349L};
-  constexpr T rsqrtpi{0.5641895835477562869480794515607725858440L};
-  constexpr T epsilonby2{std::numeric_limits<T>::epsilon() * 0.5};
-  constexpr T xneg{-26.628e0};
-  constexpr T xhuge{6.71e7};
-  constexpr T thresh{0.46875e0};
-  constexpr T zero{0.0};
-  constexpr T one{1.0};
-  constexpr T four{4.0};
-  constexpr T sixteen{16.0};
-  constexpr T xmax{1.0 / (sqrtpi * std::numeric_limits<T>::min())};
-  static_assert(xmax > xhuge, "xmax must be greater than xhuge");
-
-  T ysq;
-  T xnum;
-  T xden;
-  T del;
-  T result;
-
-  auto x{arg};
-  auto y{std::fabs(x)};
-
-  if (y <= thresh) {
-    // evaluate erf for  |x| <= 0.46875
-    ysq = zero;
-    if (y > epsilonby2) {
-      ysq = y * y;
-    }
-    xnum = a[4] * ysq;
-    xden = ysq;
-    for (int i{0}; i < 3; i++) {
-      xnum = (xnum + a[i]) * ysq;
-      xden = (xden + b[i]) * ysq;
-    }
-    result = x * (xnum + a[3]) / (xden + b[3]);
-    result = one - result;
-    result = std::exp(ysq) * result;
-    return result;
-  } else if (y <= four) {
-    //  evaluate erfc for 0.46875 < |x| <= 4.0
-    xnum = c[8] * y;
-    xden = y;
-    for (int i{0}; i < 7; ++i) {
-      xnum = (xnum + c[i]) * y;
-      xden = (xden + d[i]) * y;
-    }
-    result = (xnum + c[7]) / (xden + d[7]);
-  } else {
-    //  evaluate erfc for |x| > 4.0
-    result = zero;
-    if (y >= xhuge) {
-      if (y < xmax) {
-        result = rsqrtpi / y;
-      }
-    } else {
-      ysq = one / (y * y);
-      xnum = p[5] * ysq;
-      xden = ysq;
-      for (int i{0}; i < 4; ++i) {
-        xnum = (xnum + p[i]) * ysq;
-        xden = (xden + q[i]) * ysq;
-      }
-      result = ysq * (xnum + p[4]) / (xden + q[4]);
-      result = (rsqrtpi - result) / y;
-    }
-  }
-  //  fix up for negative argument, erf, etc.
-  if (x < zero) {
-    if (x < xneg) {
-      result = std::numeric_limits<T>::max();
-    } else {
-      ysq = trunc(x * sixteen) / sixteen;
-      del = (x - ysq) * (x + ysq);
-      y = std::exp((ysq * ysq)) * std::exp((del));
-      result = (y + y) - result;
-    }
-  }
-  return result;
+  return common::ErfcScaled(arg);
 }
 
 } // namespace Fortran::runtime
diff --git a/flang/test/Evaluate/fold-erfc-scaled.f90 b/flang/test/Evaluate/fold-erfc-scaled.f90
new file mode 100644
index 000000000000..b38cd0157d0b
--- /dev/null
+++ b/flang/test/Evaluate/fold-erfc-scaled.f90
@@ -0,0 +1,7 @@
+! RUN: %python %S/test_folding.py %s %flang_fc1
+module m
+  real(4), parameter :: x20_4 = erfc_scaled(20._4)
+  logical, parameter :: t20_4 = x20_4 == 0.02817435003817081451416015625_4
+  real(8), parameter :: x20_8 = erfc_scaled(20._8)
+  logical, parameter :: t20_8 = x20_8 == 0.0281743487410513193669459042212110944092273712158203125_8
+end
-- 
GitLab


From 9fb2db1e1f42ae10a9d8c1d9410b5f4e719fdac0 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 15 Oct 2024 14:23:45 -0700
Subject: [PATCH 036/329] [flang] Retain spaces when preprocessing fixed-form
 source (#112417)

When running fixed-form source through the compiler under -E, don't
aggressively remove space characters, since the parser won't be parsing
the result and some tools might need to see the spaces in the -E
preprocessed output.

Fixes https://github.com/llvm/llvm-project/issues/112279.
---
 flang/lib/Parser/parsing.cpp                              | 1 +
 flang/lib/Parser/prescan.cpp                              | 8 +++++---
 flang/lib/Parser/prescan.h                                | 5 +++++
 .../test/Parser/continuation-in-conditional-compilation.f | 2 +-
 flang/test/Preprocessing/pp029.F                          | 2 +-
 flang/test/Preprocessing/pp031.F                          | 4 ++--
 flang/test/Preprocessing/pp041.F                          | 2 +-
 flang/test/Preprocessing/renaming.F                       | 2 +-
 8 files changed, 17 insertions(+), 9 deletions(-)

diff --git a/flang/lib/Parser/parsing.cpp b/flang/lib/Parser/parsing.cpp
index d8448e4c527a..e2381a6b8ffa 100644
--- a/flang/lib/Parser/parsing.cpp
+++ b/flang/lib/Parser/parsing.cpp
@@ -75,6 +75,7 @@ const SourceFile *Parsing::Prescan(const std::string &path, Options options) {
       messages_, *currentCooked_, preprocessor_, options.features};
   prescanner.set_fixedForm(options.isFixedForm)
       .set_fixedFormColumnLimit(options.fixedFormColumns)
+      .set_preprocessingOnly(options.prescanAndReformat)
       .set_expandIncludeLines(!options.prescanAndReformat ||
           options.expandIncludeLinesInPreprocessedOutput)
       .AddCompilerDirectiveSentinel("dir$");
diff --git a/flang/lib/Parser/prescan.cpp b/flang/lib/Parser/prescan.cpp
index 47260c068046..1d2f1e976687 100644
--- a/flang/lib/Parser/prescan.cpp
+++ b/flang/lib/Parser/prescan.cpp
@@ -36,6 +36,8 @@ Prescanner::Prescanner(const Prescanner &that, Preprocessor &prepro,
     bool isNestedInIncludeDirective)
     : messages_{that.messages_}, cooked_{that.cooked_}, preprocessor_{prepro},
       allSources_{that.allSources_}, features_{that.features_},
+      preprocessingOnly_{that.preprocessingOnly_},
+      expandIncludeLines_{that.expandIncludeLines_},
       isNestedInIncludeDirective_{isNestedInIncludeDirective},
       backslashFreeFormContinuation_{that.backslashFreeFormContinuation_},
       inFixedForm_{that.inFixedForm_},
@@ -288,8 +290,8 @@ void Prescanner::Statement() {
       break;
     case LineClassification::Kind::Source:
       if (inFixedForm_) {
-        if (preprocessed->HasBlanks(/*after column*/ 6)) {
-          preprocessed->RemoveBlanks(/*after column*/ 6);
+        if (!preprocessingOnly_ && preprocessed->HasBlanks()) {
+          preprocessed->RemoveBlanks();
         }
       } else {
         while (SourceLineContinuation(*preprocessed)) {
@@ -622,7 +624,7 @@ const char *Prescanner::SkipCComment(const char *p) const {
 
 bool Prescanner::NextToken(TokenSequence &tokens) {
   CHECK(at_ >= start_ && at_ < limit_);
-  if (InFixedFormSource()) {
+  if (InFixedFormSource() && !preprocessingOnly_) {
     SkipSpaces();
   } else {
     if (*at_ == '/' && IsCComment(at_)) {
diff --git a/flang/lib/Parser/prescan.h b/flang/lib/Parser/prescan.h
index c50bf231e3c7..08041f93b14b 100644
--- a/flang/lib/Parser/prescan.h
+++ b/flang/lib/Parser/prescan.h
@@ -48,6 +48,10 @@ public:
   Preprocessor &preprocessor() { return preprocessor_; }
   common::LanguageFeatureControl &features() { return features_; }
 
+  Prescanner &set_preprocessingOnly(bool yes) {
+    preprocessingOnly_ = yes;
+    return *this;
+  }
   Prescanner &set_expandIncludeLines(bool yes) {
     expandIncludeLines_ = yes;
     return *this;
@@ -213,6 +217,7 @@ private:
   Preprocessor &preprocessor_;
   AllSources &allSources_;
   common::LanguageFeatureControl features_;
+  bool preprocessingOnly_{false};
   bool expandIncludeLines_{true};
   bool isNestedInIncludeDirective_{false};
   bool backslashFreeFormContinuation_{false};
diff --git a/flang/test/Parser/continuation-in-conditional-compilation.f b/flang/test/Parser/continuation-in-conditional-compilation.f
index 35eecbc0f16e..987112301e33 100644
--- a/flang/test/Parser/continuation-in-conditional-compilation.f
+++ b/flang/test/Parser/continuation-in-conditional-compilation.f
@@ -1,6 +1,6 @@
 ! RUN: %flang_fc1 -fopenmp -fopenacc -E %s 2>&1 | FileCheck %s
       program main
-! CHECK: k01=1+1
+! CHECK: k01=1+ 1
       k01=1+
 !$   &  1
 
diff --git a/flang/test/Preprocessing/pp029.F b/flang/test/Preprocessing/pp029.F
index 4ca87dd20f15..1f8533ab08cd 100644
--- a/flang/test/Preprocessing/pp029.F
+++ b/flang/test/Preprocessing/pp029.F
@@ -1,5 +1,5 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: if (777 .eq. 777) then
+! CHECK: if (77 7.eq. 777) then
 * \ newline allowed in #define
       integer, parameter :: KWM = 666
 #define KWM 77\
diff --git a/flang/test/Preprocessing/pp031.F b/flang/test/Preprocessing/pp031.F
index 4813c40208a9..3ad0bde9e50c 100644
--- a/flang/test/Preprocessing/pp031.F
+++ b/flang/test/Preprocessing/pp031.F
@@ -1,6 +1,6 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: if (777//Ccomment.eq.777)then
-! CHECK: print *, 'pp031.F no: ', 777//Ccomment
+! CHECK: if (777 // C comment.eq. 777) then
+! CHECK: print *, 'pp031.F no: ', 777 // C comment
 *  // C++ comment NOT erased from #define
       integer, parameter :: KWM = 666
 #define KWM 777 // C comment
diff --git a/flang/test/Preprocessing/pp041.F b/flang/test/Preprocessing/pp041.F
index 3f1f3c6a2aeb..cee3c5d3e490 100644
--- a/flang/test/Preprocessing/pp041.F
+++ b/flang/test/Preprocessing/pp041.F
@@ -1,5 +1,5 @@
 ! RUN: %flang -E %s 2>&1 | FileCheck %s
-! CHECK: j = 666WMj=j+1WM211
+! CHECK: j = 666WMj= j+ 1WM211
 * use KWM expansion as continuation indicators
 #define KWM 0
 #define KWM2 1
diff --git a/flang/test/Preprocessing/renaming.F b/flang/test/Preprocessing/renaming.F
index 1bef18116901..c39ab6fb029a 100644
--- a/flang/test/Preprocessing/renaming.F
+++ b/flang/test/Preprocessing/renaming.F
@@ -1,5 +1,5 @@
 ! RUN: %flang -E %s | FileCheck %s
-! CHECK: ((1)*10000+(11)*100)
+! CHECK: ((1) * 10000 + (11) * 100)
 ! Ensure that a keyword-like macro can be used to rename a
 ! function-like macro.
 #define TO_VERSION2(MAJOR, MINOR) ((MAJOR) * 10000 + (MINOR) * 100)
-- 
GitLab


From e12fbdf8775f54580b6a9a77b53c31faddece841 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 15 Oct 2024 14:37:48 -0700
Subject: [PATCH 037/329] [NFC] Remove unnececary check from test (#112438)

Important part of the test to have correct
`ThreadDescriptorSize` after `InitTlsSize()`.

It's not a problem if another test called
`InitTlsSize()` before.

Fixes #112399.
---
 compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp
index ce4a40444cd4..70669ab81691 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_linux_test.cpp
@@ -205,7 +205,6 @@ TEST(SanitizerLinux, ThreadDescriptorSize) {
   void *result;
   ASSERT_EQ(0, pthread_create(&tid, 0, thread_descriptor_size_test_func, 0));
   ASSERT_EQ(0, pthread_join(tid, &result));
-  EXPECT_EQ(0u, ThreadDescriptorSize());
   InitTlsSize();
   EXPECT_EQ((uptr)result, ThreadDescriptorSize());
 }
-- 
GitLab


From 01b78b220b665b683b97dde6159c2f515afea8b8 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 15 Oct 2024 14:40:06 -0700
Subject: [PATCH 038/329] [NFC] Fix flakiness in test if run unsharded
 (#112439)

If we run all test in a single process, there is high
probability that `99` is already claimed.
---
 .../tests/sanitizer_chained_origin_depot_test.cpp               | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp
index a557c4645ba0..61171019a570 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_chained_origin_depot_test.cpp
@@ -31,7 +31,7 @@ TEST(SanitizerCommon, ChainedOriginDepotBasic) {
 
 TEST(SanitizerCommon, ChainedOriginDepotAbsent) {
   u32 prev_id;
-  EXPECT_EQ(0U, chainedOriginDepot.Get(99, &prev_id));
+  EXPECT_EQ(0U, chainedOriginDepot.Get(123456, &prev_id));
   EXPECT_EQ(0U, prev_id);
 }
 
-- 
GitLab


From f6c23222a4fe7291a7464460216aaad8f778947b Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 15 Oct 2024 22:49:05 +0100
Subject: [PATCH 039/329] [RISCV] Promote fixed-length bf16 arith vector ops
 with zvfbfmin (#112393)

The aim is to have the same set of promotions on fixed-length bf16
vectors as on fixed-length f16 vectors, and then deduplicate them
similarly to what was done for scalable vectors.

It looks like fneg/fabs/fcopysign end up getting expanded because fsub
is now legal, and the default operation action must be expand.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |    9 +-
 .../CodeGen/RISCV/rvv/fixed-vectors-fp.ll     | 1731 ++++++++++++++++-
 2 files changed, 1656 insertions(+), 84 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index cde690793f07..bf333b7b7901 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1379,7 +1379,14 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
           setOperationAction(
               {ISD::VP_MERGE, ISD::VP_SELECT, ISD::VSELECT, ISD::SELECT}, VT,
               Custom);
-          // TODO: Promote to fp32.
+          MVT F32VecVT = MVT::getVectorVT(MVT::f32, VT.getVectorElementCount());
+          // Don't promote f16 vector operations to f32 if f32 vector type is
+          // not legal.
+          // TODO: could split the f16 vector into two vectors and do promotion.
+          if (!isTypeLegal(F32VecVT))
+            continue;
+          setOperationPromotedToType(ZvfhminZvfbfminPromoteOps, VT, F32VecVT);
+          // TODO: Promote VP ops to fp32.
           continue;
         }
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
index 7ecf8af54c8d..c24ade1e6d8e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/fixed-vectors-fp.ll
@@ -1,8 +1,52 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
-; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
-; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfh,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: llc -mtriple=riscv32 -target-abi=ilp32d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+; RUN: llc -mtriple=riscv64 -target-abi=lp64d -mattr=+v,+zvfhmin,+zvfbfmin -verify-machineinstrs < %s | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
+
+
+define void @fadd_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fadd_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = fadd <8 x bfloat> %a, %b
+  store <8 x bfloat> %c, ptr %x
+  ret void
+}
+
+define void @fadd_v6bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fadd_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = fadd <6 x bfloat> %a, %b
+  store <6 x bfloat> %c, ptr %x
+  ret void
+}
 
 define void @fadd_v8f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fadd_v8f16:
@@ -97,6 +141,49 @@ define void @fadd_v2f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fsub_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fsub_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = fsub <8 x bfloat> %a, %b
+  store <8 x bfloat> %c, ptr %x
+  ret void
+}
+
+define void @fsub_v6bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fsub_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = fsub <6 x bfloat> %a, %b
+  store <6 x bfloat> %c, ptr %x
+  ret void
+}
+
 define void @fsub_v8f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fsub_v8f16:
 ; ZVFH:       # %bb.0:
@@ -190,6 +277,49 @@ define void @fsub_v2f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fmul_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fmul_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = fmul <8 x bfloat> %a, %b
+  store <8 x bfloat> %c, ptr %x
+  ret void
+}
+
+define void @fmul_v6bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fmul_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = fmul <6 x bfloat> %a, %b
+  store <6 x bfloat> %c, ptr %x
+  ret void
+}
+
 define void @fmul_v8f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fmul_v8f16:
 ; ZVFH:       # %bb.0:
@@ -283,6 +413,49 @@ define void @fmul_v2f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fdiv_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fdiv_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = fdiv <8 x bfloat> %a, %b
+  store <8 x bfloat> %c, ptr %x
+  ret void
+}
+
+define void @fdiv_v6bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fdiv_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = fdiv <6 x bfloat> %a, %b
+  store <6 x bfloat> %c, ptr %x
+  ret void
+}
+
 define void @fdiv_v8f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fdiv_v8f16:
 ; ZVFH:       # %bb.0:
@@ -376,6 +549,36 @@ define void @fdiv_v2f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fneg_v8bf16(ptr %x) {
+; CHECK-LABEL: fneg_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = fneg <8 x bfloat> %a
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
+
+define void @fneg_v6bf16(ptr %x) {
+; CHECK-LABEL: fneg_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = fneg <6 x bfloat> %a
+  store <6 x bfloat> %b, ptr %x
+  ret void
+}
+
 define void @fneg_v8f16(ptr %x) {
 ; ZVFH-LABEL: fneg_v8f16:
 ; ZVFH:       # %bb.0:
@@ -450,6 +653,38 @@ define void @fneg_v2f64(ptr %x) {
   ret void
 }
 
+define void @fabs_v8bf16(ptr %x) {
+; CHECK-LABEL: fabs_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.fabs.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
+
+define void @fabs_v6bf16(ptr %x) {
+; CHECK-LABEL: fabs_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = call <6 x bfloat> @llvm.fabs.v6bf16(<6 x bfloat> %a)
+  store <6 x bfloat> %b, ptr %x
+  ret void
+}
+
 define void @fabs_v8f16(ptr %x) {
 ; ZVFH-LABEL: fabs_v8f16:
 ; ZVFH:       # %bb.0:
@@ -473,7 +708,6 @@ define void @fabs_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.fabs.v8f16(<8 x half>)
 
 define void @fabs_v6f16(ptr %x) {
 ; ZVFH-LABEL: fabs_v6f16:
@@ -498,7 +732,6 @@ define void @fabs_v6f16(ptr %x) {
   store <6 x half> %b, ptr %x
   ret void
 }
-declare <6 x half> @llvm.fabs.v6f16(<6 x half>)
 
 define void @fabs_v4f32(ptr %x) {
 ; CHECK-LABEL: fabs_v4f32:
@@ -513,7 +746,6 @@ define void @fabs_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.fabs.v4f32(<4 x float>)
 
 define void @fabs_v2f64(ptr %x) {
 ; CHECK-LABEL: fabs_v2f64:
@@ -528,7 +760,48 @@ define void @fabs_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.fabs.v2f64(<2 x double>)
+
+define void @copysign_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: copysign_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b)
+  store <8 x bfloat> %c, ptr %x
+  ret void
+}
+
+define void @copysign_v6bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: copysign_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    addi a1, a1, -1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = call <6 x bfloat> @llvm.copysign.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b)
+  store <6 x bfloat> %c, ptr %x
+  ret void
+}
 
 define void @copysign_v8f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_v8f16:
@@ -558,7 +831,6 @@ define void @copysign_v8f16(ptr %x, ptr %y) {
   store <8 x half> %c, ptr %x
   ret void
 }
-declare <8 x half> @llvm.copysign.v8f16(<8 x half>, <8 x half>)
 
 define void @copysign_v6f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_v6f16:
@@ -590,7 +862,6 @@ define void @copysign_v6f16(ptr %x, ptr %y) {
   store <6 x half> %c, ptr %x
   ret void
 }
-declare <6 x half> @llvm.copysign.v6f16(<6 x half>, <6 x half>)
 
 define void @copysign_v4f32(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_v4f32:
@@ -607,7 +878,6 @@ define void @copysign_v4f32(ptr %x, ptr %y) {
   store <4 x float> %c, ptr %x
   ret void
 }
-declare <4 x float> @llvm.copysign.v4f32(<4 x float>, <4 x float>)
 
 define void @copysign_v2f64(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_v2f64:
@@ -624,7 +894,52 @@ define void @copysign_v2f64(ptr %x, ptr %y) {
   store <2 x double> %c, ptr %x
   ret void
 }
-declare <2 x double> @llvm.copysign.v2f64(<2 x double>, <2 x double>)
+
+define void @copysign_vf_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: copysign_vf_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %c)
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @copysign_vf_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: copysign_vf_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = call <6 x bfloat> @llvm.copysign.v6bf16(<6 x bfloat> %a, <6 x bfloat> %c)
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
 
 define void @copysign_vf_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: copysign_vf_v8f16:
@@ -720,6 +1035,52 @@ define void @copysign_vf_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @copysign_neg_v8bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: copysign_neg_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = fneg <8 x bfloat> %b
+  %d = call <8 x bfloat> @llvm.copysign.v8bf16(<8 x bfloat> %a, <8 x bfloat> %c)
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @copysign_neg_v6bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: copysign_neg_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vand.vx v9, v9, a2
+; CHECK-NEXT:    vand.vx v8, v8, a1
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vor.vv v8, v9, v8
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = fneg <6 x bfloat> %b
+  %d = call <6 x bfloat> @llvm.copysign.v6bf16(<6 x bfloat> %a, <6 x bfloat> %c)
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @copysign_neg_v8f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_v8f16:
 ; ZVFH:       # %bb.0:
@@ -818,6 +1179,56 @@ define void @copysign_neg_v2f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @copysign_neg_trunc_v4bf16_v4f32(ptr %x, ptr %y) {
+; CHECK-LABEL: copysign_neg_trunc_v4bf16_v4f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v9
+; CHECK-NEXT:    vxor.vx v9, v10, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <4 x bfloat>, ptr %x
+  %b = load <4 x float>, ptr %y
+  %c = fneg <4 x float> %b
+  %d = fptrunc <4 x float> %c to <4 x bfloat>
+  %e = call <4 x bfloat> @llvm.copysign.v4bf16(<4 x bfloat> %a, <4 x bfloat> %d)
+  store <4 x bfloat> %e, ptr %x
+  ret void
+}
+
+define void @copysign_neg_trunc_v3bf16_v3f32(ptr %x, ptr %y) {
+; CHECK-LABEL: copysign_neg_trunc_v3bf16_v3f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vle32.v v9, (a1)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    addi a2, a1, -1
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vand.vx v8, v8, a2
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v9
+; CHECK-NEXT:    vxor.vx v9, v10, a1
+; CHECK-NEXT:    vand.vx v9, v9, a1
+; CHECK-NEXT:    vsetivli zero, 3, e16, mf2, ta, ma
+; CHECK-NEXT:    vor.vv v8, v8, v9
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <3 x bfloat>, ptr %x
+  %b = load <3 x float>, ptr %y
+  %c = fneg <3 x float> %b
+  %d = fptrunc <3 x float> %c to <3 x bfloat>
+  %e = call <3 x bfloat> @llvm.copysign.v3bf16(<3 x bfloat> %a, <3 x bfloat> %d)
+  store <3 x bfloat> %e, ptr %x
+  ret void
+}
+
 define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_trunc_v4f16_v4f32:
 ; ZVFH:       # %bb.0:
@@ -851,7 +1262,6 @@ define void @copysign_neg_trunc_v4f16_v4f32(ptr %x, ptr %y) {
   store <4 x half> %e, ptr %x
   ret void
 }
-declare <4 x half> @llvm.copysign.v4f16(<4 x half>, <4 x half>)
 
 define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
 ; ZVFH-LABEL: copysign_neg_trunc_v3f16_v3f32:
@@ -890,7 +1300,6 @@ define void @copysign_neg_trunc_v3f16_v3f32(ptr %x, ptr %y) {
   store <3 x half> %e, ptr %x
   ret void
 }
-declare <3 x half> @llvm.copysign.v3f16(<3 x half>, <3 x half>)
 
 define void @copysign_neg_ext_v2f64_v2f32(ptr %x, ptr %y) {
 ; CHECK-LABEL: copysign_neg_ext_v2f64_v2f32:
@@ -912,6 +1321,43 @@ define void @copysign_neg_ext_v2f64_v2f32(ptr %x, ptr %y) {
   ret void
 }
 
+define void @sqrt_v8bf16(ptr %x) {
+; CHECK-LABEL: sqrt_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsqrt.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
+
+define void @sqrt_v6bf16(ptr %x) {
+; CHECK-LABEL: sqrt_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsqrt.v v8, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = call <6 x bfloat> @llvm.sqrt.v6bf16(<6 x bfloat> %a)
+  store <6 x bfloat> %b, ptr %x
+  ret void
+}
+
 define void @sqrt_v8f16(ptr %x) {
 ; ZVFH-LABEL: sqrt_v8f16:
 ; ZVFH:       # %bb.0:
@@ -937,7 +1383,6 @@ define void @sqrt_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
 
 define void @sqrt_v6f16(ptr %x) {
 ; ZVFH-LABEL: sqrt_v6f16:
@@ -965,7 +1410,6 @@ define void @sqrt_v6f16(ptr %x) {
   store <6 x half> %b, ptr %x
   ret void
 }
-declare <6 x half> @llvm.sqrt.v6f16(<6 x half>)
 
 define void @sqrt_v4f32(ptr %x) {
 ; CHECK-LABEL: sqrt_v4f32:
@@ -980,7 +1424,6 @@ define void @sqrt_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
 
 define void @sqrt_v2f64(ptr %x) {
 ; CHECK-LABEL: sqrt_v2f64:
@@ -995,7 +1438,55 @@ define void @sqrt_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
+
+define void @fma_v8bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fma_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = load <8 x bfloat>, ptr %z
+  %d = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fma_v6bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fma_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = load <6 x bfloat>, ptr %z
+  %d = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %c)
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
 
 define void @fma_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fma_v8f16:
@@ -1030,7 +1521,6 @@ define void @fma_v8f16(ptr %x, ptr %y, ptr %z) {
   store <8 x half> %d, ptr %x
   ret void
 }
-declare <8 x half> @llvm.fma.v8f16(<8 x half>, <8 x half>, <8 x half>)
 
 define void @fma_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fma_v6f16:
@@ -1066,7 +1556,6 @@ define void @fma_v6f16(ptr %x, ptr %y, ptr %z) {
   store <6 x half> %d, ptr %x
   ret void
 }
-declare <6 x half> @llvm.fma.v6f16(<6 x half>, <6 x half>, <6 x half>)
 
 define void @fma_v4f32(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v4f32:
@@ -1085,7 +1574,6 @@ define void @fma_v4f32(ptr %x, ptr %y, ptr %z) {
   store <4 x float> %d, ptr %x
   ret void
 }
-declare <4 x float> @llvm.fma.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define void @fma_v2f64(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v2f64:
@@ -1104,7 +1592,61 @@ define void @fma_v2f64(ptr %x, ptr %y, ptr %z) {
   store <2 x double> %d, ptr %x
   ret void
 }
-declare <2 x double> @llvm.fma.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define void @fmsub_v8bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fmsub_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = load <8 x bfloat>, ptr %z
+  %neg = fneg <8 x bfloat> %c
+  %d = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %neg)
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fmsub_v6bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fmsub_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a1)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = load <6 x bfloat>, ptr %z
+  %neg = fneg <6 x bfloat> %c
+  %d = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %neg)
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
 
 define void @fmsub_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fmsub_v8f16:
@@ -1220,6 +1762,27 @@ define void @fnmadd_v2f64(ptr %x, ptr %y, ptr %z) {
   ret void
 }
 
+define void @fadd_v16bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fadd_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v16, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vse16.v v12, (a0)
+; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = load <16 x bfloat>, ptr %y
+  %c = fadd <16 x bfloat> %a, %b
+  store <16 x bfloat> %c, ptr %x
+  ret void
+}
+
 define void @fadd_v16f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fadd_v16f16:
 ; ZVFH:       # %bb.0:
@@ -1282,6 +1845,27 @@ define void @fadd_v4f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fsub_v16bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fsub_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v16, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vse16.v v12, (a0)
+; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = load <16 x bfloat>, ptr %y
+  %c = fsub <16 x bfloat> %a, %b
+  store <16 x bfloat> %c, ptr %x
+  ret void
+}
+
 define void @fsub_v16f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fsub_v16f16:
 ; ZVFH:       # %bb.0:
@@ -1344,6 +1928,27 @@ define void @fsub_v4f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fmul_v16bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fmul_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v16, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vse16.v v12, (a0)
+; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = load <16 x bfloat>, ptr %y
+  %c = fmul <16 x bfloat> %a, %b
+  store <16 x bfloat> %c, ptr %x
+  ret void
+}
+
 define void @fmul_v16f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fmul_v16f16:
 ; ZVFH:       # %bb.0:
@@ -1406,6 +2011,27 @@ define void @fmul_v4f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fdiv_v16bf16(ptr %x, ptr %y) {
+; CHECK-LABEL: fdiv_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfdiv.vv v8, v16, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vse16.v v12, (a0)
+; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = load <16 x bfloat>, ptr %y
+  %c = fdiv <16 x bfloat> %a, %b
+  store <16 x bfloat> %c, ptr %x
+  ret void
+}
+
 define void @fdiv_v16f16(ptr %x, ptr %y) {
 ; ZVFH-LABEL: fdiv_v16f16:
 ; ZVFH:       # %bb.0:
@@ -1468,6 +2094,21 @@ define void @fdiv_v4f64(ptr %x, ptr %y) {
   ret void
 }
 
+define void @fneg_v16bf16(ptr %x) {
+; CHECK-LABEL: fneg_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = fneg <16 x bfloat> %a
+  store <16 x bfloat> %b, ptr %x
+  ret void
+}
+
 define void @fneg_v16f16(ptr %x) {
 ; ZVFH-LABEL: fneg_v16f16:
 ; ZVFH:       # %bb.0:
@@ -1519,6 +2160,30 @@ define void @fneg_v4f64(ptr %x) {
   ret void
 }
 
+define void @fma_v16bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fma_v16bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a2)
+; CHECK-NEXT:    vle16.v v10, (a0)
+; CHECK-NEXT:    vle16.v v12, (a1)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v16, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v20, v10
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e32, m4, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v20, v16
+; CHECK-NEXT:    vsetvli zero, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v12, v8
+; CHECK-NEXT:    vse16.v v12, (a0)
+; CHECK-NEXT:    ret
+  %a = load <16 x bfloat>, ptr %x
+  %b = load <16 x bfloat>, ptr %y
+  %c = load <16 x bfloat>, ptr %z
+  %d = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> %a, <16 x bfloat> %b, <16 x bfloat> %c)
+  store <16 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fma_v16f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fma_v16f16:
 ; ZVFH:       # %bb.0:
@@ -1552,7 +2217,6 @@ define void @fma_v16f16(ptr %x, ptr %y, ptr %z) {
   store <16 x half> %d, ptr %x
   ret void
 }
-declare <16 x half> @llvm.fma.v16f16(<16 x half>, <16 x half>, <16 x half>)
 
 define void @fma_v8f32(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v8f32:
@@ -1571,7 +2235,6 @@ define void @fma_v8f32(ptr %x, ptr %y, ptr %z) {
   store <8 x float> %d, ptr %x
   ret void
 }
-declare <8 x float> @llvm.fma.v8f32(<8 x float>, <8 x float>, <8 x float>)
 
 define void @fma_v4f64(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fma_v4f64:
@@ -1590,7 +2253,53 @@ define void @fma_v4f64(ptr %x, ptr %y, ptr %z) {
   store <4 x double> %d, ptr %x
   ret void
 }
-declare <4 x double> @llvm.fma.v4f64(<4 x double>, <4 x double>, <4 x double>)
+
+define void @fadd_vf_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fadd_vf_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fadd <8 x bfloat> %a, %c
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fadd_vf_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fadd_vf_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v10, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fadd <6 x bfloat> %a, %c
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
 
 define void @fadd_vf_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fadd_vf_v8f16:
@@ -1687,6 +2396,53 @@ define void @fadd_vf_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fadd_fv_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fadd_fv_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fadd <8 x bfloat> %c, %a
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fadd_fv_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fadd_fv_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fadd <6 x bfloat> %c, %a
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fadd_fv_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fadd_fv_v8f16:
 ; ZVFH:       # %bb.0:
@@ -1782,6 +2538,53 @@ define void @fadd_fv_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fsub_vf_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fsub_vf_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fsub <8 x bfloat> %a, %c
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fsub_vf_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fsub_vf_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v10, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fsub <6 x bfloat> %a, %c
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fsub_vf_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fsub_vf_v8f16:
 ; ZVFH:       # %bb.0:
@@ -1877,6 +2680,53 @@ define void @fsub_vf_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fsub_fv_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fsub_fv_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fsub <8 x bfloat> %c, %a
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fsub_fv_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fsub_fv_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fsub <6 x bfloat> %c, %a
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fsub_fv_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fsub_fv_v8f16:
 ; ZVFH:       # %bb.0:
@@ -1972,6 +2822,53 @@ define void @fsub_fv_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fmul_vf_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fmul_vf_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fmul <8 x bfloat> %a, %c
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fmul_vf_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fmul_vf_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v10, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fmul <6 x bfloat> %a, %c
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fmul_vf_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fmul_vf_v8f16:
 ; ZVFH:       # %bb.0:
@@ -2067,6 +2964,53 @@ define void @fmul_vf_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fmul_fv_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fmul_fv_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fmul <8 x bfloat> %c, %a
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fmul_fv_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fmul_fv_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fmul <6 x bfloat> %c, %a
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fmul_fv_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fmul_fv_v8f16:
 ; ZVFH:       # %bb.0:
@@ -2162,6 +3106,53 @@ define void @fmul_fv_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fdiv_vf_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fdiv_vf_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfdiv.vv v8, v10, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fdiv <8 x bfloat> %a, %c
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fdiv_vf_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fdiv_vf_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfdiv.vv v8, v10, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fdiv <6 x bfloat> %a, %c
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fdiv_vf_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fdiv_vf_v8f16:
 ; ZVFH:       # %bb.0:
@@ -2257,6 +3248,53 @@ define void @fdiv_vf_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fdiv_fv_v8bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fdiv_fv_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = insertelement <8 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <8 x bfloat> %b, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %d = fdiv <8 x bfloat> %c, %a
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fdiv_fv_v6bf16(ptr %x, bfloat %y) {
+; CHECK-LABEL: fdiv_fv_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v9, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfdiv.vv v8, v12, v10
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = insertelement <6 x bfloat> poison, bfloat %y, i32 0
+  %c = shufflevector <6 x bfloat> %b, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %d = fdiv <6 x bfloat> %c, %a
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
+
 define void @fdiv_fv_v8f16(ptr %x, half %y) {
 ; ZVFH-LABEL: fdiv_fv_v8f16:
 ; ZVFH:       # %bb.0:
@@ -2352,6 +3390,59 @@ define void @fdiv_fv_v2f64(ptr %x, double %y) {
   ret void
 }
 
+define void @fma_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
+; CHECK-LABEL: fma_vf_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = insertelement <8 x bfloat> poison, bfloat %z, i32 0
+  %d = shufflevector <8 x bfloat> %c, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %e = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %d, <8 x bfloat> %b)
+  store <8 x bfloat> %e, ptr %x
+  ret void
+}
+
+define void @fma_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
+; CHECK-LABEL: fma_vf_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = insertelement <6 x bfloat> poison, bfloat %z, i32 0
+  %d = shufflevector <6 x bfloat> %c, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %e = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %d, <6 x bfloat> %b)
+  store <6 x bfloat> %e, ptr %x
+  ret void
+}
+
 define void @fma_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-LABEL: fma_vf_v8f16:
 ; ZVFH:       # %bb.0:
@@ -2459,6 +3550,59 @@ define void @fma_vf_v2f64(ptr %x, ptr %y, double %z) {
   ret void
 }
 
+define void @fma_fv_v8bf16(ptr %x, ptr %y, bfloat %z) {
+; CHECK-LABEL: fma_fv_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = insertelement <8 x bfloat> poison, bfloat %z, i32 0
+  %d = shufflevector <8 x bfloat> %c, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %e = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %d, <8 x bfloat> %a, <8 x bfloat> %b)
+  store <8 x bfloat> %e, ptr %x
+  ret void
+}
+
+define void @fma_fv_v6bf16(ptr %x, ptr %y, bfloat %z) {
+; CHECK-LABEL: fma_fv_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    fmv.x.w a1, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = insertelement <6 x bfloat> poison, bfloat %z, i32 0
+  %d = shufflevector <6 x bfloat> %c, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %e = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %d, <6 x bfloat> %a, <6 x bfloat> %b)
+  store <6 x bfloat> %e, ptr %x
+  ret void
+}
+
 define void @fma_fv_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-LABEL: fma_fv_v8f16:
 ; ZVFH:       # %bb.0:
@@ -2566,6 +3710,65 @@ define void @fma_fv_v2f64(ptr %x, ptr %y, double %z) {
   ret void
 }
 
+define void @fmsub_vf_v8bf16(ptr %x, ptr %y, bfloat %z) {
+; CHECK-LABEL: fmsub_vf_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a2, fa0
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vmv.v.x v10, a2
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = insertelement <8 x bfloat> poison, bfloat %z, i32 0
+  %d = shufflevector <8 x bfloat> %c, <8 x bfloat> poison, <8 x i32> zeroinitializer
+  %neg = fneg <8 x bfloat> %b
+  %e = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> %a, <8 x bfloat> %d, <8 x bfloat> %neg)
+  store <8 x bfloat> %e, ptr %x
+  ret void
+}
+
+define void @fmsub_vf_v6bf16(ptr %x, ptr %y, bfloat %z) {
+; CHECK-LABEL: fmsub_vf_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    fmv.x.w a2, fa0
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vmv.v.x v10, a2
+; CHECK-NEXT:    lui a1, 8
+; CHECK-NEXT:    vxor.vx v8, v8, a1
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmadd.vv v8, v14, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = insertelement <6 x bfloat> poison, bfloat %z, i32 0
+  %d = shufflevector <6 x bfloat> %c, <6 x bfloat> poison, <6 x i32> zeroinitializer
+  %neg = fneg <6 x bfloat> %b
+  %e = call <6 x bfloat> @llvm.fma.v6bf16(<6 x bfloat> %a, <6 x bfloat> %d, <6 x bfloat> %neg)
+  store <6 x bfloat> %e, ptr %x
+  ret void
+}
+
 define void @fmsub_vf_v8f16(ptr %x, ptr %y, half %z) {
 ; ZVFH-LABEL: fmsub_vf_v8f16:
 ; ZVFH:       # %bb.0:
@@ -2721,13 +3924,64 @@ define void @fnmadd_fv_v2f64(ptr %x, ptr %y, double %z) {
   ret void
 }
 
+define void @trunc_v8bf16(ptr %x) {
+; CHECK-LABEL: trunc_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.trunc.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
+
+define void @trunc_v6bf16(ptr %x) {
+; CHECK-LABEL: trunc_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfcvt.rtz.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = call <6 x bfloat> @llvm.trunc.v6bf16(<6 x bfloat> %a)
+  store <6 x bfloat> %b, ptr %x
+  ret void
+}
+
 define void @trunc_v8f16(ptr %x) {
 ; ZVFH-LABEL: trunc_v8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI115_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI115_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI171_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI171_0)(a1)
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -2760,15 +4014,14 @@ define void @trunc_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.trunc.v8f16(<8 x half>)
 
 define void @trunc_v6f16(ptr %x) {
 ; ZVFH-LABEL: trunc_v6f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI116_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI116_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI172_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI172_0)(a1)
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
@@ -2803,7 +4056,6 @@ define void @trunc_v6f16(ptr %x) {
   store <6 x half> %b, ptr %x
   ret void
 }
-declare <6 x half> @llvm.trunc.v6f16(<6 x half>)
 
 define void @trunc_v4f32(ptr %x) {
 ; CHECK-LABEL: trunc_v4f32:
@@ -2825,15 +4077,14 @@ define void @trunc_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.trunc.v4f32(<4 x float>)
 
 define void @trunc_v2f64(ptr %x) {
 ; CHECK-LABEL: trunc_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI118_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI118_0)(a1)
+; CHECK-NEXT:    lui a1, %hi(.LCPI174_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI174_0)(a1)
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.rtz.x.f.v v9, v8, v0.t
@@ -2847,15 +4098,69 @@ define void @trunc_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.trunc.v2f64(<2 x double>)
+
+define void @ceil_v8bf16(ptr %x) {
+; CHECK-LABEL: ceil_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    fsrmi a1, 3
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.ceil.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
+
+define void @ceil_v6bf16(ptr %x) {
+; CHECK-LABEL: ceil_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    fsrmi a1, 3
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = call <6 x bfloat> @llvm.ceil.v6bf16(<6 x bfloat> %a)
+  store <6 x bfloat> %b, ptr %x
+  ret void
+}
 
 define void @ceil_v8f16(ptr %x) {
 ; ZVFH-LABEL: ceil_v8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI119_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI119_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI177_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI177_0)(a1)
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 3
@@ -2892,15 +4197,14 @@ define void @ceil_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.ceil.v8f16(<8 x half>)
 
 define void @ceil_v6f16(ptr %x) {
 ; ZVFH-LABEL: ceil_v6f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI120_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI120_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI178_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI178_0)(a1)
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
@@ -2939,7 +4243,6 @@ define void @ceil_v6f16(ptr %x) {
   store <6 x half> %b, ptr %x
   ret void
 }
-declare <6 x half> @llvm.ceil.v6f16(<6 x half>)
 
 define void @ceil_v4f32(ptr %x) {
 ; CHECK-LABEL: ceil_v4f32:
@@ -2963,15 +4266,14 @@ define void @ceil_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.ceil.v4f32(<4 x float>)
 
 define void @ceil_v2f64(ptr %x) {
 ; CHECK-LABEL: ceil_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI122_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI122_0)(a1)
+; CHECK-NEXT:    lui a1, %hi(.LCPI180_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI180_0)(a1)
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 3
@@ -2987,15 +4289,69 @@ define void @ceil_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.ceil.v2f64(<2 x double>)
+
+define void @floor_v8bf16(ptr %x) {
+; CHECK-LABEL: floor_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    fsrmi a1, 2
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.floor.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
+
+define void @floor_v6bf16(ptr %x) {
+; CHECK-LABEL: floor_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    fsrmi a1, 2
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = call <6 x bfloat> @llvm.floor.v6bf16(<6 x bfloat> %a)
+  store <6 x bfloat> %b, ptr %x
+  ret void
+}
 
 define void @floor_v8f16(ptr %x) {
 ; ZVFH-LABEL: floor_v8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI123_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI123_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI183_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI183_0)(a1)
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 2
@@ -3032,15 +4388,14 @@ define void @floor_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.floor.v8f16(<8 x half>)
 
 define void @floor_v6f16(ptr %x) {
 ; ZVFH-LABEL: floor_v6f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI124_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI124_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI184_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI184_0)(a1)
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
@@ -3079,7 +4434,6 @@ define void @floor_v6f16(ptr %x) {
   store <6 x half> %b, ptr %x
   ret void
 }
-declare <6 x half> @llvm.floor.v6f16(<6 x half>)
 
 define void @floor_v4f32(ptr %x) {
 ; CHECK-LABEL: floor_v4f32:
@@ -3103,15 +4457,14 @@ define void @floor_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.floor.v4f32(<4 x float>)
 
 define void @floor_v2f64(ptr %x) {
 ; CHECK-LABEL: floor_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI126_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI126_0)(a1)
+; CHECK-NEXT:    lui a1, %hi(.LCPI186_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI186_0)(a1)
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 2
@@ -3127,15 +4480,69 @@ define void @floor_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.floor.v2f64(<2 x double>)
+
+define void @round_v8bf16(ptr %x) {
+; CHECK-LABEL: round_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    fsrmi a1, 4
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.round.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
+
+define void @round_v6bf16(ptr %x) {
+; CHECK-LABEL: round_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    fsrmi a1, 4
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    fsrm a1
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = call <6 x bfloat> @llvm.round.v6bf16(<6 x bfloat> %a)
+  store <6 x bfloat> %b, ptr %x
+  ret void
+}
 
 define void @round_v8f16(ptr %x) {
 ; ZVFH-LABEL: round_v8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI127_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI127_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI189_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI189_0)(a1)
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    fsrmi a1, 4
@@ -3172,15 +4579,14 @@ define void @round_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.round.v8f16(<8 x half>)
 
 define void @round_v6f16(ptr %x) {
 ; ZVFH-LABEL: round_v6f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI128_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI128_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI190_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI190_0)(a1)
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
@@ -3219,7 +4625,6 @@ define void @round_v6f16(ptr %x) {
   store <6 x half> %b, ptr %x
   ret void
 }
-declare <6 x half> @llvm.round.v6f16(<6 x half>)
 
 define void @round_v4f32(ptr %x) {
 ; CHECK-LABEL: round_v4f32:
@@ -3243,15 +4648,14 @@ define void @round_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.round.v4f32(<4 x float>)
 
 define void @round_v2f64(ptr %x) {
 ; CHECK-LABEL: round_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI130_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI130_0)(a1)
+; CHECK-NEXT:    lui a1, %hi(.LCPI192_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI192_0)(a1)
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    fsrmi a1, 4
@@ -3267,15 +4671,39 @@ define void @round_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.round.v2f64(<2 x double>)
+
+define void @rint_v8bf16(ptr %x) {
+; CHECK-LABEL: rint_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.rint.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
 
 define void @rint_v8f16(ptr %x) {
 ; ZVFH-LABEL: rint_v8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI131_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI131_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI194_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI194_0)(a1)
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -3308,7 +4736,6 @@ define void @rint_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.rint.v8f16(<8 x half>)
 
 define void @rint_v4f32(ptr %x) {
 ; CHECK-LABEL: rint_v4f32:
@@ -3330,15 +4757,14 @@ define void @rint_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.rint.v4f32(<4 x float>)
 
 define void @rint_v2f64(ptr %x) {
 ; CHECK-LABEL: rint_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI133_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI133_0)(a1)
+; CHECK-NEXT:    lui a1, %hi(.LCPI196_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI196_0)(a1)
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    vfcvt.x.f.v v9, v8, v0.t
@@ -3352,15 +4778,41 @@ define void @rint_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.rint.v2f64(<2 x double>)
+
+define void @nearbyint_v8bf16(ptr %x) {
+; CHECK-LABEL: nearbyint_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a0)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v10, v8
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfabs.v v8, v10
+; CHECK-NEXT:    lui a1, 307200
+; CHECK-NEXT:    fmv.w.x fa5, a1
+; CHECK-NEXT:    vmflt.vf v0, v8, fa5
+; CHECK-NEXT:    frflags a1
+; CHECK-NEXT:    vfcvt.x.f.v v8, v10, v0.t
+; CHECK-NEXT:    vfcvt.f.x.v v8, v8, v0.t
+; CHECK-NEXT:    fsflags a1
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, mu
+; CHECK-NEXT:    vfsgnj.vv v10, v8, v10, v0.t
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v8, v10
+; CHECK-NEXT:    vse16.v v8, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = call <8 x bfloat> @llvm.nearbyint.v8bf16(<8 x bfloat> %a)
+  store <8 x bfloat> %b, ptr %x
+  ret void
+}
 
 define void @nearbyint_v8f16(ptr %x) {
 ; ZVFH-LABEL: nearbyint_v8f16:
 ; ZVFH:       # %bb.0:
 ; ZVFH-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
 ; ZVFH-NEXT:    vle16.v v8, (a0)
-; ZVFH-NEXT:    lui a1, %hi(.LCPI134_0)
-; ZVFH-NEXT:    flh fa5, %lo(.LCPI134_0)(a1)
+; ZVFH-NEXT:    lui a1, %hi(.LCPI198_0)
+; ZVFH-NEXT:    flh fa5, %lo(.LCPI198_0)(a1)
 ; ZVFH-NEXT:    vfabs.v v9, v8
 ; ZVFH-NEXT:    vmflt.vf v0, v9, fa5
 ; ZVFH-NEXT:    frflags a1
@@ -3397,7 +4849,6 @@ define void @nearbyint_v8f16(ptr %x) {
   store <8 x half> %b, ptr %x
   ret void
 }
-declare <8 x half> @llvm.nearbyint.v8f16(<8 x half>)
 
 define void @nearbyint_v4f32(ptr %x) {
 ; CHECK-LABEL: nearbyint_v4f32:
@@ -3421,15 +4872,14 @@ define void @nearbyint_v4f32(ptr %x) {
   store <4 x float> %b, ptr %x
   ret void
 }
-declare <4 x float> @llvm.nearbyint.v4f32(<4 x float>)
 
 define void @nearbyint_v2f64(ptr %x) {
 ; CHECK-LABEL: nearbyint_v2f64:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    vsetivli zero, 2, e64, m1, ta, ma
 ; CHECK-NEXT:    vle64.v v8, (a0)
-; CHECK-NEXT:    lui a1, %hi(.LCPI136_0)
-; CHECK-NEXT:    fld fa5, %lo(.LCPI136_0)(a1)
+; CHECK-NEXT:    lui a1, %hi(.LCPI200_0)
+; CHECK-NEXT:    fld fa5, %lo(.LCPI200_0)(a1)
 ; CHECK-NEXT:    vfabs.v v9, v8
 ; CHECK-NEXT:    vmflt.vf v0, v9, fa5
 ; CHECK-NEXT:    frflags a1
@@ -3445,7 +4895,65 @@ define void @nearbyint_v2f64(ptr %x) {
   store <2 x double> %b, ptr %x
   ret void
 }
-declare <2 x double> @llvm.nearbyint.v2f64(<2 x double>)
+
+define void @fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fmuladd_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = load <8 x bfloat>, ptr %z
+  %d = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %c)
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fmuladd_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfadd.vv v8, v8, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = load <6 x bfloat>, ptr %z
+  %d = call <6 x bfloat> @llvm.fmuladd.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %c)
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
 
 define void @fmuladd_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fmuladd_v8f16:
@@ -3485,7 +4993,6 @@ define void @fmuladd_v8f16(ptr %x, ptr %y, ptr %z) {
   store <8 x half> %d, ptr %x
   ret void
 }
-declare <8 x half> @llvm.fmuladd.v8f16(<8 x half>, <8 x half>, <8 x half>)
 
 define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fmuladd_v6f16:
@@ -3526,7 +5033,6 @@ define void @fmuladd_v6f16(ptr %x, ptr %y, ptr %z) {
   store <6 x half> %d, ptr %x
   ret void
 }
-declare <6 x half> @llvm.fmuladd.v6f16(<6 x half>, <6 x half>, <6 x half>)
 
 define void @fmuladd_v4f32(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmuladd_v4f32:
@@ -3545,7 +5051,6 @@ define void @fmuladd_v4f32(ptr %x, ptr %y, ptr %z) {
   store <4 x float> %d, ptr %x
   ret void
 }
-declare <4 x float> @llvm.fmuladd.v4f32(<4 x float>, <4 x float>, <4 x float>)
 
 define void @fmuladd_v2f64(ptr %x, ptr %y, ptr %z) {
 ; CHECK-LABEL: fmuladd_v2f64:
@@ -3564,7 +5069,67 @@ define void @fmuladd_v2f64(ptr %x, ptr %y, ptr %z) {
   store <2 x double> %d, ptr %x
   ret void
 }
-declare <2 x double> @llvm.fmuladd.v2f64(<2 x double>, <2 x double>, <2 x double>)
+
+define void @fmsub_fmuladd_v8bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fmsub_fmuladd_v8bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <8 x bfloat>, ptr %x
+  %b = load <8 x bfloat>, ptr %y
+  %c = load <8 x bfloat>, ptr %z
+  %neg = fneg <8 x bfloat> %c
+  %d = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> %a, <8 x bfloat> %b, <8 x bfloat> %neg)
+  store <8 x bfloat> %d, ptr %x
+  ret void
+}
+
+define void @fmsub_fmuladd_v6bf16(ptr %x, ptr %y, ptr %z) {
+; CHECK-LABEL: fmsub_fmuladd_v6bf16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vle16.v v8, (a1)
+; CHECK-NEXT:    vle16.v v9, (a0)
+; CHECK-NEXT:    vle16.v v10, (a2)
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v14, v9
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfmul.vv v8, v14, v12
+; CHECK-NEXT:    vsetvli zero, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v11, v8
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v8, v11
+; CHECK-NEXT:    vfwcvtbf16.f.f.v v12, v10
+; CHECK-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; CHECK-NEXT:    vfsub.vv v8, v8, v12
+; CHECK-NEXT:    vsetivli zero, 6, e16, m1, ta, ma
+; CHECK-NEXT:    vfncvtbf16.f.f.w v10, v8
+; CHECK-NEXT:    vse16.v v10, (a0)
+; CHECK-NEXT:    ret
+  %a = load <6 x bfloat>, ptr %x
+  %b = load <6 x bfloat>, ptr %y
+  %c = load <6 x bfloat>, ptr %z
+  %neg = fneg <6 x bfloat> %c
+  %d = call <6 x bfloat> @llvm.fmuladd.v6bf16(<6 x bfloat> %a, <6 x bfloat> %b, <6 x bfloat> %neg)
+  store <6 x bfloat> %d, ptr %x
+  ret void
+}
 
 define void @fmsub_fmuladd_v8f16(ptr %x, ptr %y, ptr %z) {
 ; ZVFH-LABEL: fmsub_fmuladd_v8f16:
-- 
GitLab


From 2e8ad49e7cffb22a169c22d02607708c71a80c65 Mon Sep 17 00:00:00 2001
From: Jorge Gorbe Moya <jgorbe@google.com>
Date: Tue, 15 Oct 2024 14:51:31 -0700
Subject: [PATCH 040/329] [SandboxVec] Add pass to create Regions from
 metadata. Generalize SandboxVec pass pipelines. (#112288)

My previous attempt (#111904) hacked creation of Regions from metadata
into the bottom-up vectorizer. I got some feedback that it should be its
own pass. So now we have two SandboxIR function passes (`BottomUpVec`
and `RegionsFromMetadata`) that are interchangeable, and we could have
other SandboxIR function passes doing other kinds of transforms, so this
commit revamps pipeline creation and parsing.

First, `sandboxir::PassManager::setPassPipeline` now accepts pass
arguments in angle brackets. Pass arguments are arbitrary strings that
must be parsed by each pass, the only requirement is that nested angle
bracket pairs must be balanced, to allow for nested pipelines with more
arguments. For example:
```
    bottom-up-vec<region-pass-1,region-pass-2<arg>,region-pass-3>
```
This has complicated the parser a little bit (the loop over pipeline
characters now contains a small state machine), and we now have some new
test cases to exercise the new features.

The main SandboxVectorizerPass now contains a customizable pipeline of
SandboxIR function passes, defined by the `sbvec-passes` flag. Region
passes for the bottom-up vectorizer pass are now in pass arguments (like
in the example above).

Because we have now several classes that can build sub-pass pipelines,
I've moved the logic that interacts with PassRegistry.def into its own
files (PassBuilder.{h,cpp} so it can be easily reused.

Finally, I've added a `RegionsFromMetadata` function pass, which will
allow us to run region passes in isolation from lit tests without
relying on the bottom-up vectorizer, and a new lit test that does
exactly this.

Note that the new pipeline parser now allows empty pipelines. This is
useful for testing. For example, if we use
```
  -sbvec-passes="bottom-up-vec<>"
```
SandboxVectorizer converts LLVM IR to SandboxIR and runs the bottom-up
vectorizer, but no region passes afterwards.
```
  -sbvec-passes=""
```
SandboxVectorizer converts LLVM IR to SandboxIR and runs no passes on
it. This is useful to exercise SandboxIR conversion on its own.
---
 llvm/include/llvm/SandboxIR/Pass.h            |   2 +-
 llvm/include/llvm/SandboxIR/PassManager.h     | 133 +++++++++++++++---
 .../SandboxVectorizer/Passes/BottomUpVec.h    |  10 +-
 .../Passes/PrintInstructionCount.h            |  23 +++
 .../Passes/RegionsFromMetadata.h              |  38 +++++
 .../SandboxVectorizer/SandboxVectorizer.h     |   6 +-
 .../SandboxVectorizerPassBuilder.h            |  32 +++++
 llvm/lib/Transforms/Vectorize/CMakeLists.txt  |   2 +
 .../SandboxVectorizer/Passes/BottomUpVec.cpp  |  41 +-----
 .../SandboxVectorizer/Passes/PassRegistry.def |  14 +-
 .../Passes/RegionsFromMetadata.cpp            |  29 ++++
 .../SandboxVectorizer/SandboxVectorizer.cpp   |  36 ++++-
 .../SandboxVectorizerPassBuilder.cpp          |  32 +++++
 .../default_pass_pipeline.ll                  |   1 +
 .../regions-from-metadata.ll                  |  15 ++
 .../SandboxVectorizer/user_pass_pipeline.ll   |  22 ++-
 llvm/unittests/SandboxIR/PassTest.cpp         |  52 +++++--
 17 files changed, 409 insertions(+), 79 deletions(-)
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h
 create mode 100644 llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h
 create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp
 create mode 100644 llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
 create mode 100644 llvm/test/Transforms/SandboxVectorizer/regions-from-metadata.ll

diff --git a/llvm/include/llvm/SandboxIR/Pass.h b/llvm/include/llvm/SandboxIR/Pass.h
index 211f10f5d57c..5ed9d7442ee7 100644
--- a/llvm/include/llvm/SandboxIR/Pass.h
+++ b/llvm/include/llvm/SandboxIR/Pass.h
@@ -43,7 +43,7 @@ public:
   LLVM_DUMP_METHOD virtual void dump() const;
 #endif
   /// Similar to print() but adds a newline. Used for testing.
-  void printPipeline(raw_ostream &OS) const { OS << Name << "\n"; }
+  virtual void printPipeline(raw_ostream &OS) const { OS << Name << "\n"; }
 };
 
 /// A pass that runs on a sandbox::Function.
diff --git a/llvm/include/llvm/SandboxIR/PassManager.h b/llvm/include/llvm/SandboxIR/PassManager.h
index 247c43615f57..e8221996bc8f 100644
--- a/llvm/include/llvm/SandboxIR/PassManager.h
+++ b/llvm/include/llvm/SandboxIR/PassManager.h
@@ -32,11 +32,20 @@ class Value;
 /// Base class.
 template <typename ParentPass, typename ContainedPass>
 class PassManager : public ParentPass {
+public:
+  // CreatePassFunc(StringRef PassName, StringRef PassArgs).
+  using CreatePassFunc =
+      std::function<std::unique_ptr<ContainedPass>(StringRef, StringRef)>;
+
 protected:
   /// The list of passes that this pass manager will run.
   SmallVector<std::unique_ptr<ContainedPass>> Passes;
 
   PassManager(StringRef Name) : ParentPass(Name) {}
+  PassManager(StringRef Name, StringRef Pipeline, CreatePassFunc CreatePass)
+      : ParentPass(Name) {
+    setPassPipeline(Pipeline, CreatePass);
+  }
   PassManager(const PassManager &) = delete;
   PassManager(PassManager &&) = default;
   virtual ~PassManager() = default;
@@ -49,41 +58,125 @@ public:
     Passes.push_back(std::move(Pass));
   }
 
-  using CreatePassFunc =
-      std::function<std::unique_ptr<ContainedPass>(StringRef)>;
-
   /// Parses \p Pipeline as a comma-separated sequence of pass names and sets
   /// the pass pipeline, using \p CreatePass to instantiate passes by name.
   ///
-  /// After calling this function, the PassManager contains only the specified
-  /// pipeline, any previously added passes are cleared.
+  /// Passes can have arguments, for example:
+  ///   "pass1<arg1,arg2>,pass2,pass3<arg3,arg4>"
+  ///
+  /// The arguments between angle brackets are treated as a mostly opaque string
+  /// and each pass is responsible for parsing its arguments. The exception to
+  /// this are nested angle brackets, which must match pair-wise to allow
+  /// arguments to contain nested pipelines, like:
+  ///
+  ///   "pass1<subpass1,subpass2<arg1,arg2>,subpass3>"
+  ///
+  /// An empty args string is treated the same as no args, so "pass" and
+  /// "pass<>" are equivalent.
   void setPassPipeline(StringRef Pipeline, CreatePassFunc CreatePass) {
     static constexpr const char EndToken = '\0';
+    static constexpr const char BeginArgsToken = '<';
+    static constexpr const char EndArgsToken = '>';
     static constexpr const char PassDelimToken = ',';
 
     assert(Passes.empty() &&
            "setPassPipeline called on a non-empty sandboxir::PassManager");
+
+    // Accept an empty pipeline as a special case. This can be useful, for
+    // example, to test conversion to SandboxIR without running any passes on
+    // it.
+    if (Pipeline.empty())
+      return;
+
     // Add EndToken to the end to ease parsing.
     std::string PipelineStr = std::string(Pipeline) + EndToken;
-    int FlagBeginIdx = 0;
-
-    for (auto [Idx, C] : enumerate(PipelineStr)) {
-      // Keep moving Idx until we find the end of the pass name.
-      bool FoundDelim = C == EndToken || C == PassDelimToken;
-      if (!FoundDelim)
-        continue;
-      unsigned Sz = Idx - FlagBeginIdx;
-      std::string PassName(&PipelineStr[FlagBeginIdx], Sz);
-      FlagBeginIdx = Idx + 1;
+    Pipeline = StringRef(PipelineStr);
 
+    auto AddPass = [this, CreatePass](StringRef PassName, StringRef PassArgs) {
+      if (PassName.empty()) {
+        errs() << "Found empty pass name.\n";
+        exit(1);
+      }
       // Get the pass that corresponds to PassName and add it to the pass
       // manager.
-      auto Pass = CreatePass(PassName);
+      auto Pass = CreatePass(PassName, PassArgs);
       if (Pass == nullptr) {
         errs() << "Pass '" << PassName << "' not registered!\n";
         exit(1);
       }
       addPass(std::move(Pass));
+    };
+
+    enum class State {
+      ScanName,  // reading a pass name
+      ScanArgs,  // reading a list of args
+      ArgsEnded, // read the last '>' in an args list, must read delimiter next
+    } CurrentState = State::ScanName;
+    int PassBeginIdx = 0;
+    int ArgsBeginIdx;
+    StringRef PassName;
+    int NestedArgs = 0;
+    for (auto [Idx, C] : enumerate(Pipeline)) {
+      switch (CurrentState) {
+      case State::ScanName:
+        if (C == BeginArgsToken) {
+          // Save pass name for later and begin scanning args.
+          PassName = Pipeline.slice(PassBeginIdx, Idx);
+          ArgsBeginIdx = Idx + 1;
+          ++NestedArgs;
+          CurrentState = State::ScanArgs;
+          break;
+        }
+        if (C == EndArgsToken) {
+          errs() << "Unexpected '>' in pass pipeline.\n";
+          exit(1);
+        }
+        if (C == EndToken || C == PassDelimToken) {
+          // Delimiter found, add the pass (with empty args), stay in the
+          // ScanName state.
+          AddPass(Pipeline.slice(PassBeginIdx, Idx), StringRef());
+          PassBeginIdx = Idx + 1;
+        }
+        break;
+      case State::ScanArgs:
+        // While scanning args, we only care about making sure nesting of angle
+        // brackets is correct.
+        if (C == BeginArgsToken) {
+          ++NestedArgs;
+          break;
+        }
+        if (C == EndArgsToken) {
+          --NestedArgs;
+          if (NestedArgs == 0) {
+            // Done scanning args.
+            AddPass(PassName, Pipeline.slice(ArgsBeginIdx, Idx));
+            CurrentState = State::ArgsEnded;
+          } else if (NestedArgs < 0) {
+            errs() << "Unexpected '>' in pass pipeline.\n";
+            exit(1);
+          }
+          break;
+        }
+        if (C == EndToken) {
+          errs() << "Missing '>' in pass pipeline. End-of-string reached while "
+                    "reading arguments for pass '"
+                 << PassName << "'.\n";
+          exit(1);
+        }
+        break;
+      case State::ArgsEnded:
+        // Once we're done scanning args, only a delimiter is valid. This avoids
+        // accepting strings like "foo<args><more-args>" or "foo<args>bar".
+        if (C == EndToken || C == PassDelimToken) {
+          PassBeginIdx = Idx + 1;
+          CurrentState = State::ScanName;
+        } else {
+          errs() << "Expected delimiter or end-of-string after pass "
+                    "arguments.\n";
+          exit(1);
+        }
+        break;
+      }
     }
   }
 
@@ -101,7 +194,7 @@ public:
   }
 #endif
   /// Similar to print() but prints one pass per line. Used for testing.
-  void printPipeline(raw_ostream &OS) const {
+  void printPipeline(raw_ostream &OS) const override {
     OS << this->getName() << "\n";
     for (const auto &PassPtr : Passes)
       PassPtr->printPipeline(OS);
@@ -112,12 +205,18 @@ class FunctionPassManager final
     : public PassManager<FunctionPass, FunctionPass> {
 public:
   FunctionPassManager(StringRef Name) : PassManager(Name) {}
+  FunctionPassManager(StringRef Name, StringRef Pipeline,
+                      CreatePassFunc CreatePass)
+      : PassManager(Name, Pipeline, CreatePass) {}
   bool runOnFunction(Function &F) final;
 };
 
 class RegionPassManager final : public PassManager<RegionPass, RegionPass> {
 public:
   RegionPassManager(StringRef Name) : PassManager(Name) {}
+  RegionPassManager(StringRef Name, StringRef Pipeline,
+                    CreatePassFunc CreatePass)
+      : PassManager(Name, Pipeline, CreatePass) {}
   bool runOnRegion(Region &R) final;
 };
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
index 02abdf0a1ef0..5cd47efd6b34 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h
@@ -13,15 +13,15 @@
 #define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_BOTTOMUPVEC_H
 
 #include "llvm/ADT/ArrayRef.h"
+#include "llvm/ADT/StringRef.h"
 #include "llvm/SandboxIR/Constant.h"
 #include "llvm/SandboxIR/Pass.h"
 #include "llvm/SandboxIR/PassManager.h"
+#include "llvm/Support/raw_ostream.h"
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Legality.h"
 
 namespace llvm::sandboxir {
 
-class RegionPassManager;
-
 class BottomUpVec final : public FunctionPass {
   bool Change = false;
   LegalityAnalysis Legality;
@@ -32,8 +32,12 @@ class BottomUpVec final : public FunctionPass {
   RegionPassManager RPM;
 
 public:
-  BottomUpVec();
+  BottomUpVec(StringRef Pipeline);
   bool runOnFunction(Function &F) final;
+  void printPipeline(raw_ostream &OS) const final {
+    OS << getName() << "\n";
+    RPM.printPipeline(OS);
+  }
 };
 
 } // namespace llvm::sandboxir
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h
new file mode 100644
index 000000000000..9d88bc828038
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h
@@ -0,0 +1,23 @@
+#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_PRINTINSTRUCTIONCOUNT_H
+#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_PRINTINSTRUCTIONCOUNT_H
+
+#include "llvm/SandboxIR/Pass.h"
+#include "llvm/SandboxIR/Region.h"
+
+namespace llvm::sandboxir {
+
+/// A Region pass that prints the instruction count for the region to stdout.
+/// Used to test -sbvec-passes while we don't have any actual optimization
+/// passes.
+class PrintInstructionCount final : public RegionPass {
+public:
+  PrintInstructionCount() : RegionPass("null") {}
+  bool runOnRegion(Region &R) final {
+    outs() << "InstructionCount: " << std::distance(R.begin(), R.end()) << "\n";
+    return false;
+  }
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_PRINTINSTRUCTIONCOUNTPASS_H
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h
new file mode 100644
index 000000000000..3d82a61c9015
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h
@@ -0,0 +1,38 @@
+//===- RegionsFromMetadata.h ------------------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// A SandboxIR function pass that builds regions from IR metadata and then runs
+// a pipeline of region passes on them. This is useful to test region passes in
+// isolation without relying on the output of the bottom-up vectorizer.
+//
+
+#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_REGIONSFROMMETADATA_H
+#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_REGIONSFROMMETADATA_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/SandboxIR/Pass.h"
+#include "llvm/SandboxIR/PassManager.h"
+
+namespace llvm::sandboxir {
+
+class RegionsFromMetadata final : public FunctionPass {
+  // The PM containing the pipeline of region passes.
+  RegionPassManager RPM;
+
+public:
+  RegionsFromMetadata(StringRef Pipeline);
+  bool runOnFunction(Function &F) final;
+  void printPipeline(raw_ostream &OS) const final {
+    OS << getName() << "\n";
+    RPM.printPipeline(OS);
+  }
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_PASSES_REGIONSFROMMETADATA_H
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
index b7cb418c0032..b83744cf9e6c 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h
@@ -11,7 +11,7 @@
 #include <memory>
 
 #include "llvm/IR/PassManager.h"
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h"
+#include "llvm/SandboxIR/PassManager.h"
 
 namespace llvm {
 
@@ -20,8 +20,8 @@ class TargetTransformInfo;
 class SandboxVectorizerPass : public PassInfoMixin<SandboxVectorizerPass> {
   TargetTransformInfo *TTI = nullptr;
 
-  // The main vectorizer pass.
-  sandboxir::BottomUpVec BottomUpVecPass;
+  // A pipeline of SandboxIR function passes run by the vectorizer.
+  sandboxir::FunctionPassManager FPM;
 
   bool runImpl(Function &F);
 
diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h
new file mode 100644
index 000000000000..e3d6ecae836f
--- /dev/null
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h
@@ -0,0 +1,32 @@
+//===- SandboxVectorizerPassBuilder.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// Utility functions so passes with sub-pipelines can create SandboxVectorizer
+// passes without replicating the same logic in each pass.
+//
+#ifndef LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZERPASSBUILDER_H
+#define LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZERPASSBUILDER_H
+
+#include "llvm/ADT/StringRef.h"
+#include "llvm/SandboxIR/Pass.h"
+
+#include <memory>
+
+namespace llvm::sandboxir {
+
+class SandboxVectorizerPassBuilder {
+public:
+  static std::unique_ptr<FunctionPass> createFunctionPass(StringRef Name,
+                                                          StringRef Args);
+  static std::unique_ptr<RegionPass> createRegionPass(StringRef Name,
+                                                      StringRef Args);
+};
+
+} // namespace llvm::sandboxir
+
+#endif // LLVM_TRANSFORMS_VECTORIZE_SANDBOXVECTORIZER_SANDBOXVECTORIZERPASSBUILDER_H
diff --git a/llvm/lib/Transforms/Vectorize/CMakeLists.txt b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
index 9c2e7c1e0c5b..f4e98e576379 100644
--- a/llvm/lib/Transforms/Vectorize/CMakeLists.txt
+++ b/llvm/lib/Transforms/Vectorize/CMakeLists.txt
@@ -6,7 +6,9 @@ add_llvm_component_library(LLVMVectorize
   SandboxVectorizer/DependencyGraph.cpp
   SandboxVectorizer/Interval.cpp
   SandboxVectorizer/Passes/BottomUpVec.cpp
+  SandboxVectorizer/Passes/RegionsFromMetadata.cpp
   SandboxVectorizer/SandboxVectorizer.cpp
+  SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
   SandboxVectorizer/SeedCollector.cpp
   SLPVectorizer.cpp
   Vectorize.cpp
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
index 77198f932a3e..6171d5e52b58 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.cpp
@@ -7,43 +7,17 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h"
+
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/SandboxIR/Function.h"
 #include "llvm/SandboxIR/Instruction.h"
-#include "llvm/Support/CommandLine.h"
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h"
 
 namespace llvm::sandboxir {
 
-static cl::opt<bool>
-    PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden,
-                      cl::desc("Prints the pass pipeline and returns."));
-
-/// A magic string for the default pass pipeline.
-static const char *DefaultPipelineMagicStr = "*";
-
-static cl::opt<std::string> UserDefinedPassPipeline(
-    "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden,
-    cl::desc("Comma-separated list of vectorizer passes. If not set "
-             "we run the predefined pipeline."));
-
-static std::unique_ptr<RegionPass> createRegionPass(StringRef Name) {
-#define REGION_PASS(NAME, CREATE_PASS)                                         \
-  if (Name == NAME)                                                            \
-    return std::make_unique<decltype(CREATE_PASS)>(CREATE_PASS);
-#include "PassRegistry.def"
-  return nullptr;
-}
-
-BottomUpVec::BottomUpVec() : FunctionPass("bottom-up-vec"), RPM("rpm") {
-  // Create a pipeline to be run on each Region created by BottomUpVec.
-  if (UserDefinedPassPipeline == DefaultPipelineMagicStr) {
-    // TODO: Add default passes to RPM.
-  } else {
-    // Create the user-defined pipeline.
-    RPM.setPassPipeline(UserDefinedPassPipeline, createRegionPass);
-  }
-}
+BottomUpVec::BottomUpVec(StringRef Pipeline)
+    : FunctionPass("bottom-up-vec"),
+      RPM("rpm", Pipeline, SandboxVectorizerPassBuilder::createRegionPass) {}
 
 // TODO: This is a temporary function that returns some seeds.
 //       Replace this with SeedCollector's function when it lands.
@@ -82,11 +56,6 @@ void BottomUpVec::vectorizeRec(ArrayRef<Value *> Bndl) {
 void BottomUpVec::tryVectorize(ArrayRef<Value *> Bndl) { vectorizeRec(Bndl); }
 
 bool BottomUpVec::runOnFunction(Function &F) {
-  if (PrintPassPipeline) {
-    RPM.printPipeline(outs());
-    return false;
-  }
-
   Change = false;
   // TODO: Start from innermost BBs first
   for (auto &BB : F) {
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def
index bbb0dcba1ea5..0dc72842f1ab 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/PassRegistry.def
@@ -14,9 +14,19 @@
 // NOTE: NO INCLUDE GUARD DESIRED!
 
 #ifndef REGION_PASS
-#define REGION_PASS(NAME, CREATE_PASS)
+#define REGION_PASS(NAME, CLASS_NAME)
 #endif
 
-REGION_PASS("null", NullPass())
+REGION_PASS("null", ::llvm::sandboxir::NullPass)
+REGION_PASS("print-instruction-count", ::llvm::sandboxir::PrintInstructionCount)
 
 #undef REGION_PASS
+
+#ifndef FUNCTION_PASS_WITH_PARAMS
+#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS_NAME)
+#endif
+
+FUNCTION_PASS_WITH_PARAMS("bottom-up-vec", ::llvm::sandboxir::BottomUpVec)
+FUNCTION_PASS_WITH_PARAMS("regions-from-metadata", ::llvm::sandboxir::RegionsFromMetadata)
+
+#undef FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp
new file mode 100644
index 000000000000..5887d5e8bc26
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.cpp
@@ -0,0 +1,29 @@
+//===- RegionsFromMetadata.cpp - A helper to test RegionPasses -----------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h"
+
+#include "llvm/SandboxIR/Region.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h"
+
+namespace llvm::sandboxir {
+
+RegionsFromMetadata::RegionsFromMetadata(StringRef Pipeline)
+    : FunctionPass("regions-from-metadata"),
+      RPM("rpm", Pipeline, SandboxVectorizerPassBuilder::createRegionPass) {}
+
+bool RegionsFromMetadata::runOnFunction(Function &F) {
+  SmallVector<std::unique_ptr<sandboxir::Region>> Regions =
+      sandboxir::Region::createRegionsFromMD(F);
+  for (auto &R : Regions) {
+    RPM.runOnRegion(*R);
+  }
+  return false;
+}
+
+} // namespace llvm::sandboxir
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
index ba4899cc624e..c68f9482e337 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.cpp
@@ -9,14 +9,39 @@
 #include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizer.h"
 #include "llvm/Analysis/TargetTransformInfo.h"
 #include "llvm/SandboxIR/Constant.h"
-#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h"
+#include "llvm/Support/CommandLine.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h"
 
 using namespace llvm;
 
 #define SV_NAME "sandbox-vectorizer"
 #define DEBUG_TYPE SV_NAME
 
-SandboxVectorizerPass::SandboxVectorizerPass() = default;
+static cl::opt<bool>
+    PrintPassPipeline("sbvec-print-pass-pipeline", cl::init(false), cl::Hidden,
+                      cl::desc("Prints the pass pipeline and returns."));
+
+/// A magic string for the default pass pipeline.
+static const char *DefaultPipelineMagicStr = "*";
+
+static cl::opt<std::string> UserDefinedPassPipeline(
+    "sbvec-passes", cl::init(DefaultPipelineMagicStr), cl::Hidden,
+    cl::desc("Comma-separated list of vectorizer passes. If not set "
+             "we run the predefined pipeline."));
+
+SandboxVectorizerPass::SandboxVectorizerPass() : FPM("fpm") {
+  if (UserDefinedPassPipeline == DefaultPipelineMagicStr) {
+    // TODO: Add region passes to the default pipeline.
+    FPM.setPassPipeline(
+        "bottom-up-vec<>",
+        sandboxir::SandboxVectorizerPassBuilder::createFunctionPass);
+  } else {
+    // Create the user-defined pipeline.
+    FPM.setPassPipeline(
+        UserDefinedPassPipeline,
+        sandboxir::SandboxVectorizerPassBuilder::createFunctionPass);
+  }
+}
 
 SandboxVectorizerPass::SandboxVectorizerPass(SandboxVectorizerPass &&) =
     default;
@@ -37,6 +62,11 @@ PreservedAnalyses SandboxVectorizerPass::run(Function &F,
 }
 
 bool SandboxVectorizerPass::runImpl(Function &LLVMF) {
+  if (PrintPassPipeline) {
+    FPM.printPipeline(outs());
+    return false;
+  }
+
   // If the target claims to have no vector registers early return.
   if (!TTI->getNumberOfRegisters(TTI->getRegisterClassForType(true))) {
     LLVM_DEBUG(dbgs() << "SBVec: Target has no vector registers, return.\n");
@@ -52,5 +82,5 @@ bool SandboxVectorizerPass::runImpl(Function &LLVMF) {
   // Create SandboxIR for LLVMF and run BottomUpVec on it.
   sandboxir::Context Ctx(LLVMF.getContext());
   sandboxir::Function &F = *Ctx.createFunction(&LLVMF);
-  return BottomUpVecPass.runOnFunction(F);
+  return FPM.runOnFunction(F);
 }
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
new file mode 100644
index 000000000000..5ecf7b2ed0d2
--- /dev/null
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.cpp
@@ -0,0 +1,32 @@
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/SandboxVectorizerPassBuilder.h"
+
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/BottomUpVec.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/NullPass.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/PrintInstructionCount.h"
+#include "llvm/Transforms/Vectorize/SandboxVectorizer/Passes/RegionsFromMetadata.h"
+
+namespace llvm::sandboxir {
+
+std::unique_ptr<sandboxir::RegionPass>
+SandboxVectorizerPassBuilder::createRegionPass(StringRef Name, StringRef Args) {
+#define REGION_PASS(NAME, CLASS_NAME)                                          \
+  if (Name == NAME) {                                                          \
+    assert(Args.empty() && "Unexpected arguments for pass '" NAME "'.");       \
+    return std::make_unique<CLASS_NAME>();                                     \
+  }
+// TODO: Support region passes with params.
+#include "Passes/PassRegistry.def"
+  return nullptr;
+}
+
+std::unique_ptr<sandboxir::FunctionPass>
+SandboxVectorizerPassBuilder::createFunctionPass(StringRef Name,
+                                                 StringRef Args) {
+#define FUNCTION_PASS_WITH_PARAMS(NAME, CLASS_NAME)                            \
+  if (Name == NAME)                                                            \
+    return std::make_unique<CLASS_NAME>(Args);
+#include "Passes/PassRegistry.def"
+  return nullptr;
+}
+
+} // namespace llvm::sandboxir
diff --git a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll
index 86bfbee63647..1d7be43336c8 100644
--- a/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/default_pass_pipeline.ll
@@ -4,6 +4,7 @@
 
 ; This checks the default pass pipeline for the sandbox vectorizer.
 define void @pipeline() {
+; CHECK: bottom-up-vec
 ; CHECK: rpm
 ; CHECK-EMPTY:
   ret void
diff --git a/llvm/test/Transforms/SandboxVectorizer/regions-from-metadata.ll b/llvm/test/Transforms/SandboxVectorizer/regions-from-metadata.ll
new file mode 100644
index 000000000000..3e57bde76e72
--- /dev/null
+++ b/llvm/test/Transforms/SandboxVectorizer/regions-from-metadata.ll
@@ -0,0 +1,15 @@
+; RUN: opt -disable-output --passes=sandbox-vectorizer \
+; RUN:    -sbvec-passes='regions-from-metadata<print-instruction-count>' %s | FileCheck %s
+
+define i8 @foo(i8 %v0, i8 %v1) {
+  %t0 = add i8 %v0, 1, !sandboxvec !0
+  %t1 = add i8 %t0, %v1, !sandboxvec !1
+  %t2 = add i8 %t1, %v1, !sandboxvec !1
+  ret i8 %t2
+}
+
+!0 = distinct !{!"sandboxregion"}
+!1 = distinct !{!"sandboxregion"}
+
+; CHECK: InstructionCount: 1
+; CHECK: InstructionCount: 2
diff --git a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll
index 2e6dab0aa29c..b11b55ed9601 100644
--- a/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll
+++ b/llvm/test/Transforms/SandboxVectorizer/user_pass_pipeline.ll
@@ -1,12 +1,28 @@
-; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline -sbvec-passes=null,null %s -disable-output | FileCheck %s
+; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline \
+; RUN:     -disable-output -sbvec-passes="bottom-up-vec<null,null>" %s \
+; RUN:     | FileCheck %s
+;
+; RUN: opt -passes=sandbox-vectorizer -sbvec-print-pass-pipeline \
+; RUN:     -disable-output -sbvec-passes="bottom-up-vec<>,regions-from-metadata<>" %s \
+; RUN:     | FileCheck --check-prefix CHECK-MULTIPLE-FUNCTION-PASSES %s
 
 ; !!!WARNING!!! This won't get updated by update_test_checks.py !
 
 ; This checks the user defined pass pipeline.
 define void @pipeline() {
+  ret void
+}
+
+; CHECK: fpm
+; CHECK: bottom-up-vec
 ; CHECK: rpm
 ; CHECK: null
 ; CHECK: null
 ; CHECK-EMPTY:
-  ret void
-}
+
+; CHECK-MULTIPLE-FUNCTION-PASSES: fpm
+; CHECK-MULTIPLE-FUNCTION-PASSES: bottom-up-vec
+; CHECK-MULTIPLE-FUNCTION-PASSES: rpm
+; CHECK-MULTIPLE-FUNCTION-PASSES: regions-from-metadata
+; CHECK-MULTIPLE-FUNCTION-PASSES: rpm
+; CHECK-MULTIPLE-FUNCTION-PASSES-EMPTY:
diff --git a/llvm/unittests/SandboxIR/PassTest.cpp b/llvm/unittests/SandboxIR/PassTest.cpp
index ae7284ecf2de..866bd8233d80 100644
--- a/llvm/unittests/SandboxIR/PassTest.cpp
+++ b/llvm/unittests/SandboxIR/PassTest.cpp
@@ -265,39 +265,45 @@ define void @f() {
                           "f");
   class FooPass final : public FunctionPass {
     std::string &Str;
+    std::string Args;
 
   public:
-    FooPass(std::string &Str) : FunctionPass("foo-pass"), Str(Str) {}
+    FooPass(std::string &Str, llvm::StringRef Args)
+        : FunctionPass("foo-pass"), Str(Str), Args(Args.str()) {}
     bool runOnFunction(Function &F) final {
-      Str += "foo";
+      Str += "foo<" + Args + ">";
       return false;
     }
   };
   class BarPass final : public FunctionPass {
     std::string &Str;
+    std::string Args;
 
   public:
-    BarPass(std::string &Str) : FunctionPass("bar-pass"), Str(Str) {}
+    BarPass(std::string &Str, llvm::StringRef Args)
+        : FunctionPass("bar-pass"), Str(Str), Args(Args.str()) {}
     bool runOnFunction(Function &F) final {
-      Str += "bar";
+      Str += "bar<" + Args + ">";
       return false;
     }
   };
 
   std::string Str;
   auto CreatePass =
-      [&Str](llvm::StringRef Name) -> std::unique_ptr<FunctionPass> {
+      [&Str](llvm::StringRef Name,
+             llvm::StringRef Args) -> std::unique_ptr<FunctionPass> {
     if (Name == "foo")
-      return std::make_unique<FooPass>(Str);
+      return std::make_unique<FooPass>(Str, Args);
     if (Name == "bar")
-      return std::make_unique<BarPass>(Str);
+      return std::make_unique<BarPass>(Str, Args);
     return nullptr;
   };
 
   FunctionPassManager FPM("test-fpm");
-  FPM.setPassPipeline("foo,bar,foo", CreatePass);
+  FPM.setPassPipeline("foo<abc>,bar<nested1<nested2<nested3>>>,foo",
+                      CreatePass);
   FPM.runOnFunction(*F);
-  EXPECT_EQ(Str, "foobarfoo");
+  EXPECT_EQ(Str, "foo<abc>bar<nested1<nested2<nested3>>>foo<>");
 
   // A second call to setPassPipeline will trigger an assertion in debug mode.
 #ifndef NDEBUG
@@ -308,8 +314,32 @@ define void @f() {
   // Fresh PM for the death tests so they die from bad pipeline strings, rather
   // than from multiple setPassPipeline calls.
   FunctionPassManager FPM2("test-fpm");
+  // Bad/empty pass names.
   EXPECT_DEATH(FPM2.setPassPipeline("bad-pass-name", CreatePass),
                ".*not registered.*");
-  EXPECT_DEATH(FPM2.setPassPipeline("", CreatePass), ".*not registered.*");
-  EXPECT_DEATH(FPM2.setPassPipeline(",", CreatePass), ".*not registered.*");
+  EXPECT_DEATH(FPM2.setPassPipeline(",", CreatePass), ".*empty pass name.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("<>", CreatePass), ".*empty pass name.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("<>foo", CreatePass),
+               ".*empty pass name.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("foo,<>", CreatePass),
+               ".*empty pass name.*");
+
+  // Mismatched argument brackets.
+  EXPECT_DEATH(FPM2.setPassPipeline("foo<", CreatePass), ".*Missing '>'.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("foo<bar", CreatePass), ".*Missing '>'.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("foo<bar<>", CreatePass),
+               ".*Missing '>'.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("foo>", CreatePass), ".*Unexpected '>'.*");
+  EXPECT_DEATH(FPM2.setPassPipeline(">foo", CreatePass), ".*Unexpected '>'.*");
+  // Extra garbage between args and next delimiter/end-of-string.
+  EXPECT_DEATH(FPM2.setPassPipeline("foo<bar<>>>", CreatePass),
+               ".*Expected delimiter.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("bar<>foo", CreatePass),
+               ".*Expected delimiter.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("bar<>foo,baz", CreatePass),
+               ".*Expected delimiter.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("foo<args><more-args>", CreatePass),
+               ".*Expected delimiter.*");
+  EXPECT_DEATH(FPM2.setPassPipeline("foo<args>bar", CreatePass),
+               ".*Expected delimiter.*");
 }
-- 
GitLab


From d741435d776a1381bfb0d588f912c7ee7819d921 Mon Sep 17 00:00:00 2001
From: ravil-mobile <ravil.aviva.com@gmail.com>
Date: Tue, 15 Oct 2024 23:54:39 +0200
Subject: [PATCH 041/329] [MLIR][ROCDL] Added `SchedGroupBarrier` and `IglpOpt`
 ops (#112237)

This PR adds missing `sched.group.barrier` and `rocdl.iglp.opt` ops to
the ROCDL dialect (see
[here](https://github.com/llvm/llvm-project/blob/ec78f0da0e9b1b8e2b2323e434ea742e272dd913/clang/include/clang/Basic/BuiltinsAMDGPU.def#L66-L68)).
The ops are converted to the corresponding intrinsic calls during the
translation from MLIR to LLVM IRs. This intrinsics are hints to the
instruction scheduler of the AMDGPU backend.
---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td | 18 ++++++++++++++++++
 mlir/test/Dialect/LLVMIR/rocdl.mlir          | 12 ++++++++++++
 mlir/test/Target/LLVMIR/rocdl.mlir           | 16 ++++++++++++++++
 3 files changed, 46 insertions(+)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index b80d9ae88910..c40ae4b1016b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -297,6 +297,24 @@ def ROCDL_SchedBarrier : ROCDL_IntrOp<"sched.barrier", [], [], [], 0>,
     "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_sched_barrier,builder.getInt32(op.getMask()));";
 }
 
+def ROCDL_SchedGroupBarrier : ROCDL_IntrOp<"sched.group.barrier", [], [], [], 0>,
+  Arguments<(ins I32Attr:$mask, I32Attr:$size, I32Attr:$groupId)> {
+  let results = (outs);
+  let assemblyFormat = "$mask `,` $size `,` $groupId attr-dict";
+  string llvmBuilder = [{
+    createIntrinsicCall(builder,
+      llvm::Intrinsic::amdgcn_sched_group_barrier,
+      {builder.getInt32(op.getMask()), builder.getInt32(op.getSize()), builder.getInt32(op.getGroupId())});
+  }];
+}
+
+def ROCDL_IglpOpt : ROCDL_IntrOp<"iglp.opt", [], [], [], 0>,
+  Arguments<(ins I32Attr:$variant)> {
+  let results = (outs);
+  let assemblyFormat = "$variant attr-dict";
+  string llvmBuilder =
+    "createIntrinsicCall(builder, llvm::Intrinsic::amdgcn_iglp_opt,builder.getInt32(op.getVariant()));";
+}
 
 //===---------------------------------------------------------------------===//
 // Xdlops intrinsics
diff --git a/mlir/test/Dialect/LLVMIR/rocdl.mlir b/mlir/test/Dialect/LLVMIR/rocdl.mlir
index 397d66d92bc5..4afa839aa3ea 100644
--- a/mlir/test/Dialect/LLVMIR/rocdl.mlir
+++ b/mlir/test/Dialect/LLVMIR/rocdl.mlir
@@ -41,6 +41,18 @@ func.func @rocdl.sched_barrier() {
   llvm.return
 }
 
+func.func @rocdl_sched_group_barrier() {
+  // CHECK: rocdl.sched.group.barrier
+  rocdl.sched.group.barrier 8, 1, 0
+  llvm.return
+}
+
+func.func @rocdl_iglp_opt() {
+  // CHECK: rocdl.iglp.opt
+  rocdl.iglp.opt 0
+  llvm.return
+}
+
 func.func @rocdl.setprio() {
   // CHECK: rocdl.s.setprio
   rocdl.s.setprio 0
diff --git a/mlir/test/Target/LLVMIR/rocdl.mlir b/mlir/test/Target/LLVMIR/rocdl.mlir
index 97276b087b7e..2f34070147be 100644
--- a/mlir/test/Target/LLVMIR/rocdl.mlir
+++ b/mlir/test/Target/LLVMIR/rocdl.mlir
@@ -179,6 +179,22 @@ llvm.func @rocdl.schedbarrier() {
   llvm.return
 }
 
+llvm.func @rocdl.sched.group.barrier() {
+  // CHECK-LABEL: rocdl.sched.group.barrier
+  // CHECK-NEXT: call void @llvm.amdgcn.sched.group.barrier(i32 8, i32 1, i32 0)
+  rocdl.sched.group.barrier 8, 1, 0
+  llvm.return
+}
+
+llvm.func @rocdl.iglp.opt() {
+  // CHECK-LABEL: rocdl.iglp.opt
+  // CHECK-NEXT: call void @llvm.amdgcn.iglp.opt(i32 0)
+  rocdl.iglp.opt 0
+  // CHECK-NEXT: call void @llvm.amdgcn.iglp.opt(i32 1)
+  rocdl.iglp.opt 1
+  llvm.return
+}
+
 llvm.func @rocdl.xdlops(%arg0 : f32, %arg1 : f32,
                    %arg2 : vector<32 x f32>, %arg3: i32,
                    %arg4 : vector<16 x f32>, %arg5 : vector<4xf32>,
-- 
GitLab


From 4c894730a1f7d6bf4d955843c80d257cda17e308 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 15 Oct 2024 23:01:10 +0100
Subject: [PATCH 042/329] [RISCV] Fix bf16 cost model tests. NFC

These were inadvertently changed in #112393
---
 .../test/Analysis/CostModel/RISCV/arith-fp.ll | 58 +++++++++----------
 .../Analysis/CostModel/RISCV/reduce-fmul.ll   | 10 ++--
 2 files changed, 34 insertions(+), 34 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 35619db0b499..20d47001739e 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -9,11 +9,11 @@ define void @fadd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fadd bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fadd float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fadd double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fadd <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fadd <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fadd <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fadd <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fadd <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fadd <1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fadd <2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fadd <4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fadd <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fadd <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fadd <vscale x 1 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fadd <vscale x 2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fadd <vscale x 4 x bfloat> undef, undef
@@ -137,11 +137,11 @@ define void @fsub() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = fsub half undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fsub float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fsub double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fsub <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fsub <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fsub <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fsub <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fsub <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fsub <1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fsub <2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fsub <4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fsub <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fsub <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fsub <vscale x 1 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fsub <vscale x 2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fsub <vscale x 4 x bfloat> undef, undef
@@ -265,11 +265,11 @@ define void @fmul() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fmul bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fmul float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fmul double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fmul <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fmul <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fmul <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fmul <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fmul <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fmul <1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fmul <2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fmul <4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fmul <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fmul <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fmul <vscale x 1 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fmul <vscale x 2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fmul <vscale x 4 x bfloat> undef, undef
@@ -393,11 +393,11 @@ define void @fdiv() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %BF16 = fdiv bfloat undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F32 = fdiv float undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F64 = fdiv double undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V1BF16 = fdiv <1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V2BF16 = fdiv <2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V4BF16 = fdiv <4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %V8BF16 = fdiv <8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V16BF16 = fdiv <16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = fdiv <1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V2BF16 = fdiv <2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V4BF16 = fdiv <4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %V8BF16 = fdiv <8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 18 for instruction: %V16BF16 = fdiv <16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV1BF16 = fdiv <vscale x 1 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %NXV2BF16 = fdiv <vscale x 2 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 9 for instruction: %NXV4BF16 = fdiv <vscale x 4 x bfloat> undef, undef
@@ -889,11 +889,11 @@ define void @fma() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %BF16 = call bfloat @llvm.fma.bf16(bfloat undef, bfloat undef, bfloat undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F32 = call float @llvm.fma.f32(float undef, float undef, float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %F64 = call double @llvm.fma.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 5 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1BF16 = call <1 x bfloat> @llvm.fma.v1bf16(<1 x bfloat> undef, <1 x bfloat> undef, <1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V2BF16 = call <2 x bfloat> @llvm.fma.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V4BF16 = call <4 x bfloat> @llvm.fma.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V8BF16 = call <8 x bfloat> @llvm.fma.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V16BF16 = call <16 x bfloat> @llvm.fma.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV1BF16 = call <vscale x 1 x bfloat> @llvm.fma.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV2BF16 = call <vscale x 2 x bfloat> @llvm.fma.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %NXV4BF16 = call <vscale x 4 x bfloat> @llvm.fma.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
@@ -1017,10 +1017,10 @@ define void @fmuladd() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.fmuladd.bf16(bfloat undef, bfloat undef, bfloat undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call float @llvm.fmuladd.f32(float undef, float undef, float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call double @llvm.fmuladd.f64(double undef, double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %4 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %5 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %6 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %7 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <2 x bfloat> @llvm.fmuladd.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <4 x bfloat> @llvm.fmuladd.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <8 x bfloat> @llvm.fmuladd.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <16 x bfloat> @llvm.fmuladd.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef, <16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %8 = call <2 x float> @llvm.fmuladd.v2f32(<2 x float> undef, <2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %9 = call <4 x float> @llvm.fmuladd.v4f32(<4 x float> undef, <4 x float> undef, <4 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %10 = call <8 x float> @llvm.fmuladd.v8f32(<8 x float> undef, <8 x float> undef, <8 x float> undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll
index 913ce40f133d..162562c7b893 100644
--- a/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/reduce-fmul.ll
@@ -6,11 +6,11 @@
 define void @reduce_fmul_bfloat() {
 ; FP-REDUCE-LABEL: 'reduce_fmul_bfloat'
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %V1 = call fast bfloat @llvm.vector.reduce.fmul.v1bf16(bfloat 0xR0000, <1 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 7 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 13 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 19 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 49 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
-; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 151 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %V2 = call fast bfloat @llvm.vector.reduce.fmul.v2bf16(bfloat 0xR0000, <2 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 15 for instruction: %V4 = call fast bfloat @llvm.vector.reduce.fmul.v4bf16(bfloat 0xR0000, <4 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 28 for instruction: %V8 = call fast bfloat @llvm.vector.reduce.fmul.v8bf16(bfloat 0xR0000, <8 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 73 for instruction: %V16 = call fast bfloat @llvm.vector.reduce.fmul.v16bf16(bfloat 0xR0000, <16 x bfloat> undef)
+; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 211 for instruction: %v32 = call fast bfloat @llvm.vector.reduce.fmul.v32bf16(bfloat 0xR0000, <32 x bfloat> undef)
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 541 for instruction: %V64 = call fast bfloat @llvm.vector.reduce.fmul.v64bf16(bfloat 0xR0000, <64 x bfloat> undef)
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 573 for instruction: %V128 = call fast bfloat @llvm.vector.reduce.fmul.v128bf16(bfloat 0xR0000, <128 x bfloat> undef)
 ; FP-REDUCE-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-- 
GitLab


From 7cbb36590384b8b71076a91f8958df556d773238 Mon Sep 17 00:00:00 2001
From: Peter Klausler <pklausler@nvidia.com>
Date: Tue, 15 Oct 2024 15:11:25 -0700
Subject: [PATCH 043/329] [flang] Fix broken shared library build (#112444)

I just introduced a dependency from the Evaluate library to the
Semantics library, which is circular in a shared library build.
Rearrange the code a little to ensure that the dependence is only on a
header.
---
 flang/include/flang/Semantics/type.h | 13 ++++++++++++-
 flang/lib/Semantics/type.cpp         |  6 ------
 2 files changed, 12 insertions(+), 7 deletions(-)

diff --git a/flang/include/flang/Semantics/type.h b/flang/include/flang/Semantics/type.h
index 1292c381b65f..352219150205 100644
--- a/flang/include/flang/Semantics/type.h
+++ b/flang/include/flang/Semantics/type.h
@@ -29,6 +29,13 @@ namespace Fortran::parser {
 struct Keyword;
 }
 
+namespace Fortran::evaluate { // avoid including all of Evaluate/tools.h
+template <typename T>
+std::optional<bool> AreEquivalentInInterface(const Expr<T> &, const Expr<T> &);
+extern template std::optional<bool> AreEquivalentInInterface<SomeInteger>(
+    const Expr<SomeInteger> &, const Expr<SomeInteger> &);
+} // namespace Fortran::evaluate
+
 namespace Fortran::semantics {
 
 class Scope;
@@ -110,7 +117,11 @@ public:
     return category_ == that.category_ && expr_ == that.expr_;
   }
   bool operator!=(const ParamValue &that) const { return !(*this == that); }
-  bool IsEquivalentInInterface(const ParamValue &) const;
+  bool IsEquivalentInInterface(const ParamValue &that) const {
+    return (category_ == that.category_ &&
+        expr_.has_value() == that.expr_.has_value() &&
+        (!expr_ || evaluate::AreEquivalentInInterface(*expr_, *that.expr_)));
+  }
   std::string AsFortran() const;
 
 private:
diff --git a/flang/lib/Semantics/type.cpp b/flang/lib/Semantics/type.cpp
index 7f5f4e98a7d6..e867d7ad6e25 100644
--- a/flang/lib/Semantics/type.cpp
+++ b/flang/lib/Semantics/type.cpp
@@ -758,12 +758,6 @@ void ParamValue::SetExplicit(SomeIntExpr &&x) {
   expr_ = std::move(x);
 }
 
-bool ParamValue::IsEquivalentInInterface(const ParamValue &that) const {
-  return (category_ == that.category_ &&
-      expr_.has_value() == that.expr_.has_value() &&
-      (!expr_ || evaluate::AreEquivalentInInterface(*expr_, *that.expr_)));
-}
-
 std::string ParamValue::AsFortran() const {
   switch (category_) {
     SWITCH_COVERS_ALL_CASES
-- 
GitLab


From ddc3f2dd26c10b830d7137fc5f89049feec29033 Mon Sep 17 00:00:00 2001
From: wldfngrs <wldfngrs@gmail.com>
Date: Tue, 15 Oct 2024 23:40:08 +0100
Subject: [PATCH 044/329] [libc] Add sinpif16 function (#110994)

Half-precision floating point (16-bit) implementation of the
trigonometric function Sin for inputs scaled by pi
---
 libc/config/linux/aarch64/entrypoints.txt  |   1 +
 libc/config/linux/x86_64/entrypoints.txt   |   1 +
 libc/docs/math/index.rst                   |   2 +-
 libc/newhdrgen/yaml/math.yaml              |   7 ++
 libc/src/math/CMakeLists.txt               |   1 +
 libc/src/math/generic/CMakeLists.txt       |  19 +++
 libc/src/math/generic/sinpif16.cpp         | 136 +++++++++++++++++++++
 libc/src/math/sinpif16.h                   |  21 ++++
 libc/test/src/math/CMakeLists.txt          |  11 ++
 libc/test/src/math/sinpif16_test.cpp       |  40 ++++++
 libc/test/src/math/smoke/CMakeLists.txt    |  11 ++
 libc/test/src/math/smoke/sinpif16_test.cpp |  42 +++++++
 libc/utils/MPFRWrapper/MPFRUtils.cpp       |  18 ++-
 13 files changed, 307 insertions(+), 3 deletions(-)
 create mode 100644 libc/src/math/generic/sinpif16.cpp
 create mode 100644 libc/src/math/sinpif16.h
 create mode 100644 libc/test/src/math/sinpif16_test.cpp
 create mode 100644 libc/test/src/math/smoke/sinpif16_test.cpp

diff --git a/libc/config/linux/aarch64/entrypoints.txt b/libc/config/linux/aarch64/entrypoints.txt
index 32c0d1994893..885827d304ef 100644
--- a/libc/config/linux/aarch64/entrypoints.txt
+++ b/libc/config/linux/aarch64/entrypoints.txt
@@ -679,6 +679,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
     libc.src.math.setpayloadsigf16
+    libc.src.math.sinpif16
     libc.src.math.totalorderf16
     libc.src.math.totalordermagf16
     libc.src.math.truncf16
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 1cd817171de4..2589da3756e1 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -677,6 +677,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
     libc.src.math.setpayloadsigf16
+    libc.src.math.sinpif16
     libc.src.math.totalorderf16
     libc.src.math.totalordermagf16
     libc.src.math.truncf16
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 806dcc64bb93..72e8f6689a36 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -342,7 +342,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sinh      | |check|          |                 |                        |                      |                        | 7.12.5.5               | F.10.2.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| sinpi     | |check|          |                 |                        |                      |                        | 7.12.4.13              | F.10.1.13                  |
+| sinpi     | |check|          |                 |                        |  |check|             |                        | 7.12.4.13              | F.10.1.13                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sqrt      | |check|          | |check|         | |check|                |                      | |check|                | 7.12.7.10              | F.10.4.10                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/newhdrgen/yaml/math.yaml b/libc/newhdrgen/yaml/math.yaml
index d8b810b542cb..98ea1a0d25fb 100644
--- a/libc/newhdrgen/yaml/math.yaml
+++ b/libc/newhdrgen/yaml/math.yaml
@@ -2297,6 +2297,13 @@ functions:
     return_type: float
     arguments:
       - type: float
+  - name: sinpif16
+    standards:
+      - stdc
+    return_type: _Float16
+    arguments:
+      - type: _Float16
+    guard: LIBC_TYPES_HAS_FLOAT16
   - name: sqrt
     standards:
       - stdc
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 53907e47323e..7803369583de 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -475,6 +475,7 @@ add_math_entrypoint_object(sincosf)
 add_math_entrypoint_object(sin)
 add_math_entrypoint_object(sinf)
 add_math_entrypoint_object(sinpif)
+add_math_entrypoint_object(sinpif16)
 
 add_math_entrypoint_object(sinh)
 add_math_entrypoint_object(sinhf)
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index b58935abdf05..1ad611fa168c 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -528,6 +528,25 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  sinpif16
+  SRCS
+    sinpif16.cpp
+  HDRS
+    ../sinpif16.h
+  DEPENDS
+    libc.src.__support.common
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits 
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.nearest_integer
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.macros.properties.types
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   tan
   SRCS
diff --git a/libc/src/math/generic/sinpif16.cpp b/libc/src/math/generic/sinpif16.cpp
new file mode 100644
index 000000000000..17cca583e0c0
--- /dev/null
+++ b/libc/src/math/generic/sinpif16.cpp
@@ -0,0 +1,136 @@
+//===-- Half-precision sinpif function ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinpif16.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/nearest_integer.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+// Lookup table for sin(k * pi / 32) with k = 0, ..., 63.
+// Table is generated with Sollya as follows:
+// > display = hexadecimmal;
+// > for k from 0 to 63 do { round(sin(k * pi/32), SG, RN); };
+static constexpr float SIN_K_PI_OVER_32[64] = {
+    0x0.0p0,        0x1.917a6cp-4,  0x1.8f8b84p-3,  0x1.294062p-2,
+    0x1.87de2ap-2,  0x1.e2b5d4p-2,  0x1.1c73b4p-1,  0x1.44cf32p-1,
+    0x1.6a09e6p-1,  0x1.8bc806p-1,  0x1.a9b662p-1,  0x1.c38b3p-1,
+    0x1.d906bcp-1,  0x1.e9f416p-1,  0x1.f6297cp-1,  0x1.fd88dap-1,
+    0x1p0,          0x1.fd88dap-1,  0x1.f6297cp-1,  0x1.e9f416p-1,
+    0x1.d906bcp-1,  0x1.c38b3p-1,   0x1.a9b662p-1,  0x1.8bc806p-1,
+    0x1.6a09e6p-1,  0x1.44cf32p-1,  0x1.1c73b4p-1,  0x1.e2b5d4p-2,
+    0x1.87de2ap-2,  0x1.294062p-2,  0x1.8f8b84p-3,  0x1.917a6cp-4,
+    0x0.0p0,        -0x1.917a6cp-4, -0x1.8f8b84p-3, -0x1.294062p-2,
+    -0x1.87de2ap-2, -0x1.e2b5d4p-2, -0x1.1c73b4p-1, -0x1.44cf32p-1,
+    -0x1.6a09e6p-1, -0x1.8bc806p-1, -0x1.a9b662p-1, -0x1.c38b3p-1,
+    -0x1.d906bcp-1, -0x1.e9f416p-1, -0x1.f6297ep-1, -0x1.fd88dap-1,
+    -0x1p0,         -0x1.fd88dap-1, -0x1.f6297cp-1, -0x1.e9f416p-1,
+    -0x1.d906bcp-1, -0x1.c38b3p-1,  -0x1.a9b662p-1, -0x1.8bc806p-1,
+    -0x1.6a09e6p-1, -0x1.44cf32p-1, -0x1.1c73b4p-1, -0x1.e2b5d4p-2,
+    -0x1.87de2ap-2, -0x1.294062p-2, -0x1.8f8b84p-3, -0x1.917a6cp-4};
+
+static LIBC_INLINE int32_t range_reduction(float x, float &y) {
+  float kf = fputil::nearest_integer(x * 32);
+  y = fputil::multiply_add<float>(x, 32.0, -kf);
+
+  return static_cast<int32_t>(kf);
+}
+
+LLVM_LIBC_FUNCTION(float16, sinpif16, (float16 x)) {
+  using FPBits = typename fputil::FPBits<float16>;
+  FPBits xbits(x);
+
+  uint16_t x_u = xbits.uintval();
+  uint16_t x_abs = x_u & 0x7fff;
+
+  // Range reduction:
+  // For |x| > 1/32, we perform range reduction as follows:
+  // Find k and y such that:
+  //   x = (k + y) * 1/32
+  //   k is an integer
+  //   |y| < 0.5
+  //
+  // This is done by performing:
+  //   k = round(x * 32)
+  //   y = x * 32 - k
+  //
+  // Once k and y are computed, we then deduce the answer by the sine of sum
+  // formula:
+  //   sin(x * pi) = sin((k + y) * pi/32)
+  //           = sin(k * pi/32) * cos(y * pi/32) + sin (y * pi/32) * cos (k *
+  //           pi/32)
+  // The values of sin(k * pi/32) and cos (k * pi/32) for k = 0...63 are
+  // precomputed and stored using a vector of 64 single precision floats. sin(y
+  // * pi/32) and cos(y * pi/32) are computed using degree-9 chebyshev
+  // polynomials generated by Sollya.
+
+  // For signed zeros
+  if (LIBC_UNLIKELY(x_abs == 0U))
+    return x;
+
+  // Numbers greater or equal to 2^10 are integers, or infinity, or NaN
+  if (LIBC_UNLIKELY(x_abs >= 0x6400)) {
+    // Check for NaN or infinity values
+    if (LIBC_UNLIKELY(x_abs >= 0x7c00)) {
+      // If value is equal to infinity
+      if (x_abs == 0x7c00) {
+        fputil::set_errno_if_required(EDOM);
+        fputil::raise_except_if_required(FE_INVALID);
+      }
+
+      return x + FPBits::quiet_nan().get_val();
+    }
+    return FPBits::zero(xbits.sign()).get_val();
+  }
+
+  float f32 = x;
+  float y;
+  int32_t k = range_reduction(f32, y);
+
+  float sin_k = SIN_K_PI_OVER_32[k & 63];
+  float cos_k = SIN_K_PI_OVER_32[(k + 16) & 63];
+
+  // Recall;
+  // sin(x * pi/32) = sin((k + y) * pi/32)
+  // 		    = sin(y * pi/32) * cos(k * pi/32) + cos(y * pi/32) * sin(k *
+  // pi/32) Recall, after range reduction, -0.5 <= y <= 0.5. For very small
+  // values of y, calculating sin(y * p/32) can be inaccurate. Generating a
+  // polynomial for sin(y * p/32)/y instead significantly reduces the relative
+  // errors.
+  float ysq = y * y;
+
+  // Degree-6 minimax even polynomial for sin(y*pi/32)/y generated by Sollya
+  // with: > Q = fpminimax(sin(y*pi/32)/y, [|0, 2, 4, 6|], [|SG...|], [0, 0.5]);
+  float sin_y = y * fputil::polyeval(ysq, 0x1.921fb6p-4f, -0x1.4aeabcp-13f,
+                                     0x1.a03354p-21f, -0x1.ad02d2p-20f);
+
+  // Note that cosm1_y = cos(y*pi/32) - 1 = cos_y - 1
+  // Derivation:
+  // sin(x * pi) = sin((k + y) * pi/32)
+  //             = sin_y * cos_k + cos_y * sin_k
+  //             = cos_k * sin_y + sin_k * (1 + cos_y - 1)
+  // Degree-6 minimax even polynomial for cos(y*pi/32) generated by Sollya with:
+  // > P = fpminimax(cos(y*pi/32), [|0, 2, 4, 6|],[|1, SG...|], [0, 0.5]);
+  float cosm1_y = ysq * fputil::polyeval(ysq, -0x1.3bd3ccp-8f, 0x1.03a61ap-18f,
+                                         0x1.a6f7a2p-29f);
+
+  if (LIBC_UNLIKELY(sin_y == 0 && sin_k == 0))
+    return FPBits::zero(xbits.sign()).get_val();
+
+  // Since, cosm1_y = cos_y - 1, therefore:
+  // 	sin(x * pi) = cos_k * sin_y + sin_k + (cosm1_y * sin_k)
+  return fputil::cast<float16>(fputil::multiply_add(
+      sin_y, cos_k, fputil::multiply_add(cosm1_y, sin_k, sin_k)));
+}
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/sinpif16.h b/libc/src/math/sinpif16.h
new file mode 100644
index 000000000000..33a0ae265840
--- /dev/null
+++ b/libc/src/math/sinpif16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for sinpif16 ---------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache Licese v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SINPIF16_H
+#define LLVM_LIBC_SRC_MATH_SINPIF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 sinpif16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SINPIF16_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 07a9405081f9..12e1d078b29b 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -90,6 +90,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sinpif16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    sinpif16_test.cpp
+  DEPENDS
+    libc.src.math.sinpif16
+)
+
 add_fp_unittest(
   sin_test
   NEED_MPFR
diff --git a/libc/test/src/math/sinpif16_test.cpp b/libc/test/src/math/sinpif16_test.cpp
new file mode 100644
index 000000000000..8477124b2e6e
--- /dev/null
+++ b/libc/test/src/math/sinpif16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for sinpif16 --------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===---------------------------------------------------------------------===//
+
+#include "src/math/sinpif16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf]
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0]
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcSinpif16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinpi, x,
+                                   LIBC_NAMESPACE::sinpif16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcSinpif16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinpi, x,
+                                   LIBC_NAMESPACE::sinpif16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index c4787229c3ec..447ea6952713 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -51,6 +51,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sinpif16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    sinpif16_test.cpp
+  DEPENDS
+    libc.src.errno.errno
+    libc.src.math.sinpif16
+)
+
 add_fp_unittest(
   sincosf_test
   SUITE
diff --git a/libc/test/src/math/smoke/sinpif16_test.cpp b/libc/test/src/math/smoke/sinpif16_test.cpp
new file mode 100644
index 000000000000..0bcd38a60d84
--- /dev/null
+++ b/libc/test/src/math/smoke/sinpif16_test.cpp
@@ -0,0 +1,42 @@
+//===-- Unittests for sinpif16 --------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/errno/libc_errno.h"
+#include "src/math/sinpif16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcSinpif16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcSinpif16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif16(inf));
+  EXPECT_MATH_ERRNO(EDOM);
+
+  EXPECT_FP_EQ(aNaN, LIBC_NAMESPACE::sinpif16(neg_inf));
+  EXPECT_MATH_ERRNO(EDOM);
+}
+
+TEST_F(LlvmLibcSinpif16Test, Integers) {
+  EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(-0x420));
+  EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(-0x1p+10));
+  EXPECT_FP_EQ(neg_zero, LIBC_NAMESPACE::sinpif16(-0x1.4p+14));
+  EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(0x420));
+  EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(0x1.cp+15));
+  EXPECT_FP_EQ(zero, LIBC_NAMESPACE::sinpif16(0x1.cp+7));
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index 27ff1f7190ef..eecffc782c1a 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -488,14 +488,28 @@ public:
     (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
 
     mpfr_sinpi(result.value, value, mpfr_rounding);
+    return result;
 #else
+    if (mpfr_integer_p(value)) {
+      mpfr_set_si(result.value, 0, mpfr_rounding);
+      return result;
+    }
+
+    MPFRNumber value_mul_two(*this);
+    mpfr_mul_si(value_mul_two.value, value, 2, MPFR_RNDN);
+
+    if (mpfr_integer_p(value_mul_two.value)) {
+      auto d = mpfr_get_si(value, MPFR_RNDD);
+      mpfr_set_si(result.value, (d & 1) ? -1 : 1, mpfr_rounding);
+      return result;
+    }
+
     MPFRNumber value_pi(0.0, 1280);
     mpfr_const_pi(value_pi.value, MPFR_RNDN);
     mpfr_mul(value_pi.value, value_pi.value, value, MPFR_RNDN);
     mpfr_sin(result.value, value_pi.value, mpfr_rounding);
-#endif
-
     return result;
+#endif
   }
 
   MPFRNumber sinh() const {
-- 
GitLab


From 9b422d14f35ebe4991c47bd5bcfb1dd858e841e6 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 15 Oct 2024 15:48:42 -0700
Subject: [PATCH 045/329] [Clang][TableGen] Use const pointers for various Init
 objects in NeonEmitter (#112317)

Use const pointers for various Init objects in NeonEmitter. This is a
part of effort to have better const correctness in TableGen backends:

https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/NeonEmitter.cpp | 82 +++++++++++++++-------------
 1 file changed, 45 insertions(+), 37 deletions(-)

diff --git a/clang/utils/TableGen/NeonEmitter.cpp b/clang/utils/TableGen/NeonEmitter.cpp
index d4b42360e7fd..adff7c70219b 100644
--- a/clang/utils/TableGen/NeonEmitter.cpp
+++ b/clang/utils/TableGen/NeonEmitter.cpp
@@ -321,7 +321,7 @@ class Intrinsic {
   ClassKind CK;
   /// The list of DAGs for the body. May be empty, in which case we should
   /// emit a builtin call.
-  ListInit *Body;
+  const ListInit *Body;
   /// The architectural ifdef guard.
   std::string ArchGuard;
   /// The architectural target() guard.
@@ -372,9 +372,9 @@ class Intrinsic {
 
 public:
   Intrinsic(const Record *R, StringRef Name, StringRef Proto, TypeSpec OutTS,
-            TypeSpec InTS, ClassKind CK, ListInit *Body, NeonEmitter &Emitter,
-            StringRef ArchGuard, StringRef TargetGuard, bool IsUnavailable,
-            bool BigEndianSafe)
+            TypeSpec InTS, ClassKind CK, const ListInit *Body,
+            NeonEmitter &Emitter, StringRef ArchGuard, StringRef TargetGuard,
+            bool IsUnavailable, bool BigEndianSafe)
       : R(R), Name(Name.str()), OutTS(OutTS), InTS(InTS), CK(CK), Body(Body),
         ArchGuard(ArchGuard.str()), TargetGuard(TargetGuard.str()),
         IsUnavailable(IsUnavailable), BigEndianSafe(BigEndianSafe),
@@ -554,19 +554,20 @@ private:
     DagEmitter(Intrinsic &Intr, StringRef CallPrefix) :
       Intr(Intr), CallPrefix(CallPrefix) {
     }
-    std::pair<Type, std::string> emitDagArg(Init *Arg, std::string ArgName);
-    std::pair<Type, std::string> emitDagSaveTemp(DagInit *DI);
-    std::pair<Type, std::string> emitDagSplat(DagInit *DI);
-    std::pair<Type, std::string> emitDagDup(DagInit *DI);
-    std::pair<Type, std::string> emitDagDupTyped(DagInit *DI);
-    std::pair<Type, std::string> emitDagShuffle(DagInit *DI);
-    std::pair<Type, std::string> emitDagCast(DagInit *DI, bool IsBitCast);
-    std::pair<Type, std::string> emitDagCall(DagInit *DI,
+    std::pair<Type, std::string> emitDagArg(const Init *Arg,
+                                            std::string ArgName);
+    std::pair<Type, std::string> emitDagSaveTemp(const DagInit *DI);
+    std::pair<Type, std::string> emitDagSplat(const DagInit *DI);
+    std::pair<Type, std::string> emitDagDup(const DagInit *DI);
+    std::pair<Type, std::string> emitDagDupTyped(const DagInit *DI);
+    std::pair<Type, std::string> emitDagShuffle(const DagInit *DI);
+    std::pair<Type, std::string> emitDagCast(const DagInit *DI, bool IsBitCast);
+    std::pair<Type, std::string> emitDagCall(const DagInit *DI,
                                              bool MatchMangledName);
-    std::pair<Type, std::string> emitDagNameReplace(DagInit *DI);
-    std::pair<Type, std::string> emitDagLiteral(DagInit *DI);
-    std::pair<Type, std::string> emitDagOp(DagInit *DI);
-    std::pair<Type, std::string> emitDag(DagInit *DI);
+    std::pair<Type, std::string> emitDagNameReplace(const DagInit *DI);
+    std::pair<Type, std::string> emitDagLiteral(const DagInit *DI);
+    std::pair<Type, std::string> emitDagOp(const DagInit *DI);
+    std::pair<Type, std::string> emitDag(const DagInit *DI);
   };
 };
 
@@ -1410,9 +1411,9 @@ void Intrinsic::emitBody(StringRef CallPrefix) {
 
   // We have a list of "things to output". The last should be returned.
   for (auto *I : Body->getValues()) {
-    if (StringInit *SI = dyn_cast<StringInit>(I)) {
+    if (const auto *SI = dyn_cast<StringInit>(I)) {
       Lines.push_back(replaceParamsIn(SI->getAsString()));
-    } else if (DagInit *DI = dyn_cast<DagInit>(I)) {
+    } else if (const auto *DI = dyn_cast<DagInit>(I)) {
       DagEmitter DE(*this, CallPrefix);
       Lines.push_back(DE.emitDag(DI).second + ";");
     }
@@ -1438,9 +1439,9 @@ void Intrinsic::emitReturn() {
   emitNewLine();
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDag(DagInit *DI) {
+std::pair<Type, std::string> Intrinsic::DagEmitter::emitDag(const DagInit *DI) {
   // At this point we should only be seeing a def.
-  DefInit *DefI = cast<DefInit>(DI->getOperator());
+  const DefInit *DefI = cast<DefInit>(DI->getOperator());
   std::string Op = DefI->getAsString();
 
   if (Op == "cast" || Op == "bitcast")
@@ -1467,7 +1468,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDag(DagInit *DI) {
   return std::make_pair(Type::getVoid(), "");
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagOp(DagInit *DI) {
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagOp(const DagInit *DI) {
   std::string Op = cast<StringInit>(DI->getArg(0))->getAsUnquotedString();
   if (DI->getNumArgs() == 2) {
     // Unary op.
@@ -1486,7 +1488,7 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagOp(DagInit *DI) {
 }
 
 std::pair<Type, std::string>
-Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) {
+Intrinsic::DagEmitter::emitDagCall(const DagInit *DI, bool MatchMangledName) {
   std::vector<Type> Types;
   std::vector<std::string> Values;
   for (unsigned I = 0; I < DI->getNumArgs() - 1; ++I) {
@@ -1498,7 +1500,7 @@ Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) {
 
   // Look up the called intrinsic.
   std::string N;
-  if (StringInit *SI = dyn_cast<StringInit>(DI->getArg(0)))
+  if (const auto *SI = dyn_cast<StringInit>(DI->getArg(0)))
     N = SI->getAsUnquotedString();
   else
     N = emitDagArg(DI->getArg(0), "").second;
@@ -1529,8 +1531,8 @@ Intrinsic::DagEmitter::emitDagCall(DagInit *DI, bool MatchMangledName) {
   return std::make_pair(Callee.getReturnType(), S);
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagCast(DagInit *DI,
-                                                                bool IsBitCast){
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagCast(const DagInit *DI, bool IsBitCast) {
   // (cast MOD* VAL) -> cast VAL to type given by MOD.
   std::pair<Type, std::string> R =
       emitDagArg(DI->getArg(DI->getNumArgs() - 1),
@@ -1552,7 +1554,7 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagCast(DagInit *DI,
       castToType =
           Intr.Variables[std::string(DI->getArgNameStr(ArgIdx))].getType();
     } else {
-      StringInit *SI = dyn_cast<StringInit>(DI->getArg(ArgIdx));
+      const auto *SI = dyn_cast<StringInit>(DI->getArg(ArgIdx));
       assert_with_loc(SI, "Expected string type or $Name for cast type");
 
       if (SI->getAsUnquotedString() == "R") {
@@ -1599,7 +1601,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagCast(DagInit *DI,
   return std::make_pair(castToType, S);
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagShuffle(const DagInit *DI) {
   // See the documentation in arm_neon.td for a description of these operators.
   class LowHalf : public SetTheory::Operator {
   public:
@@ -1710,7 +1713,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagShuffle(DagInit *DI){
   return std::make_pair(T, S);
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDup(DagInit *DI) {
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagDup(const DagInit *DI) {
   assert_with_loc(DI->getNumArgs() == 1, "dup() expects one argument");
   std::pair<Type, std::string> A =
       emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0)));
@@ -1729,7 +1733,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDup(DagInit *DI) {
   return std::make_pair(T, S);
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI) {
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagDupTyped(const DagInit *DI) {
   assert_with_loc(DI->getNumArgs() == 2, "dup_typed() expects two arguments");
   std::pair<Type, std::string> B =
       emitDagArg(DI->getArg(1), std::string(DI->getArgNameStr(1)));
@@ -1737,7 +1742,7 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI)
                   "dup_typed() requires a scalar as the second argument");
   Type T;
   // If the type argument is a constant string, construct the type directly.
-  if (StringInit *SI = dyn_cast<StringInit>(DI->getArg(0))) {
+  if (const auto *SI = dyn_cast<StringInit>(DI->getArg(0))) {
     T = Type::fromTypedefName(SI->getAsUnquotedString());
     assert_with_loc(!T.isVoid(), "Unknown typedef");
   } else
@@ -1755,7 +1760,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagDupTyped(DagInit *DI)
   return std::make_pair(T, S);
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagSplat(DagInit *DI) {
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagSplat(const DagInit *DI) {
   assert_with_loc(DI->getNumArgs() == 2, "splat() expects two arguments");
   std::pair<Type, std::string> A =
       emitDagArg(DI->getArg(0), std::string(DI->getArgNameStr(0)));
@@ -1774,7 +1780,8 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagSplat(DagInit *DI) {
   return std::make_pair(Intr.getBaseType(), S);
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagSaveTemp(DagInit *DI) {
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagSaveTemp(const DagInit *DI) {
   assert_with_loc(DI->getNumArgs() == 2, "save_temp() expects two arguments");
   std::pair<Type, std::string> A =
       emitDagArg(DI->getArg(1), std::string(DI->getArgNameStr(1)));
@@ -1797,7 +1804,7 @@ std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagSaveTemp(DagInit *DI)
 }
 
 std::pair<Type, std::string>
-Intrinsic::DagEmitter::emitDagNameReplace(DagInit *DI) {
+Intrinsic::DagEmitter::emitDagNameReplace(const DagInit *DI) {
   std::string S = Intr.Name;
 
   assert_with_loc(DI->getNumArgs() == 2, "name_replace requires 2 arguments!");
@@ -1812,14 +1819,15 @@ Intrinsic::DagEmitter::emitDagNameReplace(DagInit *DI) {
   return std::make_pair(Type::getVoid(), S);
 }
 
-std::pair<Type, std::string> Intrinsic::DagEmitter::emitDagLiteral(DagInit *DI){
+std::pair<Type, std::string>
+Intrinsic::DagEmitter::emitDagLiteral(const DagInit *DI) {
   std::string Ty = cast<StringInit>(DI->getArg(0))->getAsUnquotedString();
   std::string Value = cast<StringInit>(DI->getArg(1))->getAsUnquotedString();
   return std::make_pair(Type::fromTypedefName(Ty), Value);
 }
 
 std::pair<Type, std::string>
-Intrinsic::DagEmitter::emitDagArg(Init *Arg, std::string ArgName) {
+Intrinsic::DagEmitter::emitDagArg(const Init *Arg, std::string ArgName) {
   if (!ArgName.empty()) {
     assert_with_loc(!Arg->isComplete(),
                     "Arguments must either be DAGs or names, not both!");
@@ -1830,7 +1838,7 @@ Intrinsic::DagEmitter::emitDagArg(Init *Arg, std::string ArgName) {
   }
 
   assert(Arg && "Neither ArgName nor Arg?!");
-  DagInit *DI = dyn_cast<DagInit>(Arg);
+  const auto *DI = dyn_cast<DagInit>(Arg);
   assert_with_loc(DI, "Arguments must either be DAGs or names!");
 
   return emitDag(DI);
@@ -1994,7 +2002,7 @@ void NeonEmitter::createIntrinsic(const Record *R,
   // decent location information even when highly nested.
   CurrentRecord = R;
 
-  ListInit *Body = OperationRec->getValueAsListInit("Ops");
+  const ListInit *Body = OperationRec->getValueAsListInit("Ops");
 
   std::vector<TypeSpec> TypeSpecs = TypeSpec::fromTypeSpecs(Types);
 
-- 
GitLab


From ffc5b191c840d6f93bc49770ffc9aa3a9ea79d02 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Tue, 15 Oct 2024 15:49:23 -0700
Subject: [PATCH 046/329] [Clang][TableGen] Use const pointers for various Init
 objects in Diagnostic Emitter (#112318)

Use const pointers for various Init objects in Diagnostic Emitter. This
is a part of effort to have better const correctness in TableGen
backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/ClangDiagnosticsEmitter.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
index 325d63de1563..34e2e8f47ae7 100644
--- a/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
+++ b/clang/utils/TableGen/ClangDiagnosticsEmitter.cpp
@@ -83,7 +83,7 @@ getCategoryFromDiagGroup(const Record *Group,
 static std::string getDiagnosticCategory(const Record *R,
                                          DiagGroupParentMap &DiagGroupParents) {
   // If the diagnostic is in a group, and that group has a category, use it.
-  if (DefInit *Group = dyn_cast<DefInit>(R->getValueInit("Group"))) {
+  if (const auto *Group = dyn_cast<DefInit>(R->getValueInit("Group"))) {
     // Check the diagnostic's diag group for a category.
     std::string CatName = getCategoryFromDiagGroup(Group->getDef(),
                                                    DiagGroupParents);
@@ -161,7 +161,7 @@ static void groupDiagnostics(ArrayRef<const Record *> Diags,
 
   for (unsigned i = 0, e = Diags.size(); i != e; ++i) {
     const Record *R = Diags[i];
-    DefInit *DI = dyn_cast<DefInit>(R->getValueInit("Group"));
+    const auto *DI = dyn_cast<DefInit>(R->getValueInit("Group"));
     if (!DI)
       continue;
     assert(R->getValueAsDef("Class")->getName() != "CLASS_NOTE" &&
@@ -359,7 +359,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic,
     const Record *R = Diags[i];
     if (isExtension(R) && isOffByDefault(R)) {
       DiagsSet.insert(R);
-      if (DefInit *Group = dyn_cast<DefInit>(R->getValueInit("Group"))) {
+      if (const auto *Group = dyn_cast<DefInit>(R->getValueInit("Group"))) {
         const Record *GroupRec = Group->getDef();
         if (!isSubGroupOfGroup(GroupRec, "pedantic")) {
           markGroup(GroupRec);
@@ -378,13 +378,13 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic,
     // Check if the group is implicitly in -Wpedantic.  If so,
     // the diagnostic should not be directly included in the -Wpedantic
     // diagnostic group.
-    if (DefInit *Group = dyn_cast<DefInit>(R->getValueInit("Group")))
+    if (const auto *Group = dyn_cast<DefInit>(R->getValueInit("Group")))
       if (groupInPedantic(Group->getDef()))
         continue;
 
     // The diagnostic is not included in a group that is (transitively) in
     // -Wpedantic.  Include it in -Wpedantic directly.
-    if (RecordVec *V = DiagsInPedantic.dyn_cast<RecordVec*>())
+    if (auto *V = DiagsInPedantic.dyn_cast<RecordVec *>())
       V->push_back(R);
     else {
       DiagsInPedantic.get<RecordSet*>()->insert(R);
@@ -413,7 +413,7 @@ void InferPedantic::compute(VecOrSet DiagsInPedantic,
     if (Parents.size() > 0 && AllParentsInPedantic)
       continue;
 
-    if (RecordVec *V = GroupsInPedantic.dyn_cast<RecordVec*>())
+    if (auto *V = GroupsInPedantic.dyn_cast<RecordVec *>())
       V->push_back(Group);
     else {
       GroupsInPedantic.get<RecordSet*>()->insert(Group);
@@ -1443,7 +1443,7 @@ void clang::EmitClangDiagsDefs(const RecordKeeper &Records, raw_ostream &OS,
     // Check if this is an error that is accidentally in a warning
     // group.
     if (isError(R)) {
-      if (DefInit *Group = dyn_cast<DefInit>(R.getValueInit("Group"))) {
+      if (const auto *Group = dyn_cast<DefInit>(R.getValueInit("Group"))) {
         const Record *GroupRec = Group->getDef();
         const std::string &GroupName =
             std::string(GroupRec->getValueAsString("GroupName"));
@@ -1478,7 +1478,7 @@ void clang::EmitClangDiagsDefs(const RecordKeeper &Records, raw_ostream &OS,
 
     // Warning group associated with the diagnostic. This is stored as an index
     // into the alphabetically sorted warning group table.
-    if (DefInit *DI = dyn_cast<DefInit>(R.getValueInit("Group"))) {
+    if (const auto *DI = dyn_cast<DefInit>(R.getValueInit("Group"))) {
       std::map<std::string, GroupInfo>::iterator I = DiagsInGroup.find(
           std::string(DI->getDef()->getValueAsString("GroupName")));
       assert(I != DiagsInGroup.end());
-- 
GitLab


From 08ed19994b2688a9643430b48669a3aef3dd3216 Mon Sep 17 00:00:00 2001
From: Joseph Huber <huberjn@outlook.com>
Date: Tue, 15 Oct 2024 18:02:08 -0500
Subject: [PATCH 047/329] [libc] Fix incorrect RPC usage in tests

---
 .../startup/gpu/rpc_interface_test.cpp        | 29 ++++++++++++-------
 .../test/integration/startup/gpu/rpc_test.cpp |  6 ++--
 2 files changed, 21 insertions(+), 14 deletions(-)

diff --git a/libc/test/integration/startup/gpu/rpc_interface_test.cpp b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
index 674e2cc1ed74..2dafa911783f 100644
--- a/libc/test/integration/startup/gpu/rpc_interface_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_interface_test.cpp
@@ -18,19 +18,26 @@ using namespace LIBC_NAMESPACE;
 static void test_interface(bool end_with_send) {
   uint64_t cnt = 0;
   rpc::Client::Port port = rpc::client.open<RPC_TEST_INTERFACE>();
-  port.send([&](rpc::Buffer *buffer) { buffer->data[0] = end_with_send; });
-  port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-  port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-  port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-  port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-  port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-  port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
-  port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
-  port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
+  port.send(
+      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = end_with_send; });
+  port.send(
+      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+  port.send(
+      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+  port.send(
+      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
+  port.send(
+      [&](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = cnt = cnt + 1; });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
+  port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
   if (end_with_send)
-    port.send([&](rpc::Buffer *buffer) { buffer->data[0] = cnt = cnt + 1; });
+    port.send([&](rpc::Buffer *buffer, uint32_t) {
+      buffer->data[0] = cnt = cnt + 1;
+    });
   else
-    port.recv([&](rpc::Buffer *buffer) { cnt = buffer->data[0]; });
+    port.recv([&](rpc::Buffer *buffer, uint32_t) { cnt = buffer->data[0]; });
   port.close();
 
   ASSERT_TRUE(cnt == 9 && "Invalid number of increments");
diff --git a/libc/test/integration/startup/gpu/rpc_test.cpp b/libc/test/integration/startup/gpu/rpc_test.cpp
index 4032d890c53e..bec8171180a0 100644
--- a/libc/test/integration/startup/gpu/rpc_test.cpp
+++ b/libc/test/integration/startup/gpu/rpc_test.cpp
@@ -20,10 +20,10 @@ static void test_add_simple() {
   for (uint32_t i = 0; i < num_additions; ++i) {
     rpc::Client::Port port = rpc::client.open<RPC_TEST_INCREMENT>();
     port.send_and_recv(
-        [=](rpc::Buffer *buffer) {
+        [=](rpc::Buffer *buffer, uint32_t) {
           reinterpret_cast<uint64_t *>(buffer->data)[0] = cnt;
         },
-        [&](rpc::Buffer *buffer) {
+        [&](rpc::Buffer *buffer, uint32_t) {
           cnt = reinterpret_cast<uint64_t *>(buffer->data)[0];
         });
     port.close();
@@ -34,7 +34,7 @@ static void test_add_simple() {
 // Test to ensure that the RPC mechanism doesn't hang on divergence.
 static void test_noop(uint8_t data) {
   rpc::Client::Port port = rpc::client.open<RPC_NOOP>();
-  port.send([=](rpc::Buffer *buffer) { buffer->data[0] = data; });
+  port.send([=](rpc::Buffer *buffer, uint32_t) { buffer->data[0] = data; });
   port.close();
 }
 
-- 
GitLab


From cc13d4fb4a208363ed1dab29829cd200a3e39c52 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Tue, 15 Oct 2024 16:07:17 -0700
Subject: [PATCH 048/329] [lldb] Make the system log a NOOP on non-Darwin
 platforms

As discussed in #111911, we have consensus that as it stands, the system
log is only meaningful on Darwin and that by default it should be a NOOP
on other platforms.
---
 lldb/source/Host/common/Host.cpp | 26 ++++----------------------
 1 file changed, 4 insertions(+), 22 deletions(-)

diff --git a/lldb/source/Host/common/Host.cpp b/lldb/source/Host/common/Host.cpp
index 6857a29b0399..03ea2f242d3c 100644
--- a/lldb/source/Host/common/Host.cpp
+++ b/lldb/source/Host/common/Host.cpp
@@ -90,30 +90,12 @@ using namespace lldb;
 using namespace lldb_private;
 
 #if !defined(__APPLE__)
-#if !defined(_WIN32)
-#include <syslog.h>
-void Host::SystemLog(Severity severity, llvm::StringRef message) {
-  static llvm::once_flag g_openlog_once;
-  llvm::call_once(g_openlog_once,
-                  [] { openlog("lldb", LOG_PID | LOG_NDELAY, LOG_USER); });
-  int level = LOG_DEBUG;
-  switch (severity) {
-  case lldb::eSeverityInfo:
-    level = LOG_INFO;
-    break;
-  case lldb::eSeverityWarning:
-    level = LOG_WARNING;
-    break;
-  case lldb::eSeverityError:
-    level = LOG_ERR;
-    break;
-  }
-  syslog(level, "%s", message.data());
-}
-#else
+// The system log is currently only meaningful on Darwin, where this means
+// os_log. The meaning of a "system log" isn't as clear on other platforms, and
+// therefore we don't providate a default implementation. Vendors are free to
+// to implement this function if they have a use for it.
 void Host::SystemLog(Severity severity, llvm::StringRef message) {}
 #endif
-#endif
 
 static constexpr Log::Category g_categories[] = {
     {{"system"}, {"system log"}, SystemLog::System}};
-- 
GitLab


From 1b6a46ab8ee79be5c278fe60fa3ad65790cb1dfe Mon Sep 17 00:00:00 2001
From: Tyler Kenney <tyler@modular.com>
Date: Tue, 15 Oct 2024 19:57:58 -0400
Subject: [PATCH 049/329] [ORC][COFF] Remove the `ExecutionSession&` argument
 to `COFFPlatform` factory & constructor (#112419)

We can get a reference to the `ExecutionSession` from the
`ObjectLinkingLayer` argument, so there's no need to pass it in
separately.

This mirrors recent changes to `ElfNixPlatform` and `MachOPlatform` by
@lhames in
https://github.com/llvm/llvm-project/commit/3dba4ca155e0b460ca82917b25d3624eb5825940
and
https://github.com/llvm/llvm-project/commit/cc20dd285ab72292a1d383d0779aecbe5e1ccf81.
---
 .../llvm/ExecutionEngine/Orc/COFFPlatform.h   | 13 ++++-----
 llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp | 27 ++++++++++---------
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp        |  2 +-
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      |  6 ++---
 4 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h b/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h
index 6bbc9b211333..f44b6b3860fc 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/COFFPlatform.h
@@ -40,18 +40,16 @@ public:
   /// Try to create a COFFPlatform instance, adding the ORC runtime to the
   /// given JITDylib.
   static Expected<std::unique_ptr<COFFPlatform>>
-  Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-         JITDylib &PlatformJD,
+  Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
          std::unique_ptr<MemoryBuffer> OrcRuntimeArchiveBuffer,
          LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime = false,
          const char *VCRuntimePath = nullptr,
          std::optional<SymbolAliasMap> RuntimeAliases = std::nullopt);
 
   static Expected<std::unique_ptr<COFFPlatform>>
-  Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-         JITDylib &PlatformJD, const char *OrcRuntimePath,
-         LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime = false,
-         const char *VCRuntimePath = nullptr,
+  Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
+         const char *OrcRuntimePath, LoadDynamicLibrary LoadDynLibrary,
+         bool StaticVCRuntime = false, const char *VCRuntimePath = nullptr,
          std::optional<SymbolAliasMap> RuntimeAliases = std::nullopt);
 
   ExecutionSession &getExecutionSession() const { return ES; }
@@ -138,8 +136,7 @@ private:
   static bool supportedTarget(const Triple &TT);
 
   COFFPlatform(
-      ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-      JITDylib &PlatformJD,
+      ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
       std::unique_ptr<StaticLibraryDefinitionGenerator> OrcRuntimeGenerator,
       std::unique_ptr<MemoryBuffer> OrcRuntimeArchiveBuffer,
       std::unique_ptr<object::Archive> OrcRuntimeArchive,
diff --git a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
index cdfcae86f79c..f46cb906bb75 100644
--- a/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/COFFPlatform.cpp
@@ -159,11 +159,14 @@ private:
 namespace llvm {
 namespace orc {
 
-Expected<std::unique_ptr<COFFPlatform>> COFFPlatform::Create(
-    ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-    JITDylib &PlatformJD, std::unique_ptr<MemoryBuffer> OrcRuntimeArchiveBuffer,
-    LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime,
-    const char *VCRuntimePath, std::optional<SymbolAliasMap> RuntimeAliases) {
+Expected<std::unique_ptr<COFFPlatform>>
+COFFPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
+                     std::unique_ptr<MemoryBuffer> OrcRuntimeArchiveBuffer,
+                     LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime,
+                     const char *VCRuntimePath,
+                     std::optional<SymbolAliasMap> RuntimeAliases) {
+
+  auto &ES = ObjLinkingLayer.getExecutionSession();
 
   // If the target is not supported then bail out immediately.
   if (!supportedTarget(ES.getTargetTriple()))
@@ -214,7 +217,7 @@ Expected<std::unique_ptr<COFFPlatform>> COFFPlatform::Create(
   // Create the instance.
   Error Err = Error::success();
   auto P = std::unique_ptr<COFFPlatform>(new COFFPlatform(
-      ES, ObjLinkingLayer, PlatformJD, std::move(*OrcRuntimeArchiveGenerator),
+      ObjLinkingLayer, PlatformJD, std::move(*OrcRuntimeArchiveGenerator),
       std::move(OrcRuntimeArchiveBuffer), std::move(RuntimeArchive),
       std::move(LoadDynLibrary), StaticVCRuntime, VCRuntimePath, Err));
   if (Err)
@@ -223,8 +226,8 @@ Expected<std::unique_ptr<COFFPlatform>> COFFPlatform::Create(
 }
 
 Expected<std::unique_ptr<COFFPlatform>>
-COFFPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-                     JITDylib &PlatformJD, const char *OrcRuntimePath,
+COFFPlatform::Create(ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
+                     const char *OrcRuntimePath,
                      LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime,
                      const char *VCRuntimePath,
                      std::optional<SymbolAliasMap> RuntimeAliases) {
@@ -233,7 +236,7 @@ COFFPlatform::Create(ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
   if (!ArchiveBuffer)
     return createFileError(OrcRuntimePath, ArchiveBuffer.getError());
 
-  return Create(ES, ObjLinkingLayer, PlatformJD, std::move(*ArchiveBuffer),
+  return Create(ObjLinkingLayer, PlatformJD, std::move(*ArchiveBuffer),
                 std::move(LoadDynLibrary), StaticVCRuntime, VCRuntimePath,
                 std::move(RuntimeAliases));
 }
@@ -382,14 +385,14 @@ bool COFFPlatform::supportedTarget(const Triple &TT) {
 }
 
 COFFPlatform::COFFPlatform(
-    ExecutionSession &ES, ObjectLinkingLayer &ObjLinkingLayer,
-    JITDylib &PlatformJD,
+    ObjectLinkingLayer &ObjLinkingLayer, JITDylib &PlatformJD,
     std::unique_ptr<StaticLibraryDefinitionGenerator> OrcRuntimeGenerator,
     std::unique_ptr<MemoryBuffer> OrcRuntimeArchiveBuffer,
     std::unique_ptr<object::Archive> OrcRuntimeArchive,
     LoadDynamicLibrary LoadDynLibrary, bool StaticVCRuntime,
     const char *VCRuntimePath, Error &Err)
-    : ES(ES), ObjLinkingLayer(ObjLinkingLayer),
+    : ES(ObjLinkingLayer.getExecutionSession()),
+      ObjLinkingLayer(ObjLinkingLayer),
       LoadDynLibrary(std::move(LoadDynLibrary)),
       OrcRuntimeArchiveBuffer(std::move(OrcRuntimeArchiveBuffer)),
       OrcRuntimeArchive(std::move(OrcRuntimeArchive)),
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index d3db89a2c3e9..7e3f58c0059c 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -1172,7 +1172,7 @@ Expected<JITDylibSP> ExecutorNativePlatform::operator()(LLJIT &J) {
       StaticVCRuntime = VCRuntime->second;
     }
     if (auto P = COFFPlatform::Create(
-            ES, *ObjLinkingLayer, PlatformJD, std::move(RuntimeArchiveBuffer),
+            *ObjLinkingLayer, PlatformJD, std::move(RuntimeArchiveBuffer),
             LoadAndLinkDynLibrary(J), StaticVCRuntime, VCRuntimePath))
       J.getExecutionSession().setPlatform(std::move(*P));
     else
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 108cadd2e016..2d1c19d2a519 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -1057,9 +1057,9 @@ Session::Session(std::unique_ptr<ExecutorProcessControl> EPC, Error &Err)
         return loadAndLinkDynamicLibrary(JD, DLLName);
       };
 
-      if (auto P = COFFPlatform::Create(ES, ObjLayer, *PlatformJD,
-                                        OrcRuntime.c_str(),
-                                        std::move(LoadDynLibrary)))
+      if (auto P =
+              COFFPlatform::Create(ObjLayer, *PlatformJD, OrcRuntime.c_str(),
+                                   std::move(LoadDynLibrary)))
         ES.setPlatform(std::move(*P));
       else {
         Err = P.takeError();
-- 
GitLab


From ed0fd13783a68af6033b2c489eb830af0726856c Mon Sep 17 00:00:00 2001
From: Christopher Ferris <cferris1000@users.noreply.github.com>
Date: Tue, 15 Oct 2024 17:14:50 -0700
Subject: [PATCH 050/329] [scudo] Double frees result in chunk state error
 (#110345)

Fixes bug where a device that supports tagged pointers doesn't use
the tagged pointer when computing the checksum.

Add tests to verify that double frees result in chunk state error
not corrupted header errors.
---
 compiler-rt/lib/scudo/standalone/combined.h   | 22 ++++++++++++-------
 .../scudo/standalone/tests/combined_test.cpp  | 21 ++++++++++++++++++
 2 files changed, 35 insertions(+), 8 deletions(-)

diff --git a/compiler-rt/lib/scudo/standalone/combined.h b/compiler-rt/lib/scudo/standalone/combined.h
index 323a8b9d76c9..5deb8c97f1c8 100644
--- a/compiler-rt/lib/scudo/standalone/combined.h
+++ b/compiler-rt/lib/scudo/standalone/combined.h
@@ -1255,22 +1255,26 @@ private:
     else
       Header->State = Chunk::State::Quarantined;
 
-    void *BlockBegin;
-    if (LIKELY(!useMemoryTagging<AllocatorConfig>(Options))) {
+    if (LIKELY(!useMemoryTagging<AllocatorConfig>(Options)))
       Header->OriginOrWasZeroed = 0U;
-      if (BypassQuarantine && allocatorSupportsMemoryTagging<AllocatorConfig>())
-        Ptr = untagPointer(Ptr);
-      BlockBegin = getBlockBegin(Ptr, Header);
-    } else {
+    else {
       Header->OriginOrWasZeroed =
           Header->ClassId && !TSDRegistry.getDisableMemInit();
-      BlockBegin =
-          retagBlock(Options, TaggedPtr, Ptr, Header, Size, BypassQuarantine);
     }
 
     Chunk::storeHeader(Cookie, Ptr, Header);
 
     if (BypassQuarantine) {
+      void *BlockBegin;
+      if (LIKELY(!useMemoryTagging<AllocatorConfig>(Options))) {
+        // Must do this after storeHeader because loadHeader uses a tagged ptr.
+        if (allocatorSupportsMemoryTagging<AllocatorConfig>())
+          Ptr = untagPointer(Ptr);
+        BlockBegin = getBlockBegin(Ptr, Header);
+      } else {
+        BlockBegin = retagBlock(Options, TaggedPtr, Ptr, Header, Size, true);
+      }
+
       const uptr ClassId = Header->ClassId;
       if (LIKELY(ClassId)) {
         bool CacheDrained;
@@ -1288,6 +1292,8 @@ private:
         Secondary.deallocate(Options, BlockBegin);
       }
     } else {
+      if (UNLIKELY(useMemoryTagging<AllocatorConfig>(Options)))
+        retagBlock(Options, TaggedPtr, Ptr, Header, Size, false);
       typename TSDRegistryT::ScopedTSD TSD(TSDRegistry);
       Quarantine.put(&TSD->getQuarantineCache(),
                      QuarantineCallback(*this, TSD->getCache()), Ptr, Size);
diff --git a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
index 16b19e807e11..ff98eb3397ee 100644
--- a/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
+++ b/compiler-rt/lib/scudo/standalone/tests/combined_test.cpp
@@ -534,6 +534,27 @@ SCUDO_TYPED_TEST(ScudoCombinedDeathTest, UseAfterFree) {
   }
 }
 
+SCUDO_TYPED_TEST(ScudoCombinedDeathTest, DoubleFreeFromPrimary) {
+  auto *Allocator = this->Allocator.get();
+
+  for (scudo::uptr SizeLog = 0U; SizeLog <= 20U; SizeLog++) {
+    const scudo::uptr Size = 1U << SizeLog;
+    if (!isPrimaryAllocation<TestAllocator<TypeParam>>(Size, 0))
+      break;
+
+    // Verify that a double free results in a chunk state error.
+    EXPECT_DEATH(
+        {
+          // Allocate from primary
+          void *P = Allocator->allocate(Size, Origin);
+          ASSERT_TRUE(P != nullptr);
+          Allocator->deallocate(P, Origin);
+          Allocator->deallocate(P, Origin);
+        },
+        "invalid chunk state");
+  }
+}
+
 SCUDO_TYPED_TEST(ScudoCombinedDeathTest, DisableMemoryTagging) {
   auto *Allocator = this->Allocator.get();
 
-- 
GitLab


From eca3206d29e7ce97dd6336deaa3da96be37f8277 Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid <omair.javaid@linaro.org>
Date: Wed, 16 Oct 2024 05:30:50 +0500
Subject: [PATCH 051/329] [lldb] Fix command-expr-diagnostics.test for Windows
 (#112109)

This adds a minor change to command-expr-diagnostics.test to make
it pass on windows. Clang produces PDB on windows by default which
was ignoring main symbol due to optimization. The problem is fixed
by adding -gdwarf to commandline, making sure dwarf debug info gets
generated on both Windows and Linux.
---
 lldb/test/Shell/Commands/command-expr-diagnostics.test | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lldb/test/Shell/Commands/command-expr-diagnostics.test b/lldb/test/Shell/Commands/command-expr-diagnostics.test
index b242dba1980f..3dab204ca87d 100644
--- a/lldb/test/Shell/Commands/command-expr-diagnostics.test
+++ b/lldb/test/Shell/Commands/command-expr-diagnostics.test
@@ -17,7 +17,7 @@
 # CHECK3: {{^                         error: use of undeclared identifier 'a'}}
 
 # RUN: echo "int main(){return 0;}">%t.c
-# RUN: %clang_host %t.c -o %t.exe
+# RUN: %clang_host %t.c -gdwarf -o %t.exe
 # RUN: echo quit | %lldb %t.exe -o "b main" -o r -o \
 # RUN: "expr --top-level -- template<typename T> T FOO(T x) { return x/2;}" -o \
 # RUN: "expression -- FOO(\"\")" 2>&1 | FileCheck %s --check-prefix=CHECK4
-- 
GitLab


From 69f7758ddba662b63667507f2c472c008909dd7e Mon Sep 17 00:00:00 2001
From: Muhammad Omair Javaid <omair.javaid@linaro.org>
Date: Wed, 16 Oct 2024 06:05:30 +0500
Subject: [PATCH 052/329] Revert "[lldb] Fix command-expr-diagnostics.test for
 Windows (#112109)"

This reverts commit eca3206d29e7ce97dd6336deaa3da96be37f8277.

This broke LLDB Linux bot for no apparent reason. I ll post a more
suitable fix later. Disabled command-expr-diagnostics.test on
windows for now.
---
 lldb/test/Shell/Commands/command-expr-diagnostics.test | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/lldb/test/Shell/Commands/command-expr-diagnostics.test b/lldb/test/Shell/Commands/command-expr-diagnostics.test
index 3dab204ca87d..72df47bbbdc1 100644
--- a/lldb/test/Shell/Commands/command-expr-diagnostics.test
+++ b/lldb/test/Shell/Commands/command-expr-diagnostics.test
@@ -1,3 +1,4 @@
+# XFAIL: system-windows
 # RUN: echo quit | %lldb -o "expression a+b" \
 # RUN:   | FileCheck %s --strict-whitespace --check-prefix=CHECK1
 #            (lldb) expression a+b
@@ -17,7 +18,7 @@
 # CHECK3: {{^                         error: use of undeclared identifier 'a'}}
 
 # RUN: echo "int main(){return 0;}">%t.c
-# RUN: %clang_host %t.c -gdwarf -o %t.exe
+# RUN: %clang_host %t.c -o %t.exe
 # RUN: echo quit | %lldb %t.exe -o "b main" -o r -o \
 # RUN: "expr --top-level -- template<typename T> T FOO(T x) { return x/2;}" -o \
 # RUN: "expression -- FOO(\"\")" 2>&1 | FileCheck %s --check-prefix=CHECK4
-- 
GitLab


From 210140ab6ac8126e5beec65209602cd55c4d09c7 Mon Sep 17 00:00:00 2001
From: Xing Guo <higuoxing+github@gmail.com>
Date: Wed, 16 Oct 2024 09:42:40 +0800
Subject: [PATCH 053/329] [JITLink] Add support for R_X86_64_SIZE* relocations.
 (#110081)

This patch adds support for R_X86_64_SIZE32/R_X86_64_SIZE64 relocation
types by introducing edge kinds x86_64::Size32/x86_64::Size64. The
calculation for these relocations is: Z + A, where:

Z - Represents the size of the symbol whose index resides in the
    relocation entry.

A - Represents the addend used to compute the value of the relocation
    field.

Ref: [System V Application Binary Interface
x86-64](https://gitlab.com/x86-psABIs/x86-64-ABI/-/jobs/artifacts/master/raw/x86-64-ABI/abi.pdf?job=build)
---
 .../llvm/ExecutionEngine/JITLink/x86_64.h     | 34 +++++++++++++++++++
 .../ExecutionEngine/JITLink/ELF_x86_64.cpp    |  6 ++++
 llvm/lib/ExecutionEngine/JITLink/x86_64.cpp   |  4 +++
 .../JITLink/x86-64/ELF_R_X86_64_SIZE.s        | 27 +++++++++++++++
 4 files changed, 71 insertions(+)
 create mode 100644 llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_SIZE.s

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
index 24cf982fc3ab..0d7e0fdb5820 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/x86_64.h
@@ -142,6 +142,24 @@ enum EdgeKind_x86_64 : Edge::Kind {
   ///     an out-of-range error will be returned.
   NegDelta32,
 
+  /// A 64-bit size relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Size + Addend : uint64
+  ///
+  Size64,
+
+  /// A 32-bit size relocation.
+  ///
+  /// Fixup expression:
+  ///   Fixup <- Size + Addend : uint32
+  ///
+  /// Errors:
+  ///   - The result of the fixup expression must fit into an uint32, otherwise
+  ///     an out-of-range error will be returned.
+  ///
+  Size32,
+
   /// A 64-bit GOT delta.
   ///
   /// Delta from the global offset table to the target
@@ -531,6 +549,22 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
       return makeTargetOutOfRangeError(G, B, E);
     break;
   }
+
+  case Size64: {
+    uint64_t Value = E.getTarget().getSize() + E.getAddend();
+    *(ulittle64_t *)FixupPtr = Value;
+    break;
+  }
+
+  case Size32: {
+    uint64_t Value = E.getTarget().getSize() + E.getAddend();
+    if (LLVM_LIKELY(isUInt<32>(Value)))
+      *(ulittle32_t *)FixupPtr = Value;
+    else
+      return makeTargetOutOfRangeError(G, B, E);
+    break;
+  }
+
   case Delta64FromGOT: {
     assert(GOTSymbol && "No GOT section symbol");
     int64_t Value =
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
index 6a32ccc37765..44122726fb5c 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_x86_64.cpp
@@ -182,6 +182,12 @@ private:
     case ELF::R_X86_64_64:
       Kind = x86_64::Pointer64;
       break;
+    case ELF::R_X86_64_SIZE32:
+      Kind = x86_64::Size32;
+      break;
+    case ELF::R_X86_64_SIZE64:
+      Kind = x86_64::Size64;
+      break;
     case ELF::R_X86_64_GOTPCREL:
       Kind = x86_64::RequestGOTAndTransformToDelta32;
       break;
diff --git a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
index cca4358a3776..e5b48d2c3fab 100644
--- a/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/x86_64.cpp
@@ -42,6 +42,10 @@ const char *getEdgeKindName(Edge::Kind K) {
     return "NegDelta64";
   case NegDelta32:
     return "NegDelta32";
+  case Size64:
+    return "Size64";
+  case Size32:
+    return "Size32";
   case Delta64FromGOT:
     return "Delta64FromGOT";
   case PCRel32:
diff --git a/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_SIZE.s b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_SIZE.s
new file mode 100644
index 000000000000..abde122f76e2
--- /dev/null
+++ b/llvm/test/ExecutionEngine/JITLink/x86-64/ELF_R_X86_64_SIZE.s
@@ -0,0 +1,27 @@
+# Checks that JITLink is able to handle R_X86_64_SIZE32/R_X86_64_SIZE64 relocations.
+# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent \
+# RUN:     -filetype=obj -o %t.1.o %s
+# RUN: llvm-jitlink -noexec %t.1.o
+
+# Checks that JITLink emits an error message when the fixup cannot fit into a 32-bit value.
+# RUN: llvm-mc -triple=x86_64-unknown-linux -position-independent --defsym=OVERFLOW=1 \
+# RUN:     -filetype=obj -o %t.2.o %s
+# RUN: not llvm-jitlink -noexec %t.2.o 2>&1 | FileCheck %s
+# CHECK: llvm-jitlink error: In graph {{.*}}, section .text: relocation target "main" at address {{.*}} is out of range of Size32 fixup at {{.*}} (main, {{.*}})
+
+	.text
+	.globl	main
+	.type	main,@function
+main:
+	xorl	%eax, %eax
+	movq	main@SIZE + 2, %rbx  # Generate R_X86_64_SIZE32 relocation.
+.ifndef OVERFLOW
+	movl	main@SIZE + 1, %ebx  # Generate R_X86_64_SIZE32 relocation.
+.else
+	movl	main@SIZE - 32, %ebx # Generate R_X86_64_SIZE32 relocation whose fixup overflows.
+.endif	
+	retq
+	.size	main, .-main
+
+	.data
+	.quad	main@SIZE + 1 # Generate R_X86_64_SIZE64 relocation.
-- 
GitLab


From 6d13cc9411b998aabf1a55e0813236ba7a278929 Mon Sep 17 00:00:00 2001
From: Finn Plummer <50529406+inbelic@users.noreply.github.com>
Date: Tue, 15 Oct 2024 18:49:40 -0700
Subject: [PATCH 054/329] [HLSL] Implement `WaveReadLaneAt` intrinsic (#111010)

- create a clang built-in in Builtins.td
    - add semantic checking in SemaHLSL.cpp
    - link the WaveReadLaneAt api in hlsl_intrinsics.h
    - add lowering to spirv backend op GroupNonUniformShuffle
      with Scope = 2 (Group) in SPIRVInstructionSelector.cpp
    - add WaveReadLaneAt intrinsic to IntrinsicsDirectX.td and mapping
      to DXIL.td

    - add tests for HLSL intrinsic lowering to spirv intrinsic in
      WaveReadLaneAt.hlsl
    - add tests for sema checks in WaveReadLaneAt-errors.hlsl
    - add spir-v backend tests in WaveReadLaneAt.ll
    - add test to show scalar dxil lowering functionality

    - note that this doesn't include support for the scalarizer to
      handle WaveReadLaneAt will be added in a future pr

This is the first part #70104
---
 clang/include/clang/Basic/Builtins.td         |  6 ++
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +
 clang/lib/CodeGen/CGBuiltin.cpp               | 18 +++++
 clang/lib/CodeGen/CGHLSLRuntime.h             |  1 +
 clang/lib/Headers/hlsl/hlsl_intrinsics.h      | 80 +++++++++++++++++++
 clang/lib/Sema/SemaHLSL.cpp                   | 39 +++++++++
 .../CodeGenHLSL/builtins/WaveReadLaneAt.hlsl  | 74 +++++++++++++++++
 .../BuiltIns/WaveReadLaneAt-errors.hlsl       | 38 +++++++++
 llvm/include/llvm/IR/IntrinsicsDirectX.td     |  1 +
 llvm/include/llvm/IR/IntrinsicsSPIRV.td       |  1 +
 llvm/lib/Target/DirectX/DXIL.td               | 10 +++
 .../Target/SPIRV/SPIRVInstructionSelector.cpp | 26 ++++++
 llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll   | 61 ++++++++++++++
 .../SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll   | 56 +++++++++++++
 14 files changed, 413 insertions(+)
 create mode 100644 clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
 create mode 100644 clang/test/SemaHLSL/BuiltIns/WaveReadLaneAt-errors.hlsl
 create mode 100644 llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll
 create mode 100644 llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll

diff --git a/clang/include/clang/Basic/Builtins.td b/clang/include/clang/Basic/Builtins.td
index bda8a48be92b..382fb6b7a3c0 100644
--- a/clang/include/clang/Basic/Builtins.td
+++ b/clang/include/clang/Basic/Builtins.td
@@ -4761,6 +4761,12 @@ def HLSLWaveIsFirstLane : LangBuiltin<"HLSL_LANG"> {
   let Prototype = "bool()";
 }
 
+def HLSLWaveReadLaneAt : LangBuiltin<"HLSL_LANG"> {
+  let Spellings = ["__builtin_hlsl_wave_read_lane_at"];
+  let Attributes = [NoThrow, Const];
+  let Prototype = "void(...)";
+}
+
 def HLSLClamp : LangBuiltin<"HLSL_LANG"> {
   let Spellings = ["__builtin_hlsl_elementwise_clamp"];
   let Attributes = [NoThrow, Const];
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index e9111394bcd3..c458a62d9be4 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -9230,6 +9230,8 @@ def err_typecheck_cond_incompatible_operands : Error<
 def err_typecheck_expect_scalar_or_vector : Error<
   "invalid operand of type %0 where %1 or "
   "a vector of such type is required">;
+def err_typecheck_expect_any_scalar_or_vector : Error<
+  "invalid operand of type %0 where a scalar or vector is required">;
 def err_typecheck_expect_flt_or_vector : Error<
   "invalid operand of type %0 where floating, complex or "
   "a vector of such types is required">;
diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 157e743a39bf..12f99d9f1178 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -18905,6 +18905,24 @@ case Builtin::BI__builtin_hlsl_elementwise_isinf: {
     return EmitRuntimeCall(
         Intrinsic::getOrInsertDeclaration(&CGM.getModule(), ID));
   }
+  case Builtin::BI__builtin_hlsl_wave_read_lane_at: {
+    // Due to the use of variadic arguments we must explicitly retreive them and
+    // create our function type.
+    Value *OpExpr = EmitScalarExpr(E->getArg(0));
+    Value *OpIndex = EmitScalarExpr(E->getArg(1));
+    llvm::FunctionType *FT = llvm::FunctionType::get(
+        OpExpr->getType(), ArrayRef{OpExpr->getType(), OpIndex->getType()},
+        false);
+
+    // Get overloaded name
+    std::string Name =
+        Intrinsic::getName(CGM.getHLSLRuntime().getWaveReadLaneAtIntrinsic(),
+                           ArrayRef{OpExpr->getType()}, &CGM.getModule());
+    return EmitRuntimeCall(CGM.CreateRuntimeFunction(FT, Name, {},
+                                                     /*Local=*/false,
+                                                     /*AssumeConvergent=*/true),
+                           ArrayRef{OpExpr, OpIndex}, "hlsl.wave.readlane");
+  }
   case Builtin::BI__builtin_hlsl_elementwise_sign: {
     auto *Arg0 = E->getArg(0);
     Value *Op0 = EmitScalarExpr(Arg0);
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index 282fa44af212..f7621ee20b12 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -90,6 +90,7 @@ public:
   GENERATE_HLSL_INTRINSIC_FUNCTION(SDot, sdot)
   GENERATE_HLSL_INTRINSIC_FUNCTION(UDot, udot)
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
+  GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
 
   //===----------------------------------------------------------------------===//
   // End of reserved area for HLSL intrinsic getters.
diff --git a/clang/lib/Headers/hlsl/hlsl_intrinsics.h b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
index 137467e5a782..30dce60b3ff7 100644
--- a/clang/lib/Headers/hlsl/hlsl_intrinsics.h
+++ b/clang/lib/Headers/hlsl/hlsl_intrinsics.h
@@ -2097,6 +2097,86 @@ _HLSL_AVAILABILITY(shadermodel, 6.0)
 _HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_is_first_lane)
 __attribute__((convergent)) bool WaveIsFirstLane();
 
+//===----------------------------------------------------------------------===//
+// WaveReadLaneAt builtins
+//===----------------------------------------------------------------------===//
+
+// \brief Returns the value of the expression for the given lane index within
+// the specified wave.
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) bool WaveReadLaneAt(bool, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) bool2 WaveReadLaneAt(bool2, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) bool3 WaveReadLaneAt(bool3, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) bool4 WaveReadLaneAt(bool4, int32_t);
+
+#ifdef __HLSL_ENABLE_16_BIT
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int16_t WaveReadLaneAt(int16_t, int32_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int16_t2 WaveReadLaneAt(int16_t2, int32_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int16_t3 WaveReadLaneAt(int16_t3, int32_t);
+_HLSL_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int16_t4 WaveReadLaneAt(int16_t4, int32_t);
+#endif
+
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) half WaveReadLaneAt(half, int32_t);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) half2 WaveReadLaneAt(half2, int32_t);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) half3 WaveReadLaneAt(half3, int32_t);
+_HLSL_16BIT_AVAILABILITY(shadermodel, 6.0)
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) half4 WaveReadLaneAt(half4, int32_t);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int WaveReadLaneAt(int, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int2 WaveReadLaneAt(int2, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int3 WaveReadLaneAt(int3, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int4 WaveReadLaneAt(int4, int32_t);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) float WaveReadLaneAt(float, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) float2 WaveReadLaneAt(float2, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) float3 WaveReadLaneAt(float3, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) float4 WaveReadLaneAt(float4, int32_t);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int64_t WaveReadLaneAt(int64_t, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int64_t2 WaveReadLaneAt(int64_t2, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int64_t3 WaveReadLaneAt(int64_t3, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) int64_t4 WaveReadLaneAt(int64_t4, int32_t);
+
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) double WaveReadLaneAt(double, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) double2 WaveReadLaneAt(double2, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) double3 WaveReadLaneAt(double3, int32_t);
+_HLSL_BUILTIN_ALIAS(__builtin_hlsl_wave_read_lane_at)
+__attribute__((convergent)) double4 WaveReadLaneAt(double4, int32_t);
+
 //===----------------------------------------------------------------------===//
 // sign builtins
 //===----------------------------------------------------------------------===//
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 137b15c8fcfe..698fdbed0484 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1751,6 +1751,22 @@ static bool CheckScalarOrVector(Sema *S, CallExpr *TheCall, QualType Scalar,
   return false;
 }
 
+static bool CheckAnyScalarOrVector(Sema *S, CallExpr *TheCall,
+                                   unsigned ArgIndex) {
+  assert(TheCall->getNumArgs() >= ArgIndex);
+  QualType ArgType = TheCall->getArg(ArgIndex)->getType();
+  auto *VTy = ArgType->getAs<VectorType>();
+  // not the scalar or vector<scalar>
+  if (!(ArgType->isScalarType() ||
+        (VTy && VTy->getElementType()->isScalarType()))) {
+    S->Diag(TheCall->getArg(0)->getBeginLoc(),
+            diag::err_typecheck_expect_any_scalar_or_vector)
+        << ArgType;
+    return true;
+  }
+  return false;
+}
+
 static bool CheckBoolSelect(Sema *S, CallExpr *TheCall) {
   assert(TheCall->getNumArgs() == 3);
   Expr *Arg1 = TheCall->getArg(1);
@@ -1993,6 +2009,29 @@ bool SemaHLSL::CheckBuiltinFunctionCall(unsigned BuiltinID, CallExpr *TheCall) {
       return true;
     break;
   }
+  case Builtin::BI__builtin_hlsl_wave_read_lane_at: {
+    if (SemaRef.checkArgCount(TheCall, 2))
+      return true;
+
+    // Ensure index parameter type can be interpreted as a uint
+    ExprResult Index = TheCall->getArg(1);
+    QualType ArgTyIndex = Index.get()->getType();
+    if (!ArgTyIndex->isIntegerType()) {
+      SemaRef.Diag(TheCall->getArg(1)->getBeginLoc(),
+                   diag::err_typecheck_convert_incompatible)
+          << ArgTyIndex << SemaRef.Context.UnsignedIntTy << 1 << 0 << 0;
+      return true;
+    }
+
+    // Ensure input expr type is a scalar/vector and the same as the return type
+    if (CheckAnyScalarOrVector(&SemaRef, TheCall, 0))
+      return true;
+
+    ExprResult Expr = TheCall->getArg(0);
+    QualType ArgTyExpr = Expr.get()->getType();
+    TheCall->setType(ArgTyExpr);
+    break;
+  }
   case Builtin::BI__builtin_hlsl_wave_get_lane_index: {
     if (SemaRef.checkArgCount(TheCall, 0))
       return true;
diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
new file mode 100644
index 000000000000..03e149d0a9f2
--- /dev/null
+++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
@@ -0,0 +1,74 @@
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
+// RUN:   dxil-pc-shadermodel6.3-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN: %clang_cc1 -std=hlsl2021 -finclude-default-header -fnative-half-type -triple \
+// RUN:   spirv-pc-vulkan-compute %s -emit-llvm -disable-llvm-passes -o - | \
+// RUN:   FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
+
+// Test basic lowering to runtime function call for int values.
+
+// CHECK-LABEL: test_int
+int test_int(int expr, uint idx) {
+  // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveReadLaneAt(expr, idx);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i32([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]]
+
+#ifdef __HLSL_ENABLE_16_BIT
+// CHECK-LABEL: test_int16
+int16_t test_int16(int16_t expr, uint idx) {
+  // CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry()
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveReadLaneAt(expr, idx);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]]
+#endif
+
+// Test basic lowering to runtime function call with array and float values.
+
+// CHECK-LABEL: test_half
+half test_half(half expr, uint idx) {
+  // CHECK-SPIRV: %[[#entry_tok2:]] = call token @llvm.experimental.convergence.entry()
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ]
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveReadLaneAt(expr, idx);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]]
+
+// CHECK-LABEL: test_double
+double test_double(double expr, uint idx) {
+  // CHECK-SPIRV: %[[#entry_tok3:]] = call token @llvm.experimental.convergence.entry()
+  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ]
+  // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]])
+  // CHECK:  ret [[TY]] %[[RET]]
+  return WaveReadLaneAt(expr, idx);
+}
+
+// CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f64([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]]
+
+// CHECK-LABEL: test_floatv4
+float4 test_floatv4(float4 expr, uint idx) {
+  // CHECK-SPIRV: %[[#entry_tok4:]] = call token @llvm.experimental.convergence.entry()
+  // CHECK-SPIRV:  %[[RET1:.*]] = call [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ]
+  // CHECK-DXIL:  %[[RET1:.*]] = call [[TY1:.*]] @llvm.dx.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]])
+  // CHECK:  ret [[TY1]] %[[RET1]]
+  return WaveReadLaneAt(expr, idx);
+}
+
+// CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
+// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
+
+// CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
diff --git a/clang/test/SemaHLSL/BuiltIns/WaveReadLaneAt-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/WaveReadLaneAt-errors.hlsl
new file mode 100644
index 000000000000..ef8299b59ca7
--- /dev/null
+++ b/clang/test/SemaHLSL/BuiltIns/WaveReadLaneAt-errors.hlsl
@@ -0,0 +1,38 @@
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -emit-llvm-only -disable-llvm-passes -verify
+
+bool test_too_few_arg() {
+  return __builtin_hlsl_wave_read_lane_at();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+}
+
+float2 test_too_few_arg_1(float2 p0) {
+  return __builtin_hlsl_wave_read_lane_at(p0);
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 1}}
+}
+
+float2 test_too_many_arg(float2 p0) {
+  return __builtin_hlsl_wave_read_lane_at(p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+float3 test_index_double_type_check(float3 p0, double idx) {
+  return __builtin_hlsl_wave_read_lane_at(p0, idx);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'unsigned int'}}
+}
+
+float3 test_index_int3_type_check(float3 p0, int3 idxs) {
+  return __builtin_hlsl_wave_read_lane_at(p0, idxs);
+  // expected-error@-1 {{passing 'int3' (aka 'vector<int, 3>') to parameter of incompatible type 'unsigned int'}}
+}
+
+struct S { float f; };
+
+float3 test_index_S_type_check(float3 p0, S idx) {
+  return __builtin_hlsl_wave_read_lane_at(p0, idx);
+  // expected-error@-1 {{passing 'S' to parameter of incompatible type 'unsigned int'}}
+}
+
+S test_expr_struct_type_check(S p0, int idx) {
+  return __builtin_hlsl_wave_read_lane_at(p0, idx);
+  // expected-error@-1 {{invalid operand of type 'S' where a scalar or vector is required}}
+}
diff --git a/llvm/include/llvm/IR/IntrinsicsDirectX.td b/llvm/include/llvm/IR/IntrinsicsDirectX.td
index 45aea1ccdb6d..27a437a83be6 100644
--- a/llvm/include/llvm/IR/IntrinsicsDirectX.td
+++ b/llvm/include/llvm/IR/IntrinsicsDirectX.td
@@ -86,6 +86,7 @@ def int_dx_normalize : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_
 def int_dx_rsqrt  : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_wave_getlaneindex : DefaultAttrsIntrinsic<[llvm_i32_ty], [], [IntrConvergent, IntrNoMem]>;
 def int_dx_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
+def int_dx_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
 def int_dx_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
 def int_dx_step : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty, LLVMMatchType<0>], [IntrNoMem]>;
 def int_dx_radians : DefaultAttrsIntrinsic<[llvm_anyfloat_ty], [LLVMMatchType<0>], [IntrNoMem]>;
diff --git a/llvm/include/llvm/IR/IntrinsicsSPIRV.td b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
index 3d61456589ee..6df2eb156a07 100644
--- a/llvm/include/llvm/IR/IntrinsicsSPIRV.td
+++ b/llvm/include/llvm/IR/IntrinsicsSPIRV.td
@@ -84,6 +84,7 @@ let TargetPrefix = "spv" in {
     [llvm_anyint_ty, LLVMScalarOrSameVectorWidth<0, LLVMVectorElementType<0>>],
     [IntrNoMem, Commutative] >;
   def int_spv_wave_is_first_lane : DefaultAttrsIntrinsic<[llvm_i1_ty], [], [IntrConvergent]>;
+  def int_spv_wave_readlane : DefaultAttrsIntrinsic<[llvm_any_ty], [LLVMMatchType<0>, llvm_i32_ty], [IntrConvergent, IntrNoMem]>;
   def int_spv_sign : DefaultAttrsIntrinsic<[LLVMScalarOrSameVectorWidth<0, llvm_i32_ty>], [llvm_any_ty], [IntrNoMem]>;
   def int_spv_radians : DefaultAttrsIntrinsic<[LLVMMatchType<0>], [llvm_anyfloat_ty], [IntrNoMem]>;
 
diff --git a/llvm/lib/Target/DirectX/DXIL.td b/llvm/lib/Target/DirectX/DXIL.td
index e8f56b18730d..147b32b1ca99 100644
--- a/llvm/lib/Target/DirectX/DXIL.td
+++ b/llvm/lib/Target/DirectX/DXIL.td
@@ -802,6 +802,16 @@ def WaveIsFirstLane :  DXILOp<110, waveIsFirstLane> {
   let attributes = [Attributes<DXIL1_0, [ReadNone]>];
 }
 
+def WaveReadLaneAt:  DXILOp<117, waveReadLaneAt> {
+  let Doc = "returns the value from the specified lane";
+  let LLVMIntrinsic = int_dx_wave_readlane;
+  let arguments = [OverloadTy, Int32Ty];
+  let result = OverloadTy;
+  let overloads = [Overloads<DXIL1_0, [HalfTy, FloatTy, DoubleTy, Int1Ty, Int16Ty, Int32Ty, Int64Ty]>];
+  let stages = [Stages<DXIL1_0, [all_stages]>];
+  let attributes = [Attributes<DXIL1_0, [ReadNone]>];
+}
+
 def WaveGetLaneIndex : DXILOp<111, waveGetLaneIndex> {
   let Doc = "returns the index of the current lane in the wave";
   let LLVMIntrinsic = int_dx_wave_getlaneindex;
diff --git a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
index fd92346717c4..d9377fe4b91a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
+++ b/llvm/lib/Target/SPIRV/SPIRVInstructionSelector.cpp
@@ -230,6 +230,9 @@ private:
   bool selectSpvThreadId(Register ResVReg, const SPIRVType *ResType,
                          MachineInstr &I) const;
 
+  bool selectWaveReadLaneAt(Register ResVReg, const SPIRVType *ResType,
+                            MachineInstr &I) const;
+
   bool selectUnmergeValues(MachineInstr &I) const;
 
   void selectHandleFromBinding(Register &ResVReg, const SPIRVType *ResType,
@@ -417,6 +420,7 @@ bool SPIRVInstructionSelector::spvSelect(Register ResVReg,
 
   case TargetOpcode::G_INTRINSIC:
   case TargetOpcode::G_INTRINSIC_W_SIDE_EFFECTS:
+  case TargetOpcode::G_INTRINSIC_CONVERGENT:
   case TargetOpcode::G_INTRINSIC_CONVERGENT_W_SIDE_EFFECTS:
     return selectIntrinsic(ResVReg, ResType, I);
   case TargetOpcode::G_BITREVERSE:
@@ -1758,6 +1762,26 @@ bool SPIRVInstructionSelector::selectSign(Register ResVReg,
   return Result;
 }
 
+bool SPIRVInstructionSelector::selectWaveReadLaneAt(Register ResVReg,
+                                                    const SPIRVType *ResType,
+                                                    MachineInstr &I) const {
+  assert(I.getNumOperands() == 4);
+  assert(I.getOperand(2).isReg());
+  assert(I.getOperand(3).isReg());
+  MachineBasicBlock &BB = *I.getParent();
+
+  // IntTy is used to define the execution scope, set to 3 to denote a
+  // cross-lane interaction equivalent to a SPIR-V subgroup.
+  SPIRVType *IntTy = GR.getOrCreateSPIRVIntegerType(32, I, TII);
+  return BuildMI(BB, I, I.getDebugLoc(),
+                 TII.get(SPIRV::OpGroupNonUniformShuffle))
+      .addDef(ResVReg)
+      .addUse(GR.getSPIRVTypeID(ResType))
+      .addUse(GR.getOrCreateConstInt(3, I, IntTy, TII))
+      .addUse(I.getOperand(2).getReg())
+      .addUse(I.getOperand(3).getReg());
+}
+
 bool SPIRVInstructionSelector::selectBitreverse(Register ResVReg,
                                                 const SPIRVType *ResType,
                                                 MachineInstr &I) const {
@@ -2543,6 +2567,8 @@ bool SPIRVInstructionSelector::selectIntrinsic(Register ResVReg,
         .addUse(GR.getSPIRVTypeID(ResType))
         .addUse(GR.getOrCreateConstInt(3, I, IntTy, TII));
   }
+  case Intrinsic::spv_wave_readlane:
+    return selectWaveReadLaneAt(ResVReg, ResType, I);
   case Intrinsic::spv_step:
     return selectExtInst(ResVReg, ResType, I, CL::step, GL::Step);
   case Intrinsic::spv_radians:
diff --git a/llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll b/llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll
new file mode 100644
index 000000000000..0024ba66c0ca
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveReadLaneAt.ll
@@ -0,0 +1,61 @@
+; RUN: opt -S -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+; Test that for scalar values, WaveReadLaneAt maps down to the DirectX op
+
+define noundef half @wave_rla_half(half noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr, i32 %idx)
+  %ret = call half @llvm.dx.wave.readlane.f16(half %expr, i32 %idx)
+  ret half %ret
+}
+
+define noundef float @wave_rla_float(float noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call float @dx.op.waveReadLaneAt.f32(i32 117, float %expr, i32 %idx)
+  %ret = call float @llvm.dx.wave.readlane(float %expr, i32 %idx)
+  ret float %ret
+}
+
+define noundef double @wave_rla_double(double noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr, i32 %idx)
+  %ret = call double @llvm.dx.wave.readlane(double %expr, i32 %idx)
+  ret double %ret
+}
+
+define noundef i1 @wave_rla_i1(i1 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i1 @dx.op.waveReadLaneAt.i1(i32 117, i1 %expr, i32 %idx)
+  %ret = call i1 @llvm.dx.wave.readlane.i1(i1 %expr, i32 %idx)
+  ret i1 %ret
+}
+
+define noundef i16 @wave_rla_i16(i16 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i16 @dx.op.waveReadLaneAt.i16(i32 117, i16 %expr, i32 %idx)
+  %ret = call i16 @llvm.dx.wave.readlane.i16(i16 %expr, i32 %idx)
+  ret i16 %ret
+}
+
+define noundef i32 @wave_rla_i32(i32 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr, i32 %idx)
+  %ret = call i32 @llvm.dx.wave.readlane.i32(i32 %expr, i32 %idx)
+  ret i32 %ret
+}
+
+define noundef i64 @wave_rla_i64(i64 noundef %expr, i32 noundef %idx) {
+entry:
+; CHECK: call i64 @dx.op.waveReadLaneAt.i64(i32 117, i64 %expr, i32 %idx)
+  %ret = call i64 @llvm.dx.wave.readlane.i64(i64 %expr, i32 %idx)
+  ret i64 %ret
+}
+
+declare half @llvm.dx.wave.readlane.f16(half, i32)
+declare float @llvm.dx.wave.readlane.f32(float, i32)
+declare double @llvm.dx.wave.readlane.f64(double, i32)
+
+declare i1 @llvm.dx.wave.readlane.i1(i1, i32)
+declare i16 @llvm.dx.wave.readlane.i16(i16, i32)
+declare i32 @llvm.dx.wave.readlane.i32(i32, i32)
+declare i64 @llvm.dx.wave.readlane.i64(i64, i32)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll
new file mode 100644
index 000000000000..8ba17df30c36
--- /dev/null
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/WaveReadLaneAt.ll
@@ -0,0 +1,56 @@
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv32v1.3-vulkan-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Test lowering to spir-v backend for various types and scalar/vector
+
+; CHECK-DAG:   %[[#uint:]] = OpTypeInt 32 0
+; CHECK-DAG:   %[[#f32:]] = OpTypeFloat 32
+; CHECK-DAG:   %[[#v4_float:]] = OpTypeVector %[[#f32]] 4
+; CHECK-DAG:   %[[#bool:]] = OpTypeBool
+; CHECK-DAG:   %[[#v4_bool:]] = OpTypeVector %[[#bool]] 4
+; CHECK-DAG:   %[[#scope:]] = OpConstant %[[#uint]] 3
+
+; CHECK-LABEL: Begin function test_float
+; CHECK:   %[[#fexpr:]] = OpFunctionParameter %[[#f32]]
+; CHECK:   %[[#idx1:]] = OpFunctionParameter %[[#uint]]
+define float @test_float(float %fexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#fret:]] = OpGroupNonUniformShuffle %[[#f32]] %[[#scope]] %[[#fexpr]] %[[#idx1]]
+  %0 = call float @llvm.spv.wave.readlane.f32(float %fexpr, i32 %idx)
+  ret float %0
+}
+
+; CHECK-LABEL: Begin function test_int
+; CHECK:   %[[#iexpr:]] = OpFunctionParameter %[[#uint]]
+; CHECK:   %[[#idx2:]] = OpFunctionParameter %[[#uint]]
+define i32 @test_int(i32 %iexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#iret:]] = OpGroupNonUniformShuffle %[[#uint]] %[[#scope]] %[[#iexpr]] %[[#idx2]]
+  %0 = call i32 @llvm.spv.wave.readlane.i32(i32 %iexpr, i32 %idx)
+  ret i32 %0
+}
+
+; CHECK-LABEL: Begin function test_vbool
+; CHECK:   %[[#vbexpr:]] = OpFunctionParameter %[[#v4_bool]]
+; CHECK:   %[[#idx3:]] = OpFunctionParameter %[[#uint]]
+define <4 x i1> @test_vbool(<4 x i1> %vbexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#vbret:]] = OpGroupNonUniformShuffle %[[#v4_bool]] %[[#scope]] %[[#vbexpr]] %[[#idx3]]
+  %0 = call <4 x i1> @llvm.spv.wave.readlane.v4i1(<4 x i1> %vbexpr, i32 %idx)
+  ret <4 x i1> %0
+}
+
+; CHECK-LABEL: Begin function test_vfloat
+; CHECK:   %[[#vfexpr:]] = OpFunctionParameter %[[#v4_float]]
+; CHECK:   %[[#idx4:]] = OpFunctionParameter %[[#uint]]
+define <4 x float> @test_vfloat(<4 x float> %vfexpr, i32 %idx) {
+entry:
+; CHECK:   %[[#vbret:]] = OpGroupNonUniformShuffle %[[#v4_float]] %[[#scope]] %[[#vfexpr]] %[[#idx4]]
+  %0 = call <4 x float> @llvm.spv.wave.readlane.v4f32(<4 x float> %vfexpr, i32 %idx)
+  ret <4 x float> %0
+}
+
+declare float @llvm.spv.wave.readlane.f32(float, i32)
+declare i32 @llvm.spv.wave.readlane.i32(i32, i32)
+declare <4 x i1> @llvm.spv.wave.readlane.v4i1(<4 x i1>, i32)
+declare <4 x float> @llvm.spv.wave.readlane.v4f32(<4 x float>, i32)
-- 
GitLab


From bb89988174e5f1a9a419637cadae07e4e8c61c3e Mon Sep 17 00:00:00 2001
From: Daniel Chen <cdchen@ca.ibm.com>
Date: Tue, 15 Oct 2024 22:34:49 -0400
Subject: [PATCH 055/329] [AIX][CMake] Disable `openmp` as LLVM_ENABLE_PROJECTS
  (#110152)

in favor of LLVM_ENABLE_RUNTIMES
---
 llvm/CMakeLists.txt | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/llvm/CMakeLists.txt b/llvm/CMakeLists.txt
index 741c95f3a7d0..cde4a999ea2e 100644
--- a/llvm/CMakeLists.txt
+++ b/llvm/CMakeLists.txt
@@ -117,6 +117,12 @@ endif()
 # This allows an easy way of setting up a build directory for llvm and another
 # one for llvm+clang+... using the same sources.
 set(LLVM_ALL_PROJECTS "bolt;clang;clang-tools-extra;compiler-rt;cross-project-tests;libc;libclc;lld;lldb;mlir;openmp;polly;pstl")
+if (${CMAKE_SYSTEM_NAME} MATCHES "AIX")
+  # Disallow 'openmp' as a LLVM PROJECT on AIX as the supported way is to use
+  # LLVM_ENABLE_RUNTIMES.
+  list(REMOVE_ITEM LLVM_ALL_PROJECTS openmp)
+endif()
+
 # The flang project is not yet part of "all" projects (see C++ requirements)
 set(LLVM_EXTRA_PROJECTS "flang")
 # List of all known projects in the mono repo
-- 
GitLab


From 4cc6a08142129d351840c3c63d9372a2b66930b2 Mon Sep 17 00:00:00 2001
From: westtide <tocokeo@outlook.com>
Date: Wed, 16 Oct 2024 11:22:44 +0800
Subject: [PATCH 056/329] Update LLVMLibCArchitectures.cmake (#112464)

Hi there,

When building llvm-libc on the openEuler system, I encountered an issue
as shown in the image below:

![image](https://github.com/user-attachments/assets/75667de4-5bea-4a95-be28-ed34db0e05b9)

This issue happens because the regular expression used in
`libc/cmake/modules/LLVMLibCArchitectures.cmake`: `string(REGEX MATCH
"Target: [-_a-z0-9.]+[ \r\n]+")` does not handle capital letters
properly in `openEuler`.

To fix this, I modified the regular expression to: `string(REGEX MATCH
"Target: [-_a-zA-Z0-9.]+[ \r\n]+")`. This change makes it compatible
with capital letters.
---
 libc/cmake/modules/LLVMLibCArchitectures.cmake | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/libc/cmake/modules/LLVMLibCArchitectures.cmake b/libc/cmake/modules/LLVMLibCArchitectures.cmake
index 7711127c1a81..1e5ed723194a 100644
--- a/libc/cmake/modules/LLVMLibCArchitectures.cmake
+++ b/libc/cmake/modules/LLVMLibCArchitectures.cmake
@@ -84,7 +84,7 @@ if(NOT (libc_compiler_info_result EQUAL "0"))
   message(FATAL_ERROR "libc build: error querying compiler info from the "
                       "compiler: ${libc_compiler_info}")
 endif()
-string(REGEX MATCH "Target: [-_a-z0-9.]+[ \r\n]+"
+string(REGEX MATCH "Target: [-_a-zA-Z0-9.]+[ \r\n]+"
        libc_compiler_target_info ${libc_compiler_info})
 if(NOT libc_compiler_target_info)
   message(FATAL_ERROR "libc build: could not read compiler target info from:\n"
-- 
GitLab


From b2b0e6c01e819e078fb25a7f882de58a72cfc130 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Tue, 15 Oct 2024 20:44:01 -0700
Subject: [PATCH 057/329] [NFC][lsan] Fix name of local var

---
 compiler-rt/lib/lsan/lsan_common.cpp | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp
index c05e0dd0a933..8cdc6d1651f1 100644
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@@ -712,11 +712,11 @@ static bool ReportUnsuspendedThreads(
 
   Sort(threads.data(), threads.size());
 
-  InternalMmapVector<tid_t> unsuspended;
-  GetRunningThreadsLocked(&unsuspended);
+  InternalMmapVector<tid_t> known_threads;
+  GetRunningThreadsLocked(&known_threads);
 
   bool succeded = true;
-  for (auto os_id : unsuspended) {
+  for (auto os_id : known_threads) {
     uptr i = InternalLowerBound(threads, os_id);
     if (i >= threads.size() || threads[i] != os_id) {
       succeded = false;
-- 
GitLab


From 4c2c177567390cd3d8de3fd757e9234f1da832b7 Mon Sep 17 00:00:00 2001
From: wanglei <wanglei@loongson.cn>
Date: Wed, 16 Oct 2024 11:58:00 +0800
Subject: [PATCH 058/329] [LoongArch] Add options for annotate tablejump

This aligns with GCC. LoongArch kernel developers requested that this
option generate some corresponding relations in a section, including the
addresses of the jump instruction(jr) and the `MachineJumpTableEntry`.

Reviewed By: heiher

Pull Request: https://github.com/llvm/llvm-project/pull/102411
---
 clang/include/clang/Driver/Options.td         |   7 +-
 clang/lib/Driver/ToolChains/Clang.cpp         |   8 ++
 .../Driver/loongarch-mannotate-tablejump.c    |  13 ++
 .../Target/LoongArch/LoongArchAsmPrinter.cpp  |  45 ++++++
 .../Target/LoongArch/LoongArchAsmPrinter.h    |   1 +
 .../LoongArch/LoongArchExpandPseudoInsts.cpp  |  49 +++++++
 .../LoongArch/LoongArchMachineFunctionInfo.h  |  13 ++
 .../CodeGen/LoongArch/annotate-tablejump.ll   | 133 ++++++++++++++++++
 8 files changed, 268 insertions(+), 1 deletion(-)
 create mode 100644 clang/test/Driver/loongarch-mannotate-tablejump.c
 create mode 100644 llvm/test/CodeGen/LoongArch/annotate-tablejump.ll

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 6491e9ac73ce..2072ae45d554 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5384,9 +5384,14 @@ def mlasx : Flag<["-"], "mlasx">, Group<m_loongarch_Features_Group>,
   HelpText<"Enable Loongson Advanced SIMD Extension (LASX).">;
 def mno_lasx : Flag<["-"], "mno-lasx">, Group<m_loongarch_Features_Group>,
   HelpText<"Disable Loongson Advanced SIMD Extension (LASX).">;
+let Flags = [TargetSpecific] in {
 def msimd_EQ : Joined<["-"], "msimd=">, Group<m_loongarch_Features_Group>,
-  Flags<[TargetSpecific]>,
   HelpText<"Select the SIMD extension(s) to be enabled in LoongArch either 'none', 'lsx', 'lasx'.">;
+def mannotate_tablejump : Flag<["-"], "mannotate-tablejump">, Group<m_loongarch_Features_Group>,
+  HelpText<"Enable annotate table jump instruction to correlate it with the jump table.">;
+def mno_annotate_tablejump : Flag<["-"], "mno-annotate-tablejump">, Group<m_loongarch_Features_Group>,
+  HelpText<"Disable annotate table jump instruction to correlate it with the jump table.">;
+} // let Flags = [TargetSpecific]
 def mnop_mcount : Flag<["-"], "mnop-mcount">, HelpText<"Generate mcount/__fentry__ calls as nops. To activate they need to be patched in.">,
   Visibility<[ClangOption, CC1Option]>, Group<m_Group>,
   MarshallingInfoFlag<CodeGenOpts<"MNopMCount">>;
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 9d2f7a8960b4..c132fa35098a 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -1870,6 +1870,14 @@ void Clang::AddLoongArchTargetArgs(const ArgList &Args,
     CmdArgs.push_back("-tune-cpu");
     CmdArgs.push_back(Args.MakeArgString(TuneCPU));
   }
+
+  if (Arg *A = Args.getLastArg(options::OPT_mannotate_tablejump,
+                               options::OPT_mno_annotate_tablejump)) {
+    if (A->getOption().matches(options::OPT_mannotate_tablejump)) {
+      CmdArgs.push_back("-mllvm");
+      CmdArgs.push_back("-loongarch-annotate-tablejump");
+    }
+  }
 }
 
 void Clang::AddMIPSTargetArgs(const ArgList &Args,
diff --git a/clang/test/Driver/loongarch-mannotate-tablejump.c b/clang/test/Driver/loongarch-mannotate-tablejump.c
new file mode 100644
index 000000000000..586e62c35f4f
--- /dev/null
+++ b/clang/test/Driver/loongarch-mannotate-tablejump.c
@@ -0,0 +1,13 @@
+/// Test -m[no-]annotate-tablejump options.
+
+// RUN: %clang --target=loongarch64 -mannotate-tablejump %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-ANOTATE
+// RUN: %clang --target=loongarch64 -mno-annotate-tablejump %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-ANOTATE
+// RUN: %clang --target=loongarch64 -mannotate-tablejump -mno-annotate-tablejump %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-NO-ANOTATE
+// RUN: %clang --target=loongarch64 -mno-annotate-tablejump -mannotate-tablejump %s -### 2>&1 | \
+// RUN:   FileCheck %s --check-prefix=CC1-ANOTATE
+
+// CC1-ANOTATE: "-loongarch-annotate-tablejump"
+// CC1-NO-ANOTATE-NOT: "-loongarch-annotate-tablejump"
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
index d643017b90db..89f8978e6873 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.cpp
@@ -13,18 +13,30 @@
 
 #include "LoongArchAsmPrinter.h"
 #include "LoongArch.h"
+#include "LoongArchInstrInfo.h"
+#include "LoongArchMachineFunctionInfo.h"
 #include "LoongArchTargetMachine.h"
 #include "MCTargetDesc/LoongArchInstPrinter.h"
+#include "MCTargetDesc/LoongArchMCTargetDesc.h"
 #include "TargetInfo/LoongArchTargetInfo.h"
 #include "llvm/CodeGen/AsmPrinter.h"
+#include "llvm/CodeGen/MachineJumpTableInfo.h"
+#include "llvm/CodeGen/MachineModuleInfoImpls.h"
 #include "llvm/MC/MCContext.h"
 #include "llvm/MC/MCInstBuilder.h"
+#include "llvm/MC/MCSectionELF.h"
 #include "llvm/MC/TargetRegistry.h"
 
 using namespace llvm;
 
 #define DEBUG_TYPE "loongarch-asm-printer"
 
+cl::opt<bool> LArchAnnotateTableJump(
+    "loongarch-annotate-tablejump", cl::Hidden,
+    cl::desc(
+        "Annotate table jump instruction to correlate it with the jump table."),
+    cl::init(false));
+
 // Simple pseudo-instructions have their lowering (with expansion to real
 // instructions) auto-generated.
 #include "LoongArchGenMCPseudoLowering.inc"
@@ -238,6 +250,39 @@ void LoongArchAsmPrinter::emitSled(const MachineInstr &MI, SledKind Kind) {
   recordSled(BeginOfSled, MI, Kind, 2);
 }
 
+void LoongArchAsmPrinter::emitJumpTableInfo() {
+  AsmPrinter::emitJumpTableInfo();
+
+  if (!LArchAnnotateTableJump)
+    return;
+
+  assert(TM.getTargetTriple().isOSBinFormatELF());
+
+  unsigned Size = getDataLayout().getPointerSize();
+  auto *LAFI = MF->getInfo<LoongArchMachineFunctionInfo>();
+  unsigned EntrySize = LAFI->getJumpInfoSize();
+
+  if (0 == EntrySize)
+    return;
+
+  // Emit an additional section to store the correlation info as pairs of
+  // addresses, each pair contains the address of a jump instruction (jr) and
+  // the address of the jump table.
+  OutStreamer->switchSection(MMI->getContext().getELFSection(
+      ".discard.tablejump_annotate", ELF::SHT_PROGBITS, 0));
+
+  for (unsigned Idx = 0; Idx < EntrySize; ++Idx) {
+    OutStreamer->emitValue(
+        MCSymbolRefExpr::create(LAFI->getJumpInfoJrMI(Idx)->getPreInstrSymbol(),
+                                OutContext),
+        Size);
+    OutStreamer->emitValue(
+        MCSymbolRefExpr::create(
+            GetJTISymbol(LAFI->getJumpInfoJTIMO(Idx)->getIndex()), OutContext),
+        Size);
+  }
+}
+
 bool LoongArchAsmPrinter::runOnMachineFunction(MachineFunction &MF) {
   AsmPrinter::runOnMachineFunction(MF);
   // Emit the XRay table for this function.
diff --git a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
index fc12f1079490..312631e4f0dc 100644
--- a/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
+++ b/llvm/lib/Target/LoongArch/LoongArchAsmPrinter.h
@@ -55,6 +55,7 @@ public:
   bool lowerOperand(const MachineOperand &MO, MCOperand &MCOp) const {
     return lowerLoongArchMachineOperandToMCOperand(MO, MCOp, *this);
   }
+  void emitJumpTableInfo() override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
index 33b93e42bb5c..4a10f2e282d1 100644
--- a/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchExpandPseudoInsts.cpp
@@ -13,6 +13,7 @@
 
 #include "LoongArch.h"
 #include "LoongArchInstrInfo.h"
+#include "LoongArchMachineFunctionInfo.h"
 #include "LoongArchTargetMachine.h"
 #include "MCTargetDesc/LoongArchBaseInfo.h"
 #include "MCTargetDesc/LoongArchMCTargetDesc.h"
@@ -27,6 +28,8 @@
 
 using namespace llvm;
 
+extern cl::opt<bool> LArchAnnotateTableJump;
+
 #define LOONGARCH_PRERA_EXPAND_PSEUDO_NAME                                     \
   "LoongArch Pre-RA pseudo instruction expansion pass"
 #define LOONGARCH_EXPAND_PSEUDO_NAME                                           \
@@ -103,6 +106,8 @@ private:
                           MachineBasicBlock::iterator MBBI,
                           MachineBasicBlock::iterator &NextMBBI,
                           bool IsTailCall);
+  void annotateTableJump(MachineBasicBlock &MBB,
+                         MachineBasicBlock::iterator MBBI);
 };
 
 char LoongArchPreRAExpandPseudo::ID = 0;
@@ -167,6 +172,12 @@ bool LoongArchPreRAExpandPseudo::expandMI(
   case LoongArch::PseudoTAIL_MEDIUM:
   case LoongArch::PseudoTAIL_LARGE:
     return expandFunctionCALL(MBB, MBBI, NextMBBI, /*IsTailCall=*/true);
+  case LoongArch::PseudoBRIND:
+    // If the PseudoBRIND is used to table jump, then emit a label to annotate
+    // the `jr` instruction, and save the instructions.
+    if (LArchAnnotateTableJump)
+      annotateTableJump(MBB, MBBI);
+    break;
   }
   return false;
 }
@@ -601,6 +612,44 @@ bool LoongArchPreRAExpandPseudo::expandFunctionCALL(
   return true;
 }
 
+void LoongArchPreRAExpandPseudo::annotateTableJump(
+    MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI) {
+  MachineFunction *MF = MBB.getParent();
+  MachineRegisterInfo &MRI = MBB.getParent()->getRegInfo();
+
+  bool IsFound = false;
+
+  std::function<void(MachineInstr *, int)> FindJTIMI = [&](MachineInstr *MInst,
+                                                           int FindDepth) {
+    if (FindDepth < 0)
+      return;
+    for (auto &MO : MInst->all_uses()) {
+      if (IsFound)
+        return;
+      Register Reg = MO.getReg();
+      if (!Reg.isVirtual())
+        continue;
+      MachineInstr *DefMI = MRI.getVRegDef(Reg);
+      if (!DefMI)
+        continue;
+      for (unsigned Idx = 0; Idx < DefMI->getNumOperands(); ++Idx) {
+        MachineOperand &MO = DefMI->getOperand(Idx);
+        if (MO.isJTI()) {
+          MBBI->setPreInstrSymbol(
+              *MF, MF->getContext().createNamedTempSymbol("jrtb_"));
+          MF->getInfo<LoongArchMachineFunctionInfo>()->setJumpInfo(&*MBBI, &MO);
+          IsFound = true;
+          return;
+        }
+      }
+      FindJTIMI(DefMI, --FindDepth);
+    }
+  };
+
+  // FindDepth = 3, probably sufficient.
+  FindJTIMI(&*MBBI, /*FindDepth=*/3);
+}
+
 class LoongArchExpandPseudo : public MachineFunctionPass {
 public:
   const LoongArchInstrInfo *TII;
diff --git a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
index a7366a5dba04..daa47c4dc7e3 100644
--- a/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
+++ b/llvm/lib/Target/LoongArch/LoongArchMachineFunctionInfo.h
@@ -39,6 +39,10 @@ private:
   /// Registers that have been sign extended from i32.
   SmallVector<Register, 8> SExt32Registers;
 
+  /// Pairs of `jr` instructions and corresponding JTI operands, used for the
+  /// `annotate-tablejump` option.
+  SmallVector<std::pair<MachineInstr *, MachineOperand *>, 4> JumpInfos;
+
 public:
   LoongArchMachineFunctionInfo(const Function &F,
                                const TargetSubtargetInfo *STI) {}
@@ -71,6 +75,15 @@ public:
   bool isSExt32Register(Register Reg) const {
     return is_contained(SExt32Registers, Reg);
   }
+
+  void setJumpInfo(MachineInstr *JrMI, MachineOperand *JTIMO) {
+    JumpInfos.push_back(std::make_pair(JrMI, JTIMO));
+  }
+  unsigned getJumpInfoSize() { return JumpInfos.size(); }
+  MachineInstr *getJumpInfoJrMI(unsigned Idx) { return JumpInfos[Idx].first; }
+  MachineOperand *getJumpInfoJTIMO(unsigned Idx) {
+    return JumpInfos[Idx].second;
+  }
 };
 
 } // end namespace llvm
diff --git a/llvm/test/CodeGen/LoongArch/annotate-tablejump.ll b/llvm/test/CodeGen/LoongArch/annotate-tablejump.ll
new file mode 100644
index 000000000000..a8c660609d99
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/annotate-tablejump.ll
@@ -0,0 +1,133 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc --mtriple=loongarch32 -mattr=+d \
+; RUN:   --min-jump-table-entries=4 < %s \
+; RUN:   --loongarch-annotate-tablejump \
+; RUN:   | FileCheck %s --check-prefix=LA32-JT
+; RUN: llc --mtriple=loongarch64 -mattr=+d \
+; RUN:   --min-jump-table-entries=4 < %s \
+; RUN:   --loongarch-annotate-tablejump \
+; RUN:   | FileCheck %s --check-prefix=LA64-JT
+
+define void @switch_4_arms(i32 %in, ptr %out) nounwind {
+; LA32-JT-LABEL: switch_4_arms:
+; LA32-JT:       # %bb.0: # %entry
+; LA32-JT-NEXT:    addi.w $a3, $a0, -1
+; LA32-JT-NEXT:    ori $a2, $zero, 3
+; LA32-JT-NEXT:    bltu $a2, $a3, .LBB0_7
+; LA32-JT-NEXT:  # %bb.1: # %entry
+; LA32-JT-NEXT:    pcalau12i $a4, %pc_hi20(.LJTI0_0)
+; LA32-JT-NEXT:    addi.w $a4, $a4, %pc_lo12(.LJTI0_0)
+; LA32-JT-NEXT:    alsl.w $a3, $a3, $a4, 2
+; LA32-JT-NEXT:    ld.w $a3, $a3, 0
+; LA32-JT-NEXT:  .Ljrtb_0:
+; LA32-JT-NEXT:    jr $a3
+; LA32-JT-NEXT:  .LBB0_2: # %bb1
+; LA32-JT-NEXT:    ori $a3, $zero, 4
+; LA32-JT-NEXT:    b .LBB0_6
+; LA32-JT-NEXT:  .LBB0_3: # %bb2
+; LA32-JT-NEXT:    ori $a3, $zero, 3
+; LA32-JT-NEXT:    b .LBB0_6
+; LA32-JT-NEXT:  .LBB0_4: # %bb3
+; LA32-JT-NEXT:    ori $a3, $zero, 2
+; LA32-JT-NEXT:    b .LBB0_6
+; LA32-JT-NEXT:  .LBB0_5: # %bb4
+; LA32-JT-NEXT:    ori $a3, $zero, 1
+; LA32-JT-NEXT:  .LBB0_6: # %exit
+; LA32-JT-NEXT:    st.w $a3, $a1, 0
+; LA32-JT-NEXT:  .LBB0_7: # %exit
+; LA32-JT-NEXT:    addi.w $a3, $a0, -5
+; LA32-JT-NEXT:    bltu $a2, $a3, .LBB0_9
+; LA32-JT-NEXT:  # %bb.8: # %exit
+; LA32-JT-NEXT:    pcalau12i $a4, %pc_hi20(.LJTI0_1)
+; LA32-JT-NEXT:    addi.w $a4, $a4, %pc_lo12(.LJTI0_1)
+; LA32-JT-NEXT:    alsl.w $a3, $a3, $a4, 2
+; LA32-JT-NEXT:    ld.w $a3, $a3, 0
+; LA32-JT-NEXT:  .Ljrtb_1:
+; LA32-JT-NEXT:    jr $a3
+; LA32-JT-NEXT:  .LBB0_9: # %exit2
+; LA32-JT-NEXT:    ret
+;
+; LA64-JT-LABEL: switch_4_arms:
+; LA64-JT:       # %bb.0: # %entry
+; LA64-JT-NEXT:    addi.w $a0, $a0, 0
+; LA64-JT-NEXT:    addi.d $a3, $a0, -1
+; LA64-JT-NEXT:    ori $a2, $zero, 3
+; LA64-JT-NEXT:    bltu $a2, $a3, .LBB0_7
+; LA64-JT-NEXT:  # %bb.1: # %entry
+; LA64-JT-NEXT:    slli.d $a3, $a3, 3
+; LA64-JT-NEXT:    pcalau12i $a4, %pc_hi20(.LJTI0_0)
+; LA64-JT-NEXT:    addi.d $a4, $a4, %pc_lo12(.LJTI0_0)
+; LA64-JT-NEXT:    ldx.d $a3, $a3, $a4
+; LA64-JT-NEXT:  .Ljrtb_0:
+; LA64-JT-NEXT:    jr $a3
+; LA64-JT-NEXT:  .LBB0_2: # %bb1
+; LA64-JT-NEXT:    ori $a3, $zero, 4
+; LA64-JT-NEXT:    b .LBB0_6
+; LA64-JT-NEXT:  .LBB0_3: # %bb2
+; LA64-JT-NEXT:    ori $a3, $zero, 3
+; LA64-JT-NEXT:    b .LBB0_6
+; LA64-JT-NEXT:  .LBB0_4: # %bb3
+; LA64-JT-NEXT:    ori $a3, $zero, 2
+; LA64-JT-NEXT:    b .LBB0_6
+; LA64-JT-NEXT:  .LBB0_5: # %bb4
+; LA64-JT-NEXT:    ori $a3, $zero, 1
+; LA64-JT-NEXT:  .LBB0_6: # %exit
+; LA64-JT-NEXT:    st.w $a3, $a1, 0
+; LA64-JT-NEXT:  .LBB0_7: # %exit
+; LA64-JT-NEXT:    addi.d $a3, $a0, -5
+; LA64-JT-NEXT:    bltu $a2, $a3, .LBB0_9
+; LA64-JT-NEXT:  # %bb.8: # %exit
+; LA64-JT-NEXT:    slli.d $a3, $a3, 3
+; LA64-JT-NEXT:    pcalau12i $a4, %pc_hi20(.LJTI0_1)
+; LA64-JT-NEXT:    addi.d $a4, $a4, %pc_lo12(.LJTI0_1)
+; LA64-JT-NEXT:    ldx.d $a3, $a3, $a4
+; LA64-JT-NEXT:  .Ljrtb_1:
+; LA64-JT-NEXT:    jr $a3
+; LA64-JT-NEXT:  .LBB0_9: # %exit2
+; LA64-JT-NEXT:    ret
+entry:
+  switch i32 %in, label %exit [
+    i32 1, label %bb1
+    i32 2, label %bb2
+    i32 3, label %bb3
+    i32 4, label %bb4
+  ]
+bb1:
+  store i32 4, ptr %out
+  br label %exit
+bb2:
+  store i32 3, ptr %out
+  br label %exit
+bb3:
+  store i32 2, ptr %out
+  br label %exit
+bb4:
+  store i32 1, ptr %out
+  br label %exit
+exit:
+  switch i32 %in, label %exit2 [
+    i32 5, label %bb1
+    i32 6, label %bb2
+    i32 7, label %bb3
+    i32 8, label %bb4
+  ]
+exit2:
+  ret void
+}
+
+; UTC_ARGS: --disable
+
+; LA32-JT-LABEL: .LJTI0_0:
+; LA32-JT:       .section .discard.tablejump_annotate,"",@progbits
+; LA32-JT-NEXT:  .word .Ljrtb_0
+; LA32-JT-NEXT:  .word .LJTI0_0
+; LA32-JT-NEXT:  .word .Ljrtb_1
+; LA32-JT-NEXT:  .word .LJTI0_1
+
+; UTC_ARGS: --disable
+; LA64-JT-LABEL: .LJTI0_0:
+; LA64-JT:       .section .discard.tablejump_annotate,"",@progbits
+; LA64-JT-NEXT:  .dword .Ljrtb_0
+; LA64-JT-NEXT:  .dword .LJTI0_0
+; LA64-JT-NEXT:  .dword .Ljrtb_1
+; LA64-JT-NEXT:  .dword .LJTI0_1
-- 
GitLab


From 7937fe1a17f2c883f41e1bdefd0a1d9c93861532 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Tue, 15 Oct 2024 16:20:58 +1100
Subject: [PATCH 059/329] [ORC][llvm-jitlink] Add support for forced loading of
 archive members.

This patch adds support for forced loading of archive members, similar to the
behavior of the -all_load and -ObjC options in ld64. To enable this, the
StaticLibraryDefinitionGenerator class constructors are extended with a
VisitMember callback that is called on each member file in the archive at
generator construction time. This callback can be used to unconditionally add
the member file to a JITDylib at that point.

To test this the llvm-jitlink utility is extended with -all_load (all platforms)
and -ObjC (darwin only) options. Since we can't refer to symbols in the test
objects directly (these would always cause the member to be linked in, even
without the new flags) we instead test side-effects of force loading: execution
of constructors and registration of Objective-C metadata.

rdar://134446111
---
 .../Darwin/Generic/Inputs/EmptyClassFoo.m     |  7 ++++
 .../Generic/llvm-jitlink-force-link-objc.m    | 14 +++++++
 .../Inputs/SetGlobalIntXInConstructor.cpp     | 12 ++++++
 .../Linux/Generic/llvm-jitlink-all-load.c     | 14 +++++++
 compiler-rt/test/orc/lit.cfg.py               |  5 ++-
 .../llvm/ExecutionEngine/Orc/ExecutionUtils.h | 15 ++++++-
 llvm/include/llvm/ExecutionEngine/Orc/MachO.h | 17 ++++++++
 .../ExecutionEngine/Orc/ExecutionUtils.cpp    | 42 ++++++++++++++++---
 llvm/lib/ExecutionEngine/Orc/MachO.cpp        | 35 ++++++++++++++++
 llvm/tools/llvm-jitlink/llvm-jitlink.cpp      | 32 +++++++++++---
 10 files changed, 180 insertions(+), 13 deletions(-)
 create mode 100644 compiler-rt/test/orc/TestCases/Darwin/Generic/Inputs/EmptyClassFoo.m
 create mode 100644 compiler-rt/test/orc/TestCases/Darwin/Generic/llvm-jitlink-force-link-objc.m
 create mode 100644 compiler-rt/test/orc/TestCases/Linux/Generic/Inputs/SetGlobalIntXInConstructor.cpp
 create mode 100644 compiler-rt/test/orc/TestCases/Linux/Generic/llvm-jitlink-all-load.c

diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/Inputs/EmptyClassFoo.m b/compiler-rt/test/orc/TestCases/Darwin/Generic/Inputs/EmptyClassFoo.m
new file mode 100644
index 000000000000..12e3dea5af37
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/Inputs/EmptyClassFoo.m
@@ -0,0 +1,7 @@
+#include <Foundation/Foundation.h>
+
+@interface Foo : NSObject
+@end
+
+@implementation Foo
+@end
diff --git a/compiler-rt/test/orc/TestCases/Darwin/Generic/llvm-jitlink-force-link-objc.m b/compiler-rt/test/orc/TestCases/Darwin/Generic/llvm-jitlink-force-link-objc.m
new file mode 100644
index 000000000000..89cd6bd01671
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Darwin/Generic/llvm-jitlink-force-link-objc.m
@@ -0,0 +1,14 @@
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clang -c -o %t/EmptyClassFoo.o  %S/Inputs/EmptyClassFoo.m
+// RUN: ar r %t/libFooClass.a %t/EmptyClassFoo.o
+// RUN: %clang -c -o %t/force-objc.o %s
+// RUN: %llvm_jitlink -ObjC %t/force-objc.o -L%t -lFooClass
+//
+// REQUIRES: system-darwin && host-arch-compatible
+
+id objc_getClass(const char *name);
+
+int main(int argc, char *argv[]) {
+  // Return succeess if we find Foo, error otherwise.
+  return objc_getClass("Foo") ? 0 : 1;
+}
diff --git a/compiler-rt/test/orc/TestCases/Linux/Generic/Inputs/SetGlobalIntXInConstructor.cpp b/compiler-rt/test/orc/TestCases/Linux/Generic/Inputs/SetGlobalIntXInConstructor.cpp
new file mode 100644
index 000000000000..8c67a34fe243
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Linux/Generic/Inputs/SetGlobalIntXInConstructor.cpp
@@ -0,0 +1,12 @@
+extern "C" int x;
+
+namespace {
+
+struct Init {
+public:
+  Init() { x = 1; }
+};
+
+Init SetX;
+
+} // namespace
diff --git a/compiler-rt/test/orc/TestCases/Linux/Generic/llvm-jitlink-all-load.c b/compiler-rt/test/orc/TestCases/Linux/Generic/llvm-jitlink-all-load.c
new file mode 100644
index 000000000000..dde7dacd92b3
--- /dev/null
+++ b/compiler-rt/test/orc/TestCases/Linux/Generic/llvm-jitlink-all-load.c
@@ -0,0 +1,14 @@
+// Check that the -all_load flag to llvm-jitlink causes all objects from
+// archives to be loaded, regardless of whether or not they're referenced.
+//
+// RUN: rm -rf %t && mkdir -p %t
+// RUN: %clangxx -c -o %t/SetX.o  %S/Inputs/SetGlobalIntXInConstructor.cpp
+// RUN: ar r %t/libSetX.a %t/SetX.o
+// RUN: %clang -c -o %t/all_load.o %s
+// RUN: %llvm_jitlink -all_load %t/all_load.o -L%t -lSetX
+//
+// REQUIRES: system-darwin && host-arch-compatible
+
+int x = 0;
+
+int main(int argc, char *argv[]) { return x == 1 ? 0 : 1; }
diff --git a/compiler-rt/test/orc/lit.cfg.py b/compiler-rt/test/orc/lit.cfg.py
index 897cefb3d193..6dfa94b11cc9 100644
--- a/compiler-rt/test/orc/lit.cfg.py
+++ b/compiler-rt/test/orc/lit.cfg.py
@@ -14,6 +14,8 @@ host_arch_compatible = config.target_arch == config.host_arch
 
 if config.host_arch == "x86_64h" and config.target_arch == "x86_64":
     host_arch_compatible = True
+if host_arch_compatible:
+    config.available_features.add("host-arch-compatible")
 config.test_target_is_host_executable = (
     config.target_os == config.host_os and host_arch_compatible
 )
@@ -71,9 +73,10 @@ config.substitutions.append(
         (lli + " -jit-kind=orc -jit-linker=jitlink -orc-runtime=" + orc_rt_path),
     )
 )
+config.substitutions.append(("%ar", "ar"))
 
 # Default test suffixes.
-config.suffixes = [".c", ".cpp", ".S", ".ll", ".test"]
+config.suffixes = [".c", ".cpp", ".m", ".S", ".ll", ".test"]
 
 # Exclude Inputs directories.
 config.excludes = ["Inputs"]
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
index 44db455f218a..741dcc236b30 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutionUtils.h
@@ -268,11 +268,21 @@ private:
 /// the containing object being added to the JITDylib.
 class StaticLibraryDefinitionGenerator : public DefinitionGenerator {
 public:
-  // Interface builder function for objects loaded from this archive.
+  /// Interface builder function for objects loaded from this archive.
   using GetObjectFileInterface =
       unique_function<Expected<MaterializationUnit::Interface>(
           ExecutionSession &ES, MemoryBufferRef ObjBuffer)>;
 
+  /// Callback for visiting archive members at construction time.
+  /// Con be used to pre-load archive members.
+  using VisitMembersFunction = unique_function<Error(MemoryBufferRef)>;
+
+  /// A VisitMembersFunction that unconditionally loads all object files from
+  /// the archive.
+  /// Archive members that are not valid object files will be skipped.
+  static VisitMembersFunction loadAllObjectFileMembers(ObjectLayer &L,
+                                                       JITDylib &JD);
+
   /// Try to create a StaticLibraryDefinitionGenerator from the given path.
   ///
   /// This call will succeed if the file at the given path is a static library
@@ -280,6 +290,7 @@ public:
   /// with the ExecutionSession's triple. Otherwise it will return an error.
   static Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
   Load(ObjectLayer &L, const char *FileName,
+       VisitMembersFunction VisitMembers = VisitMembersFunction(),
        GetObjectFileInterface GetObjFileInterface = GetObjectFileInterface());
 
   /// Try to create a StaticLibrarySearchGenerator from the given memory buffer
@@ -287,6 +298,7 @@ public:
   static Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
   Create(ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer,
          std::unique_ptr<object::Archive> Archive,
+         VisitMembersFunction VisitMembers = VisitMembersFunction(),
          GetObjectFileInterface GetObjFileInterface = GetObjectFileInterface());
 
   /// Try to create a StaticLibrarySearchGenerator from the given memory buffer.
@@ -298,6 +310,7 @@ public:
   /// with the ExecutionSession's triple. Otherwise it will return an error.
   static Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
   Create(ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer,
+         VisitMembersFunction VisitMembers = VisitMembersFunction(),
          GetObjectFileInterface GetObjFileInterface = GetObjectFileInterface());
 
   /// Returns a list of filenames of dynamic libraries that this archive has
diff --git a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h
index 0ae7fc80acad..a9d34a82d53d 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/MachO.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/MachO.h
@@ -28,6 +28,9 @@ class MachOUniversalBinary;
 
 namespace orc {
 
+class JITDylib;
+class ObjectLayer;
+
 /// Check that the given buffer contains a MachO object file compatible with the
 /// given triple.
 /// ObjIsSlice should be set to true if Obj is a slice of a universal binary
@@ -72,6 +75,20 @@ getMachOSliceRangeForTriple(object::MachOUniversalBinary &UB, const Triple &TT);
 Expected<std::pair<size_t, size_t>>
 getMachOSliceRangeForTriple(MemoryBufferRef UBBuf, const Triple &TT);
 
+/// For use with StaticLibraryDefinitionGenerators.
+class ForceLoadMachOArchiveMembers {
+public:
+  ForceLoadMachOArchiveMembers(ObjectLayer &L, JITDylib &JD, bool ObjCOnly)
+      : L(L), JD(JD), ObjCOnly(ObjCOnly) {}
+
+  Error operator()(MemoryBufferRef MemberBuf);
+
+private:
+  ObjectLayer &L;
+  JITDylib &JD;
+  bool ObjCOnly;
+};
+
 } // namespace orc
 } // namespace llvm
 
diff --git a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
index c4a65ebbe5a3..1dcf91443d55 100644
--- a/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/ExecutionUtils.cpp
@@ -273,9 +273,24 @@ Error DynamicLibrarySearchGenerator::tryToGenerate(
   return JD.define(absoluteSymbols(std::move(NewSymbols)));
 }
 
+StaticLibraryDefinitionGenerator::VisitMembersFunction
+StaticLibraryDefinitionGenerator::loadAllObjectFileMembers(ObjectLayer &L,
+                                                           JITDylib &JD) {
+  return [&](MemoryBufferRef Buf) -> Error {
+    switch (identify_magic(Buf.getBuffer())) {
+    case file_magic::elf_relocatable:
+    case file_magic::macho_object:
+    case file_magic::coff_object:
+      return L.add(JD, MemoryBuffer::getMemBuffer(Buf));
+    default:
+      return Error::success();
+    }
+  };
+}
+
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
 StaticLibraryDefinitionGenerator::Load(
-    ObjectLayer &L, const char *FileName,
+    ObjectLayer &L, const char *FileName, VisitMembersFunction VisitMembers,
     GetObjectFileInterface GetObjFileInterface) {
 
   const auto &TT = L.getExecutionSession().getTargetTriple();
@@ -283,17 +298,33 @@ StaticLibraryDefinitionGenerator::Load(
   if (!Linkable)
     return Linkable.takeError();
 
-  return Create(L, std::move(Linkable->first), std::move(GetObjFileInterface));
+  return Create(L, std::move(Linkable->first), std::move(VisitMembers),
+                std::move(GetObjFileInterface));
 }
 
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
 StaticLibraryDefinitionGenerator::Create(
     ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer,
-    std::unique_ptr<object::Archive> Archive,
+    std::unique_ptr<object::Archive> Archive, VisitMembersFunction VisitMembers,
     GetObjectFileInterface GetObjFileInterface) {
 
   Error Err = Error::success();
 
+  if (VisitMembers) {
+    for (auto Child : Archive->children(Err)) {
+      if (auto ChildBuf = Child.getMemoryBufferRef()) {
+        if (auto Err2 = VisitMembers(*ChildBuf))
+          return std::move(Err2);
+      } else {
+        // We silently allow non-object archive members. This matches the
+        // behavior of ld.
+        consumeError(ChildBuf.takeError());
+      }
+    }
+    if (Err)
+      return std::move(Err);
+  }
+
   std::unique_ptr<StaticLibraryDefinitionGenerator> ADG(
       new StaticLibraryDefinitionGenerator(
           L, std::move(ArchiveBuffer), std::move(Archive),
@@ -308,6 +339,7 @@ StaticLibraryDefinitionGenerator::Create(
 Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>>
 StaticLibraryDefinitionGenerator::Create(
     ObjectLayer &L, std::unique_ptr<MemoryBuffer> ArchiveBuffer,
+    VisitMembersFunction VisitMembers,
     GetObjectFileInterface GetObjFileInterface) {
 
   auto B = object::createBinary(ArchiveBuffer->getMemBufferRef());
@@ -319,7 +351,7 @@ StaticLibraryDefinitionGenerator::Create(
     return Create(L, std::move(ArchiveBuffer),
                   std::unique_ptr<object::Archive>(
                       static_cast<object::Archive *>(B->release())),
-                  std::move(GetObjFileInterface));
+                  std::move(VisitMembers), std::move(GetObjFileInterface));
 
   // If this is a universal binary then search for a slice matching the given
   // Triple.
@@ -341,7 +373,7 @@ StaticLibraryDefinitionGenerator::Create(
       return Archive.takeError();
 
     return Create(L, std::move(ArchiveBuffer), std::move(*Archive),
-                  std::move(GetObjFileInterface));
+                  std::move(VisitMembers), std::move(GetObjFileInterface));
   }
 
   return make_error<StringError>(Twine("Unrecognized file type for ") +
diff --git a/llvm/lib/ExecutionEngine/Orc/MachO.cpp b/llvm/lib/ExecutionEngine/Orc/MachO.cpp
index dc6a55758d25..784c3487d64f 100644
--- a/llvm/lib/ExecutionEngine/Orc/MachO.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/MachO.cpp
@@ -10,6 +10,7 @@
 
 #include "llvm/ADT/ScopeExit.h"
 #include "llvm/BinaryFormat/MachO.h"
+#include "llvm/ExecutionEngine/Orc/Layer.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Support/FileSystem.h"
 
@@ -228,5 +229,39 @@ getMachOSliceRangeForTriple(MemoryBufferRef UBBuf, const Triple &TT) {
   return getMachOSliceRangeForTriple(**UB, TT);
 }
 
+Error ForceLoadMachOArchiveMembers::operator()(MemoryBufferRef MemberBuf) {
+  if (!ObjCOnly)
+    return L.add(JD, MemoryBuffer::getMemBuffer(MemberBuf));
+
+  // We need to check whether this archive member contains any Objective-C
+  // or Swift metadata.
+
+  auto Obj = object::ObjectFile::createObjectFile(MemberBuf);
+  if (!Obj) {
+    // We silently ignore invalid files.
+    consumeError(Obj.takeError());
+    return Error::success();
+  }
+
+  if (auto *MachOObj = dyn_cast<object::MachOObjectFile>(&**Obj)) {
+    // Load the object if any recognized special section is present.
+    for (auto Sec : MachOObj->sections()) {
+      auto SegName =
+          MachOObj->getSectionFinalSegmentName(Sec.getRawDataRefImpl());
+      if (auto SecName = Sec.getName()) {
+        if (*SecName == "__objc_classlist" || *SecName == "__objc_protolist" ||
+            *SecName == "__objc_clsrolist" || *SecName == "__objc_catlist" ||
+            *SecName == "__objc_catlist2" || *SecName == "__objc_nlcatlist" ||
+            (SegName == "__TEXT" && (*SecName).starts_with("__swift") &&
+             *SecName != "__swift_modhash"))
+          return L.add(JD, MemoryBuffer::getMemBuffer(MemberBuf));
+      } else
+        return SecName.takeError();
+    }
+  }
+
+  return Error::success();
+}
+
 } // End namespace orc.
 } // End namespace llvm.
diff --git a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
index 2d1c19d2a519..d90176818c8e 100644
--- a/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
+++ b/llvm/tools/llvm-jitlink/llvm-jitlink.cpp
@@ -30,6 +30,7 @@
 #include "llvm/ExecutionEngine/Orc/IndirectionUtils.h"
 #include "llvm/ExecutionEngine/Orc/JITTargetMachineBuilder.h"
 #include "llvm/ExecutionEngine/Orc/LoadLinkableFile.h"
+#include "llvm/ExecutionEngine/Orc/MachO.h"
 #include "llvm/ExecutionEngine/Orc/MachOPlatform.h"
 #include "llvm/ExecutionEngine/Orc/MapperJITLinkMemoryManager.h"
 #include "llvm/ExecutionEngine/Orc/ObjectFileInterface.h"
@@ -265,6 +266,17 @@ static cl::opt<std::string>
     OverrideTriple("triple", cl::desc("Override target triple detection"),
                    cl::init(""), cl::cat(JITLinkCategory));
 
+static cl::opt<bool> AllLoad("all_load",
+                             cl::desc("Load all members of static archives"),
+                             cl::init(false), cl::cat(JITLinkCategory));
+
+static cl::opt<bool> ForceLoadObjC(
+    "ObjC",
+    cl::desc("Load all members of static archives that implement "
+             "Objective-C classes or categories, or Swift structs, "
+             "classes or extensions"),
+    cl::init(false), cl::cat(JITLinkCategory));
+
 static ExitOnError ExitOnErr;
 
 static LLVM_ATTRIBUTE_USED void linkComponents() {
@@ -1957,10 +1969,9 @@ static Error addLibraries(Session &S,
              });
 
   // 3. Process library loads.
-  auto AddArchive = [&](const char *Path, const LibraryLoad &LL)
+  auto AddArchive = [&](JITDylib &JD, const char *Path, const LibraryLoad &LL)
       -> Expected<std::unique_ptr<StaticLibraryDefinitionGenerator>> {
-    unique_function<Expected<MaterializationUnit::Interface>(
-        ExecutionSession & ES, MemoryBufferRef ObjBuffer)>
+    StaticLibraryDefinitionGenerator::GetObjectFileInterface
         GetObjFileInterface;
     switch (LL.Modifier) {
     case LibraryLoad::Standard:
@@ -1970,8 +1981,17 @@ static Error addLibraries(Session &S,
       GetObjFileInterface = getObjectFileInterfaceHidden;
       break;
     }
+
+    StaticLibraryDefinitionGenerator::VisitMembersFunction VisitMembers;
+    if (AllLoad)
+      VisitMembers = StaticLibraryDefinitionGenerator::loadAllObjectFileMembers(
+          S.ObjLayer, JD);
+    else if (S.ES.getTargetTriple().isOSBinFormatMachO() && ForceLoadObjC)
+      VisitMembers = ForceLoadMachOArchiveMembers(S.ObjLayer, JD, true);
+
     auto G = StaticLibraryDefinitionGenerator::Load(
-        S.ObjLayer, Path, std::move(GetObjFileInterface));
+        S.ObjLayer, Path, std::move(VisitMembers),
+        std::move(GetObjFileInterface));
     if (!G)
       return G.takeError();
 
@@ -2009,7 +2029,7 @@ static Error addLibraries(Session &S,
     }
 
     if (LL.IsPath) {
-      auto G = AddArchive(LL.LibName.c_str(), LL);
+      auto G = AddArchive(JD, LL.LibName.c_str(), LL);
       if (!G)
         return createFileError(LL.LibName, G.takeError());
       JD.addGenerator(std::move(*G));
@@ -2065,7 +2085,7 @@ static Error addLibraries(Session &S,
         }
         case file_magic::archive:
         case file_magic::macho_universal_binary: {
-          auto G = AddArchive(LibPath.data(), LL);
+          auto G = AddArchive(JD, LibPath.data(), LL);
           if (!G)
             return G.takeError();
           JD.addGenerator(std::move(*G));
-- 
GitLab


From cfc10bea9d18d9560cfbadca8609eb79aa3f09c3 Mon Sep 17 00:00:00 2001
From: Karl-Johan Karlsson <karl-johan.karlsson@ericsson.com>
Date: Wed, 16 Oct 2024 06:34:54 +0200
Subject: [PATCH 060/329] [lld] Fix warning in SymbolTable.cpp (#112323)

Fix gcc warning:

lld/ELF/SymbolTable.cpp:340:33: warning: enumeral and non-enumeral type
in conditional expression [-Wextra]
---
 lld/ELF/SymbolTable.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lld/ELF/SymbolTable.cpp b/lld/ELF/SymbolTable.cpp
index 02814e041390..674b1ef983f8 100644
--- a/lld/ELF/SymbolTable.cpp
+++ b/lld/ELF/SymbolTable.cpp
@@ -337,7 +337,7 @@ void SymbolTable::scanVersionScript() {
         globalAsteriskFound = !isLocal;
       }
     }
-    assignWildcard(pat, isLocal ? VER_NDX_LOCAL : ver->id, ver->name);
+    assignWildcard(pat, isLocal ? (uint16_t)VER_NDX_LOCAL : ver->id, ver->name);
   };
   for (VersionDefinition &v : llvm::reverse(ctx.arg.versionDefinitions)) {
     for (SymbolVersion &pat : v.nonLocalPatterns)
-- 
GitLab


From 4db57ab958f5bac1d85927a955f989625badf962 Mon Sep 17 00:00:00 2001
From: Tyler Nowicki <tyler.nowicki@amd.com>
Date: Wed, 16 Oct 2024 00:37:29 -0400
Subject: [PATCH 061/329] [Coroutines] Improve dump of BB label to avoid str
 copies (#112374)

* This avoids the need to call printAsOperand that requires use of an
ostream and thus avoids a str copy.
* ModuleSlotTracker is used to get a BB # for BB's without names when
dumping SuspendCrossingInfo and materialization info.
* getBasicBlockLabel() is changed to dumpBasicBlockLabel() that directly
prints the label to dbgs()
* The label corresponds with the print-before BB #s.
* This change does not require any additional arguments to be added to
dump() methods, at least those that currently do not require any args.

Co-authored-by: tnowicki <tnowicki.nowicki@amd.com>
---
 .../Coroutines/SuspendCrossingInfo.h          |  5 ++-
 .../Coroutines/MaterializationUtils.cpp       | 25 +++++++-----
 .../Coroutines/SuspendCrossingInfo.cpp        | 39 +++++++++++--------
 3 files changed, 43 insertions(+), 26 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h b/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h
index 49cae6dde47e..88cbf88acc4c 100644
--- a/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h
+++ b/llvm/include/llvm/Transforms/Coroutines/SuspendCrossingInfo.h
@@ -25,6 +25,8 @@
 
 namespace llvm {
 
+class ModuleSlotTracker;
+
 // Provides two way mapping between the blocks and numbers.
 class BlockToIndexMapping {
   SmallVector<BasicBlock *, 32> V;
@@ -96,7 +98,8 @@ public:
   // Print order is in RPO
   void dump() const;
   void dump(StringRef Label, BitVector const &BV,
-            const ReversePostOrderTraversal<Function *> &RPOT) const;
+            const ReversePostOrderTraversal<Function *> &RPOT,
+            ModuleSlotTracker &MST) const;
 #endif
 
   SuspendCrossingInfo(Function &F,
diff --git a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
index c3ea0977d421..6327cea64c0d 100644
--- a/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
+++ b/llvm/lib/Transforms/Coroutines/MaterializationUtils.cpp
@@ -15,6 +15,7 @@
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/InstIterator.h"
 #include "llvm/IR/Instruction.h"
+#include "llvm/IR/ModuleSlotTracker.h"
 #include "llvm/Transforms/Coroutines/SpillUtils.h"
 #include <deque>
 
@@ -104,19 +105,25 @@ struct RematGraph {
   }
 
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-  static std::string getBasicBlockLabel(const BasicBlock *BB) {
-    if (BB->hasName())
-      return BB->getName().str();
-
-    std::string S;
-    raw_string_ostream OS(S);
-    BB->printAsOperand(OS, false);
-    return OS.str().substr(1);
+  static void dumpBasicBlockLabel(const BasicBlock *BB,
+                                  ModuleSlotTracker &MST) {
+    if (BB->hasName()) {
+      dbgs() << BB->getName();
+      return;
+    }
+
+    dbgs() << MST.getLocalSlot(BB);
   }
 
   void dump() const {
+    BasicBlock *BB = EntryNode->Node->getParent();
+    Function *F = BB->getParent();
+
+    ModuleSlotTracker MST(F->getParent());
+    MST.incorporateFunction(*F);
+
     dbgs() << "Entry (";
-    dbgs() << getBasicBlockLabel(EntryNode->Node->getParent());
+    dumpBasicBlockLabel(BB, MST);
     dbgs() << ") : " << *EntryNode->Node << "\n";
     for (auto &E : Remats) {
       dbgs() << *(E.first) << "\n";
diff --git a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp b/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp
index f18f23306bef..c9bb3395a994 100644
--- a/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp
+++ b/llvm/lib/Transforms/Coroutines/SuspendCrossingInfo.cpp
@@ -13,6 +13,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/Transforms/Coroutines/SuspendCrossingInfo.h"
+#include "llvm/IR/ModuleSlotTracker.h"
 
 // The "coro-suspend-crossing" flag is very noisy. There is another debug type,
 // "coro-frame", which results in leaner debug spew.
@@ -20,24 +21,26 @@
 
 namespace llvm {
 #if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
-static std::string getBasicBlockLabel(const BasicBlock *BB) {
-  if (BB->hasName())
-    return BB->getName().str();
-
-  std::string S;
-  raw_string_ostream OS(S);
-  BB->printAsOperand(OS, false);
-  return OS.str().substr(1);
+static void dumpBasicBlockLabel(const BasicBlock *BB, ModuleSlotTracker &MST) {
+  if (BB->hasName()) {
+    dbgs() << BB->getName();
+    return;
+  }
+
+  dbgs() << MST.getLocalSlot(BB);
 }
 
-LLVM_DUMP_METHOD void SuspendCrossingInfo::dump(
-    StringRef Label, BitVector const &BV,
-    const ReversePostOrderTraversal<Function *> &RPOT) const {
+LLVM_DUMP_METHOD void
+SuspendCrossingInfo::dump(StringRef Label, BitVector const &BV,
+                          const ReversePostOrderTraversal<Function *> &RPOT,
+                          ModuleSlotTracker &MST) const {
   dbgs() << Label << ":";
   for (const BasicBlock *BB : RPOT) {
     auto BBNo = Mapping.blockToIndex(BB);
-    if (BV[BBNo])
-      dbgs() << " " << getBasicBlockLabel(BB);
+    if (BV[BBNo]) {
+      dbgs() << " ";
+      dumpBasicBlockLabel(BB, MST);
+    }
   }
   dbgs() << "\n";
 }
@@ -49,12 +52,16 @@ LLVM_DUMP_METHOD void SuspendCrossingInfo::dump() const {
   BasicBlock *const B = Mapping.indexToBlock(0);
   Function *F = B->getParent();
 
+  ModuleSlotTracker MST(F->getParent());
+  MST.incorporateFunction(*F);
+
   ReversePostOrderTraversal<Function *> RPOT(F);
   for (const BasicBlock *BB : RPOT) {
     auto BBNo = Mapping.blockToIndex(BB);
-    dbgs() << getBasicBlockLabel(BB) << ":\n";
-    dump("   Consumes", Block[BBNo].Consumes, RPOT);
-    dump("      Kills", Block[BBNo].Kills, RPOT);
+    dumpBasicBlockLabel(BB, MST);
+    dbgs() << ":\n";
+    dump("   Consumes", Block[BBNo].Consumes, RPOT, MST);
+    dump("      Kills", Block[BBNo].Kills, RPOT, MST);
   }
   dbgs() << "\n";
 }
-- 
GitLab


From 5716f836d25e93bf8f664a14fe55c70e07a369be Mon Sep 17 00:00:00 2001
From: Jameson Nash <vtjnash@gmail.com>
Date: Wed, 16 Oct 2024 00:48:43 -0400
Subject: [PATCH 062/329] [JITLink] Fix i686 R_386_32 and other relocation
 values (#111091)

Fix R_386_32 and other relocations by correcting Addend computations.
---
 .../llvm/ExecutionEngine/JITLink/i386.h       | 33 +++++--------------
 llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp | 20 +++++++++--
 .../i386/ELF_i386_absolute_relocations_16.s   | 10 +++++-
 .../i386/ELF_i386_absolute_relocations_32.s   | 16 ++++++---
 .../ELF_i386_pc_relative_relocations_32.s     |  7 ++--
 .../i386/ELF_i386_small_pic_relocations_got.s | 12 +++----
 .../i386/ELF_i386_small_pic_relocations_plt.s |  6 ++--
 7 files changed, 60 insertions(+), 44 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
index f8d24d8bf31c..efe8182934dd 100644
--- a/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
+++ b/llvm/include/llvm/ExecutionEngine/JITLink/i386.h
@@ -39,12 +39,8 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Represents a data/control flow instruction using PC-relative addressing
   /// to a target.
   ///
-  /// The fixup expression for this kind includes an implicit offset to account
-  /// for the PC (unlike the Delta edges) so that a PCRel32 with a target
-  /// T and addend zero is a call/branch to the start (offset zero) of T.
-  ///
   /// Fixup expression:
-  ///   Fixup <- Target - (Fixup + 4) + Addend : int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -68,12 +64,8 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Represents a data/control flow instruction using PC-relative addressing
   /// to a target.
   ///
-  /// The fixup expression for this kind includes an implicit offset to account
-  /// for the PC (unlike the Delta edges) so that a PCRel16 with a target
-  /// T and addend zero is a call/branch to the start (offset zero) of T.
-  ///
   /// Fixup expression:
-  ///   Fixup <- Target - (Fixup + 4) + Addend : int16
+  ///   Fixup <- Target - Fixup + Addend : int16
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int16, otherwise
@@ -86,7 +78,7 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Delta from the fixup to the target.
   ///
   /// Fixup expression:
-  ///   Fixup <- Target - Fixup + Addend : int64
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -130,12 +122,8 @@ enum EdgeKind_i386 : Edge::Kind {
   /// Represents a PC-relative call or branch to a target. This can be used to
   /// identify, record, and/or patch call sites.
   ///
-  /// The fixup expression for this kind includes an implicit offset to account
-  /// for the PC (unlike the Delta edges) so that a Branch32PCRel with a target
-  /// T and addend zero is a call/branch to the start (offset zero) of T.
-  ///
   /// Fixup expression:
-  ///   Fixup <- Target - (Fixup + 4) + Addend : int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -164,7 +152,7 @@ enum EdgeKind_i386 : Edge::Kind {
   /// target may be recorded to allow manipulation at runtime.
   ///
   /// Fixup expression:
-  ///   Fixup <- Target - Fixup + Addend - 4 : int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -180,7 +168,7 @@ enum EdgeKind_i386 : Edge::Kind {
   /// is within range of the fixup location.
   ///
   /// Fixup expression:
-  ///   Fixup <- Target - Fixup + Addend - 4: int32
+  ///   Fixup <- Target - Fixup + Addend : int32
   ///
   /// Errors:
   ///   - The result of the fixup expression must fit into an int32, otherwise
@@ -215,8 +203,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   }
 
   case i386::PCRel32: {
-    int32_t Value =
-        E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
+    int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
     *(little32_t *)FixupPtr = Value;
     break;
   }
@@ -231,8 +218,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   }
 
   case i386::PCRel16: {
-    int32_t Value =
-        E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
+    int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
     if (LLVM_LIKELY(isInt<16>(Value)))
       *(little16_t *)FixupPtr = Value;
     else
@@ -257,8 +243,7 @@ inline Error applyFixup(LinkGraph &G, Block &B, const Edge &E,
   case i386::BranchPCRel32:
   case i386::BranchPCRel32ToPtrJumpStub:
   case i386::BranchPCRel32ToPtrJumpStubBypassable: {
-    int32_t Value =
-        E.getTarget().getAddress() - (FixupAddress + 4) + E.getAddend();
+    int32_t Value = E.getTarget().getAddress() - FixupAddress + E.getAddend();
     *(little32_t *)FixupPtr = Value;
     break;
   }
diff --git a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
index 860165365a7e..2d5f28cad1cc 100644
--- a/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
+++ b/llvm/lib/ExecutionEngine/JITLink/ELF_i386.cpp
@@ -186,15 +186,29 @@ private:
     int64_t Addend = 0;
 
     switch (*Kind) {
-    case i386::EdgeKind_i386::Delta32: {
+    case i386::EdgeKind_i386::None:
+      break;
+    case i386::EdgeKind_i386::Pointer32:
+    case i386::EdgeKind_i386::PCRel32:
+    case i386::EdgeKind_i386::RequestGOTAndTransformToDelta32FromGOT:
+    case i386::EdgeKind_i386::Delta32:
+    case i386::EdgeKind_i386::Delta32FromGOT:
+    case i386::EdgeKind_i386::BranchPCRel32:
+    case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStub:
+    case i386::EdgeKind_i386::BranchPCRel32ToPtrJumpStubBypassable: {
       const char *FixupContent = BlockToFix.getContent().data() +
                                  (FixupAddress - BlockToFix.getAddress());
-      Addend = *(const support::ulittle32_t *)FixupContent;
+      Addend = *(const support::little32_t *)FixupContent;
       break;
     }
-    default:
+    case i386::EdgeKind_i386::Pointer16:
+    case i386::EdgeKind_i386::PCRel16: {
+      const char *FixupContent = BlockToFix.getContent().data() +
+                                 (FixupAddress - BlockToFix.getAddress());
+      Addend = *(const support::little16_t *)FixupContent;
       break;
     }
+    }
 
     Edge::OffsetT Offset = FixupAddress - BlockToFix.getAddress();
     Edge GE(*Kind, Offset, *GraphSymbol, Addend);
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s
index 47142c4be3c0..092f7d753c7e 100644
--- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s
+++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_16.s
@@ -22,4 +22,12 @@ main:
         .type   bar,@function
 bar:
         retw    $external_data
-        .size   bar, .-bar
\ No newline at end of file
+        .size   bar, .-bar
+
+# jitlink-check: decode_operand(baz, 0) = external_data + 23
+        .globl  baz
+        .align        2, 0x90
+        .type   baz,@function
+baz:
+        retw    $external_data+23
+        .size   baz, .-baz
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s
index e4b02a794bbc..a66ad8e7cda6 100644
--- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s
+++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_absolute_relocations_32.s
@@ -7,17 +7,25 @@
 # Test ELF 32 bit absolute relocations
 
         .text
-        .globl  main     
+        .globl  main
         .p2align        4, 0x90
         .type   main,@function
-main:                                   
+main:
         retl
         .size   main, .-main
 
 # jitlink-check: decode_operand(foo, 0) = external_data
-        .globl  foo     
+        .globl  foo
         .p2align        4, 0x90
         .type   foo,@function
 foo:
         movl    external_data, %eax
-        .size   foo, .-foo
\ No newline at end of file
+        .size   foo, .-foo
+
+# jitlink-check: decode_operand(bar, 0) = external_data + 4000
+        .globl  bar
+        .p2align        4, 0x90
+        .type   bar,@function
+bar:
+        movl    external_data + 4000, %eax
+        .size   bar, .-bar
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s
index df74c7bb3932..0717c8f434d5 100644
--- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s
+++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_pc_relative_relocations_32.s
@@ -33,11 +33,12 @@ foo:
 
 
 # Tests PC relative relocation for negative offset from PC
-# jitlink-check: decode_operand(baz, 0) = fooz - next_pc(baz)
+# jitlink-check: decode_operand(baz, 0) = fooz - next_pc(baz) + 1
         .globl  fooz
         .p2align        4
         .type   fooz,@function
 fooz:
+    nop
     retl
         .size   fooz, .-fooz
 
@@ -45,5 +46,5 @@ fooz:
         .p2align        4
         .type   baz,@function
 baz:
-    calll fooz
-        .size       baz, .-baz
\ No newline at end of file
+    calll fooz+1
+        .size       baz, .-baz
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s
index 91049a8a87a5..080341ac3bfe 100644
--- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s
+++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_got.s
@@ -19,29 +19,29 @@ main:
 # Test GOT32 handling.
 # 
 # We want to check both the offset to the GOT entry and its contents. 
-# jitlink-check: decode_operand(test_got, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data1) - _GLOBAL_OFFSET_TABLE_
+# jitlink-check: decode_operand(test_got, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data1) - _GLOBAL_OFFSET_TABLE_ + 42
 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_got.o, named_data1)) = named_data1
 # 
-# jitlink-check: decode_operand(test_got+6, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data2) - _GLOBAL_OFFSET_TABLE_
+# jitlink-check: decode_operand(test_got+6, 4) = got_addr(elf_sm_pic_reloc_got.o, named_data2) - _GLOBAL_OFFSET_TABLE_ + 5
 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_got.o, named_data2)) = named_data2
 
         .globl test_got
         .p2align      4, 0x90
         .type   test_got,@function
 test_got:
-	leal    named_data1@GOT, %eax
-        leal    named_data2@GOT, %eax
+        leal    named_data1@GOT+42, %eax
+        leal    named_data2@GOT+5, %eax
         .size   test_got, .-test_got
 
 
 # Test GOTOFF64 handling.
-# jitlink-check: decode_operand(test_gotoff, 1) = named_func - _GLOBAL_OFFSET_TABLE_
+# jitlink-check: decode_operand(test_gotoff, 1) = named_func - _GLOBAL_OFFSET_TABLE_ + 99
         .globl test_gotoff
         .p2align     4, 0x90
         .type  test_gotoff,@function
 test_gotoff:
-        mov $named_func@GOTOFF, %eax
+        mov $named_func@GOTOFF+99, %eax
         .size   test_gotoff, .-test_gotoff
 
 
diff --git a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s
index e5725a2b52c3..ce565ca2fcdd 100644
--- a/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s
+++ b/llvm/test/ExecutionEngine/JITLink/i386/ELF_i386_small_pic_relocations_plt.s
@@ -27,12 +27,12 @@ main:
 # for position independent code, first, as there may be future use-cases
 # where we would want to disable the optimization.
 # 
-# jitlink-check: decode_operand(test_call_extern_plt, 0) = external_func - next_pc(test_call_extern_plt)
+# jitlink-check: decode_operand(test_call_extern_plt, 0) = external_func - next_pc(test_call_extern_plt) + 53
 # jitlink-check: *{4}(got_addr(elf_sm_pic_reloc_plt.o, external_func))= external_func
         .globl  test_call_extern_plt
         .p2align       4, 0x90
         .type   test_call_extern_plt,@function
 test_call_extern_plt:
-        call   external_func@plt
+        call   external_func@plt + 53
 
-        .size   test_call_extern_plt, .-test_call_extern_plt
\ No newline at end of file
+        .size   test_call_extern_plt, .-test_call_extern_plt
-- 
GitLab


From d8fadad07c952c4aea967aefb0900e4e43ad0555 Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Wed, 16 Oct 2024 12:51:50 +0800
Subject: [PATCH 063/329] [mlir][LLVMIR] Add operand bundle support for
 llvm.intr.assume (#112143)

This patch adds operand bundle support for `llvm.intr.assume`.

This patch actually contains two parts:

- `llvm.intr.assume` now accepts operand bundle related attributes and
operands. `llvm.intr.assume` does not take constraint on the operand
bundles, but obviously only a few set of operand bundles are meaningful.
I plan to add some of those (e.g. `aligned` and `separate_storage` are
what interest me but other people may be interested in other operand
bundles as well) in future patches.

- The definitions of `llvm.call`, `llvm.invoke`, and
`llvm.call_intrinsic` actually define `op_bundle_tags` as an operation
property. It turns out this approach would introduce some unnecessary
burden if applied equally to the intrinsic operations because properties
are not available through `Operation *` but we have to operate on
`Operation *` during the import/export of intrinsics, so this PR changes
it from a property to an array attribute.
---
 .../Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td   |  1 +
 .../mlir/Dialect/LLVMIR/LLVMDialect.td        |  2 +
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   | 44 +++++++--
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td | 25 +++--
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   | 18 +---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  2 +-
 .../include/mlir/Target/LLVMIR/ModuleImport.h |  2 +
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 96 ++++++++++++-------
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  6 ++
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        | 16 +++-
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  |  6 ++
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       | 32 ++++++-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 37 ++++++-
 .../expand-then-convert-to-llvm.mlir          |  2 +-
 .../MemRefToLLVM/memref-to-llvm.mlir          |  4 +-
 mlir/test/Dialect/LLVMIR/inlining.mlir        |  4 +-
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       | 27 ++++++
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   | 12 ++-
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 15 +++
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |  2 +-
 20 files changed, 276 insertions(+), 77 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index 0e38325f9891..e81db32bcaad 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -71,6 +71,7 @@ class ArmSME_IntrOp<string mnemonic,
           /*bit requiresAccessGroup=*/0,
           /*bit requiresAliasAnalysis=*/0,
           /*bit requiresFastmath=*/0,
+          /*bit requiresOpBundles=*/0,
           /*list<int> immArgPositions=*/immArgPositions,
           /*list<string> immArgAttrNames=*/immArgAttrNames>;
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index 27a2b418aadb..ea82f7f7b8e1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -59,6 +59,8 @@ def LLVM_Dialect : Dialect {
     static StringRef getStructRetAttrName() { return "llvm.sret"; }
     static StringRef getWriteOnlyAttrName() { return "llvm.writeonly"; }
     static StringRef getZExtAttrName() { return "llvm.zeroext"; }
+    static StringRef getOpBundleSizesAttrName() { return "op_bundle_sizes"; }
+    static StringRef getOpBundleTagsAttrName() { return "op_bundle_tags"; }
     // TODO Restrict the usage of this to parameter attributes once there is an
     // alternative way of modeling memory effects on FunctionOpInterface.
     /// Name of the attribute that will cause the creation of a readnone memory
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index ab40c8ec4b65..845c88b1be77 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -120,7 +120,8 @@ def LLVM_Log2Op : LLVM_UnaryIntrOpF<"log2">;
 def LLVM_LogOp : LLVM_UnaryIntrOpF<"log">;
 def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0],
   /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*immArgPositions=*/[1, 2, 3], /*immArgAttrNames=*/["rw", "hint", "cache"]
+  /*requiresOpBundles=*/0, /*immArgPositions=*/[1, 2, 3],
+  /*immArgAttrNames=*/["rw", "hint", "cache"]
 > {
   let arguments = (ins LLVM_AnyPointer:$addr, I32Attr:$rw, I32Attr:$hint, I32Attr:$cache);
 }
@@ -176,7 +177,8 @@ class LLVM_MemcpyIntrOpBase<string name> :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+    /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   AnySignlessInteger:$len, I1Attr:$isVolatile);
@@ -206,7 +208,8 @@ def LLVM_MemcpyInlineOp :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[2, 3], /*immArgAttrNames=*/["len", "isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[2, 3],
+    /*immArgAttrNames=*/["len", "isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   APIntAttr:$len, I1Attr:$isVolatile);
@@ -232,7 +235,8 @@ def LLVM_MemsetOp : LLVM_ZeroResultIntrOp<"memset", [0, 2],
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+    /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   I8:$val, AnySignlessInteger:$len, I1Attr:$isVolatile);
   // Append the alias attributes defined by LLVM_IntrOpBase.
@@ -286,7 +290,8 @@ def LLVM_NoAliasScopeDeclOp
 class LLVM_LifetimeBaseOp<string opName> : LLVM_ZeroResultIntrOp<opName, [1],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*immArgPositions=*/[0], /*immArgAttrNames=*/["size"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+    /*immArgAttrNames=*/["size"]> {
   let arguments = (ins I64Attr:$size, LLVM_AnyPointer:$ptr);
   let assemblyFormat = "$size `,` $ptr attr-dict `:` qualified(type($ptr))";
 }
@@ -306,7 +311,8 @@ def LLVM_InvariantStartOp : LLVM_OneResultIntrOp<"invariant.start", [], [1],
 def LLVM_InvariantEndOp : LLVM_ZeroResultIntrOp<"invariant.end", [2],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*immArgPositions=*/[1], /*immArgAttrNames=*/["size"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[1],
+    /*immArgAttrNames=*/["size"]> {
   let arguments = (ins LLVM_DefaultPointer:$start,
                        I64Attr:$size,
                        LLVM_AnyPointer:$ptr);
@@ -368,7 +374,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
     SmallVector<Value> mlirOperands;
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
-        llvmOperands.take_front( }] # numArgs # [{),
+        llvmOperands.take_front( }] # numArgs # [{), {}, false,
         {}, {}, mlirOperands, mlirAttrs))) {
       return failure();
     }
@@ -429,7 +435,26 @@ def LLVM_USHLSat : LLVM_BinarySameArgsIntrOpI<"ushl.sat">;
 //
 
 def LLVM_AssumeOp
-  : LLVM_ZeroResultIntrOp<"assume", []>, Arguments<(ins I1:$cond)>;
+    : LLVM_ZeroResultIntrOp<"assume", /*overloadedOperands=*/[], /*traits=*/[],
+                            /*requiresAccessGroup=*/0,
+                            /*requiresAliasAnalysis=*/0,
+                            /*requiresOpBundles=*/1> {
+  dag args = (ins I1:$cond);
+  let arguments = !con(args, opBundleArgs);
+
+  let assemblyFormat = [{
+    $cond
+    ( custom<OpBundles>($op_bundle_operands, type($op_bundle_operands),
+                        $op_bundle_tags)^ )?
+    `:` type($cond) attr-dict
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$cond)>
+  ];
+
+  let hasVerifier = 1;
+}
 
 def LLVM_SSACopyOp : LLVM_OneResultIntrOp<"ssa.copy", [], [0],
                                             [Pure, SameOperandsAndResultType]> {
@@ -992,7 +1017,8 @@ def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">;
 def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
   /*overloadedOperands=*/[], /*traits=*/[],
   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> {
+  /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+  /*immArgAttrNames=*/["failureKind"]> {
   let arguments = (ins I8Attr:$failureKind);
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index c3d352d8d0dd..a38dafa4d9cf 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,7 +291,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                       list<int> overloadedResults, list<int> overloadedOperands,
                       list<Trait> traits, int numResults,
                       bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0,
-                      bit requiresFastmath = 0,
+                      bit requiresFastmath = 0, bit requiresOpBundles = 0,
                       list<int> immArgPositions = [],
                       list<string> immArgAttrNames = []>
     : LLVM_OpBase<dialect, opName, !listconcat(
@@ -313,6 +313,12 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                  OptionalAttr<LLVM_AliasScopeArrayAttr>:$noalias_scopes,
                  OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa),
             (ins )));
+  dag opBundleArgs = !if(!gt(requiresOpBundles, 0),
+                         (ins VariadicOfVariadic<LLVM_Type,
+                                "op_bundle_sizes">:$op_bundle_operands,
+                              DenseI32ArrayAttr:$op_bundle_sizes,
+                              OptionalAttr<ArrayAttr>:$op_bundle_tags),
+                         (ins ));
   string llvmEnumName = enumName;
   string overloadedResultsCpp =  "{" # !interleave(overloadedResults, ", ") # "}";
   string overloadedOperandsCpp =  "{" # !interleave(overloadedOperands, ", ") # "}";
@@ -336,6 +342,8 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
       llvmOperands,
+      llvmOpBundles,
+      }] # !if(!gt(requiresOpBundles, 0), "true", "false") # [{,
       }] # immArgPositionsCpp # [{,
       }] # immArgAttrNamesCpp # [{,
       mlirOperands,
@@ -381,12 +389,14 @@ class LLVM_IntrOp<string mnem, list<int> overloadedResults,
                   list<int> overloadedOperands, list<Trait> traits,
                   int numResults, bit requiresAccessGroup = 0,
                   bit requiresAliasAnalysis = 0, bit requiresFastmath = 0,
+                  bit requiresOpBundles = 0,
                   list<int> immArgPositions = [],
                   list<string> immArgAttrNames = []>
     : LLVM_IntrOpBase<LLVM_Dialect, "intr." # mnem, !subst(".", "_", mnem),
                       overloadedResults, overloadedOperands, traits,
                       numResults, requiresAccessGroup, requiresAliasAnalysis,
-                      requiresFastmath, immArgPositions, immArgAttrNames>;
+                      requiresFastmath, requiresOpBundles, immArgPositions,
+                      immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning no results. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.".
@@ -406,11 +416,13 @@ class LLVM_ZeroResultIntrOp<string mnem, list<int> overloadedOperands = [],
                             list<Trait> traits = [],
                             bit requiresAccessGroup = 0,
                             bit requiresAliasAnalysis = 0,
+                            bit requiresOpBundles = 0,
                             list<int> immArgPositions = [],
                             list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, [], overloadedOperands, traits, /*numResults=*/0,
                   requiresAccessGroup, requiresAliasAnalysis,
-                  /*requiresFastMath=*/0, immArgPositions, immArgAttrNames>;
+                  /*requiresFastMath=*/0, requiresOpBundles, immArgPositions,
+                  immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning one result. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.". This is
@@ -422,11 +434,12 @@ class LLVM_OneResultIntrOp<string mnem, list<int> overloadedResults = [],
                            list<int> overloadedOperands = [],
                            list<Trait> traits = [],
                            bit requiresFastmath = 0,
-                          list<int> immArgPositions = [],
-                          list<string> immArgAttrNames = []>
+                           list<int> immArgPositions = [],
+                           list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, overloadedResults, overloadedOperands, traits, 1,
                   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-                  requiresFastmath, immArgPositions, immArgAttrNames>;
+                  requiresFastmath, /*requiresOpBundles=*/0, immArgPositions,
+                  immArgAttrNames>;
 
 def LLVM_OneResultOpBuilder :
   OpBuilder<(ins "Type":$resultType, "ValueRange":$operands,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index bbca7bc7286a..d5def510a904 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -559,11 +559,7 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                    VariadicOfVariadic<LLVM_Type,
                                       "op_bundle_sizes">:$op_bundle_operands,
                    DenseI32ArrayAttr:$op_bundle_sizes,
-                   DefaultValuedProperty<
-                     ArrayProperty<StringProperty, "operand bundle tags">,
-                     "ArrayRef<std::string>{}",
-                     "SmallVector<std::string>{}"
-                   >:$op_bundle_tags);
+                   OptionalAttr<ArrayAttr>:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$result);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
@@ -678,11 +674,7 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
                   VariadicOfVariadic<LLVM_Type,
                                      "op_bundle_sizes">:$op_bundle_operands,
                   DenseI32ArrayAttr:$op_bundle_sizes,
-                  DefaultValuedProperty<
-                    ArrayProperty<StringProperty, "operand bundle tags">,
-                    "ArrayRef<std::string>{}",
-                    "SmallVector<std::string>{}"
-                  >:$op_bundle_tags);
+                  OptionalAttr<ArrayAttr>:$op_bundle_tags);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1930,11 +1922,7 @@ def LLVM_CallIntrinsicOp
                        VariadicOfVariadic<LLVM_Type,
                                           "op_bundle_sizes">:$op_bundle_operands,
                        DenseI32ArrayAttr:$op_bundle_sizes,
-                       DefaultValuedProperty<
-                         ArrayProperty<StringProperty, "operand bundle tags">,
-                         "ArrayRef<std::string>{}",
-                         "SmallVector<std::string>{}"
-                       >:$op_bundle_tags);
+                       OptionalAttr<ArrayAttr>:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$results);
   let llvmBuilder = [{
     return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index c40ae4b1016b..3695708439d9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -98,7 +98,7 @@ class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
     overloadedOperands, traits, numResults, requiresAccessGroup,
-    requiresAliasAnalysis, 0, immArgPositions, immArgAttrNames>;
+    requiresAliasAnalysis, 0, 0, immArgPositions, immArgAttrNames>;
 
 //===----------------------------------------------------------------------===//
 // ROCDL special register op definitions
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index 9f300bcafea5..bbb7af58d273 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -243,6 +243,8 @@ public:
   /// corresponding MLIR attribute names.
   LogicalResult
   convertIntrinsicArguments(ArrayRef<llvm::Value *> values,
+                            ArrayRef<llvm::OperandBundleUse> opBundles,
+                            bool requiresOpBundles,
                             ArrayRef<unsigned> immArgPositions,
                             ArrayRef<StringLiteral> immArgAttrNames,
                             SmallVectorImpl<Value> &valuesOut,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 12ed8cc88ae7..cc73878a64ff 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -241,13 +241,18 @@ static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands,
 static void printOpBundles(OpAsmPrinter &p, Operation *op,
                            OperandRangeRange opBundleOperands,
                            TypeRangeRange opBundleOperandTypes,
-                           ArrayRef<std::string> opBundleTags) {
+                           std::optional<ArrayAttr> opBundleTags) {
+  if (opBundleOperands.empty())
+    return;
+  assert(opBundleTags && "expect operand bundle tags");
+
   p << "[";
   llvm::interleaveComma(
-      llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p,
+      llvm::zip(opBundleOperands, opBundleOperandTypes, *opBundleTags), p,
       [&p](auto bundle) {
+        auto bundleTag = cast<StringAttr>(std::get<2>(bundle)).getValue();
         printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle),
-                         std::get<2>(bundle));
+                         bundleTag);
       });
   p << "]";
 }
@@ -256,7 +261,7 @@ static ParseResult parseOneOpBundle(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<std::string> &opBundleTags) {
+    SmallVector<Attribute> &opBundleTags) {
   SMLoc currentParserLoc = p.getCurrentLocation();
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<Type> types;
@@ -276,7 +281,7 @@ static ParseResult parseOneOpBundle(
 
   opBundleOperands.push_back(std::move(operands));
   opBundleOperandTypes.push_back(std::move(types));
-  opBundleTags.push_back(std::move(tag));
+  opBundleTags.push_back(StringAttr::get(p.getContext(), tag));
 
   return success();
 }
@@ -285,16 +290,17 @@ static std::optional<ParseResult> parseOpBundles(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<std::string> &opBundleTags) {
+    ArrayAttr &opBundleTags) {
   if (p.parseOptionalLSquare())
     return std::nullopt;
 
   if (succeeded(p.parseOptionalRSquare()))
     return success();
 
+  SmallVector<Attribute> opBundleTagAttrs;
   auto bundleParser = [&] {
     return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes,
-                            opBundleTags);
+                            opBundleTagAttrs);
   };
   if (p.parseCommaSeparatedList(bundleParser))
     return failure();
@@ -302,6 +308,8 @@ static std::optional<ParseResult> parseOpBundles(
   if (p.parseRSquare())
     return failure();
 
+  opBundleTags = ArrayAttr::get(p.getContext(), opBundleTagAttrs);
+
   return success();
 }
 
@@ -1039,7 +1047,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1066,7 +1074,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1079,7 +1087,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1092,7 +1100,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1192,12 +1200,20 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
 template <typename OpType>
 static LogicalResult verifyOperandBundles(OpType &op) {
   OperandRangeRange opBundleOperands = op.getOpBundleOperands();
-  ArrayRef<std::string> opBundleTags = op.getOpBundleTags();
+  std::optional<ArrayAttr> opBundleTags = op.getOpBundleTags();
 
-  if (opBundleTags.size() != opBundleOperands.size())
+  auto isStringAttr = [](Attribute tagAttr) {
+    return isa<StringAttr>(tagAttr);
+  };
+  if (opBundleTags && !llvm::all_of(*opBundleTags, isStringAttr))
+    return op.emitError("operand bundle tag must be a StringAttr");
+
+  size_t numOpBundles = opBundleOperands.size();
+  size_t numOpBundleTags = opBundleTags ? opBundleTags->size() : 0;
+  if (numOpBundles != numOpBundleTags)
     return op.emitError("expected ")
-           << opBundleOperands.size()
-           << " operand bundle tags, but actually got " << opBundleTags.size();
+           << numOpBundles << " operand bundle tags, but actually got "
+           << numOpBundleTags;
 
   return success();
 }
@@ -1329,7 +1345,8 @@ void CallOp::print(OpAsmPrinter &p) {
                           {getCalleeAttrName(), getTailCallKindAttrName(),
                            getVarCalleeTypeAttrName(), getCConvAttrName(),
                            getOperandSegmentSizesAttrName(),
-                           getOpBundleSizesAttrName()});
+                           getOpBundleSizesAttrName(),
+                           getOpBundleTagsAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1437,7 +1454,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  SmallVector<std::string> opBundleTags;
+  ArrayAttr opBundleTags;
 
   // Default to C Calling Convention if no keyword is provided.
   result.addAttribute(
@@ -1483,9 +1500,9 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (!opBundleTags.empty())
-    result.getOrAddProperties<CallOp::Properties>().op_bundle_tags =
-        std::move(opBundleTags);
+  if (opBundleTags && !opBundleTags.empty())
+    result.addAttribute(CallOp::getOpBundleTagsAttrName(result.name).getValue(),
+                        opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -1525,8 +1542,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
-        normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal,
-        unwind);
+        normalOps, unwindOps, nullptr, nullptr, {}, {}, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1535,7 +1551,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange unwindOps) {
   build(builder, state, tys,
         /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, {}, std::nullopt, normal, unwind);
+        nullptr, {}, {}, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1544,7 +1560,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
-        nullptr, nullptr, {}, std::nullopt, normal, unwind);
+        nullptr, nullptr, {}, {}, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1634,7 +1650,8 @@ void InvokeOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict((*this)->getAttrs(),
                           {getCalleeAttrName(), getOperandSegmentSizeAttr(),
                            getCConvAttrName(), getVarCalleeTypeAttrName(),
-                           getOpBundleSizesAttrName()});
+                           getOpBundleSizesAttrName(),
+                           getOpBundleTagsAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1657,7 +1674,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   TypeAttr varCalleeType;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  SmallVector<std::string> opBundleTags;
+  ArrayAttr opBundleTags;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1703,9 +1720,10 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (!opBundleTags.empty())
-    result.getOrAddProperties<InvokeOp::Properties>().op_bundle_tags =
-        std::move(opBundleTags);
+  if (opBundleTags && !opBundleTags.empty())
+    result.addAttribute(
+        InvokeOp::getOpBundleTagsAttrName(result.name).getValue(),
+        opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -3333,7 +3351,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3341,14 +3359,14 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         fastMathFlags,
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::Type resultType, mlir::StringAttr intrin,
                             mlir::ValueRange args) {
   build(builder, state, {resultType}, intrin, args, FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3356,7 +3374,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, resultTypes, intrin, args, fastMathFlags,
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 //===----------------------------------------------------------------------===//
@@ -3413,6 +3431,18 @@ void InlineAsmOp::getEffects(
   }
 }
 
+//===----------------------------------------------------------------------===//
+// AssumeOp (intrinsic)
+//===----------------------------------------------------------------------===//
+
+void LLVM::AssumeOp::build(OpBuilder &builder, OperationState &state,
+                           mlir::Value cond) {
+  return build(builder, state, cond, /*op_bundle_operands=*/{},
+               /*op_bundle_tags=*/{});
+}
+
+LogicalResult LLVM::AssumeOp::verify() { return verifyOperandBundles(*this); }
+
 //===----------------------------------------------------------------------===//
 // masked_gather (intrinsic)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index d034e576dfc5..4fd043c7c93e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -68,6 +68,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
+
+    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+    llvmOpBundles.reserve(inst->getNumOperandBundles());
+    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index a8595d14ccf2..2084e527773c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -114,17 +114,27 @@ convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag,
 }
 
 static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands,
-                      ArrayRef<std::string> bundleTags,
+convertOperandBundles(OperandRangeRange bundleOperands, ArrayAttr bundleTags,
                       LLVM::ModuleTranslation &moduleTranslation) {
   SmallVector<llvm::OperandBundleDef> bundles;
   bundles.reserve(bundleOperands.size());
 
-  for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags))
+  for (auto [operands, tagAttr] : llvm::zip_equal(bundleOperands, bundleTags)) {
+    StringRef tag = cast<StringAttr>(tagAttr).getValue();
     bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation));
+  }
   return bundles;
 }
 
+static SmallVector<llvm::OperandBundleDef>
+convertOperandBundles(OperandRangeRange bundleOperands,
+                      std::optional<ArrayAttr> bundleTags,
+                      LLVM::ModuleTranslation &moduleTranslation) {
+  if (!bundleTags)
+    return {};
+  return convertOperandBundles(bundleOperands, *bundleTags, moduleTranslation);
+}
+
 /// Builder for LLVM_CallIntrinsicOp
 static LogicalResult
 convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index bc830a77f3c5..2c0b665ad0d8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -50,6 +50,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
+
+    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+    llvmOpBundles.reserve(inst->getNumOperandBundles());
+    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
 #include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index bd861f3a69e5..6e97b2a50af8 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1311,7 +1311,8 @@ ModuleImport::convertValues(ArrayRef<llvm::Value *> values) {
 }
 
 LogicalResult ModuleImport::convertIntrinsicArguments(
-    ArrayRef<llvm::Value *> values, ArrayRef<unsigned> immArgPositions,
+    ArrayRef<llvm::Value *> values, ArrayRef<llvm::OperandBundleUse> opBundles,
+    bool requiresOpBundles, ArrayRef<unsigned> immArgPositions,
     ArrayRef<StringLiteral> immArgAttrNames, SmallVectorImpl<Value> &valuesOut,
     SmallVectorImpl<NamedAttribute> &attrsOut) {
   assert(immArgPositions.size() == immArgAttrNames.size() &&
@@ -1341,6 +1342,35 @@ LogicalResult ModuleImport::convertIntrinsicArguments(
     valuesOut.push_back(*mlirValue);
   }
 
+  SmallVector<int> opBundleSizes;
+  SmallVector<Attribute> opBundleTagAttrs;
+  if (requiresOpBundles) {
+    opBundleSizes.reserve(opBundles.size());
+    opBundleTagAttrs.reserve(opBundles.size());
+
+    for (const llvm::OperandBundleUse &bundle : opBundles) {
+      opBundleSizes.push_back(bundle.Inputs.size());
+      opBundleTagAttrs.push_back(StringAttr::get(context, bundle.getTagName()));
+
+      for (const llvm::Use &opBundleOperand : bundle.Inputs) {
+        auto operandMlirValue = convertValue(opBundleOperand.get());
+        if (failed(operandMlirValue))
+          return failure();
+        valuesOut.push_back(*operandMlirValue);
+      }
+    }
+
+    auto opBundleSizesAttr = DenseI32ArrayAttr::get(context, opBundleSizes);
+    auto opBundleSizesAttrNameAttr =
+        StringAttr::get(context, LLVMDialect::getOpBundleSizesAttrName());
+    attrsOut.push_back({opBundleSizesAttrNameAttr, opBundleSizesAttr});
+
+    auto opBundleTagsAttr = ArrayAttr::get(context, opBundleTagAttrs);
+    auto opBundleTagsAttrNameAttr =
+        StringAttr::get(context, LLVMDialect::getOpBundleTagsAttrName());
+    attrsOut.push_back({opBundleTagsAttrNameAttr, opBundleTagsAttr});
+  }
+
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 6e005f9ec5df..e4c097c0daed 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <numeric>
 #include <optional>
 
 #define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
@@ -854,8 +855,40 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
          "LLVM `immArgPositions` and MLIR `immArgAttrNames` should have equal "
          "length");
 
+  SmallVector<llvm::OperandBundleDef> opBundles;
+  size_t numOpBundleOperands = 0;
+  auto opBundleSizesAttr = cast_if_present<DenseI32ArrayAttr>(
+      intrOp->getAttr(LLVMDialect::getOpBundleSizesAttrName()));
+  auto opBundleTagsAttr = cast_if_present<ArrayAttr>(
+      intrOp->getAttr(LLVMDialect::getOpBundleTagsAttrName()));
+
+  if (opBundleSizesAttr && opBundleTagsAttr) {
+    ArrayRef<int> opBundleSizes = opBundleSizesAttr.asArrayRef();
+    assert(opBundleSizes.size() == opBundleTagsAttr.size() &&
+           "operand bundles and tags do not match");
+
+    numOpBundleOperands =
+        std::reduce(opBundleSizes.begin(), opBundleSizes.end());
+    assert(numOpBundleOperands <= intrOp->getNumOperands() &&
+           "operand bundle operands is more than the number of operands");
+
+    ValueRange operands = intrOp->getOperands().take_back(numOpBundleOperands);
+    size_t nextOperandIdx = 0;
+    opBundles.reserve(opBundleSizesAttr.size());
+
+    for (auto [opBundleTagAttr, bundleSize] :
+         llvm::zip(opBundleTagsAttr, opBundleSizes)) {
+      auto bundleTag = cast<StringAttr>(opBundleTagAttr).str();
+      auto bundleOperands = moduleTranslation.lookupValues(
+          operands.slice(nextOperandIdx, bundleSize));
+      opBundles.emplace_back(std::move(bundleTag), std::move(bundleOperands));
+      nextOperandIdx += bundleSize;
+    }
+  }
+
   // Map operands and attributes to LLVM values.
-  auto operands = moduleTranslation.lookupValues(intrOp->getOperands());
+  auto opOperands = intrOp->getOperands().drop_back(numOpBundleOperands);
+  auto operands = moduleTranslation.lookupValues(opOperands);
   SmallVector<llvm::Value *> args(immArgPositions.size() + operands.size());
   for (auto [immArgPos, immArgName] :
        llvm::zip(immArgPositions, immArgAttrNames)) {
@@ -890,7 +923,7 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
   llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration(
       module, intrinsic, overloadedTypes);
 
-  return builder.CreateCall(llvmIntr, args);
+  return builder.CreateCall(llvmIntr, args, opBundles);
 }
 
 /// Given a single MLIR operation, create the corresponding LLVM IR operation
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index b86103422b07..55b1bc9c545a 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -684,7 +684,7 @@ func.func @collapse_static_shape_with_non_identity_layout(%arg: memref<1x1x8x8xf
 // CHECK: %[[INT_TO_PTR:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
 // CHECK: %[[AND:.*]] = llvm.and %[[INT_TO_PTR]], {{.*}}  : i64
 // CHECK: %[[CMP:.*]] = llvm.icmp "eq" %[[AND]], {{.*}} : i64
-// CHECK: "llvm.intr.assume"(%[[CMP]]) : (i1) -> ()
+// CHECK: llvm.intr.assume %[[CMP]] : i1
 // CHECK: %[[LD_ADDR:.*]] = llvm.getelementptr %[[BUFF_ADDR]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 // CHECK: %[[VAL:.*]] = llvm.load %[[LD_ADDR]] : !llvm.ptr -> f32
 // CHECK: return %[[VAL]] : f32
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 9dc22abf143b..48dc9079333d 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -160,7 +160,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
   memref.assume_alignment %0, 16 : memref<4x4xf16>
   return
 }
@@ -177,7 +177,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
   memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
   return
 }
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index f9551e311df5..0b7ca3f2bb04 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -18,7 +18,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
   "llvm.intr.memset"(%ptr, %byte, %0) <{isVolatile = true}> : (!llvm.ptr, i8, i32) -> ()
   "llvm.intr.memmove"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   "llvm.intr.memcpy"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
-  "llvm.intr.assume"(%true) : (i1) -> ()
+  llvm.intr.assume %true : i1
   llvm.fence release
   %2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32
   %3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32
@@ -44,7 +44,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
 // CHECK: "llvm.intr.memset"(%[[PTR]]
 // CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]]
 // CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]]
-// CHECK: "llvm.intr.assume"
+// CHECK: llvm.intr.assume
 // CHECK: llvm.fence release
 // CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic
 // CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 3062cdc38c0a..b8ce7db795a1 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -836,3 +836,30 @@ llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
   llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
   llvm.return
 }
+
+// CHECK-LABEL: @test_assume_intr_no_opbundle
+llvm.func @test_assume_intr_no_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  // CHECK: llvm.intr.assume %0 : i1
+  llvm.intr.assume %0 : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_empty_opbundle
+llvm.func @test_assume_intr_empty_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  // CHECK: llvm.intr.assume %0 : i1
+  llvm.intr.assume %0 [] : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_with_opbundles
+llvm.func @test_assume_intr_with_opbundles(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  %3 = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+  llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 28a1bd21c82a..606b11175f57 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -630,11 +630,21 @@ define void @va_intrinsics_test(ptr %0, ptr %1, ...) {
 ; CHECK-LABEL: @assume
 ; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
 define void @assume(i1 %true) {
-  ; CHECK:  "llvm.intr.assume"(%[[TRUE]]) : (i1) -> ()
+  ; CHECK:  llvm.intr.assume %[[TRUE]] : i1
   call void @llvm.assume(i1 %true)
   ret void
 }
 
+; CHECK-LABEL: @assume_with_opbundles
+; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
+; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
+define void @assume_with_opbundles(i1 %true, ptr %p) {
+  ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32
+  ; CHECK:  llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1
+  call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)]
+  ret void
+}
+
 ; CHECK-LABEL: @is_constant
 ; CHECK-SAME:  %[[VAL:[a-zA-Z0-9]+]]
 define void @is_constant(i32 %0) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 0634a7ba907f..cb712eb4e126 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -363,6 +363,21 @@ llvm.func @umin_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector
   llvm.return
 }
 
+// CHECK-LABEL: @assume_without_opbundles
+llvm.func @assume_without_opbundles(%cond: i1) {
+  // CHECK: call void @llvm.assume(i1 %{{.+}})
+  llvm.intr.assume %cond : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @assume_with_opbundles
+llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) {
+  %0 = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ]
+  llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1
+  llvm.return
+}
+
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: f32, %arg1: vector<8xf32>, %arg2: vector<8xi32>) {
   // CHECK: call i32 @llvm.vector.reduce.add.v8i32
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index af0981440a17..15658ea60681 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -188,7 +188,7 @@ llvm.func @sadd_overflow_intr_wrong_type(%arg0 : i32, %arg1 : f32) -> !llvm.stru
 
 llvm.func @assume_intr_wrong_type(%cond : i16) {
   // expected-error @below{{op operand #0 must be 1-bit signless integer, but got 'i16'}}
-  "llvm.intr.assume"(%cond) : (i16) -> ()
+  llvm.intr.assume %cond : i16
   llvm.return
 }
 
-- 
GitLab


From 4245c00faf5eb525ea9167c7a30c6cfe260b8676 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Wed, 16 Oct 2024 15:52:40 +1100
Subject: [PATCH 064/329] [ORC] Fix LLJIT's atexit declaration for clang-repl
 on SystemZ.

The atexit needs a signext attribute on its return type. See
https://github.com/llvm/llvm-project/issues/109658.
---
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp | 14 ++++++++++----
 1 file changed, 10 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index 7e3f58c0059c..c56ec196772b 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -7,6 +7,8 @@
 //===----------------------------------------------------------------------===//
 
 #include "llvm/ExecutionEngine/Orc/LLJIT.h"
+
+#include "llvm/Analysis/TargetLibraryInfo.h"
 #include "llvm/Config/llvm-config.h" // for LLVM_ENABLE_THREADS
 #include "llvm/ExecutionEngine/JITLink/EHFrameSupport.h"
 #include "llvm/ExecutionEngine/JITLink/JITLinkMemoryManager.h"
@@ -195,10 +197,14 @@ public:
     auto *IntTy = Type::getIntNTy(*Ctx, sizeof(int) * CHAR_BIT);
     auto *AtExitCallbackTy = FunctionType::get(VoidTy, {}, false);
     auto *AtExitCallbackPtrTy = PointerType::getUnqual(AtExitCallbackTy);
-    addHelperAndWrapper(*M, "atexit",
-                        FunctionType::get(IntTy, {AtExitCallbackPtrTy}, false),
-                        GlobalValue::HiddenVisibility, "__lljit.atexit_helper",
-                        {PlatformInstanceDecl, DSOHandle});
+    auto *AtExit = addHelperAndWrapper(
+        *M, "atexit", FunctionType::get(IntTy, {AtExitCallbackPtrTy}, false),
+        GlobalValue::HiddenVisibility, "__lljit.atexit_helper",
+        {PlatformInstanceDecl, DSOHandle});
+    Attribute::AttrKind AtExitExtAttr =
+        TargetLibraryInfo::getExtAttrForI32Return(J.getTargetTriple());
+    if (AtExitExtAttr != Attribute::None)
+      AtExit->addRetAttr(AtExitExtAttr);
 
     return J.addIRModule(JD, ThreadSafeModule(std::move(M), std::move(Ctx)));
   }
-- 
GitLab


From f3648046ec67b6be1743cc7760fc57820bcdc7f7 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang@sifive.com>
Date: Wed, 16 Oct 2024 12:57:43 +0800
Subject: [PATCH 065/329] [RISCV] Fix vp-intrinsics args in cost model tests.
 NFC (#112463)

This patch contains following changes to fix vp intrinsics tests.
1. v\*float -> v\*f32, v\*double -> v\*f64 and v\*half -> v\*f16
2. Fix the order of the vp-intrinsics.
---
 llvm/test/Analysis/CostModel/RISCV/cast.ll | 2238 ++++++++++----------
 llvm/test/Analysis/CostModel/RISCV/cmp.ll  |   64 +-
 2 files changed, 1151 insertions(+), 1151 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/cast.ll b/llvm/test/Analysis/CostModel/RISCV/cast.ll
index 853eef6bcb2e..04048b8ba17f 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cast.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cast.ll
@@ -660,16 +660,16 @@ define void @sext() {
   %v2i1_v2i32 = sext <2 x i1> undef to <2 x i32>
   %v2i1_v2i64 = sext <2 x i1> undef to <2 x i64>
 
-  %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.sext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.sext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.sext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.sext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 
   %v4i8_v4i16 = sext <4 x i8> undef to <4 x i16>
   %v4i8_v4i32 = sext <4 x i8> undef to <4 x i32>
@@ -682,16 +682,16 @@ define void @sext() {
   %v4i1_v4i32 = sext <4 x i1> undef to <4 x i32>
   %v4i1_v4i64 = sext <4 x i1> undef to <4 x i64>
 
-  %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.sext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.sext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.sext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.sext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 
   %v8i8_v8i16 = sext <8 x i8> undef to <8 x i16>
   %v8i8_v8i32 = sext <8 x i8> undef to <8 x i32>
@@ -704,16 +704,16 @@ define void @sext() {
   %v8i1_v8i32 = sext <8 x i1> undef to <8 x i32>
   %v8i1_v8i64 = sext <8 x i1> undef to <8 x i64>
 
-  %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.sext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.sext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.sext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.sext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 
   %v16i8_v16i16 = sext <16 x i8> undef to <16 x i16>
   %v16i8_v16i32 = sext <16 x i8> undef to <16 x i32>
@@ -726,16 +726,16 @@ define void @sext() {
   %v16i1_v16i32 = sext <16 x i1> undef to <16 x i32>
   %v16i1_v16i64 = sext <16 x i1> undef to <16 x i64>
 
-  %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.sext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.sext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.sext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.sext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 
   %v32i8_v32i16 = sext <32 x i8> undef to <32 x i16>
   %v32i8_v32i32 = sext <32 x i8> undef to <32 x i32>
@@ -748,16 +748,16 @@ define void @sext() {
   %v32i1_v32i32 = sext <32 x i1> undef to <32 x i32>
   %v32i1_v32i64 = sext <32 x i1> undef to <32 x i64>
 
-  %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.sext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.sext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.sext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.sext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 
   %v64i8_v64i16 = sext <64 x i8> undef to <64 x i16>
   %v64i8_v64i32 = sext <64 x i8> undef to <64 x i32>
@@ -770,16 +770,16 @@ define void @sext() {
   %v64i1_v64i32 = sext <64 x i1> undef to <64 x i32>
   %v64i1_v64i64 = sext <64 x i1> undef to <64 x i64>
 
-  %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.sext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.sext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.sext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.sext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 
   %v128i8_v128i16 = sext <128 x i8> undef to <128 x i16>
   %v128i8_v128i32 = sext <128 x i8> undef to <128 x i32>
@@ -792,16 +792,16 @@ define void @sext() {
   %v128i1_v128i32 = sext <128 x i1> undef to <128 x i32>
   %v128i1_v128i64 = sext <128 x i1> undef to <128 x i64>
 
-  %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.sext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.sext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.sext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.sext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 
   %v256i8_v256i16 = sext <256 x i8> undef to <256 x i16>
   %v256i8_v256i32 = sext <256 x i8> undef to <256 x i32>
@@ -814,16 +814,16 @@ define void @sext() {
   %v256i1_v256i32 = sext <256 x i1> undef to <256 x i32>
   %v256i1_v256i64 = sext <256 x i1> undef to <256 x i64>
 
-  %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.sext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.sext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.sext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.sext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 
   %nxv1i8_nxv1i16 = sext <vscale x 1 x i8> undef to <vscale x 1 x i16>
   %nxv1i8_nxv1i32 = sext <vscale x 1 x i8> undef to <vscale x 1 x i32>
@@ -836,16 +836,16 @@ define void @sext() {
   %nxv1i1_nxv1i32 = sext <vscale x 1 x i1> undef to <vscale x 1 x i32>
   %nxv1i1_nxv1i64 = sext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 
-  %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i8.nxv1i16(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i8.nxv1i32(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i8.nxv1i64(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i16.nxv1i32(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i16.nxv1i64(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i32.nxv1i64(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i1.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i1.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i1.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i1.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.sext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.sext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.sext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.sext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2i8_nxv2i16 = sext <vscale x 2 x i8> undef to <vscale x 2 x i16>
   %nxv2i8_nxv2i32 = sext <vscale x 2 x i8> undef to <vscale x 2 x i32>
@@ -858,16 +858,16 @@ define void @sext() {
   %nxv2i1_nxv2i32 = sext <vscale x 2 x i1> undef to <vscale x 2 x i32>
   %nxv2i1_nxv2i64 = sext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 
-  %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i8.nxv2i16(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i8.nxv2i32(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i8.nxv2i64(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i16.nxv2i32(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i16.nxv2i64(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i32.nxv2i64(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i1.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i1.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i1.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i1.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.sext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.sext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.sext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.sext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4i8_nxv4i16 = sext <vscale x 4 x i8> undef to <vscale x 4 x i16>
   %nxv4i8_nxv4i32 = sext <vscale x 4 x i8> undef to <vscale x 4 x i32>
@@ -880,16 +880,16 @@ define void @sext() {
   %nxv4i1_nxv4i32 = sext <vscale x 4 x i1> undef to <vscale x 4 x i32>
   %nxv4i1_nxv4i64 = sext <vscale x 4 x i1> undef to <vscale x 4 x i64>
 
-  %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i8.nxv4i16(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i8.nxv4i32(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i8.nxv4i64(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i16.nxv4i32(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i16.nxv4i64(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i32.nxv4i64(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i1.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i1.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i1.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i1.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.sext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.sext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.sext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.sext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8i8_nxv8i16 = sext <vscale x 8 x i8> undef to <vscale x 8 x i16>
   %nxv8i8_nxv8i32 = sext <vscale x 8 x i8> undef to <vscale x 8 x i32>
@@ -902,16 +902,16 @@ define void @sext() {
   %nxv8i1_nxv8i32 = sext <vscale x 8 x i1> undef to <vscale x 8 x i32>
   %nxv8i1_nxv8i64 = sext <vscale x 8 x i1> undef to <vscale x 8 x i64>
 
-  %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i8.nxv8i16(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i8.nxv8i32(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i8.nxv8i64(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i16.nxv8i32(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i16.nxv8i64(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i32.nxv8i64(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i1.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i1.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i1.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i1.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.sext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.sext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.sext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.sext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16i8_nxv16i16 = sext <vscale x 16 x i8> undef to <vscale x 16 x i16>
   %nxv16i8_nxv16i32 = sext <vscale x 16 x i8> undef to <vscale x 16 x i32>
@@ -924,16 +924,16 @@ define void @sext() {
   %nxv16i1_nxv16i32 = sext <vscale x 16 x i1> undef to <vscale x 16 x i32>
   %nxv16i1_nxv16i64 = sext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 
-  %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i8.nxv16i16(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i8.nxv16i32(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i8.nxv16i64(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i16.nxv16i32(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i16.nxv16i64(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i32.nxv16i64(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i1.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i1.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i1.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i1.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.sext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.sext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.sext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.sext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32i8_nxv32i16 = sext <vscale x 32 x i8> undef to <vscale x 32 x i16>
   %nxv32i8_nxv32i32 = sext <vscale x 32 x i8> undef to <vscale x 32 x i32>
@@ -946,16 +946,16 @@ define void @sext() {
   %nxv32i1_nxv32i32 = sext <vscale x 32 x i1> undef to <vscale x 32 x i32>
   %nxv32i1_nxv32i64 = sext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 
-  %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i8.nxv32i16(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i8.nxv32i32(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i8.nxv32i64(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i16.nxv32i32(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i16.nxv32i64(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i32.nxv32i64(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i1.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i1.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i1.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i1.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.sext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.sext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.sext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.sext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64i8_nxv64i16 = sext <vscale x 64 x i8> undef to <vscale x 64 x i16>
   %nxv64i8_nxv64i32 = sext <vscale x 64 x i8> undef to <vscale x 64 x i32>
@@ -968,16 +968,16 @@ define void @sext() {
   %nxv64i1_nxv64i32 = sext <vscale x 64 x i1> undef to <vscale x 64 x i32>
   %nxv64i1_nxv64i64 = sext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 
-  %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i8.nxv64i16(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i8.nxv64i32(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i8.nxv64i64(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i16.nxv64i32(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i16.nxv64i64(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i32.nxv64i64(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i1.nxv64i8(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i1.nxv64i16(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i1.nxv64i32(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i1.nxv64i64(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.sext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.sext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.sext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.sext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 
   %nxv128i8_nxv128i16 = sext <vscale x 128 x i8> undef to <vscale x 128 x i16>
   %nxv128i8_nxv128i32 = sext <vscale x 128 x i8> undef to <vscale x 128 x i32>
@@ -990,16 +990,16 @@ define void @sext() {
   %nxv128i1_nxv128i32 = sext <vscale x 128 x i1> undef to <vscale x 128 x i32>
   %nxv128i1_nxv128i128 = sext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 
-  %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i8.nxv128i16(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i8.nxv128i32(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i8.nxv128i128(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i16.nxv128i32(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i16.nxv128i128(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i32.nxv128i128(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i1.nxv128i8(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i1.nxv128i16(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i1.nxv128i32(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i1.nxv128i128(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.sext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.sext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.sext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.sext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 
   ret void
 }
@@ -1662,16 +1662,16 @@ define void @zext() {
   %v2i1_v2i32 = zext <2 x i1> undef to <2 x i32>
   %v2i1_v2i64 = zext <2 x i1> undef to <2 x i64>
 
-  %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i8.v2i16(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i8.v2i32(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i8.v2i64(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i16.v2i32(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i16.v2i64(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i32.v2i64(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i1.v2i8(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i1.v2i16(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i1.v2i32(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i1.v2i64(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i8 = call <2 x i8> @llvm.vp.zext.v2i8.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i16 = call <2 x i16> @llvm.vp.zext.v2i16.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i32 = call <2 x i32> @llvm.vp.zext.v2i32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i1_v2i64 = call <2 x i64> @llvm.vp.zext.v2i64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 
   %v4i8_v4i16 = zext <4 x i8> undef to <4 x i16>
   %v4i8_v4i32 = zext <4 x i8> undef to <4 x i32>
@@ -1684,16 +1684,16 @@ define void @zext() {
   %v4i1_v4i32 = zext <4 x i1> undef to <4 x i32>
   %v4i1_v4i64 = zext <4 x i1> undef to <4 x i64>
 
-  %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i8.v4i16(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i8.v4i32(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i8.v4i64(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i16.v4i32(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i16.v4i64(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i32.v4i64(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i1.v4i8(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i1.v4i16(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i1.v4i32(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i1.v4i64(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i8 = call <4 x i8> @llvm.vp.zext.v4i8.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i16 = call <4 x i16> @llvm.vp.zext.v4i16.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i32 = call <4 x i32> @llvm.vp.zext.v4i32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i1_v4i64 = call <4 x i64> @llvm.vp.zext.v4i64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 
   %v8i8_v8i16 = zext <8 x i8> undef to <8 x i16>
   %v8i8_v8i32 = zext <8 x i8> undef to <8 x i32>
@@ -1706,16 +1706,16 @@ define void @zext() {
   %v8i1_v8i32 = zext <8 x i1> undef to <8 x i32>
   %v8i1_v8i64 = zext <8 x i1> undef to <8 x i64>
 
-  %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i8.v8i16(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i8.v8i32(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i8.v8i64(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i16.v8i32(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i16.v8i64(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i32.v8i64(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i1.v8i8(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i1.v8i16(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i1.v8i32(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i1.v8i64(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i8 = call <8 x i8> @llvm.vp.zext.v8i8.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i16 = call <8 x i16> @llvm.vp.zext.v8i16.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i32 = call <8 x i32> @llvm.vp.zext.v8i32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i1_v8i64 = call <8 x i64> @llvm.vp.zext.v8i64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 
   %v16i8_v16i16 = zext <16 x i8> undef to <16 x i16>
   %v16i8_v16i32 = zext <16 x i8> undef to <16 x i32>
@@ -1728,16 +1728,16 @@ define void @zext() {
   %v16i1_v16i32 = zext <16 x i1> undef to <16 x i32>
   %v16i1_v16i64 = zext <16 x i1> undef to <16 x i64>
 
-  %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i8.v16i16(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i8.v16i32(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i8.v16i64(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i16.v16i32(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i16.v16i64(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i32.v16i64(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i1.v16i8(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i1.v16i16(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i1.v16i32(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i1.v16i64(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i8 = call <16 x i8> @llvm.vp.zext.v16i8.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i16 = call <16 x i16> @llvm.vp.zext.v16i16.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i32 = call <16 x i32> @llvm.vp.zext.v16i32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i1_v16i64 = call <16 x i64> @llvm.vp.zext.v16i64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 
   %v32i8_v32i16 = zext <32 x i8> undef to <32 x i16>
   %v32i8_v32i32 = zext <32 x i8> undef to <32 x i32>
@@ -1750,16 +1750,16 @@ define void @zext() {
   %v32i1_v32i32 = zext <32 x i1> undef to <32 x i32>
   %v32i1_v32i64 = zext <32 x i1> undef to <32 x i64>
 
-  %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i8.v32i16(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i8.v32i32(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i8.v32i64(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i16.v32i32(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i16.v32i64(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i32.v32i64(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i1.v32i8(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i1.v32i16(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i1.v32i32(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i1.v32i64(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i8 = call <32 x i8> @llvm.vp.zext.v32i8.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i16 = call <32 x i16> @llvm.vp.zext.v32i16.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i32 = call <32 x i32> @llvm.vp.zext.v32i32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i1_v32i64 = call <32 x i64> @llvm.vp.zext.v32i64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 
   %v64i8_v64i16 = zext <64 x i8> undef to <64 x i16>
   %v64i8_v64i32 = zext <64 x i8> undef to <64 x i32>
@@ -1772,16 +1772,16 @@ define void @zext() {
   %v64i1_v64i32 = zext <64 x i1> undef to <64 x i32>
   %v64i1_v64i64 = zext <64 x i1> undef to <64 x i64>
 
-  %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i8.v64i16(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i8.v64i32(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i8.v64i64(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i16.v64i32(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i16.v64i64(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i32.v64i64(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i1.v64i8(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i1.v64i16(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i1.v64i32(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i1.v64i64(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i8 = call <64 x i8> @llvm.vp.zext.v64i8.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i16 = call <64 x i16> @llvm.vp.zext.v64i16.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i32 = call <64 x i32> @llvm.vp.zext.v64i32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i1_v64i64 = call <64 x i64> @llvm.vp.zext.v64i64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 
   %v128i8_v128i16 = zext <128 x i8> undef to <128 x i16>
   %v128i8_v128i32 = zext <128 x i8> undef to <128 x i32>
@@ -1794,16 +1794,16 @@ define void @zext() {
   %v128i1_v128i32 = zext <128 x i1> undef to <128 x i32>
   %v128i1_v128i64 = zext <128 x i1> undef to <128 x i64>
 
-  %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i8.v128i16(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i8.v128i32(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i8.v128i64(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i16.v128i32(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i16.v128i64(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i32.v128i64(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i1.v128i8(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i1.v128i16(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i1.v128i32(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i1.v128i64(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i8 = call <128 x i8> @llvm.vp.zext.v128i8.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i16 = call <128 x i16> @llvm.vp.zext.v128i16.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i32 = call <128 x i32> @llvm.vp.zext.v128i32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i1_v128i64 = call <128 x i64> @llvm.vp.zext.v128i64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 
   %v256i8_v256i16 = zext <256 x i8> undef to <256 x i16>
   %v256i8_v256i32 = zext <256 x i8> undef to <256 x i32>
@@ -1816,16 +1816,16 @@ define void @zext() {
   %v256i1_v256i32 = zext <256 x i1> undef to <256 x i32>
   %v256i1_v256i64 = zext <256 x i1> undef to <256 x i64>
 
-  %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i8.v256i16(<256 x i8> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i8.v256i32(<256 x i8> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i8.v256i64(<256 x i8> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i16.v256i32(<256 x i16> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i16.v256i64(<256 x i16> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i32.v256i64(<256 x i32> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i1.v256i8(<256 x i1> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i1.v256i16(<256 x i1> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i1.v256i32(<256 x i1> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i1.v256i64(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i8 = call <256 x i8> @llvm.vp.zext.v256i8.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i16 = call <256 x i16> @llvm.vp.zext.v256i16.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i32 = call <256 x i32> @llvm.vp.zext.v256i32.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i1_v256i64 = call <256 x i64> @llvm.vp.zext.v256i64.v256i1(<256 x i1> undef, <256 x i1> undef, i32 undef)
 
   %nxv1i8_nxv1i16 = zext <vscale x 1 x i8> undef to <vscale x 1 x i16>
   %nxv1i8_nxv1i32 = zext <vscale x 1 x i8> undef to <vscale x 1 x i32>
@@ -1838,16 +1838,16 @@ define void @zext() {
   %nxv1i1_nxv1i32 = zext <vscale x 1 x i1> undef to <vscale x 1 x i32>
   %nxv1i1_nxv1i64 = zext <vscale x 1 x i1> undef to <vscale x 1 x i64>
 
-  %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i8.nxv1i16(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i8.nxv1i32(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i8.nxv1i64(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i16.nxv1i32(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i16.nxv1i64(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i32.nxv1i64(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i1.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i1.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i1.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i1.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.zext.nxv1i8.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.zext.nxv1i16.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.zext.nxv1i32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i1_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.zext.nxv1i64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2i8_nxv2i16 = zext <vscale x 2 x i8> undef to <vscale x 2 x i16>
   %nxv2i8_nxv2i32 = zext <vscale x 2 x i8> undef to <vscale x 2 x i32>
@@ -1860,16 +1860,16 @@ define void @zext() {
   %nxv2i1_nxv2i32 = zext <vscale x 2 x i1> undef to <vscale x 2 x i32>
   %nxv2i1_nxv2i64 = zext <vscale x 2 x i1> undef to <vscale x 2 x i64>
 
-  %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i8.nxv2i16(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i8.nxv2i32(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i8.nxv2i64(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i16.nxv2i32(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i16.nxv2i64(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i32.nxv2i64(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i1.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i1.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i1.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i1.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.zext.nxv2i8.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.zext.nxv2i16.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.zext.nxv2i32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i1_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.zext.nxv2i64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4i8_nxv4i16 = zext <vscale x 4 x i8> undef to <vscale x 4 x i16>
   %nxv4i8_nxv4i32 = zext <vscale x 4 x i8> undef to <vscale x 4 x i32>
@@ -1882,16 +1882,16 @@ define void @zext() {
   %nxv4i1_nxv4i32 = zext <vscale x 4 x i1> undef to <vscale x 4 x i32>
   %nxv4i1_nxv4i64 = zext <vscale x 4 x i1> undef to <vscale x 4 x i64>
 
-  %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i8.nxv4i16(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i8.nxv4i32(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i8.nxv4i64(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i16.nxv4i32(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i16.nxv4i64(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i32.nxv4i64(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i1.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i1.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i1.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i1.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.zext.nxv4i8.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.zext.nxv4i16.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.zext.nxv4i32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i1_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.zext.nxv4i64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8i8_nxv8i16 = zext <vscale x 8 x i8> undef to <vscale x 8 x i16>
   %nxv8i8_nxv8i32 = zext <vscale x 8 x i8> undef to <vscale x 8 x i32>
@@ -1904,16 +1904,16 @@ define void @zext() {
   %nxv8i1_nxv8i32 = zext <vscale x 8 x i1> undef to <vscale x 8 x i32>
   %nxv8i1_nxv8i64 = zext <vscale x 8 x i1> undef to <vscale x 8 x i64>
 
-  %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i8.nxv8i16(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i8.nxv8i32(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i8.nxv8i64(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i16.nxv8i32(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i16.nxv8i64(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i32.nxv8i64(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i1.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i1.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i1.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i1.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.zext.nxv8i8.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.zext.nxv8i16.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.zext.nxv8i32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i1_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.zext.nxv8i64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16i8_nxv16i16 = zext <vscale x 16 x i8> undef to <vscale x 16 x i16>
   %nxv16i8_nxv16i32 = zext <vscale x 16 x i8> undef to <vscale x 16 x i32>
@@ -1926,16 +1926,16 @@ define void @zext() {
   %nxv16i1_nxv16i32 = zext <vscale x 16 x i1> undef to <vscale x 16 x i32>
   %nxv16i1_nxv16i64 = zext <vscale x 16 x i1> undef to <vscale x 16 x i64>
 
-  %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i8.nxv16i16(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i8.nxv16i32(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i8.nxv16i64(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i16.nxv16i32(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i16.nxv16i64(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i32.nxv16i64(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i1.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i1.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i1.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i1.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.zext.nxv16i8.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.zext.nxv16i16.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.zext.nxv16i32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i1_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.zext.nxv16i64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32i8_nxv32i16 = zext <vscale x 32 x i8> undef to <vscale x 32 x i16>
   %nxv32i8_nxv32i32 = zext <vscale x 32 x i8> undef to <vscale x 32 x i32>
@@ -1948,16 +1948,16 @@ define void @zext() {
   %nxv32i1_nxv32i32 = zext <vscale x 32 x i1> undef to <vscale x 32 x i32>
   %nxv32i1_nxv32i64 = zext <vscale x 32 x i1> undef to <vscale x 32 x i64>
 
-  %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i8.nxv32i16(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i8.nxv32i32(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i8.nxv32i64(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i16.nxv32i32(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i16.nxv32i64(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i32.nxv32i64(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i1.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i1.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i1.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i1.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.zext.nxv32i8.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.zext.nxv32i16.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.zext.nxv32i32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i1_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.zext.nxv32i64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64i8_nxv64i16 = zext <vscale x 64 x i8> undef to <vscale x 64 x i16>
   %nxv64i8_nxv64i32 = zext <vscale x 64 x i8> undef to <vscale x 64 x i32>
@@ -1970,16 +1970,16 @@ define void @zext() {
   %nxv64i1_nxv64i32 = zext <vscale x 64 x i1> undef to <vscale x 64 x i32>
   %nxv64i1_nxv64i64 = zext <vscale x 64 x i1> undef to <vscale x 64 x i64>
 
-  %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i8.nxv64i16(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i8.nxv64i32(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i8.nxv64i64(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i16.nxv64i32(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i16.nxv64i64(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i32.nxv64i64(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i1.nxv64i8(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i1.nxv64i16(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i1.nxv64i32(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i1.nxv64i64(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.zext.nxv64i8.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.zext.nxv64i16.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.zext.nxv64i32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i1_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.zext.nxv64i64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 
   %nxv128i8_nxv128i16 = zext <vscale x 128 x i8> undef to <vscale x 128 x i16>
   %nxv128i8_nxv128i32 = zext <vscale x 128 x i8> undef to <vscale x 128 x i32>
@@ -1992,16 +1992,16 @@ define void @zext() {
   %nxv128i1_nxv128i32 = zext <vscale x 128 x i1> undef to <vscale x 128 x i32>
   %nxv128i1_nxv128i128 = zext <vscale x 128 x i1> undef to <vscale x 128 x i128>
 
-  %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i8.nxv128i16(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i8.nxv128i32(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i8.nxv128i128(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i16.nxv128i32(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i16.nxv128i128(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i32.nxv128i128(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i1.nxv128i8(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i1.nxv128i16(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i1.nxv128i32(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
-  %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i1.nxv128i128(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i8_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i8(<vscale x 128 x i8> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i16_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i16(<vscale x 128 x i16> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i32_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i32(<vscale x 128 x i32> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i8 = call <vscale x 128 x i8> @llvm.vp.zext.nxv128i8.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i16 = call <vscale x 128 x i16> @llvm.vp.zext.nxv128i16.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i32 = call <vscale x 128 x i32> @llvm.vp.zext.nxv128i32.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
+  %vp_nxv128i1_nxv128i128 = call <vscale x 128 x i128> @llvm.vp.zext.nxv128i128.nxv128i1(<vscale x 128 x i1> undef, <vscale x 128 x i1> undef, i32 undef)
 
   ret void
 }
@@ -2640,11 +2640,11 @@ define void @trunc() {
   %v2i4_v2i2 = trunc <2 x i4> undef to <2 x i2>
   %v2i6_v2i4 = trunc <2 x i6> undef to <2 x i4>
 
-  %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i16.v2i2(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i16.v2i4(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i16.v2i6(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i4.v2i2(<2 x i4> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i6.v2i4(<2 x i6> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i6 = call <2 x i6> @llvm.vp.trunc.v2i6.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i4_v2i2 = call <2 x i2> @llvm.vp.trunc.v2i2.v2i4(<2 x i4> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i6_v2i4 = call <2 x i4> @llvm.vp.trunc.v2i4.v2i6(<2 x i6> undef, <2 x i1> undef, i32 undef)
 
   %v2i16_v2i8 = trunc <2 x i16> undef to <2 x i8>
   %v2i32_v2i8 = trunc <2 x i32> undef to <2 x i8>
@@ -2657,16 +2657,16 @@ define void @trunc() {
   %v2i32_v2i1 = trunc <2 x i32> undef to <2 x i1>
   %v2i64_v2i1 = trunc <2 x i64> undef to <2 x i1>
 
-  %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i16.v2i8(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i32.v2i8(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i64.v2i8(<2 x i64> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i32.v2i16(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i64.v2i16(<2 x i64> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i64.v2i32(<2 x i64> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i8.v2i1(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i16.v2i1(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i32.v2i1(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i64.v2i1(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i8 = call <2 x i8> @llvm.vp.trunc.v2i8.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i16 = call <2 x i16> @llvm.vp.trunc.v2i16.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i32 = call <2 x i32> @llvm.vp.trunc.v2i32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i8_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i16_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i32_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2i64_v2i1 = call <2 x i1> @llvm.vp.trunc.v2i1.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
 
   %v4i16_v4i8 = trunc <4 x i16> undef to <4 x i8>
   %v4i32_v4i8 = trunc <4 x i32> undef to <4 x i8>
@@ -2679,16 +2679,16 @@ define void @trunc() {
   %v4i32_v4i1 = trunc <4 x i32> undef to <4 x i1>
   %v4i64_v4i1 = trunc <4 x i64> undef to <4 x i1>
 
-  %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i16.v4i8(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i32.v4i8(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i64.v4i8(<4 x i64> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i32.v4i16(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i64.v4i16(<4 x i64> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i64.v4i32(<4 x i64> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i8.v4i1(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i16.v4i1(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i32.v4i1(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i64.v4i1(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i8 = call <4 x i8> @llvm.vp.trunc.v4i8.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i16 = call <4 x i16> @llvm.vp.trunc.v4i16.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i32 = call <4 x i32> @llvm.vp.trunc.v4i32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i8_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i16_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i32_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4i64_v4i1 = call <4 x i1> @llvm.vp.trunc.v4i1.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
 
   %v8i16_v8i8 = trunc <8 x i16> undef to <8 x i8>
   %v8i32_v8i8 = trunc <8 x i32> undef to <8 x i8>
@@ -2701,16 +2701,16 @@ define void @trunc() {
   %v8i32_v8i1 = trunc <8 x i32> undef to <8 x i1>
   %v8i64_v8i1 = trunc <8 x i64> undef to <8 x i1>
 
-  %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i16.v8i8(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i32.v8i8(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i64.v8i8(<8 x i64> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i32.v8i16(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i64.v8i16(<8 x i64> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i64.v8i32(<8 x i64> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i8.v8i1(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i16.v8i1(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i32.v8i1(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i64.v8i1(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i8 = call <8 x i8> @llvm.vp.trunc.v8i8.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i16 = call <8 x i16> @llvm.vp.trunc.v8i16.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i32 = call <8 x i32> @llvm.vp.trunc.v8i32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i8_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i16_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i32_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8i64_v8i1 = call <8 x i1> @llvm.vp.trunc.v8i1.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
 
   %v16i16_v16i8 = trunc <2 x i16> undef to <2 x i8>
   %v16i32_v16i8 = trunc <2 x i32> undef to <2 x i8>
@@ -2723,16 +2723,16 @@ define void @trunc() {
   %v16i32_v16i1 = trunc <2 x i32> undef to <2 x i1>
   %v16i64_v16i1 = trunc <2 x i64> undef to <2 x i1>
 
-  %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i16.v16i8(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i32.v16i8(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i64.v16i8(<16 x i64> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i32.v16i16(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i64.v16i16(<16 x i64> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i64.v16i32(<16 x i64> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i8.v16i1(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i16.v16i1(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i32.v16i1(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i64.v16i1(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i8 = call <16 x i8> @llvm.vp.trunc.v16i8.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i16 = call <16 x i16> @llvm.vp.trunc.v16i16.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i32 = call <16 x i32> @llvm.vp.trunc.v16i32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i8_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i16_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i32_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16i64_v16i1 = call <16 x i1> @llvm.vp.trunc.v16i1.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
 
   %v32i16_v32i8 = trunc <16 x i16> undef to <16 x i8>
   %v32i32_v32i8 = trunc <16 x i32> undef to <16 x i8>
@@ -2745,16 +2745,16 @@ define void @trunc() {
   %v32i32_v32i1 = trunc <16 x i32> undef to <16 x i1>
   %v32i64_v32i1 = trunc <16 x i64> undef to <16 x i1>
 
-  %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i16.v32i8(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i32.v32i8(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i64.v32i8(<32 x i64> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i32.v32i16(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i64.v32i16(<32 x i64> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i64.v32i32(<32 x i64> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i8.v32i1(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i16.v32i1(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i32.v32i1(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i64.v32i1(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i8 = call <32 x i8> @llvm.vp.trunc.v32i8.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i16 = call <32 x i16> @llvm.vp.trunc.v32i16.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i32 = call <32 x i32> @llvm.vp.trunc.v32i32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i8_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i16_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i32_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32i64_v32i1 = call <32 x i1> @llvm.vp.trunc.v32i1.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
 
   %v64i16_v64i8 = trunc <64 x i16> undef to <64 x i8>
   %v64i32_v64i8 = trunc <64 x i32> undef to <64 x i8>
@@ -2767,16 +2767,16 @@ define void @trunc() {
   %v64i32_v64i1 = trunc <64 x i32> undef to <64 x i1>
   %v64i64_v64i1 = trunc <64 x i64> undef to <64 x i1>
 
-  %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i16.v64i8(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i32.v64i8(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i64.v64i8(<64 x i64> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i32.v64i16(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i64.v64i16(<64 x i64> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i64.v64i32(<64 x i64> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i8.v64i1(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i16.v64i1(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i32.v64i1(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i64.v64i1(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i8 = call <64 x i8> @llvm.vp.trunc.v64i8.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i16 = call <64 x i16> @llvm.vp.trunc.v64i16.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i32 = call <64 x i32> @llvm.vp.trunc.v64i32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i8_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i16_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i32_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64i64_v64i1 = call <64 x i1> @llvm.vp.trunc.v64i1.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
 
   %v128i16_v128i8 = trunc <128 x i16> undef to <128 x i8>
   %v128i32_v128i8 = trunc <128 x i32> undef to <128 x i8>
@@ -2789,16 +2789,16 @@ define void @trunc() {
   %v128i32_v128i1 = trunc <128 x i32> undef to <128 x i1>
   %v128i64_v128i1 = trunc <128 x i64> undef to <128 x i1>
 
-  %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i16.v128i8(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i32.v128i8(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i64.v128i8(<128 x i64> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i32.v128i16(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i64.v128i16(<128 x i64> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i64.v128i32(<128 x i64> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i8.v128i1(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i16.v128i1(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i32.v128i1(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i64.v128i1(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i8 = call <128 x i8> @llvm.vp.trunc.v128i8.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i16 = call <128 x i16> @llvm.vp.trunc.v128i16.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i32 = call <128 x i32> @llvm.vp.trunc.v128i32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i8_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i16_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i32_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128i64_v128i1 = call <128 x i1> @llvm.vp.trunc.v128i1.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
 
   %v256i16_v256i8 = trunc <256 x i16> undef to <256 x i8>
   %v256i32_v256i8 = trunc <256 x i32> undef to <256 x i8>
@@ -2811,16 +2811,16 @@ define void @trunc() {
   %v256i32_v256i1 = trunc <256 x i32> undef to <256 x i1>
   %v256i64_v256i1 = trunc <256 x i64> undef to <256 x i1>
 
-  %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i16.v256i8(<256 x i16> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i32.v256i8(<256 x i32> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i64.v256i8(<256 x i64> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i32.v256i16(<256 x i32> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i64.v256i16(<256 x i64> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i64.v256i32(<256 x i64> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i8.v256i1(<256 x i8> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i16.v256i1(<256 x i16> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i32.v256i1(<256 x i32> undef, <256 x i1> undef, i32 undef)
-  %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i64.v256i1(<256 x i64> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i8 = call <256 x i8> @llvm.vp.trunc.v256i8.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i16 = call <256 x i16> @llvm.vp.trunc.v256i16.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i32 = call <256 x i32> @llvm.vp.trunc.v256i32.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i8_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i8(<256 x i8> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i16_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i16(<256 x i16> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i32_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i32(<256 x i32> undef, <256 x i1> undef, i32 undef)
+  %vp_v256i64_v256i1 = call <256 x i1> @llvm.vp.trunc.v256i1.v256i64(<256 x i64> undef, <256 x i1> undef, i32 undef)
 
   %nxv1i16_nxv1i8 = trunc <vscale x 1 x i16> undef to <vscale x 1 x i8>
   %nxv1i32_nxv1i8 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i8>
@@ -2833,16 +2833,16 @@ define void @trunc() {
   %nxv1i32_nxv1i1 = trunc <vscale x 1 x i32> undef to <vscale x 1 x i1>
   %nxv1i64_nxv1i1 = trunc <vscale x 1 x i64> undef to <vscale x 1 x i1>
 
-  %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i16.nxv1i8(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i32.nxv1i8(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i64.nxv1i8(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i32.nxv1i16(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i64.nxv1i16(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i64.nxv1i32(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i8.nxv1i1(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i16.nxv1i1(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i32.nxv1i1(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i64.nxv1i1(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.trunc.nxv1i8.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.trunc.nxv1i16.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.trunc.nxv1i32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i8_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i16_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1i64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.trunc.nxv1i1.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2i16_nxv2i8 = trunc <vscale x 2 x i16> undef to <vscale x 2 x i8>
   %nxv2i32_nxv2i8 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i8>
@@ -2855,16 +2855,16 @@ define void @trunc() {
   %nxv2i32_nxv2i1 = trunc <vscale x 2 x i32> undef to <vscale x 2 x i1>
   %nxv2i64_nxv2i1 = trunc <vscale x 2 x i64> undef to <vscale x 2 x i1>
 
-  %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i16.nxv2i8(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i32.nxv2i8(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i64.nxv2i8(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i32.nxv2i16(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i64.nxv2i16(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i64.nxv2i32(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i8.nxv2i1(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i16.nxv2i1(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i32.nxv2i1(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i64.nxv2i1(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.trunc.nxv2i8.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.trunc.nxv2i16.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.trunc.nxv2i32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i8_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i16_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2i64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.trunc.nxv2i1.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4i16_nxv4i8 = trunc <vscale x 4 x i16> undef to <vscale x 4 x i8>
   %nxv4i32_nxv4i8 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i8>
@@ -2877,16 +2877,16 @@ define void @trunc() {
   %nxv4i32_nxv4i1 = trunc <vscale x 4 x i32> undef to <vscale x 4 x i1>
   %nxv4i64_nxv4i1 = trunc <vscale x 4 x i64> undef to <vscale x 4 x i1>
 
-  %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i16.nxv4i8(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i32.nxv4i8(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i64.nxv4i8(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i32.nxv4i16(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i64.nxv4i16(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i64.nxv4i32(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i8.nxv4i1(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i16.nxv4i1(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i32.nxv4i1(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i64.nxv4i1(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.trunc.nxv4i8.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.trunc.nxv4i16.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.trunc.nxv4i32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i8_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i16_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4i64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.trunc.nxv4i1.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8i16_nxv8i8 = trunc <vscale x 8 x i16> undef to <vscale x 8 x i8>
   %nxv8i32_nxv8i8 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i8>
@@ -2899,16 +2899,16 @@ define void @trunc() {
   %nxv8i32_nxv8i1 = trunc <vscale x 8 x i32> undef to <vscale x 8 x i1>
   %nxv8i64_nxv8i1 = trunc <vscale x 8 x i64> undef to <vscale x 8 x i1>
 
-  %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i16.nxv8i8(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i32.nxv8i8(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i64.nxv8i8(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i32.nxv8i16(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i64.nxv8i16(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i64.nxv8i32(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i8.nxv8i1(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i16.nxv8i1(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i32.nxv8i1(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i64.nxv8i1(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.trunc.nxv8i8.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.trunc.nxv8i16.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.trunc.nxv8i32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i8_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i16_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8i64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.trunc.nxv8i1.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16i16_nxv16i8 = trunc <vscale x 16 x i16> undef to <vscale x 16 x i8>
   %nxv16i32_nxv16i8 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i8>
@@ -2921,16 +2921,16 @@ define void @trunc() {
   %nxv16i32_nxv16i1 = trunc <vscale x 16 x i32> undef to <vscale x 16 x i1>
   %nxv16i64_nxv16i1 = trunc <vscale x 16 x i64> undef to <vscale x 16 x i1>
 
-  %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i16.nxv16i8(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i32.nxv16i8(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i64.nxv16i8(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i32.nxv16i16(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i64.nxv16i16(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i64.nxv16i32(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i8.nxv16i1(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i16.nxv16i1(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i32.nxv16i1(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i64.nxv16i1(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.trunc.nxv16i8.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.trunc.nxv16i16.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.trunc.nxv16i32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i8_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i16_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16i64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.trunc.nxv16i1.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32i16_nxv32i8 = trunc <vscale x 32 x i16> undef to <vscale x 32 x i8>
   %nxv32i32_nxv32i8 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i8>
@@ -2943,16 +2943,16 @@ define void @trunc() {
   %nxv32i32_nxv32i1 = trunc <vscale x 32 x i32> undef to <vscale x 32 x i1>
   %nxv32i64_nxv32i1 = trunc <vscale x 32 x i64> undef to <vscale x 32 x i1>
 
-  %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i16.nxv32i8(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i32.nxv32i8(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i64.nxv32i8(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i32.nxv32i16(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i64.nxv32i16(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i64.nxv32i32(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i8.nxv32i1(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i16.nxv32i1(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i32.nxv32i1(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i64.nxv32i1(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.trunc.nxv32i8.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.trunc.nxv32i16.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.trunc.nxv32i32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i8_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i16_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32i64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.trunc.nxv32i1.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64i16_nxv64i8 = trunc <vscale x 64 x i16> undef to <vscale x 64 x i8>
   %nxv64i32_nxv64i8 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i8>
@@ -2965,16 +2965,16 @@ define void @trunc() {
   %nxv64i32_nxv64i1 = trunc <vscale x 64 x i32> undef to <vscale x 64 x i1>
   %nxv64i64_nxv64i1 = trunc <vscale x 64 x i64> undef to <vscale x 64 x i1>
 
-  %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i16.nxv64i8(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i32.nxv64i8(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i64.nxv64i8(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i32.nxv64i16(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i64.nxv64i16(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i64.nxv64i32(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i8.nxv64i1(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i16.nxv64i1(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i32.nxv64i1(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i64.nxv64i1(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.trunc.nxv64i8.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.trunc.nxv64i16.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.trunc.nxv64i32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i8_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i16_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64i64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.trunc.nxv64i1.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
 
   ret void
 }
@@ -3071,113 +3071,113 @@ define void @fpext() {
   %v2f16_v2f64 = fpext <2 x half> undef to <2 x double>
   %v2f32_v2f64 = fpext <2 x float> undef to <2 x double>
 
-  %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2half.v2float(<2 x half> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2half.v2double(<2 x half> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2float.v2double(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f16_v2f32 = call <2 x float> @llvm.vp.fpext.v2f32.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f16_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f16(<2 x half> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2f64 = call <2 x double> @llvm.vp.fpext.v2f64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
 
   %v4f16_v4f32 = fpext <4 x half> undef to <4 x float>
   %v4f16_v4f64 = fpext <4 x half> undef to <4 x double>
   %v4f32_v4f64 = fpext <4 x float> undef to <4 x double>
 
-  %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4half.v4float(<4 x half> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4half.v4double(<4 x half> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4float.v4double(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f16_v4f32 = call <4 x float> @llvm.vp.fpext.v4f32.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f16_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f16(<4 x half> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4f64 = call <4 x double> @llvm.vp.fpext.v4f64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
 
   %v8f16_v8f32 = fpext <8 x half> undef to <8 x float>
   %v8f16_v8f64 = fpext <8 x half> undef to <8 x double>
   %v8f32_v8f64 = fpext <8 x float> undef to <8 x double>
 
-  %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8half.v8float(<8 x half> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8half.v8double(<8 x half> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8float.v8double(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f16_v8f32 = call <8 x float> @llvm.vp.fpext.v8f32.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f16_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f16(<8 x half> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8f64 = call <8 x double> @llvm.vp.fpext.v8f64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
 
   %v16f16_v16f32 = fpext <16 x half> undef to <16 x float>
   %v16f16_v16f64 = fpext <16 x half> undef to <16 x double>
   %v16f32_v16f64 = fpext <16 x float> undef to <16 x double>
 
-  %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16half.v16float(<16 x half> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16half.v16double(<16 x half> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16float.v16double(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f16_v16f32 = call <16 x float> @llvm.vp.fpext.v16f32.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f16_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f16(<16 x half> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16f64 = call <16 x double> @llvm.vp.fpext.v16f64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
 
   %v32f16_v32f32 = fpext <32 x half> undef to <32 x float>
   %v32f16_v32f64 = fpext <32 x half> undef to <32 x double>
   %v32f32_v32f64 = fpext <32 x float> undef to <32 x double>
 
-  %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32half.v32float(<32 x half> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32half.v32double(<32 x half> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32float.v32double(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f16_v32f32 = call <32 x float> @llvm.vp.fpext.v32f32.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f16_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f16(<32 x half> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32f64 = call <32 x double> @llvm.vp.fpext.v32f64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
 
   %v64f16_v64f32 = fpext <64 x half> undef to <64 x float>
   %v64f16_v64f64 = fpext <64 x half> undef to <64 x double>
   %v64f32_v64f64 = fpext <64 x float> undef to <64 x double>
 
-  %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64half.v64float(<64 x half> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64half.v64double(<64 x half> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64float.v64double(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f16_v64f32 = call <64 x float> @llvm.vp.fpext.v64f32.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f16_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f16(<64 x half> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64f64 = call <64 x double> @llvm.vp.fpext.v64f64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
 
   %v128f16_v128f32 = fpext <128 x half> undef to <128 x float>
   %v128f16_v128f64 = fpext <128 x half> undef to <128 x double>
   %v128f32_v128f64 = fpext <128 x float> undef to <128 x double>
 
-  %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128half.v128float(<128 x half> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128half.v128double(<128 x half> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128float.v128double(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f16_v128f32 = call <128 x float> @llvm.vp.fpext.v128f32.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f16_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f16(<128 x half> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128f64 = call <128 x double> @llvm.vp.fpext.v128f64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
 
   %nxv1f16_nxv1f32 = fpext <vscale x 1 x half> undef to <vscale x 1 x float>
   %nxv1f16_nxv1f64 = fpext <vscale x 1 x half> undef to <vscale x 1 x double>
   %nxv1f32_nxv1f64 = fpext <vscale x 1 x float> undef to <vscale x 1 x double>
 
-  %vp_nxv1f16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fpext.nxv1half.nxv1float(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1half.nxv1double(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1float.nxv1double(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fpext.nxv1f32.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.fpext.nxv1f64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2f16_nxv2f32 = fpext <vscale x 2 x half> undef to <vscale x 2 x float>
   %nxv2f16_nxv2f64 = fpext <vscale x 2 x half> undef to <vscale x 2 x double>
   %nxv2f32_nxv2f64 = fpext <vscale x 2 x float> undef to <vscale x 2 x double>
 
-  %vp_nxv2f16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fpext.nxv2half.nxv2float(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2half.nxv2double(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2float.nxv2double(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fpext.nxv2f32.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.fpext.nxv2f64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4f16_nxv4f32 = fpext <vscale x 4 x half> undef to <vscale x 4 x float>
   %nxv4f16_nxv4f64 = fpext <vscale x 4 x half> undef to <vscale x 4 x double>
   %nxv4f32_nxv4f64 = fpext <vscale x 4 x float> undef to <vscale x 4 x double>
 
-  %vp_nxv4f16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fpext.nxv4half.nxv4float(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4half.nxv4double(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4float.nxv4double(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fpext.nxv4f32.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.fpext.nxv4f64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8f16_nxv8f32 = fpext <vscale x 8 x half> undef to <vscale x 8 x float>
   %nxv8f16_nxv8f64 = fpext <vscale x 8 x half> undef to <vscale x 8 x double>
   %nxv8f32_nxv8f64 = fpext <vscale x 8 x float> undef to <vscale x 8 x double>
 
-  %vp_nxv8f16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fpext.nxv8half.nxv8float(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8half.nxv8double(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8float.nxv8double(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fpext.nxv8f32.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.fpext.nxv8f64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16f16_nxv16f32 = fpext <vscale x 16 x half> undef to <vscale x 16 x float>
   %nxv16f16_nxv16f64 = fpext <vscale x 16 x half> undef to <vscale x 16 x double>
   %nxv16f32_nxv16f64 = fpext <vscale x 16 x float> undef to <vscale x 16 x double>
 
-  %vp_nxv16f16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fpext.nxv16half.nxv16float(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16half.nxv16double(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16float.nxv16double(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fpext.nxv16f32.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.fpext.nxv16f64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32f16_nxv32f32 = fpext <vscale x 32 x half> undef to <vscale x 32 x float>
   %nxv32f16_nxv32f64 = fpext <vscale x 32 x half> undef to <vscale x 32 x double>
   %nxv32f32_nxv32f64 = fpext <vscale x 32 x float> undef to <vscale x 32 x double>
 
-  %vp_nxv32f16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fpext.nxv32half.nxv32float(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32half.nxv32double(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32float.nxv32double(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fpext.nxv32f32.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f16(<vscale x 32 x half> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.fpext.nxv32f64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64f16_nxv64f32 = fpext <vscale x 64 x half> undef to <vscale x 64 x float>
   %nxv64f16_nxv64f64 = fpext <vscale x 64 x half> undef to <vscale x 64 x double>
   %nxv64f32_nxv64f64 = fpext <vscale x 64 x float> undef to <vscale x 64 x double>
 
-  %vp_nxv64f16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fpext.nxv64half.nxv64float(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64half.nxv64double(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64float.nxv64double(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fpext.nxv64f32.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f16(<vscale x 64 x half> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.fpext.nxv64f64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
 
   ret void
 }
@@ -3274,113 +3274,113 @@ define void @fptrunc() {
   %v2f64_v2f16 = fptrunc <2 x double> undef to <2 x half>
   %v2f64_v2f32 = fptrunc <2 x double> undef to <2 x float>
 
-  %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2float.v2half(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2double.v2half(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2double.v2float(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2f16 = call <2 x half> @llvm.vp.fptrunc.v2f16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2f32 = call <2 x float> @llvm.vp.fptrunc.v2f32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 
   %v4f32_v4f16 = fptrunc <4 x float> undef to <4 x half>
   %v4f64_v4f16 = fptrunc <4 x double> undef to <4 x half>
   %v4f64_v4f32 = fptrunc <4 x double> undef to <4 x float>
 
-  %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4float.v4half(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4double.v4half(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4double.v4float(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4f16 = call <4 x half> @llvm.vp.fptrunc.v4f16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4f32 = call <4 x float> @llvm.vp.fptrunc.v4f32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 
   %v8f32_v8f16 = fptrunc <8 x float> undef to <8 x half>
   %v8f64_v8f16 = fptrunc <8 x double> undef to <8 x half>
   %v8f64_v8f32 = fptrunc <8 x double> undef to <8 x float>
 
-  %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8float.v8half(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8double.v8half(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8double.v8float(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8f16 = call <8 x half> @llvm.vp.fptrunc.v8f16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8f32 = call <8 x float> @llvm.vp.fptrunc.v8f32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 
   %v16f32_v16f16 = fptrunc <16 x float> undef to <16 x half>
   %v16f64_v16f16 = fptrunc <16 x double> undef to <16 x half>
   %v16f64_v16f32 = fptrunc <16 x double> undef to <16 x float>
 
-  %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16float.v16half(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16double.v16half(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16double.v16float(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16f16 = call <16 x half> @llvm.vp.fptrunc.v16f16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16f32 = call <16 x float> @llvm.vp.fptrunc.v16f32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 
   %v32f32_v32f16 = fptrunc <32 x float> undef to <32 x half>
   %v32f64_v32f16 = fptrunc <32 x double> undef to <32 x half>
   %v32f64_v32f32 = fptrunc <32 x double> undef to <32 x float>
 
-  %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32float.v32half(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32double.v32half(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32double.v32float(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32f16 = call <32 x half> @llvm.vp.fptrunc.v32f16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32f32 = call <32 x float> @llvm.vp.fptrunc.v32f32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 
   %v64f32_v64f16 = fptrunc <64 x float> undef to <64 x half>
   %v64f64_v64f16 = fptrunc <64 x double> undef to <64 x half>
   %v64f64_v64f32 = fptrunc <64 x double> undef to <64 x float>
 
-  %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64float.v64half(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64double.v64half(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64double.v64float(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64f16 = call <64 x half> @llvm.vp.fptrunc.v64f16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64f32 = call <64 x float> @llvm.vp.fptrunc.v64f32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 
   %v128f32_v128f16 = fptrunc <128 x float> undef to <128 x half>
   %v128f64_v128f16 = fptrunc <128 x double> undef to <128 x half>
   %v128f64_v128f32 = fptrunc <128 x double> undef to <128 x float>
 
-  %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128float.v128half(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128double.v128half(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128double.v128float(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128f16 = call <128 x half> @llvm.vp.fptrunc.v128f16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128f32 = call <128 x float> @llvm.vp.fptrunc.v128f32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 
   %nxv1f32_nxv1f16 = fptrunc <vscale x 1 x float> undef to <vscale x 1 x half>
   %nxv1f64_nxv1f16 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x half>
   %nxv1f64_nxv1f32 = fptrunc <vscale x 1 x double> undef to <vscale x 1 x float>
 
-  %vp_nxv1f32_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1float.nxv1half(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1double.nxv1half(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fptrunc.nxv1double.nxv1float(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1f16 = call <vscale x 1 x half> @llvm.vp.fptrunc.nxv1f16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.fptrunc.nxv1f32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2f32_nxv1f16 = fptrunc <vscale x 2 x float> undef to <vscale x 2 x half>
   %nxv2f64_nxv1f16 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x half>
   %nxv2f64_nxv1f32 = fptrunc <vscale x 2 x double> undef to <vscale x 2 x float>
 
-  %vp_nxv2f32_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2float.nxv2half(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2double.nxv2half(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2double.nxv2float(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2f16 = call <vscale x 2 x half> @llvm.vp.fptrunc.nxv2f16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.fptrunc.nxv2f32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4f32_nxv4f16 = fptrunc <vscale x 4 x float> undef to <vscale x 4 x half>
   %nxv4f64_nxv4f16 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x half>
   %nxv4f64_nxv4f32 = fptrunc <vscale x 4 x double> undef to <vscale x 4 x float>
 
-  %vp_nxv4f32_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4float.nxv4half(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4double.nxv4half(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fptrunc.nxv4double.nxv4float(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4f16 = call <vscale x 4 x half> @llvm.vp.fptrunc.nxv4f16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.fptrunc.nxv4f32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8f32_nxv8f16 = fptrunc <vscale x 8 x float> undef to <vscale x 8 x half>
   %nxv8f64_nxv8f16 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x half>
   %nxv8f64_nxv8f32 = fptrunc <vscale x 8 x double> undef to <vscale x 8 x float>
 
-  %vp_nxv8f32_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8float.nxv8half(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8double.nxv8half(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fptrunc.nxv8double.nxv8float(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8f16 = call <vscale x 8 x half> @llvm.vp.fptrunc.nxv8f16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.fptrunc.nxv8f32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16f32_nxv16f16 = fptrunc <vscale x 16 x float> undef to <vscale x 16 x half>
   %nxv16f64_nxv16f16 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x half>
   %nxv16f64_nxv16f32 = fptrunc <vscale x 16 x double> undef to <vscale x 16 x float>
 
-  %vp_nxv16f32_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16float.nxv16half(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16double.nxv16half(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16double.nxv16float(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16f16 = call <vscale x 16 x half> @llvm.vp.fptrunc.nxv16f16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.fptrunc.nxv16f32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32f32_nxv32f16 = fptrunc <vscale x 32 x float> undef to <vscale x 32 x half>
   %nxv32f64_nxv32f16 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x half>
   %nxv32f64_nxv32f32 = fptrunc <vscale x 32 x double> undef to <vscale x 32 x float>
 
-  %vp_nxv32f32_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32float.nxv32half(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32double.nxv32half(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32double.nxv32float(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32f16 = call <vscale x 32 x half> @llvm.vp.fptrunc.nxv32f16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.fptrunc.nxv32f32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64f32_nxv64f16 = fptrunc <vscale x 64 x float> undef to <vscale x 64 x half>
   %nxv64f64_nxv64f16 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x half>
   %nxv64f64_nxv64f32 = fptrunc <vscale x 64 x double> undef to <vscale x 64 x float>
 
-  %vp_nxv64f32_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64float.nxv64half(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64double.nxv64half(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fptrunc.nxv64double.nxv64float(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64f16 = call <vscale x 64 x half> @llvm.vp.fptrunc.nxv64f16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.fptrunc.nxv64f32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 
   ret void
 }
@@ -3963,16 +3963,16 @@ define void @fptosi() {
   %v2f32_v2i1 = fptosi <2 x float> undef to <2 x i1>
   %v2f64_v2i1 = fptosi <2 x double> undef to <2 x i1>
 
-  %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptosi.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptosi.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptosi.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptosi.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptosi.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 
   %v4f32_v4i8 = fptosi <4 x float> undef to <4 x i8>
   %v4f64_v4i8 = fptosi <4 x double> undef to <4 x i8>
@@ -3985,16 +3985,16 @@ define void @fptosi() {
   %v4f32_v4i1 = fptosi <4 x float> undef to <4 x i1>
   %v4f64_v4i1 = fptosi <4 x double> undef to <4 x i1>
 
-  %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptosi.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptosi.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptosi.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptosi.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptosi.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 
   %v8f32_v8i8 = fptosi <8 x float> undef to <8 x i8>
   %v8f64_v8i8 = fptosi <8 x double> undef to <8 x i8>
@@ -4007,16 +4007,16 @@ define void @fptosi() {
   %v8f32_v8i1 = fptosi <8 x float> undef to <8 x i1>
   %v8f64_v8i1 = fptosi <8 x double> undef to <8 x i1>
 
-  %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptosi.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptosi.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptosi.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptosi.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptosi.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 
   %v16f32_v16i8 = fptosi <16 x float> undef to <16 x i8>
   %v16f64_v16i8 = fptosi <16 x double> undef to <16 x i8>
@@ -4029,16 +4029,16 @@ define void @fptosi() {
   %v16f32_v16i1 = fptosi <16 x float> undef to <16 x i1>
   %v16f64_v16i1 = fptosi <16 x double> undef to <16 x i1>
 
-  %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptosi.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptosi.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptosi.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptosi.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptosi.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 
   %v32f32_v32i8 = fptosi <32 x float> undef to <32 x i8>
   %v32f64_v32i8 = fptosi <32 x double> undef to <32 x i8>
@@ -4051,16 +4051,16 @@ define void @fptosi() {
   %v32f32_v32i1 = fptosi <32 x float> undef to <32 x i1>
   %v32f64_v32i1 = fptosi <32 x double> undef to <32 x i1>
 
-  %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptosi.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptosi.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptosi.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptosi.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptosi.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 
   %v64f32_v64i8 = fptosi <64 x float> undef to <64 x i8>
   %v64f64_v64i8 = fptosi <64 x double> undef to <64 x i8>
@@ -4073,16 +4073,16 @@ define void @fptosi() {
   %v64f32_v64i1 = fptosi <64 x float> undef to <64 x i1>
   %v64f64_v64i1 = fptosi <64 x double> undef to <64 x i1>
 
-  %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptosi.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptosi.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptosi.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptosi.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptosi.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 
   %v128f32_v128i8 = fptosi <128 x float> undef to <128 x i8>
   %v128f64_v128i8 = fptosi <128 x double> undef to <128 x i8>
@@ -4095,16 +4095,16 @@ define void @fptosi() {
   %v128f32_v128i1 = fptosi <128 x float> undef to <128 x i1>
   %v128f64_v128i1 = fptosi <128 x double> undef to <128 x i1>
 
-  %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptosi.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptosi.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptosi.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptosi.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptosi.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 
   %nxv1f32_nxv1i8 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i8>
   %nxv1f64_nxv1i8 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i8>
@@ -4117,16 +4117,16 @@ define void @fptosi() {
   %nxv1f32_nxv1i1 = fptosi <vscale x 1 x float> undef to <vscale x 1 x i1>
   %nxv1f64_nxv1i1 = fptosi <vscale x 1 x double> undef to <vscale x 1 x i1>
 
-  %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1float.nxv1i8(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1double.nxv1i8(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1float.nxv1i16(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1double.nxv1i16(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1float.nxv1i32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1double.nxv1i32(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1float.nxv1i64(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1double.nxv1i64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1float.nxv1i1(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1double.nxv1i1(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptosi.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptosi.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptosi.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptosi.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptosi.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2f32_nxv2i8 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i8>
   %nxv2f64_nxv2i8 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i8>
@@ -4139,16 +4139,16 @@ define void @fptosi() {
   %nxv2f32_nxv2i1 = fptosi <vscale x 2 x float> undef to <vscale x 2 x i1>
   %nxv2f64_nxv2i1 = fptosi <vscale x 2 x double> undef to <vscale x 2 x i1>
 
-  %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2float.nxv2i8(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2double.nxv2i8(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2float.nxv2i16(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2double.nxv2i16(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2float.nxv2i32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2double.nxv2i32(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2float.nxv2i64(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2double.nxv2i64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2float.nxv2i1(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2double.nxv2i1(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptosi.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptosi.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptosi.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptosi.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptosi.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4f32_nxv4i8 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i8>
   %nxv4f64_nxv4i8 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i8>
@@ -4161,16 +4161,16 @@ define void @fptosi() {
   %nxv4f32_nxv4i1 = fptosi <vscale x 4 x float> undef to <vscale x 4 x i1>
   %nxv4f64_nxv4i1 = fptosi <vscale x 4 x double> undef to <vscale x 4 x i1>
 
-  %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4float.nxv4i8(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4double.nxv4i8(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4float.nxv4i16(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4double.nxv4i16(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4float.nxv4i32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4double.nxv4i32(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4float.nxv4i64(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4double.nxv4i64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4float.nxv4i1(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4double.nxv4i1(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptosi.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptosi.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptosi.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptosi.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptosi.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8f32_nxv8i8 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i8>
   %nxv8f64_nxv8i8 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i8>
@@ -4183,16 +4183,16 @@ define void @fptosi() {
   %nxv8f32_nxv8i1 = fptosi <vscale x 8 x float> undef to <vscale x 8 x i1>
   %nxv8f64_nxv8i1 = fptosi <vscale x 8 x double> undef to <vscale x 8 x i1>
 
-  %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8float.nxv8i8(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8double.nxv8i8(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8float.nxv8i16(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8double.nxv8i16(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8float.nxv8i32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8double.nxv8i32(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8float.nxv8i64(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8double.nxv8i64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8float.nxv8i1(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8double.nxv8i1(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptosi.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptosi.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptosi.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptosi.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptosi.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16f32_nxv16i8 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i8>
   %nxv16f64_nxv16i8 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i8>
@@ -4205,16 +4205,16 @@ define void @fptosi() {
   %nxv16f32_nxv16i1 = fptosi <vscale x 16 x float> undef to <vscale x 16 x i1>
   %nxv16f64_nxv16i1 = fptosi <vscale x 16 x double> undef to <vscale x 16 x i1>
 
-  %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16float.nxv16i8(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16double.nxv16i8(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16float.nxv16i16(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16double.nxv16i16(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16float.nxv16i32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16double.nxv16i32(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16float.nxv16i64(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16double.nxv16i64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16float.nxv16i1(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16double.nxv16i1(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptosi.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptosi.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptosi.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptosi.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptosi.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32f32_nxv32i8 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i8>
   %nxv32f64_nxv32i8 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i8>
@@ -4227,16 +4227,16 @@ define void @fptosi() {
   %nxv32f32_nxv32i1 = fptosi <vscale x 32 x float> undef to <vscale x 32 x i1>
   %nxv32f64_nxv32i1 = fptosi <vscale x 32 x double> undef to <vscale x 32 x i1>
 
-  %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32float.nxv32i8(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32double.nxv32i8(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32float.nxv32i16(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32double.nxv32i16(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32float.nxv32i32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32double.nxv32i32(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32float.nxv32i64(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32double.nxv32i64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32float.nxv32i1(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32double.nxv32i1(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptosi.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptosi.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptosi.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptosi.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptosi.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64f32_nxv64i8 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i8>
   %nxv64f64_nxv64i8 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i8>
@@ -4249,16 +4249,16 @@ define void @fptosi() {
   %nxv64f32_nxv64i1 = fptosi <vscale x 64 x float> undef to <vscale x 64 x i1>
   %nxv64f64_nxv64i1 = fptosi <vscale x 64 x double> undef to <vscale x 64 x i1>
 
-  %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64float.nxv64i8(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64double.nxv64i8(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64float.nxv64i16(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64double.nxv64i16(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64float.nxv64i32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64double.nxv64i32(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64float.nxv64i64(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64double.nxv64i64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64float.nxv64i1(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64double.nxv64i1(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptosi.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptosi.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptosi.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptosi.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptosi.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 
   ret void
 }
@@ -4841,16 +4841,16 @@ define void @fptoui() {
   %v2f32_v2i1 = fptoui <2 x float> undef to <2 x i1>
   %v2f64_v2i1 = fptoui <2 x double> undef to <2 x i1>
 
-  %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2float.v2i8(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2double.v2i8(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2float.v2i16(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2double.v2i16(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2float.v2i32(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2double.v2i32(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2float.v2i64(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2double.v2i64(<2 x double> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2float.v2i1(<2 x float> undef, <2 x i1> undef, i32 undef)
-  %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2double.v2i1(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i8 = call <2 x i8> @llvm.vp.fptoui.v2i8.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i16 = call <2 x i16> @llvm.vp.fptoui.v2i16.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i32 = call <2 x i32> @llvm.vp.fptoui.v2i32.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i64 = call <2 x i64> @llvm.vp.fptoui.v2i64.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f32_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f32(<2 x float> undef, <2 x i1> undef, i32 undef)
+  %vp_v2f64_v2i1 = call <2 x i1> @llvm.vp.fptoui.v2i1.v2f64(<2 x double> undef, <2 x i1> undef, i32 undef)
 
   %v4f32_v4i8 = fptoui <4 x float> undef to <4 x i8>
   %v4f64_v4i8 = fptoui <4 x double> undef to <4 x i8>
@@ -4863,16 +4863,16 @@ define void @fptoui() {
   %v4f32_v4i1 = fptoui <4 x float> undef to <4 x i1>
   %v4f64_v4i1 = fptoui <4 x double> undef to <4 x i1>
 
-  %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4float.v4i8(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4double.v4i8(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4float.v4i16(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4double.v4i16(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4float.v4i32(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4double.v4i32(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4float.v4i64(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4double.v4i64(<4 x double> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4float.v4i1(<4 x float> undef, <4 x i1> undef, i32 undef)
-  %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4double.v4i1(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i8 = call <4 x i8> @llvm.vp.fptoui.v4i8.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i16 = call <4 x i16> @llvm.vp.fptoui.v4i16.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i32 = call <4 x i32> @llvm.vp.fptoui.v4i32.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i64 = call <4 x i64> @llvm.vp.fptoui.v4i64.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f32_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f32(<4 x float> undef, <4 x i1> undef, i32 undef)
+  %vp_v4f64_v4i1 = call <4 x i1> @llvm.vp.fptoui.v4i1.v4f64(<4 x double> undef, <4 x i1> undef, i32 undef)
 
   %v8f32_v8i8 = fptoui <8 x float> undef to <8 x i8>
   %v8f64_v8i8 = fptoui <8 x double> undef to <8 x i8>
@@ -4885,16 +4885,16 @@ define void @fptoui() {
   %v8f32_v8i1 = fptoui <8 x float> undef to <8 x i1>
   %v8f64_v8i1 = fptoui <8 x double> undef to <8 x i1>
 
-  %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8float.v8i8(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8double.v8i8(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8float.v8i16(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8double.v8i16(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8float.v8i32(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8double.v8i32(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8float.v8i64(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8double.v8i64(<8 x double> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8float.v8i1(<8 x float> undef, <8 x i1> undef, i32 undef)
-  %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8double.v8i1(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i8 = call <8 x i8> @llvm.vp.fptoui.v8i8.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i16 = call <8 x i16> @llvm.vp.fptoui.v8i16.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i32 = call <8 x i32> @llvm.vp.fptoui.v8i32.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i64 = call <8 x i64> @llvm.vp.fptoui.v8i64.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f32_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f32(<8 x float> undef, <8 x i1> undef, i32 undef)
+  %vp_v8f64_v8i1 = call <8 x i1> @llvm.vp.fptoui.v8i1.v8f64(<8 x double> undef, <8 x i1> undef, i32 undef)
 
   %v16f32_v16i8 = fptoui <16 x float> undef to <16 x i8>
   %v16f64_v16i8 = fptoui <16 x double> undef to <16 x i8>
@@ -4907,16 +4907,16 @@ define void @fptoui() {
   %v16f32_v16i1 = fptoui <16 x float> undef to <16 x i1>
   %v16f64_v16i1 = fptoui <16 x double> undef to <16 x i1>
 
-  %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16float.v16i8(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16double.v16i8(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16float.v16i16(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16double.v16i16(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16float.v16i32(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16double.v16i32(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16float.v16i64(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16double.v16i64(<16 x double> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16float.v16i1(<16 x float> undef, <16 x i1> undef, i32 undef)
-  %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16double.v16i1(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i8 = call <16 x i8> @llvm.vp.fptoui.v16i8.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i16 = call <16 x i16> @llvm.vp.fptoui.v16i16.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i32 = call <16 x i32> @llvm.vp.fptoui.v16i32.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i64 = call <16 x i64> @llvm.vp.fptoui.v16i64.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f32_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f32(<16 x float> undef, <16 x i1> undef, i32 undef)
+  %vp_v16f64_v16i1 = call <16 x i1> @llvm.vp.fptoui.v16i1.v16f64(<16 x double> undef, <16 x i1> undef, i32 undef)
 
   %v32f32_v32i8 = fptoui <32 x float> undef to <32 x i8>
   %v32f64_v32i8 = fptoui <32 x double> undef to <32 x i8>
@@ -4929,16 +4929,16 @@ define void @fptoui() {
   %v32f32_v32i1 = fptoui <32 x float> undef to <32 x i1>
   %v32f64_v32i1 = fptoui <32 x double> undef to <32 x i1>
 
-  %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32float.v32i8(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32double.v32i8(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32float.v32i16(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32double.v32i16(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32float.v32i32(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32double.v32i32(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32float.v32i64(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32double.v32i64(<32 x double> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32float.v32i1(<32 x float> undef, <32 x i1> undef, i32 undef)
-  %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32double.v32i1(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i8 = call <32 x i8> @llvm.vp.fptoui.v32i8.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i16 = call <32 x i16> @llvm.vp.fptoui.v32i16.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i32 = call <32 x i32> @llvm.vp.fptoui.v32i32.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i64 = call <32 x i64> @llvm.vp.fptoui.v32i64.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f32_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f32(<32 x float> undef, <32 x i1> undef, i32 undef)
+  %vp_v32f64_v32i1 = call <32 x i1> @llvm.vp.fptoui.v32i1.v32f64(<32 x double> undef, <32 x i1> undef, i32 undef)
 
   %v64f32_v64i8 = fptoui <64 x float> undef to <64 x i8>
   %v64f64_v64i8 = fptoui <64 x double> undef to <64 x i8>
@@ -4951,16 +4951,16 @@ define void @fptoui() {
   %v64f32_v64i1 = fptoui <64 x float> undef to <64 x i1>
   %v64f64_v64i1 = fptoui <64 x double> undef to <64 x i1>
 
-  %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64float.v64i8(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64double.v64i8(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64float.v64i16(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64double.v64i16(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64float.v64i32(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64double.v64i32(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64float.v64i64(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64double.v64i64(<64 x double> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64float.v64i1(<64 x float> undef, <64 x i1> undef, i32 undef)
-  %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64double.v64i1(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i8 = call <64 x i8> @llvm.vp.fptoui.v64i8.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i16 = call <64 x i16> @llvm.vp.fptoui.v64i16.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i32 = call <64 x i32> @llvm.vp.fptoui.v64i32.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i64 = call <64 x i64> @llvm.vp.fptoui.v64i64.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f32_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f32(<64 x float> undef, <64 x i1> undef, i32 undef)
+  %vp_v64f64_v64i1 = call <64 x i1> @llvm.vp.fptoui.v64i1.v64f64(<64 x double> undef, <64 x i1> undef, i32 undef)
 
   %v128f32_v128i8 = fptoui <128 x float> undef to <128 x i8>
   %v128f64_v128i8 = fptoui <128 x double> undef to <128 x i8>
@@ -4973,16 +4973,16 @@ define void @fptoui() {
   %v128f32_v128i1 = fptoui <128 x float> undef to <128 x i1>
   %v128f64_v128i1 = fptoui <128 x double> undef to <128 x i1>
 
-  %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128float.v128i8(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128double.v128i8(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128float.v128i16(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128double.v128i16(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128float.v128i32(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128double.v128i32(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128float.v128i64(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128double.v128i64(<128 x double> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128float.v128i1(<128 x float> undef, <128 x i1> undef, i32 undef)
-  %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128double.v128i1(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i8 = call <128 x i8> @llvm.vp.fptoui.v128i8.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i16 = call <128 x i16> @llvm.vp.fptoui.v128i16.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i32 = call <128 x i32> @llvm.vp.fptoui.v128i32.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i64 = call <128 x i64> @llvm.vp.fptoui.v128i64.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f32_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f32(<128 x float> undef, <128 x i1> undef, i32 undef)
+  %vp_v128f64_v128i1 = call <128 x i1> @llvm.vp.fptoui.v128i1.v128f64(<128 x double> undef, <128 x i1> undef, i32 undef)
 
   %nxv1f32_nxv1i8 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i8>
   %nxv1f64_nxv1i8 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i8>
@@ -4995,16 +4995,16 @@ define void @fptoui() {
   %nxv1f32_nxv1i1 = fptoui <vscale x 1 x float> undef to <vscale x 1 x i1>
   %nxv1f64_nxv1i1 = fptoui <vscale x 1 x double> undef to <vscale x 1 x i1>
 
-  %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1float.nxv1i8(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1double.nxv1i8(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1float.nxv1i16(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1double.nxv1i16(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1float.nxv1i32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1double.nxv1i32(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1float.nxv1i64(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1double.nxv1i64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1float.nxv1i1(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1double.nxv1i1(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i8 = call <vscale x 1 x i8> @llvm.vp.fptoui.nxv1i8.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i16 = call <vscale x 1 x i16> @llvm.vp.fptoui.nxv1i16.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i32 = call <vscale x 1 x i32> @llvm.vp.fptoui.nxv1i32.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i64 = call <vscale x 1 x i64> @llvm.vp.fptoui.nxv1i64.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f32_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1f64_nxv1i1 = call <vscale x 1 x i1> @llvm.vp.fptoui.nxv1i1.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2f32_nxv2i8 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i8>
   %nxv2f64_nxv2i8 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i8>
@@ -5017,16 +5017,16 @@ define void @fptoui() {
   %nxv2f32_nxv2i1 = fptoui <vscale x 2 x float> undef to <vscale x 2 x i1>
   %nxv2f64_nxv2i1 = fptoui <vscale x 2 x double> undef to <vscale x 2 x i1>
 
-  %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2float.nxv2i8(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2double.nxv2i8(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2float.nxv2i16(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2double.nxv2i16(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2float.nxv2i32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2double.nxv2i32(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2float.nxv2i64(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2double.nxv2i64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2float.nxv2i1(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2double.nxv2i1(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i8 = call <vscale x 2 x i8> @llvm.vp.fptoui.nxv2i8.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i16 = call <vscale x 2 x i16> @llvm.vp.fptoui.nxv2i16.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i32 = call <vscale x 2 x i32> @llvm.vp.fptoui.nxv2i32.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i64 = call <vscale x 2 x i64> @llvm.vp.fptoui.nxv2i64.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f32_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2f64_nxv2i1 = call <vscale x 2 x i1> @llvm.vp.fptoui.nxv2i1.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4f32_nxv4i8 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i8>
   %nxv4f64_nxv4i8 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i8>
@@ -5039,16 +5039,16 @@ define void @fptoui() {
   %nxv4f32_nxv4i1 = fptoui <vscale x 4 x float> undef to <vscale x 4 x i1>
   %nxv4f64_nxv4i1 = fptoui <vscale x 4 x double> undef to <vscale x 4 x i1>
 
-  %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4float.nxv4i8(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4double.nxv4i8(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4float.nxv4i16(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4double.nxv4i16(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4float.nxv4i32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4double.nxv4i32(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4float.nxv4i64(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4double.nxv4i64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4float.nxv4i1(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4double.nxv4i1(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i8 = call <vscale x 4 x i8> @llvm.vp.fptoui.nxv4i8.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i16 = call <vscale x 4 x i16> @llvm.vp.fptoui.nxv4i16.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i32 = call <vscale x 4 x i32> @llvm.vp.fptoui.nxv4i32.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i64 = call <vscale x 4 x i64> @llvm.vp.fptoui.nxv4i64.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f32_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4f64_nxv4i1 = call <vscale x 4 x i1> @llvm.vp.fptoui.nxv4i1.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8f32_nxv8i8 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i8>
   %nxv8f64_nxv8i8 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i8>
@@ -5061,16 +5061,16 @@ define void @fptoui() {
   %nxv8f32_nxv8i1 = fptoui <vscale x 8 x float> undef to <vscale x 8 x i1>
   %nxv8f64_nxv8i1 = fptoui <vscale x 8 x double> undef to <vscale x 8 x i1>
 
-  %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8float.nxv8i8(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8double.nxv8i8(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8float.nxv8i16(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8double.nxv8i16(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8float.nxv8i32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8double.nxv8i32(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8float.nxv8i64(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8double.nxv8i64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8float.nxv8i1(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8double.nxv8i1(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i8 = call <vscale x 8 x i8> @llvm.vp.fptoui.nxv8i8.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i16 = call <vscale x 8 x i16> @llvm.vp.fptoui.nxv8i16.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i32 = call <vscale x 8 x i32> @llvm.vp.fptoui.nxv8i32.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i64 = call <vscale x 8 x i64> @llvm.vp.fptoui.nxv8i64.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f32_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8f64_nxv8i1 = call <vscale x 8 x i1> @llvm.vp.fptoui.nxv8i1.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16f32_nxv16i8 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i8>
   %nxv16f64_nxv16i8 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i8>
@@ -5083,16 +5083,16 @@ define void @fptoui() {
   %nxv16f32_nxv16i1 = fptoui <vscale x 16 x float> undef to <vscale x 16 x i1>
   %nxv16f64_nxv16i1 = fptoui <vscale x 16 x double> undef to <vscale x 16 x i1>
 
-  %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16float.nxv16i8(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16double.nxv16i8(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16float.nxv16i16(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16double.nxv16i16(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16float.nxv16i32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16double.nxv16i32(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16float.nxv16i64(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16double.nxv16i64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16float.nxv16i1(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16double.nxv16i1(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i8 = call <vscale x 16 x i8> @llvm.vp.fptoui.nxv16i8.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i16 = call <vscale x 16 x i16> @llvm.vp.fptoui.nxv16i16.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i32 = call <vscale x 16 x i32> @llvm.vp.fptoui.nxv16i32.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i64 = call <vscale x 16 x i64> @llvm.vp.fptoui.nxv16i64.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f32_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16f64_nxv16i1 = call <vscale x 16 x i1> @llvm.vp.fptoui.nxv16i1.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32f32_nxv32i8 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i8>
   %nxv32f64_nxv32i8 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i8>
@@ -5105,16 +5105,16 @@ define void @fptoui() {
   %nxv32f32_nxv32i1 = fptoui <vscale x 32 x float> undef to <vscale x 32 x i1>
   %nxv32f64_nxv32i1 = fptoui <vscale x 32 x double> undef to <vscale x 32 x i1>
 
-  %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32float.nxv32i8(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32double.nxv32i8(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32float.nxv32i16(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32double.nxv32i16(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32float.nxv32i32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32double.nxv32i32(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32float.nxv32i64(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32double.nxv32i64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32float.nxv32i1(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32double.nxv32i1(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i8 = call <vscale x 32 x i8> @llvm.vp.fptoui.nxv32i8.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i16 = call <vscale x 32 x i16> @llvm.vp.fptoui.nxv32i16.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i32 = call <vscale x 32 x i32> @llvm.vp.fptoui.nxv32i32.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i64 = call <vscale x 32 x i64> @llvm.vp.fptoui.nxv32i64.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f32_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32f64_nxv32i1 = call <vscale x 32 x i1> @llvm.vp.fptoui.nxv32i1.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64f32_nxv64i8 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i8>
   %nxv64f64_nxv64i8 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i8>
@@ -5127,16 +5127,16 @@ define void @fptoui() {
   %nxv64f32_nxv64i1 = fptoui <vscale x 64 x float> undef to <vscale x 64 x i1>
   %nxv64f64_nxv64i1 = fptoui <vscale x 64 x double> undef to <vscale x 64 x i1>
 
-  %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64float.nxv64i8(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64double.nxv64i8(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64float.nxv64i16(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64double.nxv64i16(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64float.nxv64i32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64double.nxv64i32(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64float.nxv64i64(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64double.nxv64i64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64float.nxv64i1(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64double.nxv64i1(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i8 = call <vscale x 64 x i8> @llvm.vp.fptoui.nxv64i8.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i16 = call <vscale x 64 x i16> @llvm.vp.fptoui.nxv64i16.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i32 = call <vscale x 64 x i32> @llvm.vp.fptoui.nxv64i32.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i64 = call <vscale x 64 x i64> @llvm.vp.fptoui.nxv64i64.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f32_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64f64_nxv64i1 = call <vscale x 64 x i1> @llvm.vp.fptoui.nxv64i1.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x i1> undef, i32 undef)
 
   ret void
 }
@@ -5719,16 +5719,16 @@ define void @sitofp() {
   %v2i1_v2f32 = sitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = sitofp <2 x i1> undef to <2 x double>
 
-  %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.sitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.sitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 
   %v4i8_v4f32 = sitofp <4 x i8> undef to <4 x float>
   %v4i8_v4f64 = sitofp <4 x i8> undef to <4 x double>
@@ -5741,16 +5741,16 @@ define void @sitofp() {
   %v4i1_v4f32 = sitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = sitofp <4 x i1> undef to <4 x double>
 
-  %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.sitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.sitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 
   %v8i8_v8f32 = sitofp <8 x i8> undef to <8 x float>
   %v8i8_v8f64 = sitofp <8 x i8> undef to <8 x double>
@@ -5763,16 +5763,16 @@ define void @sitofp() {
   %v8i1_v8f32 = sitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = sitofp <8 x i1> undef to <8 x double>
 
-  %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.sitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.sitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 
   %v16i8_v16f32 = sitofp <16 x i8> undef to <16 x float>
   %v16i8_v16f64 = sitofp <16 x i8> undef to <16 x double>
@@ -5785,16 +5785,16 @@ define void @sitofp() {
   %v16i1_v16f32 = sitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = sitofp <16 x i1> undef to <16 x double>
 
-  %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.sitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.sitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 
   %v32i8_v32f32 = sitofp <32 x i8> undef to <32 x float>
   %v32i8_v32f64 = sitofp <32 x i8> undef to <32 x double>
@@ -5807,16 +5807,16 @@ define void @sitofp() {
   %v32i1_v32f32 = sitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = sitofp <32 x i1> undef to <32 x double>
 
-  %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.sitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.sitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 
   %v64i8_v64f32 = sitofp <64 x i8> undef to <64 x float>
   %v64i8_v64f64 = sitofp <64 x i8> undef to <64 x double>
@@ -5829,16 +5829,16 @@ define void @sitofp() {
   %v64i1_v64f32 = sitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = sitofp <64 x i1> undef to <64 x double>
 
-  %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.sitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.sitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 
   %v128i8_v128f32 = sitofp <128 x i8> undef to <128 x float>
   %v128i8_v128f64 = sitofp <128 x i8> undef to <128 x double>
@@ -5851,16 +5851,16 @@ define void @sitofp() {
   %v128i1_v128f32 = sitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = sitofp <128 x i1> undef to <128 x double>
 
-  %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.sitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.sitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 
   %nxv1i8_nxv1f32 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
   %nxv1i8_nxv1f64 = sitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
@@ -5873,16 +5873,16 @@ define void @sitofp() {
   %nxv1i1_nxv1f32 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = sitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
-  %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i8.nxv1float(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i8.nxv1double(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i16.nxv1float(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i16.nxv1double(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i32.nxv1float(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i32.nxv1double(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i64.nxv1float(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i64.nxv1double(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1i1.nxv1float(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1i1.nxv1double(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2i8_nxv2f32 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
   %nxv2i8_nxv2f64 = sitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
@@ -5895,16 +5895,16 @@ define void @sitofp() {
   %nxv2i1_nxv2f32 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = sitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
-  %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i8.nxv2float(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i8.nxv2double(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i16.nxv2float(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i16.nxv2double(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i32.nxv2float(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i32.nxv2double(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i64.nxv2float(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i64.nxv2double(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2i1.nxv2float(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2i1.nxv2double(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.sitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.sitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4i8_nxv4f32 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
   %nxv4i8_nxv4f64 = sitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
@@ -5917,16 +5917,16 @@ define void @sitofp() {
   %nxv4i1_nxv4f32 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = sitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
-  %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i8.nxv4float(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i8.nxv4double(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i16.nxv4float(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i16.nxv4double(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i32.nxv4float(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i32.nxv4double(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i64.nxv4float(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i64.nxv4double(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4i1.nxv4float(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4i1.nxv4double(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.sitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.sitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8i8_nxv8f32 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
   %nxv8i8_nxv8f64 = sitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
@@ -5939,16 +5939,16 @@ define void @sitofp() {
   %nxv8i1_nxv8f32 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = sitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
-  %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i8.nxv8float(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i8.nxv8double(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i16.nxv8float(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i16.nxv8double(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i32.nxv8float(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i32.nxv8double(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i64.nxv8float(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i64.nxv8double(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv8i1.nxv8float(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv8i1.nxv8double(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi8_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi8_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f32 = call <vscale x 1 x float> @llvm.vp.sitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f64 = call <vscale x 1 x double> @llvm.vp.sitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv16i8_nxv16f32 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
   %nxv16i8_nxv16f64 = sitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
@@ -5961,16 +5961,16 @@ define void @sitofp() {
   %nxv16i1_nxv16f32 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = sitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
-  %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i8.nxv16float(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i8.nxv16double(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i16.nxv16float(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i16.nxv16double(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i32.nxv16float(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i32.nxv16double(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i64.nxv16float(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i64.nxv16double(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16i1.nxv16float(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16i1.nxv16double(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.sitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.sitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32i8_nxv32f32 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
   %nxv32i8_nxv32f64 = sitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
@@ -5983,16 +5983,16 @@ define void @sitofp() {
   %nxv32i1_nxv32f32 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = sitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
-  %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i8.nxv32float(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i8.nxv32double(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i16.nxv32float(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i16.nxv32double(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i32.nxv32float(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i32.nxv32double(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i64.nxv32float(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i64.nxv32double(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32i1.nxv32float(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32i1.nxv32double(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.sitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.sitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64i8_nxv64f32 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
   %nxv64i8_nxv64f64 = sitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
@@ -6005,16 +6005,16 @@ define void @sitofp() {
   %nxv64i1_nxv64f32 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = sitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 
-  %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i8.nxv64float(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i8.nxv64double(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i16.nxv64float(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i16.nxv64double(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i32.nxv64float(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i32.nxv64double(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i64.nxv64float(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i64.nxv64double(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64i1.nxv64float(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64i1.nxv64double(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.sitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.sitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 
   ret void
 }
@@ -6597,16 +6597,16 @@ define void @uitofp() {
   %v2i1_v2f32 = uitofp <2 x i1> undef to <2 x float>
   %v2i1_v2f64 = uitofp <2 x i1> undef to <2 x double>
 
-  %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i8.v2float(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i8.v2double(<2 x i8> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i16.v2float(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i16.v2double(<2 x i16> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i32.v2float(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i32.v2double(<2 x i32> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i64.v2float(<2 x i64> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i64.v2double(<2 x i64> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2i1.v2float(<2 x i1> undef, <2 x i1> undef, i32 undef)
-  %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2i1.v2double(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi8_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi8_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i8(<2 x i8> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi16_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i16(<2 x i16> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi32_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i32(<2 x i32> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi64_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i64(<2 x i64> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f32 = call <2 x float> @llvm.vp.uitofp.v2f32.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
+  %vp_v2fi1_v2f64 = call <2 x double> @llvm.vp.uitofp.v2f64.v2i1(<2 x i1> undef, <2 x i1> undef, i32 undef)
 
   %v4i8_v4f32 = uitofp <4 x i8> undef to <4 x float>
   %v4i8_v4f64 = uitofp <4 x i8> undef to <4 x double>
@@ -6619,16 +6619,16 @@ define void @uitofp() {
   %v4i1_v4f32 = uitofp <4 x i1> undef to <4 x float>
   %v4i1_v4f64 = uitofp <4 x i1> undef to <4 x double>
 
-  %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i8.v4float(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i8.v4double(<4 x i8> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i16.v4float(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i16.v4double(<4 x i16> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i32.v4float(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i32.v4double(<4 x i32> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i64.v4float(<4 x i64> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i64.v4double(<4 x i64> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4i1.v4float(<4 x i1> undef, <4 x i1> undef, i32 undef)
-  %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4i1.v4double(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi8_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi8_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i8(<4 x i8> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi16_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i16(<4 x i16> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi32_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i32(<4 x i32> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi64_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i64(<4 x i64> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f32 = call <4 x float> @llvm.vp.uitofp.v4f32.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
+  %vp_v4fi1_v4f64 = call <4 x double> @llvm.vp.uitofp.v4f64.v4i1(<4 x i1> undef, <4 x i1> undef, i32 undef)
 
   %v8i8_v8f32 = uitofp <8 x i8> undef to <8 x float>
   %v8i8_v8f64 = uitofp <8 x i8> undef to <8 x double>
@@ -6641,16 +6641,16 @@ define void @uitofp() {
   %v8i1_v8f32 = uitofp <8 x i1> undef to <8 x float>
   %v8i1_v8f64 = uitofp <8 x i1> undef to <8 x double>
 
-  %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i8.v8float(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i8.v8double(<8 x i8> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i16.v8float(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i16.v8double(<8 x i16> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i32.v8float(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i32.v8double(<8 x i32> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i64.v8float(<8 x i64> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i64.v8double(<8 x i64> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8i1.v8float(<8 x i1> undef, <8 x i1> undef, i32 undef)
-  %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8i1.v8double(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi8_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi8_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i8(<8 x i8> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi16_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i16(<8 x i16> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi32_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i32(<8 x i32> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi64_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i64(<8 x i64> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f32 = call <8 x float> @llvm.vp.uitofp.v8f32.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
+  %vp_v8fi1_v8f64 = call <8 x double> @llvm.vp.uitofp.v8f64.v8i1(<8 x i1> undef, <8 x i1> undef, i32 undef)
 
   %v16i8_v16f32 = uitofp <16 x i8> undef to <16 x float>
   %v16i8_v16f64 = uitofp <16 x i8> undef to <16 x double>
@@ -6663,16 +6663,16 @@ define void @uitofp() {
   %v16i1_v16f32 = uitofp <16 x i1> undef to <16 x float>
   %v16i1_v16f64 = uitofp <16 x i1> undef to <16 x double>
 
-  %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i8.v16float(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i8.v16double(<16 x i8> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i16.v16float(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i16.v16double(<16 x i16> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i32.v16float(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i32.v16double(<16 x i32> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i64.v16float(<16 x i64> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i64.v16double(<16 x i64> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16i1.v16float(<16 x i1> undef, <16 x i1> undef, i32 undef)
-  %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16i1.v16double(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi8_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi8_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i8(<16 x i8> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi16_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i16(<16 x i16> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi32_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i32(<16 x i32> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi64_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i64(<16 x i64> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f32 = call <16 x float> @llvm.vp.uitofp.v16f32.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
+  %vp_v16fi1_v16f64 = call <16 x double> @llvm.vp.uitofp.v16f64.v16i1(<16 x i1> undef, <16 x i1> undef, i32 undef)
 
   %v32i8_v32f32 = uitofp <32 x i8> undef to <32 x float>
   %v32i8_v32f64 = uitofp <32 x i8> undef to <32 x double>
@@ -6685,16 +6685,16 @@ define void @uitofp() {
   %v32i1_v32f32 = uitofp <32 x i1> undef to <32 x float>
   %v32i1_v32f64 = uitofp <32 x i1> undef to <32 x double>
 
-  %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i8.v32float(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i8.v32double(<32 x i8> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i16.v32float(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i16.v32double(<32 x i16> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i32.v32float(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i32.v32double(<32 x i32> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i64.v32float(<32 x i64> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i64.v32double(<32 x i64> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32i1.v32float(<32 x i1> undef, <32 x i1> undef, i32 undef)
-  %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32i1.v32double(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi8_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi8_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i8(<32 x i8> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi16_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i16(<32 x i16> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi32_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i32(<32 x i32> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi64_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i64(<32 x i64> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f32 = call <32 x float> @llvm.vp.uitofp.v32f32.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
+  %vp_v32fi1_v32f64 = call <32 x double> @llvm.vp.uitofp.v32f64.v32i1(<32 x i1> undef, <32 x i1> undef, i32 undef)
 
   %v64i8_v64f32 = uitofp <64 x i8> undef to <64 x float>
   %v64i8_v64f64 = uitofp <64 x i8> undef to <64 x double>
@@ -6707,16 +6707,16 @@ define void @uitofp() {
   %v64i1_v64f32 = uitofp <64 x i1> undef to <64 x float>
   %v64i1_v64f64 = uitofp <64 x i1> undef to <64 x double>
 
-  %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i8.v64float(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i8.v64double(<64 x i8> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i16.v64float(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i16.v64double(<64 x i16> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i32.v64float(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i32.v64double(<64 x i32> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i64.v64float(<64 x i64> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i64.v64double(<64 x i64> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64i1.v64float(<64 x i1> undef, <64 x i1> undef, i32 undef)
-  %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64i1.v64double(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi8_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi8_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i8(<64 x i8> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi16_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i16(<64 x i16> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi32_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i32(<64 x i32> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi64_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i64(<64 x i64> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f32 = call <64 x float> @llvm.vp.uitofp.v64f32.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
+  %vp_v64fi1_v64f64 = call <64 x double> @llvm.vp.uitofp.v64f64.v64i1(<64 x i1> undef, <64 x i1> undef, i32 undef)
 
   %v128i8_v128f32 = uitofp <128 x i8> undef to <128 x float>
   %v128i8_v128f64 = uitofp <128 x i8> undef to <128 x double>
@@ -6729,16 +6729,16 @@ define void @uitofp() {
   %v128i1_v128f32 = uitofp <128 x i1> undef to <128 x float>
   %v128i1_v128f64 = uitofp <128 x i1> undef to <128 x double>
 
-  %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i8.v128float(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i8.v128double(<128 x i8> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i16.v128float(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i16.v128double(<128 x i16> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i32.v128float(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i32.v128double(<128 x i32> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i64.v128float(<128 x i64> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i64.v128double(<128 x i64> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128i1.v128float(<128 x i1> undef, <128 x i1> undef, i32 undef)
-  %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128i1.v128double(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi8_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi8_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i8(<128 x i8> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi16_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i16(<128 x i16> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi32_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i32(<128 x i32> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi64_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i64(<128 x i64> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f32 = call <128 x float> @llvm.vp.uitofp.v128f32.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
+  %vp_v128fi1_v128f64 = call <128 x double> @llvm.vp.uitofp.v128f64.v128i1(<128 x i1> undef, <128 x i1> undef, i32 undef)
 
   %nxv1i8_nxv1f32 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x float>
   %nxv1i8_nxv1f64 = uitofp <vscale x 1 x i8> undef to <vscale x 1 x double>
@@ -6751,16 +6751,16 @@ define void @uitofp() {
   %nxv1i1_nxv1f32 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x float>
   %nxv1i1_nxv1f64 = uitofp <vscale x 1 x i1> undef to <vscale x 1 x double>
 
-  %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i8.nxv1float(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i8.nxv1double(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i16.nxv1float(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i16.nxv1double(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i32.nxv1float(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i32.nxv1double(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i64.nxv1float(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i64.nxv1double(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1i1.nxv1float(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
-  %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1i1.nxv1double(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi8_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi8_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i8(<vscale x 1 x i8> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi16_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i16(<vscale x 1 x i16> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi32_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i32(<vscale x 1 x i32> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi64_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i64(<vscale x 1 x i64> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f32 = call <vscale x 1 x float> @llvm.vp.uitofp.nxv1f32.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  %vp_nxv1fi1_nxv1f64 = call <vscale x 1 x double> @llvm.vp.uitofp.nxv1f64.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
 
   %nxv2i8_nxv2f32 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x float>
   %nxv2i8_nxv2f64 = uitofp <vscale x 2 x i8> undef to <vscale x 2 x double>
@@ -6773,16 +6773,16 @@ define void @uitofp() {
   %nxv2i1_nxv2f32 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x float>
   %nxv2i1_nxv2f64 = uitofp <vscale x 2 x i1> undef to <vscale x 2 x double>
 
-  %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i8.nxv2float(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i8.nxv2double(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i16.nxv2float(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i16.nxv2double(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i32.nxv2float(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i32.nxv2double(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i64.nxv2float(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i64.nxv2double(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2i1.nxv2float(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
-  %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2i1.nxv2double(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi8_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi8_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i8(<vscale x 2 x i8> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi16_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i16(<vscale x 2 x i16> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi32_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i32(<vscale x 2 x i32> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi64_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i64(<vscale x 2 x i64> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f32 = call <vscale x 2 x float> @llvm.vp.uitofp.nxv2f32.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  %vp_nxv2fi1_nxv2f64 = call <vscale x 2 x double> @llvm.vp.uitofp.nxv2f64.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
 
   %nxv4i8_nxv4f32 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x float>
   %nxv4i8_nxv4f64 = uitofp <vscale x 4 x i8> undef to <vscale x 4 x double>
@@ -6795,16 +6795,16 @@ define void @uitofp() {
   %nxv4i1_nxv4f32 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x float>
   %nxv4i1_nxv4f64 = uitofp <vscale x 4 x i1> undef to <vscale x 4 x double>
 
-  %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i8.nxv4float(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i8.nxv4double(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i16.nxv4float(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i16.nxv4double(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i32.nxv4float(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i32.nxv4double(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i64.nxv4float(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i64.nxv4double(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4i1.nxv4float(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
-  %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4i1.nxv4double(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi8_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi8_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i8(<vscale x 4 x i8> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi16_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i16(<vscale x 4 x i16> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi32_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i32(<vscale x 4 x i32> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi64_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i64(<vscale x 4 x i64> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f32 = call <vscale x 4 x float> @llvm.vp.uitofp.nxv4f32.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  %vp_nxv4fi1_nxv4f64 = call <vscale x 4 x double> @llvm.vp.uitofp.nxv4f64.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
 
   %nxv8i8_nxv8f32 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x float>
   %nxv8i8_nxv8f64 = uitofp <vscale x 8 x i8> undef to <vscale x 8 x double>
@@ -6817,16 +6817,16 @@ define void @uitofp() {
   %nxv8i1_nxv8f32 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x float>
   %nxv8i1_nxv8f64 = uitofp <vscale x 8 x i1> undef to <vscale x 8 x double>
 
-  %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i8.nxv8float(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i8.nxv8double(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i16.nxv8float(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i16.nxv8double(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i32.nxv8float(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i32.nxv8double(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i64.nxv8float(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i64.nxv8double(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8i1.nxv8float(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
-  %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8i1.nxv8double(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi8_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi8_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i8(<vscale x 8 x i8> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi16_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i16(<vscale x 8 x i16> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi32_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i32(<vscale x 8 x i32> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi64_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i64(<vscale x 8 x i64> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f32 = call <vscale x 8 x float> @llvm.vp.uitofp.nxv8f32.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  %vp_nxv8fi1_nxv8f64 = call <vscale x 8 x double> @llvm.vp.uitofp.nxv8f64.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
 
   %nxv16i8_nxv16f32 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x float>
   %nxv16i8_nxv16f64 = uitofp <vscale x 16 x i8> undef to <vscale x 16 x double>
@@ -6839,16 +6839,16 @@ define void @uitofp() {
   %nxv16i1_nxv16f32 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x float>
   %nxv16i1_nxv16f64 = uitofp <vscale x 16 x i1> undef to <vscale x 16 x double>
 
-  %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i8.nxv16float(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i8.nxv16double(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i16.nxv16float(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i16.nxv16double(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i32.nxv16float(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i32.nxv16double(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i64.nxv16float(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i64.nxv16double(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16i1.nxv16float(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
-  %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16i1.nxv16double(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi8_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi8_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i8(<vscale x 16 x i8> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi16_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i16(<vscale x 16 x i16> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi32_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i32(<vscale x 16 x i32> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi64_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i64(<vscale x 16 x i64> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f32 = call <vscale x 16 x float> @llvm.vp.uitofp.nxv16f32.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  %vp_nxv16fi1_nxv16f64 = call <vscale x 16 x double> @llvm.vp.uitofp.nxv16f64.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
 
   %nxv32i8_nxv32f32 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x float>
   %nxv32i8_nxv32f64 = uitofp <vscale x 32 x i8> undef to <vscale x 32 x double>
@@ -6861,16 +6861,16 @@ define void @uitofp() {
   %nxv32i1_nxv32f32 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x float>
   %nxv32i1_nxv32f64 = uitofp <vscale x 32 x i1> undef to <vscale x 32 x double>
 
-  %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i8.nxv32float(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i8.nxv32double(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i16.nxv32float(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i16.nxv32double(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i32.nxv32float(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i32.nxv32double(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i64.nxv32float(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i64.nxv32double(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32i1.nxv32float(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
-  %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32i1.nxv32double(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi8_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi8_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i8(<vscale x 32 x i8> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi16_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i16(<vscale x 32 x i16> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi32_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i32(<vscale x 32 x i32> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi64_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i64(<vscale x 32 x i64> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f32 = call <vscale x 32 x float> @llvm.vp.uitofp.nxv32f32.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+  %vp_nxv32fi1_nxv32f64 = call <vscale x 32 x double> @llvm.vp.uitofp.nxv32f64.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
 
   %nxv64i8_nxv64f32 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x float>
   %nxv64i8_nxv64f64 = uitofp <vscale x 64 x i8> undef to <vscale x 64 x double>
@@ -6883,16 +6883,16 @@ define void @uitofp() {
   %nxv64i1_nxv64f32 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x float>
   %nxv64i1_nxv64f64 = uitofp <vscale x 64 x i1> undef to <vscale x 64 x double>
 
-  %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i8.nxv64float(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i8.nxv64double(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i16.nxv64float(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i16.nxv64double(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i32.nxv64float(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i32.nxv64double(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i64.nxv64float(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i64.nxv64double(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64i1.nxv64float(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
-  %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64i1.nxv64double(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi8_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi8_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i8(<vscale x 64 x i8> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi16_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i16(<vscale x 64 x i16> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi32_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i32(<vscale x 64 x i32> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi64_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i64(<vscale x 64 x i64> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f32 = call <vscale x 64 x float> @llvm.vp.uitofp.nxv64f32.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
+  %vp_nxv64fi1_nxv64f64 = call <vscale x 64 x double> @llvm.vp.uitofp.nxv64f64.nxv64i1(<vscale x 64 x i1> undef, <vscale x 64 x i1> undef, i32 undef)
 
   ret void
 }
diff --git a/llvm/test/Analysis/CostModel/RISCV/cmp.ll b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
index 40938e000b64..b820baf5acf8 100644
--- a/llvm/test/Analysis/CostModel/RISCV/cmp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/cmp.ll
@@ -578,83 +578,83 @@ define void @fcmp() {
 ;
   fcmp olt <2 x float> undef, undef
   fcmp olt <2 x double> undef, undef
-  call <2 x i1> @llvm.vp.fcmp.v2float(<2 x float> undef, <2 x float> undef, metadata !"olt", <2 x i1> undef, i32 undef)
-  call <2 x i1> @llvm.vp.fcmp.v2double(<2 x double> undef, <2 x double> undef, metadata !"olt", <2 x i1> undef, i32 undef)
+  call <2 x i1> @llvm.vp.fcmp.v2f32(<2 x float> undef, <2 x float> undef, metadata !"olt", <2 x i1> undef, i32 undef)
+  call <2 x i1> @llvm.vp.fcmp.v2f64(<2 x double> undef, <2 x double> undef, metadata !"olt", <2 x i1> undef, i32 undef)
 
   fcmp olt <4 x float> undef, undef
   fcmp olt <4 x double> undef, undef
-  call <4 x i1> @llvm.vp.fcmp.v4float(<4 x float> undef, <4 x float> undef, metadata !"olt", <4 x i1> undef, i32 undef)
-  call <4 x i1> @llvm.vp.fcmp.v4double(<4 x double> undef, <4 x double> undef, metadata !"olt", <4 x i1> undef, i32 undef)
+  call <4 x i1> @llvm.vp.fcmp.v4f32(<4 x float> undef, <4 x float> undef, metadata !"olt", <4 x i1> undef, i32 undef)
+  call <4 x i1> @llvm.vp.fcmp.v4f64(<4 x double> undef, <4 x double> undef, metadata !"olt", <4 x i1> undef, i32 undef)
 
   fcmp olt <8 x float> undef, undef
   fcmp olt <8 x double> undef, undef
-  call <8 x i1> @llvm.vp.fcmp.v8float(<8 x float> undef, <8 x float> undef, metadata !"olt", <8 x i1> undef, i32 undef)
-  call <8 x i1> @llvm.vp.fcmp.v8double(<8 x double> undef, <8 x double> undef, metadata !"olt", <8 x i1> undef, i32 undef)
+  call <8 x i1> @llvm.vp.fcmp.v8f32(<8 x float> undef, <8 x float> undef, metadata !"olt", <8 x i1> undef, i32 undef)
+  call <8 x i1> @llvm.vp.fcmp.v8f64(<8 x double> undef, <8 x double> undef, metadata !"olt", <8 x i1> undef, i32 undef)
 
   fcmp olt <16 x float> undef, undef
   fcmp olt <16 x double> undef, undef
-  call <16 x i1> @llvm.vp.fcmp.v16float(<16 x float> undef, <16 x float> undef, metadata !"olt", <16 x i1> undef, i32 undef)
-  call <16 x i1> @llvm.vp.fcmp.v16double(<16 x double> undef, <16 x double> undef, metadata !"olt", <16 x i1> undef, i32 undef)
+  call <16 x i1> @llvm.vp.fcmp.v16f32(<16 x float> undef, <16 x float> undef, metadata !"olt", <16 x i1> undef, i32 undef)
+  call <16 x i1> @llvm.vp.fcmp.v16f64(<16 x double> undef, <16 x double> undef, metadata !"olt", <16 x i1> undef, i32 undef)
 
   fcmp olt <32 x float> undef, undef
   fcmp olt <32 x double> undef, undef
-  call <32 x i1> @llvm.vp.fcmp.v32float(<32 x float> undef, <32 x float> undef, metadata !"olt", <32 x i1> undef, i32 undef)
-  call <32 x i1> @llvm.vp.fcmp.v32double(<32 x double> undef, <32 x double> undef, metadata !"olt", <32 x i1> undef, i32 undef)
+  call <32 x i1> @llvm.vp.fcmp.v32f32(<32 x float> undef, <32 x float> undef, metadata !"olt", <32 x i1> undef, i32 undef)
+  call <32 x i1> @llvm.vp.fcmp.v32f64(<32 x double> undef, <32 x double> undef, metadata !"olt", <32 x i1> undef, i32 undef)
 
   fcmp olt <64 x float> undef, undef
   fcmp olt <64 x double> undef, undef
-  call <64 x i1> @llvm.vp.fcmp.v64float(<64 x float> undef, <64 x float> undef, metadata !"olt", <64 x i1> undef, i32 undef)
-  call <64 x i1> @llvm.vp.fcmp.v64double(<64 x double> undef, <64 x double> undef, metadata !"olt", <64 x i1> undef, i32 undef)
+  call <64 x i1> @llvm.vp.fcmp.v64f32(<64 x float> undef, <64 x float> undef, metadata !"olt", <64 x i1> undef, i32 undef)
+  call <64 x i1> @llvm.vp.fcmp.v64f64(<64 x double> undef, <64 x double> undef, metadata !"olt", <64 x i1> undef, i32 undef)
 
   fcmp olt <128 x float> undef, undef
   fcmp olt <128 x double> undef, undef
-  call <128 x i1> @llvm.vp.fcmp.v128float(<128 x float> undef, <128 x float> undef, metadata !"olt", <128 x i1> undef, i32 undef)
-  call <128 x i1> @llvm.vp.fcmp.v128double(<128 x double> undef, <128 x double> undef, metadata !"olt", <128 x i1> undef, i32 undef)
+  call <128 x i1> @llvm.vp.fcmp.v128f32(<128 x float> undef, <128 x float> undef, metadata !"olt", <128 x i1> undef, i32 undef)
+  call <128 x i1> @llvm.vp.fcmp.v128f64(<128 x double> undef, <128 x double> undef, metadata !"olt", <128 x i1> undef, i32 undef)
 
   fcmp olt <256 x float> undef, undef
   fcmp olt <256 x double> undef, undef
-  call <256 x i1> @llvm.vp.fcmp.v256float(<256 x float> undef, <256 x float> undef, metadata !"olt", <256 x i1> undef, i32 undef)
-  call <256 x i1> @llvm.vp.fcmp.v256double(<256 x double> undef, <256 x double> undef, metadata !"olt", <256 x i1> undef, i32 undef)
+  call <256 x i1> @llvm.vp.fcmp.v256f32(<256 x float> undef, <256 x float> undef, metadata !"olt", <256 x i1> undef, i32 undef)
+  call <256 x i1> @llvm.vp.fcmp.v256f64(<256 x double> undef, <256 x double> undef, metadata !"olt", <256 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 1 x float> undef, undef
   fcmp olt <vscale x 1 x double> undef, undef
-  call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1float(<vscale x 1 x float> undef, <vscale x 1 x float> undef, metadata !"olt", <vscale x 1 x i1> undef, i32 undef)
-  call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1double(<vscale x 1 x double> undef, <vscale x 1 x double> undef, metadata !"olt", <vscale x 1 x i1> undef, i32 undef)
+  call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef, metadata !"olt", <vscale x 1 x i1> undef, i32 undef)
+  call <vscale x 1 x i1> @llvm.vp.fcmp.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef, metadata !"olt", <vscale x 1 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 2 x float> undef, undef
   fcmp olt <vscale x 2 x double> undef, undef
-  call <vscale x 2 x i1> @llvm.vp.fcmp.nxv2float(<vscale x 2 x float> undef, <vscale x 2 x float> undef, metadata !"olt", <vscale x 2 x i1> undef, i32 undef)
-  call <vscale x 2 x i1> @llvm.vp.fcmp.nxv2double(<vscale x 2 x double> undef, <vscale x 2 x double> undef, metadata !"olt", <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 2 x i1> @llvm.vp.fcmp.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef, metadata !"olt", <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 2 x i1> @llvm.vp.fcmp.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef, metadata !"olt", <vscale x 2 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 4 x float> undef, undef
   fcmp olt <vscale x 4 x double> undef, undef
-  call <vscale x 4 x i1> @llvm.vp.fcmp.nxv4float(<vscale x 4 x float> undef, <vscale x 4 x float> undef, metadata !"olt", <vscale x 4 x i1> undef, i32 undef)
-  call <vscale x 4 x i1> @llvm.vp.fcmp.nxv4double(<vscale x 4 x double> undef, <vscale x 4 x double> undef, metadata !"olt", <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 4 x i1> @llvm.vp.fcmp.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef, metadata !"olt", <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 4 x i1> @llvm.vp.fcmp.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef, metadata !"olt", <vscale x 4 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 8 x float> undef, undef
   fcmp olt <vscale x 8 x double> undef, undef
-  call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8float(<vscale x 8 x float> undef, <vscale x 8 x float> undef, metadata !"olt", <vscale x 8 x i1> undef, i32 undef)
-  call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8double(<vscale x 8 x double> undef, <vscale x 8 x double> undef, metadata !"olt", <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef, metadata !"olt", <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 8 x i1> @llvm.vp.fcmp.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef, metadata !"olt", <vscale x 8 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 16 x float> undef, undef
   fcmp olt <vscale x 16 x double> undef, undef
-  call <vscale x 16 x i1> @llvm.vp.fcmp.nxv16float(<vscale x 16 x float> undef, <vscale x 16 x float> undef, metadata !"olt", <vscale x 16 x i1> undef, i32 undef)
-  call <vscale x 16 x i1> @llvm.vp.fcmp.nxv16double(<vscale x 16 x double> undef, <vscale x 16 x double> undef, metadata !"olt", <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 16 x i1> @llvm.vp.fcmp.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef, metadata !"olt", <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 16 x i1> @llvm.vp.fcmp.nxv16f64(<vscale x 16 x double> undef, <vscale x 16 x double> undef, metadata !"olt", <vscale x 16 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 32 x float> undef, undef
   fcmp olt <vscale x 32 x double> undef, undef
-  call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32float(<vscale x 32 x float> undef, <vscale x 32 x float> undef, metadata !"olt", <vscale x 32 x i1> undef, i32 undef)
-  call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32double(<vscale x 32 x double> undef, <vscale x 32 x double> undef, metadata !"olt", <vscale x 32 x i1> undef, i32 undef)
+  call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f32(<vscale x 32 x float> undef, <vscale x 32 x float> undef, metadata !"olt", <vscale x 32 x i1> undef, i32 undef)
+  call <vscale x 32 x i1> @llvm.vp.fcmp.nxv32f64(<vscale x 32 x double> undef, <vscale x 32 x double> undef, metadata !"olt", <vscale x 32 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 64 x float> undef, undef
   fcmp olt <vscale x 64 x double> undef, undef
-  call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64float(<vscale x 64 x float> undef, <vscale x 64 x float> undef, metadata !"olt", <vscale x 64 x i1> undef, i32 undef)
-  call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64double(<vscale x 64 x double> undef, <vscale x 64 x double> undef, metadata !"olt", <vscale x 64 x i1> undef, i32 undef)
+  call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64f32(<vscale x 64 x float> undef, <vscale x 64 x float> undef, metadata !"olt", <vscale x 64 x i1> undef, i32 undef)
+  call <vscale x 64 x i1> @llvm.vp.fcmp.nxv64f64(<vscale x 64 x double> undef, <vscale x 64 x double> undef, metadata !"olt", <vscale x 64 x i1> undef, i32 undef)
 
   fcmp olt <vscale x 128 x float> undef, undef
   fcmp olt <vscale x 128 x double> undef, undef
-  call <vscale x 128 x i1> @llvm.vp.fcmp.nxv128float(<vscale x 128 x float> undef, <vscale x 128 x float> undef, metadata !"olt", <vscale x 128 x i1> undef, i32 undef)
-  call <vscale x 128 x i1> @llvm.vp.fcmp.nxv128double(<vscale x 128 x double> undef, <vscale x 128 x double> undef, metadata !"olt", <vscale x 128 x i1> undef, i32 undef)
+  call <vscale x 128 x i1> @llvm.vp.fcmp.nxv128f32(<vscale x 128 x float> undef, <vscale x 128 x float> undef, metadata !"olt", <vscale x 128 x i1> undef, i32 undef)
+  call <vscale x 128 x i1> @llvm.vp.fcmp.nxv128f64(<vscale x 128 x double> undef, <vscale x 128 x double> undef, metadata !"olt", <vscale x 128 x i1> undef, i32 undef)
 
   ret void
 }
-- 
GitLab


From 3860e29e0e743c5f411c3023396d1ea07c28da7d Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 16 Oct 2024 06:10:18 +0100
Subject: [PATCH 066/329] [VPlan] Mark VPVectorPointerRecipe as not having
 sideeffects.

VectorPointer doesn't read from memory or have any sideeffects. Mark it
accordingly.
---
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp              | 3 +++
 .../Transforms/LoopVectorize/AArch64/induction-costs.ll     | 2 --
 .../RISCV/first-order-recurrence-scalable-vf1.ll            | 1 -
 llvm/test/Transforms/LoopVectorize/X86/cost-model.ll        | 6 +-----
 llvm/test/Transforms/LoopVectorize/dead_instructions.ll     | 3 ---
 llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll    | 6 ------
 6 files changed, 4 insertions(+), 17 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 0f90166528a4..6fe30356e8c9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -87,6 +87,7 @@ bool VPRecipeBase::mayWriteToMemory() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
+  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
@@ -132,6 +133,7 @@ bool VPRecipeBase::mayReadFromMemory() const {
   case VPBlendSC:
   case VPReductionEVLSC:
   case VPReductionSC:
+  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
@@ -170,6 +172,7 @@ bool VPRecipeBase::mayHaveSideEffects() const {
   case VPReductionEVLSC:
   case VPReductionSC:
   case VPScalarIVStepsSC:
+  case VPVectorPointerSC:
   case VPWidenCanonicalIVSC:
   case VPWidenCastSC:
   case VPWidenGEPSC:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
index 8080c3a9ba0a..36eee8d0c98c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/induction-costs.ll
@@ -110,7 +110,6 @@ define i64 @pointer_induction_only(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP4]]
-; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 2
 ; CHECK-NEXT:    [[WIDE_LOAD4:%.*]] = load <2 x i32>, ptr [[TMP7]], align 1
 ; CHECK-NEXT:    [[TMP9]] = zext <2 x i32> [[WIDE_LOAD4]] to <2 x i64>
@@ -169,7 +168,6 @@ define i64 @int_and_pointer_iv(ptr %start, i32 %N) {
 ; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 0
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr i32, ptr [[NEXT_GEP]], i32 4
 ; CHECK-NEXT:    [[WIDE_LOAD3:%.*]] = load <4 x i32>, ptr [[TMP3]], align 4
 ; CHECK-NEXT:    [[TMP5]] = zext <4 x i32> [[WIDE_LOAD3]] to <4 x i64>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
index 2fd00d67a43e..f4dfdacac1b3 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/first-order-recurrence-scalable-vf1.ll
@@ -18,7 +18,6 @@ define i64 @pr97452_scalable_vf1_for(ptr %src) #0 {
 ; CHECK-NEXT:    [[VECTOR_RECUR:%.*]] = phi <4 x i64> [ <i64 poison, i64 poison, i64 poison, i64 0>, %[[VECTOR_PH]] ], [ [[WIDE_LOAD1:%.*]], %[[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
 ; CHECK-NEXT:    [[TMP2:%.*]] = getelementptr inbounds i64, ptr [[SRC]], i64 [[TMP0]]
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 0
 ; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i64, ptr [[TMP2]], i32 4
 ; CHECK-NEXT:    [[WIDE_LOAD1]] = load <4 x i64>, ptr [[TMP5]], align 8
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
diff --git a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
index 8e5bf27acc64..73647919aac3 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/cost-model.ll
@@ -858,10 +858,6 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i32> [ <i32 0, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP11:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_PHI1:%.*]] = phi <4 x i32> [ <i32 -1, i32 -1, i32 -1, i32 -1>, [[VECTOR_PH]] ], [ [[TMP12:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[TMP5:%.*]] = add i32 [[INDEX]], 0
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[TMP5]]
-; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 0
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP7]], i32 4
 ; CHECK-NEXT:    [[TMP11]] = and <4 x i32> [[VEC_PHI]], [[TMP2]]
 ; CHECK-NEXT:    [[TMP12]] = and <4 x i32> [[VEC_PHI1]], [[TMP2]]
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 8
@@ -879,7 +875,7 @@ define void @reduction_store(ptr noalias %src, ptr %dst, i1 %x) #2 {
 ; CHECK:       loop:
 ; CHECK-NEXT:    [[RED:%.*]] = phi i32 [ [[BC_MERGE_RDX]], [[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], [[LOOP]] ]
 ; CHECK-NEXT:    [[IV:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
-; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC]], i32 [[IV]]
+; CHECK-NEXT:    [[GEP_SRC:%.*]] = getelementptr inbounds i32, ptr [[SRC:%.*]], i32 [[IV]]
 ; CHECK-NEXT:    [[L:%.*]] = load i32, ptr [[GEP_SRC]], align 4
 ; CHECK-NEXT:    [[L_AND:%.*]] = and i32 [[L]], 3
 ; CHECK-NEXT:    store i32 [[L_AND]], ptr [[DST]], align 4
diff --git a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
index aae9393bbe0d..8b5dd4211d85 100644
--- a/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
+++ b/llvm/test/Transforms/LoopVectorize/dead_instructions.ll
@@ -159,9 +159,6 @@ define void @dead_load_and_vector_pointer(ptr %a, ptr %b) {
 ; CHECK-NEXT:    [[TMP7:%.*]] = add <2 x i32> [[WIDE_LOAD2]], <i32 1, i32 1>
 ; CHECK-NEXT:    store <2 x i32> [[TMP6]], ptr [[TMP4]], align 4, !alias.scope [[META6]], !noalias [[META9]]
 ; CHECK-NEXT:    store <2 x i32> [[TMP7]], ptr [[TMP5]], align 4, !alias.scope [[META6]], !noalias [[META9]]
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[B]], i32 [[TMP0]]
-; CHECK-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
-; CHECK-NEXT:    [[TMP11:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP12:%.*]] = icmp eq i32 [[INDEX_NEXT]], 128
 ; CHECK-NEXT:    br i1 [[TMP12]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP11:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
index 3d2c2e5e9b57..7a5a8bfb1a99 100644
--- a/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
+++ b/llvm/test/Transforms/LoopVectorize/use-iv-start-value.ll
@@ -19,12 +19,6 @@ define i64 @foo(ptr %p1, ptr %p2, i64 %start, i64 %end) {
 ; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
 ; CHECK:       [[VECTOR_BODY]]:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = add i64 [[START2]], [[INDEX]]
-; CHECK-NEXT:    [[IND:%.*]] = add i64 [[OFFSET_IDX]], 0
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[IND]]
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds i32, ptr [[ARRAYIDX1]], i32 0
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[IND]]
-; CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds i32, ptr [[TMP4]], i32 0
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP6:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP6]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
-- 
GitLab


From 00cd1a06daa7f950cf0954c7f9fafc371c255639 Mon Sep 17 00:00:00 2001
From: Thomas Fransham <tfransham@gmail.com>
Date: Wed, 16 Oct 2024 06:41:33 +0100
Subject: [PATCH 067/329] Update llvm::Registry to work for LLVM shared library
 builds on windows (#109024)

This is part of the effort to support for enabling plugins on windows by
adding better support for building llvm and clang as a DLL.

Since windows doesn't implicitly import and merge exported symbols
across shared libraries like other platforms we need to explicitly add a
extern template declaration for each instantiation of llvm::Registry to
force the registry symbols to be dllimport'ed.
I've added a new visibility macro that doesn't switch between dllimport
and dllexport on windows since the existing macro would be in the wrong
mode for llvm::Registry's declared in Clang. This PR also depends Clang
symbol visibility macros that will be added by #108276

---------

Co-authored-by: Saleem Abdulrasool <compnerd@compnerd.org>
---
 clang/include/clang/Basic/ParsedAttrInfo.h    |  5 ++
 .../clang/Frontend/FrontendPluginRegistry.h   |  5 ++
 clang/include/clang/Lex/Preprocessor.h        |  5 ++
 .../CompilationDatabasePluginRegistry.h       |  6 ++
 .../Tooling/ToolExecutorPluginRegistry.h      |  6 ++
 llvm/include/llvm/CodeGen/GCMetadataPrinter.h |  2 +
 llvm/include/llvm/IR/GCStrategy.h             |  2 +
 llvm/include/llvm/Support/Compiler.h          | 11 ++++
 llvm/include/llvm/Support/Registry.h          | 64 ++++++++++---------
 9 files changed, 76 insertions(+), 30 deletions(-)

diff --git a/clang/include/clang/Basic/ParsedAttrInfo.h b/clang/include/clang/Basic/ParsedAttrInfo.h
index fab5c6f1377d..3b5f5d3c3f92 100644
--- a/clang/include/clang/Basic/ParsedAttrInfo.h
+++ b/clang/include/clang/Basic/ParsedAttrInfo.h
@@ -17,6 +17,7 @@
 
 #include "clang/Basic/AttrSubjectMatchRules.h"
 #include "clang/Basic/AttributeCommonInfo.h"
+#include "clang/Support/Compiler.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Registry.h"
 #include <climits>
@@ -175,4 +176,8 @@ const std::list<std::unique_ptr<ParsedAttrInfo>> &getAttributePluginInstances();
 
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI Registry<clang::ParsedAttrInfo>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_BASIC_PARSEDATTRINFO_H
diff --git a/clang/include/clang/Frontend/FrontendPluginRegistry.h b/clang/include/clang/Frontend/FrontendPluginRegistry.h
index 810578534acb..5eea9c2fd89a 100644
--- a/clang/include/clang/Frontend/FrontendPluginRegistry.h
+++ b/clang/include/clang/Frontend/FrontendPluginRegistry.h
@@ -14,6 +14,7 @@
 #define LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
 
 #include "clang/Frontend/FrontendAction.h"
+#include "clang/Support/Compiler.h"
 #include "llvm/Support/Registry.h"
 
 namespace clang {
@@ -23,4 +24,8 @@ using FrontendPluginRegistry = llvm::Registry<PluginASTAction>;
 
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI Registry<clang::PluginASTAction>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 4643b0213815..92749e4de44b 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -32,6 +32,7 @@
 #include "clang/Lex/PPEmbedParameters.h"
 #include "clang/Lex/Token.h"
 #include "clang/Lex/TokenLexer.h"
+#include "clang/Support/Compiler.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -3060,4 +3061,8 @@ using PragmaHandlerRegistry = llvm::Registry<PragmaHandler>;
 
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI Registry<clang::PragmaHandler>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_LEX_PREPROCESSOR_H
diff --git a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
index 8c58ad926a40..e6bcac542b0e 100644
--- a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
+++ b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 
+#include "clang/Support/Compiler.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/Support/Registry.h"
 
@@ -42,4 +43,9 @@ using CompilationDatabasePluginRegistry =
 } // namespace tooling
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI
+    Registry<clang::tooling::CompilationDatabasePlugin>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
diff --git a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
index 5304ff26252d..8d5458323468 100644
--- a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
+++ b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
@@ -9,6 +9,7 @@
 #ifndef LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 
+#include "clang/Support/Compiler.h"
 #include "clang/Tooling/Execution.h"
 #include "llvm/Support/Registry.h"
 
@@ -20,4 +21,9 @@ using ToolExecutorPluginRegistry = llvm::Registry<ToolExecutorPlugin>;
 } // namespace tooling
 } // namespace clang
 
+namespace llvm {
+extern template class CLANG_TEMPLATE_ABI
+    Registry<clang::tooling::ToolExecutorPlugin>;
+} // namespace llvm
+
 #endif // LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
diff --git a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
index f9527c9f8752..9d421be8313f 100644
--- a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -34,6 +34,8 @@ class StackMaps;
 /// defaults from Registry.
 using GCMetadataPrinterRegistry = Registry<GCMetadataPrinter>;
 
+extern template class LLVM_TEMPLATE_ABI Registry<GCMetadataPrinter>;
+
 /// GCMetadataPrinter - Emits GC metadata as assembly code.  Instances are
 /// created, managed, and owned by the AsmPrinter.
 class GCMetadataPrinter {
diff --git a/llvm/include/llvm/IR/GCStrategy.h b/llvm/include/llvm/IR/GCStrategy.h
index 3186465f0018..cbfbe23aaa06 100644
--- a/llvm/include/llvm/IR/GCStrategy.h
+++ b/llvm/include/llvm/IR/GCStrategy.h
@@ -141,6 +141,8 @@ public:
 /// GCMetadataPrinterRegistery as well.
 using GCRegistry = Registry<GCStrategy>;
 
+extern template class LLVM_TEMPLATE_ABI Registry<GCStrategy>;
+
 /// Lookup the GCStrategy object associated with the given gc name.
 std::unique_ptr<GCStrategy> getGCStrategy(const StringRef Name);
 
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index 1d2d751d4dc1..ab0cbff43d74 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -153,6 +153,12 @@
 /// exported when llvm is built as a shared library with everything else that is
 /// unannotated will have internal visibility.
 ///
+/// LLVM_ABI_EXPORT is for the special case for things like plugin symbol
+/// declarations or definitions where we don't want the macro to be switching
+/// between dllexport and dllimport on windows based on what codebase is being
+/// built, it will only be dllexport. For non windows platforms this macro
+/// behaves the same as LLVM_ABI.
+///
 /// LLVM_EXPORT_TEMPLATE is used on explicit template instantiations in source
 /// files that were declared extern in a header. This macro is only set as a
 /// compiler export attribute on windows, on other platforms it does nothing.
@@ -179,6 +185,7 @@
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT
 #elif defined(_WIN32) && !defined(__MINGW32__)
 #if defined(LLVM_EXPORTS)
 #define LLVM_ABI __declspec(dllexport)
@@ -189,19 +196,23 @@
 #define LLVM_TEMPLATE_ABI __declspec(dllimport)
 #define LLVM_EXPORT_TEMPLATE
 #endif
+#define LLVM_ABI_EXPORT __declspec(dllexport)
 #elif defined(__ELF__) || defined(__MINGW32__) || defined(_AIX)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #elif defined(__MACH__) || defined(__WASM__)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #endif
 #else
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
+#define LLVM_ABI_EXPORT
 #endif
 #define LLVM_C_ABI LLVM_ABI
 #endif
diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h
index 5bb6a254a47f..ff9226c39359 100644
--- a/llvm/include/llvm/Support/Registry.h
+++ b/llvm/include/llvm/Support/Registry.h
@@ -53,7 +53,13 @@ namespace llvm {
     Registry() = delete;
 
     friend class node;
-    static node *Head, *Tail;
+    // These must be must two separate declarations to workaround a 20 year
+    // old MSVC bug with dllexport and multiple static fields in the same
+    // declaration causing error C2487 "member of dll interface class may not
+    // be declared with dll interface".
+    // https://developercommunity.visualstudio.com/t/c2487-in-dllexport-class-with-static-members/69878
+    static node *Head;
+    static node *Tail;
 
   public:
     /// Node in linked list of entries.
@@ -76,7 +82,13 @@ namespace llvm {
     /// add a node to the executable's registry. Therefore it's not defined here
     /// to avoid it being instantiated in the plugin and is instead defined in
     /// the executable (see LLVM_INSTANTIATE_REGISTRY below).
-    static void add_node(node *N);
+    static void add_node(node *N) {
+      if (Tail)
+        Tail->Next = N;
+      else
+        Head = N;
+      Tail = N;
+    }
 
     /// Iterators for registry entries.
     ///
@@ -95,7 +107,7 @@ namespace llvm {
 
     // begin is not defined here in order to avoid usage of an undefined static
     // data member, instead it's instantiated by LLVM_INSTANTIATE_REGISTRY.
-    static iterator begin();
+    static iterator begin() { return iterator(Head); }
     static iterator end()   { return iterator(nullptr); }
 
     static iterator_range<iterator> entries() {
@@ -124,36 +136,28 @@ namespace llvm {
       }
     };
   };
+
 } // end namespace llvm
 
+#ifdef _WIN32
 /// Instantiate a registry class.
-///
-/// This provides template definitions of add_node, begin, and the Head and Tail
-/// pointers, then explicitly instantiates them. We could explicitly specialize
-/// them, instead of the two-step process of define then instantiate, but
-/// strictly speaking that's not allowed by the C++ standard (we would need to
-/// have explicit specialization declarations in all translation units where the
-/// specialization is used) so we don't.
-#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
-  namespace llvm { \
-  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
-  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
-  template<typename T> \
-  void Registry<T>::add_node(typename Registry<T>::node *N) { \
-    if (Tail) \
-      Tail->Next = N; \
-    else \
-      Head = N; \
-    Tail = N; \
-  } \
-  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
-    return iterator(Head); \
-  } \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
-  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
-  template \
-  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
-  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
+#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace llvm {                                                             \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
+  template class LLVM_ABI_EXPORT Registry<REGISTRY_CLASS::type>;               \
+  }
+#else
+#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
+  namespace llvm {                                                             \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
+  template <typename T>                                                        \
+  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
+  template class Registry<REGISTRY_CLASS::type>;                               \
   }
+#endif
 
 #endif // LLVM_SUPPORT_REGISTRY_H
-- 
GitLab


From 682925ef43902282173dd9e27cc4a5cc7b794821 Mon Sep 17 00:00:00 2001
From: Fangrui Song <i@maskray.me>
Date: Tue, 15 Oct 2024 22:58:07 -0700
Subject: [PATCH 068/329] [ELF] Pass Ctx & to Partition

---
 lld/ELF/Driver.cpp            |  4 +-
 lld/ELF/InputSection.h        |  2 +-
 lld/ELF/Relocations.cpp       |  4 +-
 lld/ELF/SyntheticSections.cpp | 78 +++++++++++++++++------------------
 lld/ELF/SyntheticSections.h   |  4 +-
 lld/ELF/Writer.cpp            |  4 +-
 6 files changed, 48 insertions(+), 48 deletions(-)

diff --git a/lld/ELF/Driver.cpp b/lld/ELF/Driver.cpp
index afacbbfe9099..fb77e67e9fc5 100644
--- a/lld/ELF/Driver.cpp
+++ b/lld/ELF/Driver.cpp
@@ -2471,7 +2471,7 @@ static void readSymbolPartitionSection(Ctx &ctx, InputSectionBase *s) {
   StringRef partName = reinterpret_cast<const char *>(s->content().data());
   for (Partition &part : ctx.partitions) {
     if (part.name == partName) {
-      sym->partition = part.getNumber();
+      sym->partition = part.getNumber(ctx);
       return;
     }
   }
@@ -2500,7 +2500,7 @@ static void readSymbolPartitionSection(Ctx &ctx, InputSectionBase *s) {
   ctx.partitions.emplace_back(ctx);
   Partition &newPart = ctx.partitions.back();
   newPart.name = partName;
-  sym->partition = newPart.getNumber();
+  sym->partition = newPart.getNumber(ctx);
 }
 
 static void markBuffersAsDontNeed(Ctx &ctx, bool skipLinkedOutput) {
diff --git a/lld/ELF/InputSection.h b/lld/ELF/InputSection.h
index 1a5bc629d8b0..dfcc7c8bc852 100644
--- a/lld/ELF/InputSection.h
+++ b/lld/ELF/InputSection.h
@@ -89,7 +89,7 @@ public:
   // The 1-indexed partition that this section is assigned to by the garbage
   // collector, or 0 if this section is dead. Normally there is only one
   // partition, so this will either be 0 or 1.
-  elf::Partition &getPartition() const;
+  elf::Partition &getPartition(Ctx &) const;
 
   // These corresponds to the fields in Elf_Shdr.
   uint64_t flags;
diff --git a/lld/ELF/Relocations.cpp b/lld/ELF/Relocations.cpp
index 73c19c8385a8..0188d658f921 100644
--- a/lld/ELF/Relocations.cpp
+++ b/lld/ELF/Relocations.cpp
@@ -878,7 +878,7 @@ template <bool shard = false>
 static void addRelativeReloc(Ctx &ctx, InputSectionBase &isec,
                              uint64_t offsetInSec, Symbol &sym, int64_t addend,
                              RelExpr expr, RelType type) {
-  Partition &part = isec.getPartition();
+  Partition &part = isec.getPartition(ctx);
 
   if (sym.isTagged()) {
     std::lock_guard<std::mutex> lock(relocMutex);
@@ -1159,7 +1159,7 @@ void RelocationScanner::processAux(RelExpr expr, RelType type, uint64_t offset,
       if (ctx.arg.emachine == EM_MIPS && rel == ctx.target->symbolicRel)
         rel = ctx.target->relativeRel;
       std::lock_guard<std::mutex> lock(relocMutex);
-      Partition &part = sec->getPartition();
+      Partition &part = sec->getPartition(ctx);
       if (ctx.arg.emachine == EM_AARCH64 && type == R_AARCH64_AUTH_ABS64) {
         // For a preemptible symbol, we can't use a relative relocation. For an
         // undefined symbol, we can't compute offset at link-time and use a
diff --git a/lld/ELF/SyntheticSections.cpp b/lld/ELF/SyntheticSections.cpp
index d99aeac50ca5..e18e7a32df86 100644
--- a/lld/ELF/SyntheticSections.cpp
+++ b/lld/ELF/SyntheticSections.cpp
@@ -563,7 +563,7 @@ SmallVector<EhFrameSection::FdeData, 0> EhFrameSection::getFdeData() const {
   uint8_t *buf = ctx.bufferStart + getParent()->offset + outSecOff;
   SmallVector<FdeData, 0> ret;
 
-  uint64_t va = getPartition().ehFrameHdr->getVA();
+  uint64_t va = getPartition(ctx).ehFrameHdr->getVA();
   for (CieRecord *rec : cieRecords) {
     uint8_t enc = getFdeEncoding(rec->cie);
     for (EhSectionPiece *fde : rec->fdes) {
@@ -650,8 +650,8 @@ void EhFrameSection::writeTo(uint8_t *buf) {
   for (EhInputSection *s : sections)
     ctx.target->relocateAlloc(*s, buf);
 
-  if (getPartition().ehFrameHdr && getPartition().ehFrameHdr->getParent())
-    getPartition().ehFrameHdr->write();
+  if (getPartition(ctx).ehFrameHdr && getPartition(ctx).ehFrameHdr->getParent())
+    getPartition(ctx).ehFrameHdr->write();
 }
 
 GotSection::GotSection(Ctx &ctx)
@@ -1325,7 +1325,7 @@ static uint64_t addPltRelSz(Ctx &ctx) { return ctx.in.relaPlt->getSize(); }
 template <class ELFT>
 std::vector<std::pair<int32_t, uint64_t>>
 DynamicSection<ELFT>::computeContents() {
-  elf::Partition &part = getPartition();
+  elf::Partition &part = getPartition(ctx);
   bool isMain = part.name.empty();
   std::vector<std::pair<int32_t, uint64_t>> entries;
 
@@ -1586,7 +1586,7 @@ DynamicSection<ELFT>::computeContents() {
 }
 
 template <class ELFT> void DynamicSection<ELFT>::finalizeContents() {
-  if (OutputSection *sec = getPartition().dynStrTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent())
     getParent()->link = sec->sectionIndex;
   this->size = computeContents().size() * this->entsize;
 }
@@ -1688,7 +1688,7 @@ void RelocationBaseSection::partitionRels() {
 }
 
 void RelocationBaseSection::finalizeContents() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
 
   // When linking glibc statically, .rel{,a}.plt contains R_*_IRELATIVE
   // relocations due to IFUNC (e.g. strcpy). sh_link will be set to 0 in that
@@ -1712,7 +1712,7 @@ void DynamicReloc::computeRaw(Ctx &ctx, SymbolTableBaseSection *symt) {
 }
 
 void RelocationBaseSection::computeRels() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
   parallelForEach(relocs, [&ctx = ctx, symTab](DynamicReloc &rel) {
     rel.computeRaw(ctx, symTab);
   });
@@ -1852,7 +1852,7 @@ bool AndroidPackedRelocationSection<ELFT>::updateAllocSize(Ctx &ctx) {
   for (const DynamicReloc &rel : relocs) {
     Elf_Rela r;
     r.r_offset = rel.getOffset();
-    r.setSymbolAndType(rel.getSymIndex(getPartition().dynSymTab.get()),
+    r.setSymbolAndType(rel.getSymIndex(getPartition(ctx).dynSymTab.get()),
                        rel.type, false);
     r.r_addend = ctx.arg.isRela ? rel.computeAddend(ctx) : 0;
 
@@ -2162,9 +2162,9 @@ void SymbolTableBaseSection::finalizeContents() {
   // Because the first symbol entry is a null entry, 1 is the first.
   getParent()->info = 1;
 
-  if (getPartition().gnuHashTab) {
+  if (getPartition(ctx).gnuHashTab) {
     // NB: It also sorts Symbols to meet the GNU hash table requirements.
-    getPartition().gnuHashTab->addSymbols(symbols);
+    getPartition(ctx).gnuHashTab->addSymbols(symbols);
   } else if (ctx.arg.emachine == EM_MIPS) {
     sortMipsSymbols(ctx, symbols);
   }
@@ -2416,7 +2416,7 @@ GnuHashTableSection::GnuHashTableSection(Ctx &ctx)
                        ".gnu.hash") {}
 
 void GnuHashTableSection::finalizeContents() {
-  if (OutputSection *sec = getPartition().dynSymTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynSymTab->getParent())
     getParent()->link = sec->sectionIndex;
 
   // Computes bloom filter size in word size. We want to allocate 12
@@ -2438,7 +2438,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) {
   // Write a header.
   write32(ctx, buf, nBuckets);
   write32(ctx, buf + 4,
-          getPartition().dynSymTab->getNumSymbols() - symbols.size());
+          getPartition(ctx).dynSymTab->getNumSymbols() - symbols.size());
   write32(ctx, buf + 8, maskWords);
   write32(ctx, buf + 12, Shift2);
   buf += 16;
@@ -2474,7 +2474,7 @@ void GnuHashTableSection::writeTo(uint8_t *buf) {
     // Write a hash bucket. Hash buckets contain indices in the following hash
     // value table.
     write32(ctx, buckets + i->bucketIdx,
-            getPartition().dynSymTab->getSymbolIndex(*i->sym));
+            getPartition(ctx).dynSymTab->getSymbolIndex(*i->sym));
     oldBucket = i->bucketIdx;
   }
 }
@@ -2527,7 +2527,7 @@ HashTableSection::HashTableSection(Ctx &ctx)
 }
 
 void HashTableSection::finalizeContents() {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
 
   if (OutputSection *sec = symTab->getParent())
     getParent()->link = sec->sectionIndex;
@@ -2541,7 +2541,7 @@ void HashTableSection::finalizeContents() {
 }
 
 void HashTableSection::writeTo(uint8_t *buf) {
-  SymbolTableBaseSection *symTab = getPartition().dynSymTab.get();
+  SymbolTableBaseSection *symTab = getPartition(ctx).dynSymTab.get();
   unsigned numSymbols = symTab->getNumSymbols();
 
   uint32_t *p = reinterpret_cast<uint32_t *>(buf);
@@ -3667,14 +3667,14 @@ void EhFrameHeader::writeTo(uint8_t *buf) {
 void EhFrameHeader::write() {
   uint8_t *buf = ctx.bufferStart + getParent()->offset + outSecOff;
   using FdeData = EhFrameSection::FdeData;
-  SmallVector<FdeData, 0> fdes = getPartition().ehFrame->getFdeData();
+  SmallVector<FdeData, 0> fdes = getPartition(ctx).ehFrame->getFdeData();
 
   buf[0] = 1;
   buf[1] = DW_EH_PE_pcrel | DW_EH_PE_sdata4;
   buf[2] = DW_EH_PE_udata4;
   buf[3] = DW_EH_PE_datarel | DW_EH_PE_sdata4;
   write32(ctx, buf + 4,
-          getPartition().ehFrame->getParent()->addr - this->getVA() - 4);
+          getPartition(ctx).ehFrame->getParent()->addr - this->getVA() - 4);
   write32(ctx, buf + 8, fdes.size());
   buf += 12;
 
@@ -3687,11 +3687,11 @@ void EhFrameHeader::write() {
 
 size_t EhFrameHeader::getSize() const {
   // .eh_frame_hdr has a 12 bytes header followed by an array of FDEs.
-  return 12 + getPartition().ehFrame->numFdes * 8;
+  return 12 + getPartition(ctx).ehFrame->numFdes * 8;
 }
 
 bool EhFrameHeader::isNeeded() const {
-  return isLive() && getPartition().ehFrame->isNeeded();
+  return isLive() && getPartition(ctx).ehFrame->isNeeded();
 }
 
 VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx)
@@ -3699,19 +3699,19 @@ VersionDefinitionSection::VersionDefinitionSection(Ctx &ctx)
                        ".gnu.version_d") {}
 
 StringRef VersionDefinitionSection::getFileDefName() {
-  if (!getPartition().name.empty())
-    return getPartition().name;
+  if (!getPartition(ctx).name.empty())
+    return getPartition(ctx).name;
   if (!ctx.arg.soName.empty())
     return ctx.arg.soName;
   return ctx.arg.outputFile;
 }
 
 void VersionDefinitionSection::finalizeContents() {
-  fileDefNameOff = getPartition().dynStrTab->addString(getFileDefName());
+  fileDefNameOff = getPartition(ctx).dynStrTab->addString(getFileDefName());
   for (const VersionDefinition &v : namedVersionDefs(ctx))
-    verDefNameOffs.push_back(getPartition().dynStrTab->addString(v.name));
+    verDefNameOffs.push_back(getPartition(ctx).dynStrTab->addString(v.name));
 
-  if (OutputSection *sec = getPartition().dynStrTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent())
     getParent()->link = sec->sectionIndex;
 
   // sh_info should be set to the number of definitions. This fact is missed in
@@ -3765,16 +3765,16 @@ VersionTableSection::VersionTableSection(Ctx &ctx)
 void VersionTableSection::finalizeContents() {
   // At the moment of june 2016 GNU docs does not mention that sh_link field
   // should be set, but Sun docs do. Also readelf relies on this field.
-  getParent()->link = getPartition().dynSymTab->getParent()->sectionIndex;
+  getParent()->link = getPartition(ctx).dynSymTab->getParent()->sectionIndex;
 }
 
 size_t VersionTableSection::getSize() const {
-  return (getPartition().dynSymTab->getSymbols().size() + 1) * 2;
+  return (getPartition(ctx).dynSymTab->getSymbols().size() + 1) * 2;
 }
 
 void VersionTableSection::writeTo(uint8_t *buf) {
   buf += 2;
-  for (const SymbolTableEntry &s : getPartition().dynSymTab->getSymbols()) {
+  for (const SymbolTableEntry &s : getPartition(ctx).dynSymTab->getSymbols()) {
     // For an unextracted lazy symbol (undefined weak), it must have been
     // converted to Undefined and have VER_NDX_GLOBAL version here.
     assert(!s.sym->isLazy());
@@ -3785,7 +3785,7 @@ void VersionTableSection::writeTo(uint8_t *buf) {
 
 bool VersionTableSection::isNeeded() const {
   return isLive() &&
-         (getPartition().verDef || getPartition().verNeed->isNeeded());
+         (getPartition(ctx).verDef || getPartition(ctx).verNeed->isNeeded());
 }
 
 void elf::addVerneed(Ctx &ctx, Symbol &ss) {
@@ -3817,7 +3817,7 @@ template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents() {
       continue;
     verneeds.emplace_back();
     Verneed &vn = verneeds.back();
-    vn.nameStrTab = getPartition().dynStrTab->addString(f->soName);
+    vn.nameStrTab = getPartition(ctx).dynStrTab->addString(f->soName);
     bool isLibc = ctx.arg.relrGlibc && f->soName.starts_with("libc.so.");
     bool isGlibc2 = false;
     for (unsigned i = 0; i != f->vernauxs.size(); ++i) {
@@ -3829,17 +3829,17 @@ template <class ELFT> void VersionNeedSection<ELFT>::finalizeContents() {
       if (isLibc && ver.starts_with("GLIBC_2."))
         isGlibc2 = true;
       vn.vernauxs.push_back({verdef->vd_hash, f->vernauxs[i],
-                             getPartition().dynStrTab->addString(ver)});
+                             getPartition(ctx).dynStrTab->addString(ver)});
     }
     if (isGlibc2) {
       const char *ver = "GLIBC_ABI_DT_RELR";
       vn.vernauxs.push_back({hashSysV(ver),
                              ++SharedFile::vernauxNum + getVerDefNum(ctx),
-                             getPartition().dynStrTab->addString(ver)});
+                             getPartition(ctx).dynStrTab->addString(ver)});
     }
   }
 
-  if (OutputSection *sec = getPartition().dynStrTab->getParent())
+  if (OutputSection *sec = getPartition(ctx).dynStrTab->getParent())
     getParent()->link = sec->sectionIndex;
   getParent()->info = verneeds.size();
 }
@@ -3995,7 +3995,7 @@ template <class ELFT> void elf::splitSections(Ctx &ctx) {
 void elf::combineEhSections(Ctx &ctx) {
   llvm::TimeTraceScope timeScope("Combine EH sections");
   for (EhInputSection *sec : ctx.ehInputSections) {
-    EhFrameSection &eh = *sec->getPartition().ehFrame;
+    EhFrameSection &eh = *sec->getPartition(ctx).ehFrame;
     sec->parent = &eh;
     eh.addralign = std::max(eh.addralign, sec->addralign);
     eh.sections.push_back(sec);
@@ -4004,12 +4004,12 @@ void elf::combineEhSections(Ctx &ctx) {
 
   if (!ctx.mainPart->armExidx)
     return;
-  llvm::erase_if(ctx.inputSections, [](InputSectionBase *s) {
+  llvm::erase_if(ctx.inputSections, [&](InputSectionBase *s) {
     // Ignore dead sections and the partition end marker (.part.end),
     // whose partition number is out of bounds.
     if (!s->isLive() || s->partition == 255)
       return false;
-    Partition &part = s->getPartition();
+    Partition &part = s->getPartition(ctx);
     return s->kind() == SectionBase::Regular && part.armExidx &&
            part.armExidx->addSection(cast<InputSection>(s));
   });
@@ -4447,7 +4447,7 @@ size_t PartitionElfHeaderSection<ELFT>::getSize() const {
 
 template <typename ELFT>
 void PartitionElfHeaderSection<ELFT>::writeTo(uint8_t *buf) {
-  writeEhdr<ELFT>(ctx, buf, getPartition());
+  writeEhdr<ELFT>(ctx, buf, getPartition(ctx));
 
   // Loadable partitions are always ET_DYN.
   auto *eHdr = reinterpret_cast<typename ELFT::Ehdr *>(buf);
@@ -4460,12 +4460,12 @@ PartitionProgramHeadersSection<ELFT>::PartitionProgramHeadersSection(Ctx &ctx)
 
 template <typename ELFT>
 size_t PartitionProgramHeadersSection<ELFT>::getSize() const {
-  return sizeof(typename ELFT::Phdr) * getPartition().phdrs.size();
+  return sizeof(typename ELFT::Phdr) * getPartition(ctx).phdrs.size();
 }
 
 template <typename ELFT>
 void PartitionProgramHeadersSection<ELFT>::writeTo(uint8_t *buf) {
-  writePhdrs<ELFT>(buf, getPartition());
+  writePhdrs<ELFT>(buf, getPartition(ctx));
 }
 
 PartitionIndexSection::PartitionIndexSection(Ctx &ctx)
@@ -4747,7 +4747,7 @@ template <class ELFT> void elf::createSyntheticSections(Ctx &ctx) {
   const unsigned threadCount = ctx.arg.threadCount;
   for (Partition &part : ctx.partitions) {
     auto add = [&](SyntheticSection &sec) {
-      sec.partition = part.getNumber();
+      sec.partition = part.getNumber(ctx);
       ctx.inputSections.push_back(&sec);
     };
 
diff --git a/lld/ELF/SyntheticSections.h b/lld/ELF/SyntheticSections.h
index 7ddbaff5573f..d64c4aad8c55 100644
--- a/lld/ELF/SyntheticSections.h
+++ b/lld/ELF/SyntheticSections.h
@@ -1475,10 +1475,10 @@ struct Partition {
   std::unique_ptr<VersionTableSection> verSym;
 
   Partition(Ctx &ctx) : ctx(ctx) {}
-  unsigned getNumber() const { return this - &ctx.partitions[0] + 1; }
+  unsigned getNumber(Ctx &ctx) const { return this - &ctx.partitions[0] + 1; }
 };
 
-inline Partition &SectionBase::getPartition() const {
+inline Partition &SectionBase::getPartition(Ctx &ctx) const {
   assert(isLive());
   return ctx.partitions[partition - 1];
 }
diff --git a/lld/ELF/Writer.cpp b/lld/ELF/Writer.cpp
index 49b6ab4a48b5..2cd4478d00cf 100644
--- a/lld/ELF/Writer.cpp
+++ b/lld/ELF/Writer.cpp
@@ -2191,7 +2191,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
     return ret.back();
   };
 
-  unsigned partNo = part.getNumber();
+  unsigned partNo = part.getNumber(ctx);
   bool isMain = partNo == 1;
 
   // Add the first PT_LOAD segment for regular output sections.
@@ -2381,7 +2381,7 @@ SmallVector<PhdrEntry *, 0> Writer<ELFT>::createPhdrs(Partition &part) {
 template <class ELFT>
 void Writer<ELFT>::addPhdrForSection(Partition &part, unsigned shType,
                                      unsigned pType, unsigned pFlags) {
-  unsigned partNo = part.getNumber();
+  unsigned partNo = part.getNumber(ctx);
   auto i = llvm::find_if(ctx.outputSections, [=](OutputSection *cmd) {
     return cmd->partition == partNo && cmd->type == shType;
   });
-- 
GitLab


From b8882be26f00d2a053269948ee6ecaeff8db8eb8 Mon Sep 17 00:00:00 2001
From: Vassil Vassilev <v.g.vassilev@gmail.com>
Date: Wed, 16 Oct 2024 06:03:45 +0000
Subject: [PATCH 069/329] Revert "Update llvm::Registry to work for LLVM shared
 library builds on windows (#109024)"

This reverts commit 00cd1a06daa7f950cf0954c7f9fafc371c255639.

This effectively reverts llvm/llvm-project#109024
---
 clang/include/clang/Basic/ParsedAttrInfo.h    |  5 --
 .../clang/Frontend/FrontendPluginRegistry.h   |  5 --
 clang/include/clang/Lex/Preprocessor.h        |  5 --
 .../CompilationDatabasePluginRegistry.h       |  6 --
 .../Tooling/ToolExecutorPluginRegistry.h      |  6 --
 llvm/include/llvm/CodeGen/GCMetadataPrinter.h |  2 -
 llvm/include/llvm/IR/GCStrategy.h             |  2 -
 llvm/include/llvm/Support/Compiler.h          | 11 ----
 llvm/include/llvm/Support/Registry.h          | 64 +++++++++----------
 9 files changed, 30 insertions(+), 76 deletions(-)

diff --git a/clang/include/clang/Basic/ParsedAttrInfo.h b/clang/include/clang/Basic/ParsedAttrInfo.h
index 3b5f5d3c3f92..fab5c6f1377d 100644
--- a/clang/include/clang/Basic/ParsedAttrInfo.h
+++ b/clang/include/clang/Basic/ParsedAttrInfo.h
@@ -17,7 +17,6 @@
 
 #include "clang/Basic/AttrSubjectMatchRules.h"
 #include "clang/Basic/AttributeCommonInfo.h"
-#include "clang/Support/Compiler.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/Support/Registry.h"
 #include <climits>
@@ -176,8 +175,4 @@ const std::list<std::unique_ptr<ParsedAttrInfo>> &getAttributePluginInstances();
 
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI Registry<clang::ParsedAttrInfo>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_BASIC_PARSEDATTRINFO_H
diff --git a/clang/include/clang/Frontend/FrontendPluginRegistry.h b/clang/include/clang/Frontend/FrontendPluginRegistry.h
index 5eea9c2fd89a..810578534acb 100644
--- a/clang/include/clang/Frontend/FrontendPluginRegistry.h
+++ b/clang/include/clang/Frontend/FrontendPluginRegistry.h
@@ -14,7 +14,6 @@
 #define LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
 
 #include "clang/Frontend/FrontendAction.h"
-#include "clang/Support/Compiler.h"
 #include "llvm/Support/Registry.h"
 
 namespace clang {
@@ -24,8 +23,4 @@ using FrontendPluginRegistry = llvm::Registry<PluginASTAction>;
 
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI Registry<clang::PluginASTAction>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_FRONTEND_FRONTENDPLUGINREGISTRY_H
diff --git a/clang/include/clang/Lex/Preprocessor.h b/clang/include/clang/Lex/Preprocessor.h
index 92749e4de44b..4643b0213815 100644
--- a/clang/include/clang/Lex/Preprocessor.h
+++ b/clang/include/clang/Lex/Preprocessor.h
@@ -32,7 +32,6 @@
 #include "clang/Lex/PPEmbedParameters.h"
 #include "clang/Lex/Token.h"
 #include "clang/Lex/TokenLexer.h"
-#include "clang/Support/Compiler.h"
 #include "llvm/ADT/APSInt.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
@@ -3061,8 +3060,4 @@ using PragmaHandlerRegistry = llvm::Registry<PragmaHandler>;
 
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI Registry<clang::PragmaHandler>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_LEX_PREPROCESSOR_H
diff --git a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
index e6bcac542b0e..8c58ad926a40 100644
--- a/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
+++ b/clang/include/clang/Tooling/CompilationDatabasePluginRegistry.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
 
-#include "clang/Support/Compiler.h"
 #include "clang/Tooling/CompilationDatabase.h"
 #include "llvm/Support/Registry.h"
 
@@ -43,9 +42,4 @@ using CompilationDatabasePluginRegistry =
 } // namespace tooling
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI
-    Registry<clang::tooling::CompilationDatabasePlugin>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_TOOLING_COMPILATIONDATABASEPLUGINREGISTRY_H
diff --git a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
index 8d5458323468..5304ff26252d 100644
--- a/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
+++ b/clang/include/clang/Tooling/ToolExecutorPluginRegistry.h
@@ -9,7 +9,6 @@
 #ifndef LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 #define LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
 
-#include "clang/Support/Compiler.h"
 #include "clang/Tooling/Execution.h"
 #include "llvm/Support/Registry.h"
 
@@ -21,9 +20,4 @@ using ToolExecutorPluginRegistry = llvm::Registry<ToolExecutorPlugin>;
 } // namespace tooling
 } // namespace clang
 
-namespace llvm {
-extern template class CLANG_TEMPLATE_ABI
-    Registry<clang::tooling::ToolExecutorPlugin>;
-} // namespace llvm
-
 #endif // LLVM_CLANG_TOOLING_TOOLEXECUTORPLUGINREGISTRY_H
diff --git a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
index 9d421be8313f..f9527c9f8752 100644
--- a/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
+++ b/llvm/include/llvm/CodeGen/GCMetadataPrinter.h
@@ -34,8 +34,6 @@ class StackMaps;
 /// defaults from Registry.
 using GCMetadataPrinterRegistry = Registry<GCMetadataPrinter>;
 
-extern template class LLVM_TEMPLATE_ABI Registry<GCMetadataPrinter>;
-
 /// GCMetadataPrinter - Emits GC metadata as assembly code.  Instances are
 /// created, managed, and owned by the AsmPrinter.
 class GCMetadataPrinter {
diff --git a/llvm/include/llvm/IR/GCStrategy.h b/llvm/include/llvm/IR/GCStrategy.h
index cbfbe23aaa06..3186465f0018 100644
--- a/llvm/include/llvm/IR/GCStrategy.h
+++ b/llvm/include/llvm/IR/GCStrategy.h
@@ -141,8 +141,6 @@ public:
 /// GCMetadataPrinterRegistery as well.
 using GCRegistry = Registry<GCStrategy>;
 
-extern template class LLVM_TEMPLATE_ABI Registry<GCStrategy>;
-
 /// Lookup the GCStrategy object associated with the given gc name.
 std::unique_ptr<GCStrategy> getGCStrategy(const StringRef Name);
 
diff --git a/llvm/include/llvm/Support/Compiler.h b/llvm/include/llvm/Support/Compiler.h
index ab0cbff43d74..1d2d751d4dc1 100644
--- a/llvm/include/llvm/Support/Compiler.h
+++ b/llvm/include/llvm/Support/Compiler.h
@@ -153,12 +153,6 @@
 /// exported when llvm is built as a shared library with everything else that is
 /// unannotated will have internal visibility.
 ///
-/// LLVM_ABI_EXPORT is for the special case for things like plugin symbol
-/// declarations or definitions where we don't want the macro to be switching
-/// between dllexport and dllimport on windows based on what codebase is being
-/// built, it will only be dllexport. For non windows platforms this macro
-/// behaves the same as LLVM_ABI.
-///
 /// LLVM_EXPORT_TEMPLATE is used on explicit template instantiations in source
 /// files that were declared extern in a header. This macro is only set as a
 /// compiler export attribute on windows, on other platforms it does nothing.
@@ -185,7 +179,6 @@
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT
 #elif defined(_WIN32) && !defined(__MINGW32__)
 #if defined(LLVM_EXPORTS)
 #define LLVM_ABI __declspec(dllexport)
@@ -196,23 +189,19 @@
 #define LLVM_TEMPLATE_ABI __declspec(dllimport)
 #define LLVM_EXPORT_TEMPLATE
 #endif
-#define LLVM_ABI_EXPORT __declspec(dllexport)
 #elif defined(__ELF__) || defined(__MINGW32__) || defined(_AIX)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #elif defined(__MACH__) || defined(__WASM__)
 #define LLVM_ABI LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT LLVM_ATTRIBUTE_VISIBILITY_DEFAULT
 #endif
 #else
 #define LLVM_ABI
 #define LLVM_TEMPLATE_ABI
 #define LLVM_EXPORT_TEMPLATE
-#define LLVM_ABI_EXPORT
 #endif
 #define LLVM_C_ABI LLVM_ABI
 #endif
diff --git a/llvm/include/llvm/Support/Registry.h b/llvm/include/llvm/Support/Registry.h
index ff9226c39359..5bb6a254a47f 100644
--- a/llvm/include/llvm/Support/Registry.h
+++ b/llvm/include/llvm/Support/Registry.h
@@ -53,13 +53,7 @@ namespace llvm {
     Registry() = delete;
 
     friend class node;
-    // These must be must two separate declarations to workaround a 20 year
-    // old MSVC bug with dllexport and multiple static fields in the same
-    // declaration causing error C2487 "member of dll interface class may not
-    // be declared with dll interface".
-    // https://developercommunity.visualstudio.com/t/c2487-in-dllexport-class-with-static-members/69878
-    static node *Head;
-    static node *Tail;
+    static node *Head, *Tail;
 
   public:
     /// Node in linked list of entries.
@@ -82,13 +76,7 @@ namespace llvm {
     /// add a node to the executable's registry. Therefore it's not defined here
     /// to avoid it being instantiated in the plugin and is instead defined in
     /// the executable (see LLVM_INSTANTIATE_REGISTRY below).
-    static void add_node(node *N) {
-      if (Tail)
-        Tail->Next = N;
-      else
-        Head = N;
-      Tail = N;
-    }
+    static void add_node(node *N);
 
     /// Iterators for registry entries.
     ///
@@ -107,7 +95,7 @@ namespace llvm {
 
     // begin is not defined here in order to avoid usage of an undefined static
     // data member, instead it's instantiated by LLVM_INSTANTIATE_REGISTRY.
-    static iterator begin() { return iterator(Head); }
+    static iterator begin();
     static iterator end()   { return iterator(nullptr); }
 
     static iterator_range<iterator> entries() {
@@ -136,28 +124,36 @@ namespace llvm {
       }
     };
   };
-
 } // end namespace llvm
 
-#ifdef _WIN32
 /// Instantiate a registry class.
-#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
-  namespace llvm {                                                             \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
-  template class LLVM_ABI_EXPORT Registry<REGISTRY_CLASS::type>;               \
-  }
-#else
-#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS)                              \
-  namespace llvm {                                                             \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Head = nullptr;                     \
-  template <typename T>                                                        \
-  typename Registry<T>::node *Registry<T>::Tail = nullptr;                     \
-  template class Registry<REGISTRY_CLASS::type>;                               \
+///
+/// This provides template definitions of add_node, begin, and the Head and Tail
+/// pointers, then explicitly instantiates them. We could explicitly specialize
+/// them, instead of the two-step process of define then instantiate, but
+/// strictly speaking that's not allowed by the C++ standard (we would need to
+/// have explicit specialization declarations in all translation units where the
+/// specialization is used) so we don't.
+#define LLVM_INSTANTIATE_REGISTRY(REGISTRY_CLASS) \
+  namespace llvm { \
+  template<typename T> typename Registry<T>::node *Registry<T>::Head = nullptr;\
+  template<typename T> typename Registry<T>::node *Registry<T>::Tail = nullptr;\
+  template<typename T> \
+  void Registry<T>::add_node(typename Registry<T>::node *N) { \
+    if (Tail) \
+      Tail->Next = N; \
+    else \
+      Head = N; \
+    Tail = N; \
+  } \
+  template<typename T> typename Registry<T>::iterator Registry<T>::begin() { \
+    return iterator(Head); \
+  } \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Head; \
+  template REGISTRY_CLASS::node *Registry<REGISTRY_CLASS::type>::Tail; \
+  template \
+  void Registry<REGISTRY_CLASS::type>::add_node(REGISTRY_CLASS::node*); \
+  template REGISTRY_CLASS::iterator Registry<REGISTRY_CLASS::type>::begin(); \
   }
-#endif
 
 #endif // LLVM_SUPPORT_REGISTRY_H
-- 
GitLab


From 282ab2f1895450707c9f8fc6a46634620165d1c9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Tue, 15 Oct 2024 23:11:30 -0700
Subject: [PATCH 070/329] [lldb] Avoid repeated hash lookups (NFC) (#112471)

---
 lldb/source/Core/Progress.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/lldb/source/Core/Progress.cpp b/lldb/source/Core/Progress.cpp
index 27774ce7a552..c9a556472c06 100644
--- a/lldb/source/Core/Progress.cpp
+++ b/lldb/source/Core/Progress.cpp
@@ -151,10 +151,11 @@ void ProgressManager::Decrement(const Progress::ProgressData &progress_data) {
   std::lock_guard<std::mutex> lock(m_entries_mutex);
   llvm::StringRef key = progress_data.title;
 
-  if (!m_entries.contains(key))
+  auto it = m_entries.find(key);
+  if (it == m_entries.end())
     return;
 
-  Entry &entry = m_entries[key];
+  Entry &entry = it->second;
   entry.refcount--;
 
   if (entry.refcount == 0) {
-- 
GitLab


From cc5b5ca34b93e05199527c969a04e44f13653620 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 16 Oct 2024 07:13:37 +0100
Subject: [PATCH 071/329] [LV] Add test where interleave group start pointer is
 incorrect.

Test case from https://github.com/llvm/llvm-project/pull/106431.
---
 ...aved-accesses-different-insert-position.ll | 88 +++++++++++++++++++
 1 file changed, 88 insertions(+)

diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 953c93756fef..665fd1b9aeac 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -154,6 +154,92 @@ loop.latch:
 exit:
   ret void
 }
+
+; FIXME: Currently the start address of the interleav group is computed
+; incorrectly.
+define i64 @interleave_group_load_pointer_type(ptr %start, ptr %end) {
+; CHECK-LABEL: define i64 @interleave_group_load_pointer_type(
+; CHECK-SAME: ptr [[START:%.*]], ptr [[END:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*]]:
+; CHECK-NEXT:    [[START2:%.*]] = ptrtoint ptr [[START]] to i64
+; CHECK-NEXT:    [[END1:%.*]] = ptrtoint ptr [[END]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = sub i64 [[END1]], [[START2]]
+; CHECK-NEXT:    [[TMP1:%.*]] = udiv i64 [[TMP0]], 24
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label %[[SCALAR_PH:.*]], label %[[VECTOR_PH:.*]]
+; CHECK:       [[VECTOR_PH]]:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[TMP5:%.*]] = mul i64 [[N_VEC]], 24
+; CHECK-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP5]]
+; CHECK-NEXT:    br label %[[VECTOR_BODY:.*]]
+; CHECK:       [[VECTOR_BODY]]:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, %[[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[VEC_PHI:%.*]] = phi <4 x i64> [ zeroinitializer, %[[VECTOR_PH]] ], [ [[TMP12:%.*]], %[[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 24
+; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
+; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x ptr>, ptr [[TMP8]], align 8
+; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
+; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
+; CHECK-NEXT:    [[TMP9:%.*]] = ptrtoint <4 x ptr> [[STRIDED_VEC3]] to <4 x i64>
+; CHECK-NEXT:    [[TMP10:%.*]] = ptrtoint <4 x ptr> [[STRIDED_VEC]] to <4 x i64>
+; CHECK-NEXT:    [[TMP11:%.*]] = or <4 x i64> [[TMP9]], [[TMP10]]
+; CHECK-NEXT:    [[TMP12]] = or <4 x i64> [[TMP11]], [[VEC_PHI]]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP13]], label %[[MIDDLE_BLOCK:.*]], label %[[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; CHECK:       [[MIDDLE_BLOCK]]:
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vector.reduce.or.v4i64(<4 x i64> [[TMP12]])
+; CHECK-NEXT:    br label %[[SCALAR_PH]]
+; CHECK:       [[SCALAR_PH]]:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], %[[MIDDLE_BLOCK]] ], [ [[START]], %[[ENTRY]] ]
+; CHECK-NEXT:    [[BC_MERGE_RDX:%.*]] = phi i64 [ [[TMP14]], %[[MIDDLE_BLOCK]] ], [ 0, %[[ENTRY]] ]
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[PTR_IV:%.*]] = phi ptr [ [[BC_RESUME_VAL]], %[[SCALAR_PH]] ], [ [[PTR_IV_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[BC_MERGE_RDX]], %[[SCALAR_PH]] ], [ [[RED_NEXT:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[GEP_16:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 16
+; CHECK-NEXT:    [[L_16:%.*]] = load ptr, ptr [[GEP_16]], align 8
+; CHECK-NEXT:    [[P_16:%.*]] = ptrtoint ptr [[L_16]] to i64
+; CHECK-NEXT:    [[GEP_8:%.*]] = getelementptr i8, ptr [[PTR_IV]], i64 8
+; CHECK-NEXT:    [[L_8:%.*]] = load ptr, ptr [[GEP_8]], align 8
+; CHECK-NEXT:    [[P_8:%.*]] = ptrtoint ptr [[L_8]] to i64
+; CHECK-NEXT:    [[OR_1:%.*]] = or i64 [[P_16]], [[P_8]]
+; CHECK-NEXT:    [[RED_NEXT]] = or i64 [[OR_1]], [[RED]]
+; CHECK-NEXT:    [[PTR_IV_NEXT]] = getelementptr nusw i8, ptr [[PTR_IV]], i64 24
+; CHECK-NEXT:    [[EC:%.*]] = icmp eq ptr [[PTR_IV]], [[END]]
+; CHECK-NEXT:    br i1 [[EC]], label %[[EXIT:.*]], label %[[LOOP]], !llvm.loop [[LOOP7:![0-9]+]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], %[[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RED_NEXT_LCSSA]]
+;
+entry:
+  br label %loop
+
+loop:
+  %ptr.iv = phi ptr [ %start, %entry ], [ %ptr.iv.next, %loop ]
+  %red = phi i64 [ 0, %entry ], [ %red.next, %loop ]
+  %gep.16 = getelementptr i8, ptr %ptr.iv, i64 16
+  %l.16 = load ptr, ptr %gep.16, align 8
+  %p.16 = ptrtoint ptr %l.16 to i64
+  %gep.8 = getelementptr i8, ptr %ptr.iv, i64 8
+  %l.8 = load ptr, ptr %gep.8, align 8
+  %p.8 = ptrtoint ptr %l.8 to i64
+  %or.1 = or i64 %p.16, %p.8
+  %red.next = or i64 %or.1, %red
+  %ptr.iv.next = getelementptr nusw i8, ptr %ptr.iv, i64 24
+  %ec = icmp eq ptr %ptr.iv, %end
+  br i1 %ec, label %exit, label %loop
+
+exit:
+  ret i64 %red.next
+}
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
 ; CHECK: [[META1]] = !{!"llvm.loop.isvectorized", i32 1}
@@ -161,4 +247,6 @@ exit:
 ; CHECK: [[LOOP3]] = distinct !{[[LOOP3]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP4]] = distinct !{[[LOOP4]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP5]] = distinct !{[[LOOP5]], [[META2]], [[META1]]}
+; CHECK: [[LOOP6]] = distinct !{[[LOOP6]], [[META1]], [[META2]]}
+; CHECK: [[LOOP7]] = distinct !{[[LOOP7]], [[META2]], [[META1]]}
 ;.
-- 
GitLab


From 49de1541655cc71cfedbee10d6b4a4c12fc7e13b Mon Sep 17 00:00:00 2001
From: Aiden Grossman <aidengrossman@google.com>
Date: Tue, 15 Oct 2024 23:15:05 -0700
Subject: [PATCH 072/329] [CMake] Do not set CMP0114 explicitly to old (#90384)

CMP0114 was originally set to old to get rid of warnings. However, this
behavior is now set to new by default with the minimum CMake version
that LLVM requires so does not produce any warnings, and setting it
explicitly to old does produce a warning in newer CMake versions. Due to
these reasons, remove this check for now.

Splitting off from removing the CMP0116 check just in case something
breaks.

Partially fixes #83727.
---
 cmake/Modules/CMakePolicy.cmake | 5 -----
 1 file changed, 5 deletions(-)

diff --git a/cmake/Modules/CMakePolicy.cmake b/cmake/Modules/CMakePolicy.cmake
index 665af01d43bd..f19dfd716571 100644
--- a/cmake/Modules/CMakePolicy.cmake
+++ b/cmake/Modules/CMakePolicy.cmake
@@ -1,10 +1,5 @@
 # CMake policy settings shared between LLVM projects
 
-# CMP0114: ExternalProject step targets fully adopt their steps.
-# New in CMake 3.19: https://cmake.org/cmake/help/latest/policy/CMP0114.html
-if(POLICY CMP0114)
-  cmake_policy(SET CMP0114 OLD)
-endif()
 # CMP0116: Ninja generators transform `DEPFILE`s from `add_custom_command()`
 # New in CMake 3.20. https://cmake.org/cmake/help/latest/policy/CMP0116.html
 if(POLICY CMP0116)
-- 
GitLab


From 484c02780bad58aa99baf3621530d334c6c0d59b Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Wed, 16 Oct 2024 14:14:06 +0800
Subject: [PATCH 073/329] Revert "[mlir][LLVMIR] Add operand bundle support for
 llvm.intr.assume (#112143)"

This reverts commit d8fadad07c952c4aea967aefb0900e4e43ad0555.

The commit breaks the following CI builds:
- ppc64le-mlir-rhel-clang: https://lab.llvm.org/buildbot/#/builders/129/builds/7685
- ppc64le-flang-rhel-clang: https://lab.llvm.org/buildbot/#/builders/157/builds/10338
---
 .../Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td   |  1 -
 .../mlir/Dialect/LLVMIR/LLVMDialect.td        |  2 -
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   | 44 ++-------
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td | 25 ++---
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   | 18 +++-
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  2 +-
 .../include/mlir/Target/LLVMIR/ModuleImport.h |  2 -
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 96 +++++++------------
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  6 --
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        | 16 +---
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  |  6 --
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       | 32 +------
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 37 +------
 .../expand-then-convert-to-llvm.mlir          |  2 +-
 .../MemRefToLLVM/memref-to-llvm.mlir          |  4 +-
 mlir/test/Dialect/LLVMIR/inlining.mlir        |  4 +-
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       | 27 ------
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   | 12 +--
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 15 ---
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |  2 +-
 20 files changed, 77 insertions(+), 276 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index e81db32bcaad..0e38325f9891 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -71,7 +71,6 @@ class ArmSME_IntrOp<string mnemonic,
           /*bit requiresAccessGroup=*/0,
           /*bit requiresAliasAnalysis=*/0,
           /*bit requiresFastmath=*/0,
-          /*bit requiresOpBundles=*/0,
           /*list<int> immArgPositions=*/immArgPositions,
           /*list<string> immArgAttrNames=*/immArgAttrNames>;
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index ea82f7f7b8e1..27a2b418aadb 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -59,8 +59,6 @@ def LLVM_Dialect : Dialect {
     static StringRef getStructRetAttrName() { return "llvm.sret"; }
     static StringRef getWriteOnlyAttrName() { return "llvm.writeonly"; }
     static StringRef getZExtAttrName() { return "llvm.zeroext"; }
-    static StringRef getOpBundleSizesAttrName() { return "op_bundle_sizes"; }
-    static StringRef getOpBundleTagsAttrName() { return "op_bundle_tags"; }
     // TODO Restrict the usage of this to parameter attributes once there is an
     // alternative way of modeling memory effects on FunctionOpInterface.
     /// Name of the attribute that will cause the creation of a readnone memory
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index 845c88b1be77..ab40c8ec4b65 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -120,8 +120,7 @@ def LLVM_Log2Op : LLVM_UnaryIntrOpF<"log2">;
 def LLVM_LogOp : LLVM_UnaryIntrOpF<"log">;
 def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0],
   /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*requiresOpBundles=*/0, /*immArgPositions=*/[1, 2, 3],
-  /*immArgAttrNames=*/["rw", "hint", "cache"]
+  /*immArgPositions=*/[1, 2, 3], /*immArgAttrNames=*/["rw", "hint", "cache"]
 > {
   let arguments = (ins LLVM_AnyPointer:$addr, I32Attr:$rw, I32Attr:$hint, I32Attr:$cache);
 }
@@ -177,8 +176,7 @@ class LLVM_MemcpyIntrOpBase<string name> :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
-    /*immArgAttrNames=*/["isVolatile"]> {
+    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   AnySignlessInteger:$len, I1Attr:$isVolatile);
@@ -208,8 +206,7 @@ def LLVM_MemcpyInlineOp :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[2, 3],
-    /*immArgAttrNames=*/["len", "isVolatile"]> {
+    /*immArgPositions=*/[2, 3], /*immArgAttrNames=*/["len", "isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   APIntAttr:$len, I1Attr:$isVolatile);
@@ -235,8 +232,7 @@ def LLVM_MemsetOp : LLVM_ZeroResultIntrOp<"memset", [0, 2],
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
-    /*immArgAttrNames=*/["isVolatile"]> {
+    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   I8:$val, AnySignlessInteger:$len, I1Attr:$isVolatile);
   // Append the alias attributes defined by LLVM_IntrOpBase.
@@ -290,8 +286,7 @@ def LLVM_NoAliasScopeDeclOp
 class LLVM_LifetimeBaseOp<string opName> : LLVM_ZeroResultIntrOp<opName, [1],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
-    /*immArgAttrNames=*/["size"]> {
+    /*immArgPositions=*/[0], /*immArgAttrNames=*/["size"]> {
   let arguments = (ins I64Attr:$size, LLVM_AnyPointer:$ptr);
   let assemblyFormat = "$size `,` $ptr attr-dict `:` qualified(type($ptr))";
 }
@@ -311,8 +306,7 @@ def LLVM_InvariantStartOp : LLVM_OneResultIntrOp<"invariant.start", [], [1],
 def LLVM_InvariantEndOp : LLVM_ZeroResultIntrOp<"invariant.end", [2],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*requiresOpBundles=*/0, /*immArgPositions=*/[1],
-    /*immArgAttrNames=*/["size"]> {
+    /*immArgPositions=*/[1], /*immArgAttrNames=*/["size"]> {
   let arguments = (ins LLVM_DefaultPointer:$start,
                        I64Attr:$size,
                        LLVM_AnyPointer:$ptr);
@@ -374,7 +368,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
     SmallVector<Value> mlirOperands;
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
-        llvmOperands.take_front( }] # numArgs # [{), {}, false,
+        llvmOperands.take_front( }] # numArgs # [{),
         {}, {}, mlirOperands, mlirAttrs))) {
       return failure();
     }
@@ -435,26 +429,7 @@ def LLVM_USHLSat : LLVM_BinarySameArgsIntrOpI<"ushl.sat">;
 //
 
 def LLVM_AssumeOp
-    : LLVM_ZeroResultIntrOp<"assume", /*overloadedOperands=*/[], /*traits=*/[],
-                            /*requiresAccessGroup=*/0,
-                            /*requiresAliasAnalysis=*/0,
-                            /*requiresOpBundles=*/1> {
-  dag args = (ins I1:$cond);
-  let arguments = !con(args, opBundleArgs);
-
-  let assemblyFormat = [{
-    $cond
-    ( custom<OpBundles>($op_bundle_operands, type($op_bundle_operands),
-                        $op_bundle_tags)^ )?
-    `:` type($cond) attr-dict
-  }];
-
-  let builders = [
-    OpBuilder<(ins "Value":$cond)>
-  ];
-
-  let hasVerifier = 1;
-}
+  : LLVM_ZeroResultIntrOp<"assume", []>, Arguments<(ins I1:$cond)>;
 
 def LLVM_SSACopyOp : LLVM_OneResultIntrOp<"ssa.copy", [], [0],
                                             [Pure, SameOperandsAndResultType]> {
@@ -1017,8 +992,7 @@ def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">;
 def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
   /*overloadedOperands=*/[], /*traits=*/[],
   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
-  /*immArgAttrNames=*/["failureKind"]> {
+  /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> {
   let arguments = (ins I8Attr:$failureKind);
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index a38dafa4d9cf..c3d352d8d0dd 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,7 +291,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                       list<int> overloadedResults, list<int> overloadedOperands,
                       list<Trait> traits, int numResults,
                       bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0,
-                      bit requiresFastmath = 0, bit requiresOpBundles = 0,
+                      bit requiresFastmath = 0,
                       list<int> immArgPositions = [],
                       list<string> immArgAttrNames = []>
     : LLVM_OpBase<dialect, opName, !listconcat(
@@ -313,12 +313,6 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                  OptionalAttr<LLVM_AliasScopeArrayAttr>:$noalias_scopes,
                  OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa),
             (ins )));
-  dag opBundleArgs = !if(!gt(requiresOpBundles, 0),
-                         (ins VariadicOfVariadic<LLVM_Type,
-                                "op_bundle_sizes">:$op_bundle_operands,
-                              DenseI32ArrayAttr:$op_bundle_sizes,
-                              OptionalAttr<ArrayAttr>:$op_bundle_tags),
-                         (ins ));
   string llvmEnumName = enumName;
   string overloadedResultsCpp =  "{" # !interleave(overloadedResults, ", ") # "}";
   string overloadedOperandsCpp =  "{" # !interleave(overloadedOperands, ", ") # "}";
@@ -342,8 +336,6 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
       llvmOperands,
-      llvmOpBundles,
-      }] # !if(!gt(requiresOpBundles, 0), "true", "false") # [{,
       }] # immArgPositionsCpp # [{,
       }] # immArgAttrNamesCpp # [{,
       mlirOperands,
@@ -389,14 +381,12 @@ class LLVM_IntrOp<string mnem, list<int> overloadedResults,
                   list<int> overloadedOperands, list<Trait> traits,
                   int numResults, bit requiresAccessGroup = 0,
                   bit requiresAliasAnalysis = 0, bit requiresFastmath = 0,
-                  bit requiresOpBundles = 0,
                   list<int> immArgPositions = [],
                   list<string> immArgAttrNames = []>
     : LLVM_IntrOpBase<LLVM_Dialect, "intr." # mnem, !subst(".", "_", mnem),
                       overloadedResults, overloadedOperands, traits,
                       numResults, requiresAccessGroup, requiresAliasAnalysis,
-                      requiresFastmath, requiresOpBundles, immArgPositions,
-                      immArgAttrNames>;
+                      requiresFastmath, immArgPositions, immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning no results. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.".
@@ -416,13 +406,11 @@ class LLVM_ZeroResultIntrOp<string mnem, list<int> overloadedOperands = [],
                             list<Trait> traits = [],
                             bit requiresAccessGroup = 0,
                             bit requiresAliasAnalysis = 0,
-                            bit requiresOpBundles = 0,
                             list<int> immArgPositions = [],
                             list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, [], overloadedOperands, traits, /*numResults=*/0,
                   requiresAccessGroup, requiresAliasAnalysis,
-                  /*requiresFastMath=*/0, requiresOpBundles, immArgPositions,
-                  immArgAttrNames>;
+                  /*requiresFastMath=*/0, immArgPositions, immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning one result. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.". This is
@@ -434,12 +422,11 @@ class LLVM_OneResultIntrOp<string mnem, list<int> overloadedResults = [],
                            list<int> overloadedOperands = [],
                            list<Trait> traits = [],
                            bit requiresFastmath = 0,
-                           list<int> immArgPositions = [],
-                           list<string> immArgAttrNames = []>
+                          list<int> immArgPositions = [],
+                          list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, overloadedResults, overloadedOperands, traits, 1,
                   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-                  requiresFastmath, /*requiresOpBundles=*/0, immArgPositions,
-                  immArgAttrNames>;
+                  requiresFastmath, immArgPositions, immArgAttrNames>;
 
 def LLVM_OneResultOpBuilder :
   OpBuilder<(ins "Type":$resultType, "ValueRange":$operands,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index d5def510a904..bbca7bc7286a 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -559,7 +559,11 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                    VariadicOfVariadic<LLVM_Type,
                                       "op_bundle_sizes">:$op_bundle_operands,
                    DenseI32ArrayAttr:$op_bundle_sizes,
-                   OptionalAttr<ArrayAttr>:$op_bundle_tags);
+                   DefaultValuedProperty<
+                     ArrayProperty<StringProperty, "operand bundle tags">,
+                     "ArrayRef<std::string>{}",
+                     "SmallVector<std::string>{}"
+                   >:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$result);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
@@ -674,7 +678,11 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
                   VariadicOfVariadic<LLVM_Type,
                                      "op_bundle_sizes">:$op_bundle_operands,
                   DenseI32ArrayAttr:$op_bundle_sizes,
-                  OptionalAttr<ArrayAttr>:$op_bundle_tags);
+                  DefaultValuedProperty<
+                    ArrayProperty<StringProperty, "operand bundle tags">,
+                    "ArrayRef<std::string>{}",
+                    "SmallVector<std::string>{}"
+                  >:$op_bundle_tags);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1922,7 +1930,11 @@ def LLVM_CallIntrinsicOp
                        VariadicOfVariadic<LLVM_Type,
                                           "op_bundle_sizes">:$op_bundle_operands,
                        DenseI32ArrayAttr:$op_bundle_sizes,
-                       OptionalAttr<ArrayAttr>:$op_bundle_tags);
+                       DefaultValuedProperty<
+                         ArrayProperty<StringProperty, "operand bundle tags">,
+                         "ArrayRef<std::string>{}",
+                         "SmallVector<std::string>{}"
+                       >:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$results);
   let llvmBuilder = [{
     return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index 3695708439d9..c40ae4b1016b 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -98,7 +98,7 @@ class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
     overloadedOperands, traits, numResults, requiresAccessGroup,
-    requiresAliasAnalysis, 0, 0, immArgPositions, immArgAttrNames>;
+    requiresAliasAnalysis, 0, immArgPositions, immArgAttrNames>;
 
 //===----------------------------------------------------------------------===//
 // ROCDL special register op definitions
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index bbb7af58d273..9f300bcafea5 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -243,8 +243,6 @@ public:
   /// corresponding MLIR attribute names.
   LogicalResult
   convertIntrinsicArguments(ArrayRef<llvm::Value *> values,
-                            ArrayRef<llvm::OperandBundleUse> opBundles,
-                            bool requiresOpBundles,
                             ArrayRef<unsigned> immArgPositions,
                             ArrayRef<StringLiteral> immArgAttrNames,
                             SmallVectorImpl<Value> &valuesOut,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index cc73878a64ff..12ed8cc88ae7 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -241,18 +241,13 @@ static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands,
 static void printOpBundles(OpAsmPrinter &p, Operation *op,
                            OperandRangeRange opBundleOperands,
                            TypeRangeRange opBundleOperandTypes,
-                           std::optional<ArrayAttr> opBundleTags) {
-  if (opBundleOperands.empty())
-    return;
-  assert(opBundleTags && "expect operand bundle tags");
-
+                           ArrayRef<std::string> opBundleTags) {
   p << "[";
   llvm::interleaveComma(
-      llvm::zip(opBundleOperands, opBundleOperandTypes, *opBundleTags), p,
+      llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p,
       [&p](auto bundle) {
-        auto bundleTag = cast<StringAttr>(std::get<2>(bundle)).getValue();
         printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle),
-                         bundleTag);
+                         std::get<2>(bundle));
       });
   p << "]";
 }
@@ -261,7 +256,7 @@ static ParseResult parseOneOpBundle(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<Attribute> &opBundleTags) {
+    SmallVector<std::string> &opBundleTags) {
   SMLoc currentParserLoc = p.getCurrentLocation();
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<Type> types;
@@ -281,7 +276,7 @@ static ParseResult parseOneOpBundle(
 
   opBundleOperands.push_back(std::move(operands));
   opBundleOperandTypes.push_back(std::move(types));
-  opBundleTags.push_back(StringAttr::get(p.getContext(), tag));
+  opBundleTags.push_back(std::move(tag));
 
   return success();
 }
@@ -290,17 +285,16 @@ static std::optional<ParseResult> parseOpBundles(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    ArrayAttr &opBundleTags) {
+    SmallVector<std::string> &opBundleTags) {
   if (p.parseOptionalLSquare())
     return std::nullopt;
 
   if (succeeded(p.parseOptionalRSquare()))
     return success();
 
-  SmallVector<Attribute> opBundleTagAttrs;
   auto bundleParser = [&] {
     return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes,
-                            opBundleTagAttrs);
+                            opBundleTags);
   };
   if (p.parseCommaSeparatedList(bundleParser))
     return failure();
@@ -308,8 +302,6 @@ static std::optional<ParseResult> parseOpBundles(
   if (p.parseRSquare())
     return failure();
 
-  opBundleTags = ArrayAttr::get(p.getContext(), opBundleTagAttrs);
-
   return success();
 }
 
@@ -1047,7 +1039,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1074,7 +1066,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1087,7 +1079,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1100,7 +1092,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1200,20 +1192,12 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
 template <typename OpType>
 static LogicalResult verifyOperandBundles(OpType &op) {
   OperandRangeRange opBundleOperands = op.getOpBundleOperands();
-  std::optional<ArrayAttr> opBundleTags = op.getOpBundleTags();
+  ArrayRef<std::string> opBundleTags = op.getOpBundleTags();
 
-  auto isStringAttr = [](Attribute tagAttr) {
-    return isa<StringAttr>(tagAttr);
-  };
-  if (opBundleTags && !llvm::all_of(*opBundleTags, isStringAttr))
-    return op.emitError("operand bundle tag must be a StringAttr");
-
-  size_t numOpBundles = opBundleOperands.size();
-  size_t numOpBundleTags = opBundleTags ? opBundleTags->size() : 0;
-  if (numOpBundles != numOpBundleTags)
+  if (opBundleTags.size() != opBundleOperands.size())
     return op.emitError("expected ")
-           << numOpBundles << " operand bundle tags, but actually got "
-           << numOpBundleTags;
+           << opBundleOperands.size()
+           << " operand bundle tags, but actually got " << opBundleTags.size();
 
   return success();
 }
@@ -1345,8 +1329,7 @@ void CallOp::print(OpAsmPrinter &p) {
                           {getCalleeAttrName(), getTailCallKindAttrName(),
                            getVarCalleeTypeAttrName(), getCConvAttrName(),
                            getOperandSegmentSizesAttrName(),
-                           getOpBundleSizesAttrName(),
-                           getOpBundleTagsAttrName()});
+                           getOpBundleSizesAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1454,7 +1437,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  ArrayAttr opBundleTags;
+  SmallVector<std::string> opBundleTags;
 
   // Default to C Calling Convention if no keyword is provided.
   result.addAttribute(
@@ -1500,9 +1483,9 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (opBundleTags && !opBundleTags.empty())
-    result.addAttribute(CallOp::getOpBundleTagsAttrName(result.name).getValue(),
-                        opBundleTags);
+  if (!opBundleTags.empty())
+    result.getOrAddProperties<CallOp::Properties>().op_bundle_tags =
+        std::move(opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -1542,7 +1525,8 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
-        normalOps, unwindOps, nullptr, nullptr, {}, {}, normal, unwind);
+        normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal,
+        unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1551,7 +1535,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange unwindOps) {
   build(builder, state, tys,
         /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, {}, {}, normal, unwind);
+        nullptr, {}, std::nullopt, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1560,7 +1544,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
-        nullptr, nullptr, {}, {}, normal, unwind);
+        nullptr, nullptr, {}, std::nullopt, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1650,8 +1634,7 @@ void InvokeOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict((*this)->getAttrs(),
                           {getCalleeAttrName(), getOperandSegmentSizeAttr(),
                            getCConvAttrName(), getVarCalleeTypeAttrName(),
-                           getOpBundleSizesAttrName(),
-                           getOpBundleTagsAttrName()});
+                           getOpBundleSizesAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1674,7 +1657,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   TypeAttr varCalleeType;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  ArrayAttr opBundleTags;
+  SmallVector<std::string> opBundleTags;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1720,10 +1703,9 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (opBundleTags && !opBundleTags.empty())
-    result.addAttribute(
-        InvokeOp::getOpBundleTagsAttrName(result.name).getValue(),
-        opBundleTags);
+  if (!opBundleTags.empty())
+    result.getOrAddProperties<InvokeOp::Properties>().op_bundle_tags =
+        std::move(opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -3351,7 +3333,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3359,14 +3341,14 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         fastMathFlags,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::Type resultType, mlir::StringAttr intrin,
                             mlir::ValueRange args) {
   build(builder, state, {resultType}, intrin, args, FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3374,7 +3356,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, resultTypes, intrin, args, fastMathFlags,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
+        /*op_bundle_operands=*/{});
 }
 
 //===----------------------------------------------------------------------===//
@@ -3431,18 +3413,6 @@ void InlineAsmOp::getEffects(
   }
 }
 
-//===----------------------------------------------------------------------===//
-// AssumeOp (intrinsic)
-//===----------------------------------------------------------------------===//
-
-void LLVM::AssumeOp::build(OpBuilder &builder, OperationState &state,
-                           mlir::Value cond) {
-  return build(builder, state, cond, /*op_bundle_operands=*/{},
-               /*op_bundle_tags=*/{});
-}
-
-LogicalResult LLVM::AssumeOp::verify() { return verifyOperandBundles(*this); }
-
 //===----------------------------------------------------------------------===//
 // masked_gather (intrinsic)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index 4fd043c7c93e..d034e576dfc5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -68,12 +68,6 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
-
-    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
-    llvmOpBundles.reserve(inst->getNumOperandBundles());
-    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
-      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
-
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index 2084e527773c..a8595d14ccf2 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -114,27 +114,17 @@ convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag,
 }
 
 static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands, ArrayAttr bundleTags,
+convertOperandBundles(OperandRangeRange bundleOperands,
+                      ArrayRef<std::string> bundleTags,
                       LLVM::ModuleTranslation &moduleTranslation) {
   SmallVector<llvm::OperandBundleDef> bundles;
   bundles.reserve(bundleOperands.size());
 
-  for (auto [operands, tagAttr] : llvm::zip_equal(bundleOperands, bundleTags)) {
-    StringRef tag = cast<StringAttr>(tagAttr).getValue();
+  for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags))
     bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation));
-  }
   return bundles;
 }
 
-static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands,
-                      std::optional<ArrayAttr> bundleTags,
-                      LLVM::ModuleTranslation &moduleTranslation) {
-  if (!bundleTags)
-    return {};
-  return convertOperandBundles(bundleOperands, *bundleTags, moduleTranslation);
-}
-
 /// Builder for LLVM_CallIntrinsicOp
 static LogicalResult
 convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index 2c0b665ad0d8..bc830a77f3c5 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -50,12 +50,6 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
-
-    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
-    llvmOpBundles.reserve(inst->getNumOperandBundles());
-    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
-      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
-
 #include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index 6e97b2a50af8..bd861f3a69e5 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1311,8 +1311,7 @@ ModuleImport::convertValues(ArrayRef<llvm::Value *> values) {
 }
 
 LogicalResult ModuleImport::convertIntrinsicArguments(
-    ArrayRef<llvm::Value *> values, ArrayRef<llvm::OperandBundleUse> opBundles,
-    bool requiresOpBundles, ArrayRef<unsigned> immArgPositions,
+    ArrayRef<llvm::Value *> values, ArrayRef<unsigned> immArgPositions,
     ArrayRef<StringLiteral> immArgAttrNames, SmallVectorImpl<Value> &valuesOut,
     SmallVectorImpl<NamedAttribute> &attrsOut) {
   assert(immArgPositions.size() == immArgAttrNames.size() &&
@@ -1342,35 +1341,6 @@ LogicalResult ModuleImport::convertIntrinsicArguments(
     valuesOut.push_back(*mlirValue);
   }
 
-  SmallVector<int> opBundleSizes;
-  SmallVector<Attribute> opBundleTagAttrs;
-  if (requiresOpBundles) {
-    opBundleSizes.reserve(opBundles.size());
-    opBundleTagAttrs.reserve(opBundles.size());
-
-    for (const llvm::OperandBundleUse &bundle : opBundles) {
-      opBundleSizes.push_back(bundle.Inputs.size());
-      opBundleTagAttrs.push_back(StringAttr::get(context, bundle.getTagName()));
-
-      for (const llvm::Use &opBundleOperand : bundle.Inputs) {
-        auto operandMlirValue = convertValue(opBundleOperand.get());
-        if (failed(operandMlirValue))
-          return failure();
-        valuesOut.push_back(*operandMlirValue);
-      }
-    }
-
-    auto opBundleSizesAttr = DenseI32ArrayAttr::get(context, opBundleSizes);
-    auto opBundleSizesAttrNameAttr =
-        StringAttr::get(context, LLVMDialect::getOpBundleSizesAttrName());
-    attrsOut.push_back({opBundleSizesAttrNameAttr, opBundleSizesAttr});
-
-    auto opBundleTagsAttr = ArrayAttr::get(context, opBundleTagAttrs);
-    auto opBundleTagsAttrNameAttr =
-        StringAttr::get(context, LLVMDialect::getOpBundleTagsAttrName());
-    attrsOut.push_back({opBundleTagsAttrNameAttr, opBundleTagsAttr});
-  }
-
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index e4c097c0daed..6e005f9ec5df 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -55,7 +55,6 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
-#include <numeric>
 #include <optional>
 
 #define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
@@ -855,40 +854,8 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
          "LLVM `immArgPositions` and MLIR `immArgAttrNames` should have equal "
          "length");
 
-  SmallVector<llvm::OperandBundleDef> opBundles;
-  size_t numOpBundleOperands = 0;
-  auto opBundleSizesAttr = cast_if_present<DenseI32ArrayAttr>(
-      intrOp->getAttr(LLVMDialect::getOpBundleSizesAttrName()));
-  auto opBundleTagsAttr = cast_if_present<ArrayAttr>(
-      intrOp->getAttr(LLVMDialect::getOpBundleTagsAttrName()));
-
-  if (opBundleSizesAttr && opBundleTagsAttr) {
-    ArrayRef<int> opBundleSizes = opBundleSizesAttr.asArrayRef();
-    assert(opBundleSizes.size() == opBundleTagsAttr.size() &&
-           "operand bundles and tags do not match");
-
-    numOpBundleOperands =
-        std::reduce(opBundleSizes.begin(), opBundleSizes.end());
-    assert(numOpBundleOperands <= intrOp->getNumOperands() &&
-           "operand bundle operands is more than the number of operands");
-
-    ValueRange operands = intrOp->getOperands().take_back(numOpBundleOperands);
-    size_t nextOperandIdx = 0;
-    opBundles.reserve(opBundleSizesAttr.size());
-
-    for (auto [opBundleTagAttr, bundleSize] :
-         llvm::zip(opBundleTagsAttr, opBundleSizes)) {
-      auto bundleTag = cast<StringAttr>(opBundleTagAttr).str();
-      auto bundleOperands = moduleTranslation.lookupValues(
-          operands.slice(nextOperandIdx, bundleSize));
-      opBundles.emplace_back(std::move(bundleTag), std::move(bundleOperands));
-      nextOperandIdx += bundleSize;
-    }
-  }
-
   // Map operands and attributes to LLVM values.
-  auto opOperands = intrOp->getOperands().drop_back(numOpBundleOperands);
-  auto operands = moduleTranslation.lookupValues(opOperands);
+  auto operands = moduleTranslation.lookupValues(intrOp->getOperands());
   SmallVector<llvm::Value *> args(immArgPositions.size() + operands.size());
   for (auto [immArgPos, immArgName] :
        llvm::zip(immArgPositions, immArgAttrNames)) {
@@ -923,7 +890,7 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
   llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration(
       module, intrinsic, overloadedTypes);
 
-  return builder.CreateCall(llvmIntr, args, opBundles);
+  return builder.CreateCall(llvmIntr, args);
 }
 
 /// Given a single MLIR operation, create the corresponding LLVM IR operation
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index 55b1bc9c545a..b86103422b07 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -684,7 +684,7 @@ func.func @collapse_static_shape_with_non_identity_layout(%arg: memref<1x1x8x8xf
 // CHECK: %[[INT_TO_PTR:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
 // CHECK: %[[AND:.*]] = llvm.and %[[INT_TO_PTR]], {{.*}}  : i64
 // CHECK: %[[CMP:.*]] = llvm.icmp "eq" %[[AND]], {{.*}} : i64
-// CHECK: llvm.intr.assume %[[CMP]] : i1
+// CHECK: "llvm.intr.assume"(%[[CMP]]) : (i1) -> ()
 // CHECK: %[[LD_ADDR:.*]] = llvm.getelementptr %[[BUFF_ADDR]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 // CHECK: %[[VAL:.*]] = llvm.load %[[LD_ADDR]] : !llvm.ptr -> f32
 // CHECK: return %[[VAL]] : f32
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 48dc9079333d..9dc22abf143b 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -160,7 +160,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
+  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
   memref.assume_alignment %0, 16 : memref<4x4xf16>
   return
 }
@@ -177,7 +177,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
+  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
   memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
   return
 }
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index 0b7ca3f2bb04..f9551e311df5 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -18,7 +18,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
   "llvm.intr.memset"(%ptr, %byte, %0) <{isVolatile = true}> : (!llvm.ptr, i8, i32) -> ()
   "llvm.intr.memmove"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   "llvm.intr.memcpy"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
-  llvm.intr.assume %true : i1
+  "llvm.intr.assume"(%true) : (i1) -> ()
   llvm.fence release
   %2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32
   %3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32
@@ -44,7 +44,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
 // CHECK: "llvm.intr.memset"(%[[PTR]]
 // CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]]
 // CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]]
-// CHECK: llvm.intr.assume
+// CHECK: "llvm.intr.assume"
 // CHECK: llvm.fence release
 // CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic
 // CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index b8ce7db795a1..3062cdc38c0a 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -836,30 +836,3 @@ llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
   llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
   llvm.return
 }
-
-// CHECK-LABEL: @test_assume_intr_no_opbundle
-llvm.func @test_assume_intr_no_opbundle(%arg0 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(1 : i1) : i1
-  // CHECK: llvm.intr.assume %0 : i1
-  llvm.intr.assume %0 : i1
-  llvm.return
-}
-
-// CHECK-LABEL: @test_assume_intr_empty_opbundle
-llvm.func @test_assume_intr_empty_opbundle(%arg0 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(1 : i1) : i1
-  // CHECK: llvm.intr.assume %0 : i1
-  llvm.intr.assume %0 [] : i1
-  llvm.return
-}
-
-// CHECK-LABEL: @test_assume_intr_with_opbundles
-llvm.func @test_assume_intr_with_opbundles(%arg0 : !llvm.ptr) {
-  %0 = llvm.mlir.constant(1 : i1) : i1
-  %1 = llvm.mlir.constant(2 : i32) : i32
-  %2 = llvm.mlir.constant(3 : i32) : i32
-  %3 = llvm.mlir.constant(4 : i32) : i32
-  // CHECK: llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
-  llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
-  llvm.return
-}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 606b11175f57..28a1bd21c82a 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -630,21 +630,11 @@ define void @va_intrinsics_test(ptr %0, ptr %1, ...) {
 ; CHECK-LABEL: @assume
 ; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
 define void @assume(i1 %true) {
-  ; CHECK:  llvm.intr.assume %[[TRUE]] : i1
+  ; CHECK:  "llvm.intr.assume"(%[[TRUE]]) : (i1) -> ()
   call void @llvm.assume(i1 %true)
   ret void
 }
 
-; CHECK-LABEL: @assume_with_opbundles
-; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
-; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
-define void @assume_with_opbundles(i1 %true, ptr %p) {
-  ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32
-  ; CHECK:  llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1
-  call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)]
-  ret void
-}
-
 ; CHECK-LABEL: @is_constant
 ; CHECK-SAME:  %[[VAL:[a-zA-Z0-9]+]]
 define void @is_constant(i32 %0) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index cb712eb4e126..0634a7ba907f 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -363,21 +363,6 @@ llvm.func @umin_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector
   llvm.return
 }
 
-// CHECK-LABEL: @assume_without_opbundles
-llvm.func @assume_without_opbundles(%cond: i1) {
-  // CHECK: call void @llvm.assume(i1 %{{.+}})
-  llvm.intr.assume %cond : i1
-  llvm.return
-}
-
-// CHECK-LABEL: @assume_with_opbundles
-llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) {
-  %0 = llvm.mlir.constant(8 : i32) : i32
-  // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ]
-  llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1
-  llvm.return
-}
-
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: f32, %arg1: vector<8xf32>, %arg2: vector<8xi32>) {
   // CHECK: call i32 @llvm.vector.reduce.add.v8i32
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index 15658ea60681..af0981440a17 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -188,7 +188,7 @@ llvm.func @sadd_overflow_intr_wrong_type(%arg0 : i32, %arg1 : f32) -> !llvm.stru
 
 llvm.func @assume_intr_wrong_type(%cond : i16) {
   // expected-error @below{{op operand #0 must be 1-bit signless integer, but got 'i16'}}
-  llvm.intr.assume %cond : i16
+  "llvm.intr.assume"(%cond) : (i16) -> ()
   llvm.return
 }
 
-- 
GitLab


From bbff5b8891c0ce929d6ace2d86ea6891425042e2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 16 Oct 2024 07:21:57 +0100
Subject: [PATCH 074/329] [VPlan] Use alloc-type to compute interleave group
 offset.

Use getAllocTypeSize to get compute the offset to the start of
interleave groups instead getScalarSizeInBits, which may return 0 for
pointers. This is in line with the analysis building the interleave
groups and fixes a mis-compile reported for
https://github.com/llvm/llvm-project/pull/106431.
---
 llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp              | 3 ++-
 .../interleaved-accesses-different-insert-position.ll          | 2 +-
 2 files changed, 3 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
index 4443a7be4ad4..faec08cac187 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanTransforms.cpp
@@ -1646,8 +1646,9 @@ void VPlanTransforms::createInterleaveGroups(
       // zero.
       assert(IG->getIndex(IRInsertPos) != 0 &&
              "index of insert position shouldn't be zero");
+      auto &DL = IRInsertPos->getDataLayout();
       APInt Offset(32,
-                   getLoadStoreType(IRInsertPos)->getScalarSizeInBits() / 8 *
+                   DL.getTypeAllocSize(getLoadStoreType(IRInsertPos)) *
                        IG->getIndex(IRInsertPos),
                    /*IsSigned=*/true);
       VPValue *OffsetVPV = Plan.getOrAddLiveIn(
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
index 665fd1b9aeac..5913bae082f1 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses-different-insert-position.ll
@@ -183,7 +183,7 @@ define i64 @interleave_group_load_pointer_type(ptr %start, ptr %end) {
 ; CHECK-NEXT:    [[TMP6:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[START]], i64 [[TMP6]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr i8, ptr [[NEXT_GEP]], i64 16
-; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr i8, ptr [[TMP7]], i32 -8
 ; CHECK-NEXT:    [[WIDE_VEC:%.*]] = load <12 x ptr>, ptr [[TMP8]], align 8
 ; CHECK-NEXT:    [[STRIDED_VEC:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 0, i32 3, i32 6, i32 9>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x ptr> [[WIDE_VEC]], <12 x ptr> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
-- 
GitLab


From e55869ae8a4ef1ae2e898ff5cd66fb8ae6e099b8 Mon Sep 17 00:00:00 2001
From: Piotr Fusik <p.fusik@samsung.com>
Date: Wed, 16 Oct 2024 08:30:22 +0200
Subject: [PATCH 075/329] [LV][NFC] Fix typos (#111971)

---
 .../Transforms/Vectorize/LoopVectorize.cpp    | 31 +++++++++----------
 1 file changed, 15 insertions(+), 16 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index c8cf137816d3..8bf92f348062 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -809,8 +809,7 @@ protected:
 };
 } // end namespace llvm
 
-/// Look for a meaningful debug location on the instruction or it's
-/// operands.
+/// Look for a meaningful debug location on the instruction or its operands.
 static DebugLoc getDebugLocFromInstOrOperands(Instruction *I) {
   if (!I)
     return DebugLoc();
@@ -1798,7 +1797,7 @@ public:
 
   /// Generate runtime checks in SCEVCheckBlock and MemCheckBlock, so we can
   /// accurately estimate the cost of the runtime checks. The blocks are
-  /// un-linked from the IR and is added back during vector code generation. If
+  /// un-linked from the IR and are added back during vector code generation. If
   /// there is no vector code generation, the check blocks are removed
   /// completely.
   void create(Loop *L, const LoopAccessInfo &LAI,
@@ -2581,7 +2580,7 @@ PHINode *InnerLoopVectorizer::createInductionResumeValue(
     }
   }
 
-  // Create phi nodes to merge from the  backedge-taken check block.
+  // Create phi nodes to merge from the backedge-taken check block.
   PHINode *BCResumeVal =
       PHINode::Create(OrigPhi->getType(), 3, "bc.resume.val",
                       LoopScalarPreHeader->getFirstNonPHIIt());
@@ -3002,7 +3001,8 @@ void InnerLoopVectorizer::sinkScalarOperands(Instruction *PredInst) {
 
       // We can't sink an instruction if it is a phi node, is not in the loop,
       // may have side effects or may read from memory.
-      // TODO Could dor more granular checking to allow sinking a load past non-store instructions.
+      // TODO: Could do more granular checking to allow sinking
+      // a load past non-store instructions.
       if (!I || isa<PHINode>(I) || !VectorLoop->contains(I) ||
           I->mayHaveSideEffects() || I->mayReadFromMemory())
           continue;
@@ -3140,9 +3140,8 @@ void LoopVectorizationCostModel::collectLoopScalars(ElementCount VF) {
 
   // (2) Add to the worklist all bitcast and getelementptr instructions used by
   // memory accesses requiring a scalar use. The pointer operands of loads and
-  // stores will be scalar as long as the memory accesses is not a gather or
-  // scatter operation. The value operand of a store will remain scalar if the
-  // store is scalarized.
+  // stores will be scalar unless the operation is a gather or scatter.
+  // The value operand of a store will remain scalar if the store is scalarized.
   for (auto *BB : TheLoop->blocks())
     for (auto &I : *BB) {
       if (auto *Load = dyn_cast<LoadInst>(&I)) {
@@ -3415,7 +3414,7 @@ bool LoopVectorizationCostModel::interleavedAccessCanBeWidened(
   assert(Group && "Must have a group.");
   unsigned InterleaveFactor = Group->getFactor();
 
-  // If the instruction's allocated size doesn't equal it's type size, it
+  // If the instruction's allocated size doesn't equal its type size, it
   // requires padding and will be scalarized.
   auto &DL = I->getDataLayout();
   auto *ScalarTy = getLoadStoreType(I);
@@ -3515,11 +3514,11 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
   assert(VF.isVector() && !Uniforms.contains(VF) &&
          "This function should not be visited twice for the same VF");
 
-  // Visit the list of Uniforms. If we'll not find any uniform value, we'll
-  // not analyze again.  Uniforms.count(VF) will return 1.
+  // Visit the list of Uniforms. If we find no uniform value, we won't
+  // analyze again.  Uniforms.count(VF) will return 1.
   Uniforms[VF].clear();
 
-  // We now know that the loop is vectorizable!
+  // Now we know that the loop is vectorizable!
   // Collect instructions inside the loop that will remain uniform after
   // vectorization.
 
@@ -3566,7 +3565,7 @@ void LoopVectorizationCostModel::collectLoopUniforms(ElementCount VF) {
 
   auto PrevVF = VF.divideCoefficientBy(2);
   // Return true if all lanes perform the same memory operation, and we can
-  // thus chose to execute only one.
+  // thus choose to execute only one.
   auto IsUniformMemOpUse = [&](Instruction *I) {
     // If the value was already known to not be uniform for the previous
     // (smaller VF), it cannot be uniform for the larger VF.
@@ -3957,7 +3956,7 @@ FixedScalableVFPair LoopVectorizationCostModel::computeFeasibleMaxVF(
 FixedScalableVFPair
 LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
   if (Legal->getRuntimePointerChecking()->Need && TTI.hasBranchDivergence()) {
-    // TODO: It may by useful to do since it's still likely to be dynamically
+    // TODO: It may be useful to do since it's still likely to be dynamically
     // uniform if the target can skip.
     reportVectorizationFailure(
         "Not inserting runtime ptr check for divergent target",
@@ -4031,7 +4030,7 @@ LoopVectorizationCostModel::computeMaxVF(ElementCount UserVF, unsigned UserIC) {
     assert(WideningDecisions.empty() && Uniforms.empty() && Scalars.empty() &&
            "No decisions should have been taken at this point");
     // Note: There is no need to invalidate any cost modeling decisions here, as
-    // non where taken so far.
+    // none were taken so far.
     InterleaveInfo.invalidateGroupsRequiringScalarEpilogue();
   }
 
@@ -7940,7 +7939,7 @@ EpilogueVectorizerEpilogueLoop::emitMinimumVectorEpilogueIterCountCheck(
     BasicBlock *Bypass, BasicBlock *Insert) {
 
   assert(EPI.TripCount &&
-         "Expected trip count to have been safed in the first pass.");
+         "Expected trip count to have been saved in the first pass.");
   assert(
       (!isa<Instruction>(EPI.TripCount) ||
        DT->dominates(cast<Instruction>(EPI.TripCount)->getParent(), Insert)) &&
-- 
GitLab


From 24423107abc23a24d465189ba05e51d1bc31bbf2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Wed, 16 Oct 2024 07:40:04 +0100
Subject: [PATCH 076/329] [LV] Add additional trip count expansion tests for
 #92177.

Extra tests for https://github.com/llvm/llvm-project/pull/92177, split
off the PR.
---
 .../trip-count-expansion-may-introduce-ub.ll  | 310 ++++++++++++++++--
 1 file changed, 289 insertions(+), 21 deletions(-)

diff --git a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
index c63dc9979bce..7dfd80a688f3 100644
--- a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
+++ b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
@@ -459,6 +459,7 @@ exit:
   ret i64 %p
 }
 
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
@@ -526,6 +527,152 @@ exit:
   ret i64 %p
 }
 
+declare void @foo()
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
+define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop(ptr %dst, i64 %N) {
+; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    call void @foo()
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]])
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  call void @foo()
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, %N
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch]
+  ret i64 %p
+}
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
+define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute(ptr %dst, i64 %N, i1 %c) {
+; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[C:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER_PREHEADER:%.*]], label [[EXIT:%.*]]
+; CHECK:       loop.header.preheader:
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]])
+; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP4]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP4]], 4
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = select i1 [[TMP5]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP4]], [[TMP6]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP7:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP7]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i32, ptr [[TMP8]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP9]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[LOOP_HEADER_PREHEADER]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ], [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT_LOOPEXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT_LOOPEXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK:       exit.loopexit:
+; CHECK-NEXT:    [[P_PH:%.*]] = phi i64 [ 0, [[LOOP_LATCH]] ], [ 1, [[LOOP_HEADER]] ]
+; CHECK-NEXT:    br label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 2, [[ENTRY:%.*]] ], [ [[P_PH]], [[EXIT_LOOPEXIT]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  br i1 %c, label %loop.header, label %exit
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, %N
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch], [ 2, %entry ]
+  ret i64 %p
+}
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds(ptr %dst, i64 %N, i64 %M) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) {
@@ -551,7 +698,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP9]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP10:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP10]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -567,7 +714,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[M]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP13:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -593,16 +740,16 @@ exit:
   ret i64 %p
 }
 
-
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FR_N:%.*]] = freeze i64 [[N]]
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[FR_N]]
-; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[FR_N]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze i64 [[TMP2]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
-; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SMAX]])
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[UMIN]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP3]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -620,7 +767,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst,
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -636,7 +783,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst,
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[FR_N]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -688,7 +835,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -704,7 +851,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 [[N]], 42
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV1]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -750,7 +897,7 @@ define void @single_exit_tc_with_udiv(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP4]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[TMP1]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
@@ -764,7 +911,7 @@ define void @single_exit_tc_with_udiv(ptr %dst, i64 %N) {
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP]], label [[EXIT]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -784,6 +931,7 @@ exit:
   ret void
 }
 
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
@@ -812,7 +960,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP8]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP9:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP9]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -828,7 +976,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
 ; CHECK-NEXT:    [[D:%.*]] = urem i64 42, [[N]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -854,6 +1002,7 @@ exit:
   ret i64 %p
 }
 
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
@@ -879,7 +1028,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[LOOP_HEADER]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -895,7 +1044,7 @@ define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64
 ; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV1]], 1
 ; CHECK-NEXT:    [[D:%.*]] = urem i64 [[N]], 42
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV1]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER1]], label [[EXIT]], !llvm.loop [[LOOP27:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER1]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -1007,10 +1156,10 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[N]]
-; CHECK-NEXT:    [[TMP8:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP10:%.*]] = freeze i64 [[TMP9]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
-; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP8]], i64 [[SMAX]])
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[UMIN]], 1
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP1]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
@@ -1028,7 +1177,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP6]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP28:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
@@ -1045,7 +1194,7 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[N]]
 ; CHECK-NEXT:    [[X:%.*]] = sub i64 100, [[D]]
 ; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
-; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP29:![0-9]+]]
 ; CHECK:       exit:
 ; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
 ; CHECK-NEXT:    ret i64 [[P]]
@@ -1072,6 +1221,119 @@ exit:
   ret i64 %p
 }
 
+define i64 @multi_exit_exit_count_with_udiv_by_0_in_latch(ptr %dst, i64 %N) {
+; CHECK-LABEL: define i64 @multi_exit_exit_count_with_udiv_by_0_in_latch(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, 0
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, 0
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch]
+  ret i64 %p
+}
+
+; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
+define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison(ptr %dst, i64 %N, i64 %M) {
+; CHECK-LABEL: define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison(
+; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[M_1:%.*]] = call i64 @llvm.umax.i64(i64 [[M]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[M_1]]
+; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
+; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
+; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SMAX]])
+; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[UMIN]], 1
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ule i64 [[TMP2]], 4
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; CHECK:       vector.ph:
+; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], 4
+; CHECK-NEXT:    [[TMP3:%.*]] = icmp eq i64 [[N_MOD_VF]], 0
+; CHECK-NEXT:    [[TMP4:%.*]] = select i1 [[TMP3]], i64 4, i64 [[N_MOD_VF]]
+; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
+; CHECK:       vector.body:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
+; CHECK-NEXT:    [[TMP5:%.*]] = add i64 [[INDEX]], 0
+; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[TMP5]]
+; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i32, ptr [[TMP6]], i32 0
+; CHECK-NEXT:    store <4 x i32> <i32 1, i32 1, i32 1, i32 1>, ptr [[TMP7]], align 4
+; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 4
+; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP30:![0-9]+]]
+; CHECK:       middle.block:
+; CHECK-NEXT:    br label [[SCALAR_PH]]
+; CHECK:       scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    br label [[LOOP_HEADER:%.*]]
+; CHECK:       loop.header:
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP_LATCH:%.*]] ]
+; CHECK-NEXT:    [[GEP:%.*]] = getelementptr inbounds i32, ptr [[DST]], i64 [[IV]]
+; CHECK-NEXT:    store i32 1, ptr [[GEP]], align 4
+; CHECK-NEXT:    [[C_0:%.*]] = icmp slt i64 [[IV]], [[N]]
+; CHECK-NEXT:    br i1 [[C_0]], label [[LOOP_LATCH]], label [[EXIT:%.*]]
+; CHECK:       loop.latch:
+; CHECK-NEXT:    [[IV_NEXT]] = add i64 [[IV]], 1
+; CHECK-NEXT:    [[D:%.*]] = udiv i64 42, [[M_1]]
+; CHECK-NEXT:    [[C_1:%.*]] = icmp slt i64 [[IV]], [[D]]
+; CHECK-NEXT:    br i1 [[C_1]], label [[LOOP_HEADER]], label [[EXIT]], !llvm.loop [[LOOP31:![0-9]+]]
+; CHECK:       exit:
+; CHECK-NEXT:    [[P:%.*]] = phi i64 [ 1, [[LOOP_HEADER]] ], [ 0, [[LOOP_LATCH]] ]
+; CHECK-NEXT:    ret i64 [[P]]
+;
+entry:
+  %M.1 = call i64 @llvm.umax.i64(i64 %M, i64 1)
+  br label %loop.header
+
+loop.header:
+  %iv = phi i64 [ 0, %entry ], [ %iv.next, %loop.latch ]
+  %gep = getelementptr inbounds i32, ptr %dst, i64 %iv
+  store i32 1, ptr %gep
+  %c.0 = icmp slt i64 %iv, %N
+  br i1 %c.0, label %loop.latch, label %exit
+
+loop.latch:
+  %iv.next = add i64 %iv, 1
+  %d = udiv i64 42, %M.1
+  %c.1 = icmp slt i64 %iv, %d
+  br i1 %c.1, label %loop.header, label %exit
+
+exit:
+  %p = phi i64 [ 1, %loop.header ], [ 0, %loop.latch]
+  ret i64 %p
+}
+
+declare i64 @llvm.umax.i64(i64, i64)
+
 
 ;.
 ; CHECK: [[LOOP0]] = distinct !{[[LOOP0]], [[META1:![0-9]+]], [[META2:![0-9]+]]}
@@ -1100,4 +1362,10 @@ exit:
 ; CHECK: [[LOOP23]] = distinct !{[[LOOP23]], [[META2]], [[META1]]}
 ; CHECK: [[LOOP24]] = distinct !{[[LOOP24]], [[META1]], [[META2]]}
 ; CHECK: [[LOOP25]] = distinct !{[[LOOP25]], [[META2]], [[META1]]}
+; CHECK: [[LOOP26]] = distinct !{[[LOOP26]], [[META1]], [[META2]]}
+; CHECK: [[LOOP27]] = distinct !{[[LOOP27]], [[META2]], [[META1]]}
+; CHECK: [[LOOP28]] = distinct !{[[LOOP28]], [[META1]], [[META2]]}
+; CHECK: [[LOOP29]] = distinct !{[[LOOP29]], [[META2]], [[META1]]}
+; CHECK: [[LOOP30]] = distinct !{[[LOOP30]], [[META1]], [[META2]]}
+; CHECK: [[LOOP31]] = distinct !{[[LOOP31]], [[META2]], [[META1]]}
 ;.
-- 
GitLab


From 37ad65ffb6b8b8867e5d58f05ba676211d0da233 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Andrzej=20Warzy=C5=84ski?= <andrzej.warzynski@arm.com>
Date: Wed, 16 Oct 2024 07:43:49 +0100
Subject: [PATCH 077/329] [mlir][arith] Remove some e2e tests (#112012)

I am removing the recently added integration test for various Arith Ops.
These operations and their lowerings are effectively already verified by
the Arith-to-LLVM conversion tests in:
  * "mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir"

I've noticed that a few variants of `arith.cmpi` were missing in that
file - those are added here as well.

This is a follow-up for this discussion:
  * https://github.com/llvm/llvm-project/pull/92272

See also the recent update to our guidelines on e2e tests in MLIR:
  * https://github.com/llvm/mlir-www/pull/203
---
 .../Conversion/ArithToLLVM/arith-to-llvm.mlir |  40 ++--
 .../Dialect/Arith/CPU/addition.mlir           |  88 ---------
 .../Dialect/Arith/CPU/comparison.mlir         | 174 ------------------
 .../Dialect/Arith/CPU/multiplication.mlir     | 119 ------------
 4 files changed, 26 insertions(+), 395 deletions(-)
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/addition.mlir
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir
 delete mode 100644 mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir

diff --git a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
index d3bdbe89a548..64c40f1aba43 100644
--- a/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
+++ b/mlir/test/Conversion/ArithToLLVM/arith-to-llvm.mlir
@@ -46,33 +46,45 @@ func.func @ops(f32, f32, i32, i32, f64) -> (f32, i32) {
   %1 = arith.subi %arg2, %arg3: i32
 // CHECK: = llvm.icmp "slt" %arg2, %1 : i32
   %2 = arith.cmpi slt, %arg2, %1 : i32
+// CHECK: = llvm.icmp "sle" %arg2, %1 : i32
+  %3 = arith.cmpi sle, %arg2, %1 : i32
+// CHECK: = llvm.icmp "sgt" %arg2, %1 : i32
+  %4 = arith.cmpi sgt, %arg2, %1 : i32
+// CHECK: = llvm.icmp "ult" %arg2, %1 : i32
+  %5 = arith.cmpi ult, %arg2, %1 : i32
+// CHECK: = llvm.icmp "ule" %arg2, %1 : i32
+  %6 = arith.cmpi ule, %arg2, %1 : i32
+// CHECK: = llvm.icmp "ugt" %arg2, %1 : i32
+  %7 = arith.cmpi ugt, %arg2, %1 : i32
+// CHECK: = llvm.icmp "eq" %arg2, %1 : i32
+  %8 = arith.cmpi eq, %arg2, %1 : i32
 // CHECK: = llvm.sdiv %arg2, %arg3 : i32
-  %3 = arith.divsi %arg2, %arg3 : i32
+  %9 = arith.divsi %arg2, %arg3 : i32
 // CHECK: = llvm.udiv %arg2, %arg3 : i32
-  %4 = arith.divui %arg2, %arg3 : i32
+  %10 = arith.divui %arg2, %arg3 : i32
 // CHECK: = llvm.srem %arg2, %arg3 : i32
-  %5 = arith.remsi %arg2, %arg3 : i32
+  %11 = arith.remsi %arg2, %arg3 : i32
 // CHECK: = llvm.urem %arg2, %arg3 : i32
-  %6 = arith.remui %arg2, %arg3 : i32
+  %12 = arith.remui %arg2, %arg3 : i32
 // CHECK: = llvm.fdiv %arg0, %arg1 : f32
-  %8 = arith.divf %arg0, %arg1 : f32
+  %13 = arith.divf %arg0, %arg1 : f32
 // CHECK: = llvm.frem %arg0, %arg1 : f32
-  %9 = arith.remf %arg0, %arg1 : f32
+  %14 = arith.remf %arg0, %arg1 : f32
 // CHECK: = llvm.and %arg2, %arg3 : i32
-  %10 = arith.andi %arg2, %arg3 : i32
+  %15 = arith.andi %arg2, %arg3 : i32
 // CHECK: = llvm.or %arg2, %arg3 : i32
-  %11 = arith.ori %arg2, %arg3 : i32
+  %16 = arith.ori %arg2, %arg3 : i32
 // CHECK: = llvm.xor %arg2, %arg3 : i32
-  %12 = arith.xori %arg2, %arg3 : i32
+  %17 = arith.xori %arg2, %arg3 : i32
 // CHECK: = llvm.mlir.constant(7.900000e-01 : f64) : f64
-  %15 = arith.constant 7.9e-01 : f64
+  %18 = arith.constant 7.9e-01 : f64
 // CHECK: = llvm.shl %arg2, %arg3 : i32
-  %16 = arith.shli %arg2, %arg3 : i32
+  %19 = arith.shli %arg2, %arg3 : i32
 // CHECK: = llvm.ashr %arg2, %arg3 : i32
-  %17 = arith.shrsi %arg2, %arg3 : i32
+  %20 = arith.shrsi %arg2, %arg3 : i32
 // CHECK: = llvm.lshr %arg2, %arg3 : i32
-  %18 = arith.shrui %arg2, %arg3 : i32
-  return %0, %4 : f32, i32
+  %21 = arith.shrui %arg2, %arg3 : i32
+  return %0, %10 : f32, i32
 }
 
 // Checking conversion of index types to integers using i1, assuming no target
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir b/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir
deleted file mode 100644
index b6acfd53c1f5..000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/addition.mlir
+++ /dev/null
@@ -1,88 +0,0 @@
-// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
-// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:                   --shared-libs=%mlir_c_runner_utils | \
-// RUN:   FileCheck %s --match-full-lines
-
-func.func @addi_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@addi_i1\n"
-  %res = arith.addi %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @addi() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-
-  // addi on i1
-  // addi(0, 1) : i1 = 1 : i1; addi(0, -1) : i1 = 1
-  %false = arith.constant 0 : i1
-  %true = arith.constant 1 : i1
-
-  // CHECK-LABEL: @addi_i1
-  // CHECK-NEXT:  1
-  func.call @addi_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addi_i1
-  // CHECK-NEXT:  1
-  %true_based_on_non_zero_val = arith.constant -1 : i1
-  func.call @addi_i1(%false, %true_based_on_non_zero_val) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc..
-  // ------------------------------------------------
-
-  return
-}
-
-func.func @addui_extended_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@addui_extended_i1\n"
-  %res, %overflow = arith.addui_extended %v1, %v2 : i1, i1
-  vector.print %res : i1
-  vector.print %overflow : i1
-  return
-}
-
-func.func @addi_extended() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-
-  // addui_extended on i1
-  // addui_extended 1 1 : i1 = 0, 1
-  %true = arith.constant 1 : i1
-  %false = arith.constant 0 : i1
-  
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  1
-  func.call @addui_extended_i1(%true, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  1
-  // CHECK-NEXT:  0
-  func.call @addui_extended_i1(%true, %false) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  1
-  // CHECK-NEXT:  0
-  func.call @addui_extended_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @addui_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @addui_extended_i1(%false, %false) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @entry() {
-  func.call @addi() : () -> ()
-  func.call @addi_extended() : () -> ()
-  return
-}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir b/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir
deleted file mode 100644
index 418fbb0c0a94..000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/comparison.mlir
+++ /dev/null
@@ -1,174 +0,0 @@
-// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
-// RUN: mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:                   --shared-libs=%mlir_c_runner_utils | \
-// RUN: FileCheck %s --match-full-lines
-
-func.func @cmpi_eq_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_eq_i1\n"
-  %res = arith.cmpi eq, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_slt_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_slt_i1\n"
-  %res = arith.cmpi slt, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_sle_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_sle_i1\n"
-  %res = arith.cmpi sle, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_sgt_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_sgt_i1\n"
-  %res = arith.cmpi sgt, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_sge_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@cmpi_sge_i1\n"
-  %res = arith.cmpi sge, %v1, %v2 : i1
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_eq() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-  %false_i1 = arith.constant 0 : i1
-  %true_i1 = arith.constant 1 : i1
-  %true_i1_n1 = arith.constant -1 : i1
-
-  // int values 1 and -1 are represented with the same bitvector (`0b1`)
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true_i1, %true_i1_n1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_eq_i1(%false_i1, %true_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_eq_i1(%true_i1, %false_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true_i1, %true_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%false_i1, %false_i1) : (i1, i1) -> ()
-
-  %false = arith.constant false
-  %true = arith.constant true
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true, %true_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%false, %false_i1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_eq_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_eq_i1(%true, %true_i1_n1) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc..
-  // ------------------------------------------------
-  return
-}
-
-func.func @cmpi_signed() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-  %false_i1 = arith.constant 0 : i1
-  %true_i1 = arith.constant 1 : i1
-  %true_i1_n1 = arith.constant -1 : i1
-
-  // int values 1 and -1 are represented with the same bitvector (`0b1`)
-  // But, bitvector `1` is interpreted as int value -1 in signed comparison
-
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sge_i1(%false_i1, %true_i1_n1) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sge_i1(%false_i1, %true_i1) : (i1, i1) -> ()
-  
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_sge_i1(%true_i1, %false_i1) : (i1, i1) -> ()
-
-  %false = arith.constant false
-  %true = arith.constant true
-
-  // CHECK-LABEL: @cmpi_slt_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_slt_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sle_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_sle_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sgt_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sgt_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  1
-  func.call @cmpi_sge_i1(%false, %true) : (i1, i1) -> ()
-  
-  // CHECK-LABEL: @cmpi_sge_i1
-  // CHECK-NEXT:  0
-  func.call @cmpi_sge_i1(%true, %false) : (i1, i1) -> ()
-  
-  // ------------------------------------------------
-  // TODO: Test i8, i16 etc..
-  // ------------------------------------------------
-  return
-}
-
-func.func @cmpi_ult_index(%v1 : index, %v2 : index) {
-  vector.print str "@cmpi_ult_index\n"
-  %res = arith.cmpi ult, %v1, %v2 : index
-  vector.print %res : i1
-  return
-}
-
-func.func @cmpi_unsigned() {
-  // ------------------------------------------------
-  // Test index
-  // ------------------------------------------------
-  // 0 `ult` -2^63 = true
-  %zero = arith.constant 0 : index
-  %index_min = arith.constant -9223372036854775808 : index
-
-  // CHECK-LABEL: @cmpi_ult_index
-  // CHECK-NEXT: 1
-  func.call @cmpi_ult_index(%zero, %index_min) : (index, index) -> ()
-  
-  // ------------------------------------------------
-  // TODO: i1, i8, i16, uge, ule etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @entry() {
-  func.call @cmpi_eq() : () -> ()
-  func.call @cmpi_signed() : () -> ()
-  func.call @cmpi_unsigned() : () -> ()
-  return
-}
diff --git a/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir b/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir
deleted file mode 100644
index 21fd81678843..000000000000
--- a/mlir/test/Integration/Dialect/Arith/CPU/multiplication.mlir
+++ /dev/null
@@ -1,119 +0,0 @@
-// RUN: mlir-opt %s --convert-scf-to-cf --convert-cf-to-llvm --convert-vector-to-llvm \
-// RUN:             --convert-func-to-llvm --convert-arith-to-llvm | \
-// RUN:   mlir-cpu-runner -e entry -entry-point-result=void \
-// RUN:                   --shared-libs=%mlir_c_runner_utils | \
-// RUN:   FileCheck %s --match-full-lines
-
-func.func @mulsi_extended_i1(%v1 : i1, %v2 : i1) {
-  vector.print str "@mulsi_extended_i1\n"
-  %low, %high = arith.mulsi_extended %v1, %v2 : i1
-  vector.print %low : i1
-  vector.print %high : i1
-  return
-}
-
-func.func @mulsi_extended_i8(%v1 : i8, %v2 : i8) {
-  vector.print str "@mulsi_extended_i8\n"
-  %low, %high = arith.mulsi_extended %v1, %v2 : i8
-  vector.print %low : i8
-  vector.print %high : i8
-  return
-}
-
-func.func @mulsi_extended() {
-  // ------------------------------------------------
-  // Test i1
-  // ------------------------------------------------
-
-  // mulsi_extended on i1, tests for overflow bit
-  // mulsi_extended 1, 1 : i1 = (1, 0)
-  %true = arith.constant true
-  %false = arith.constant false
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  1
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%true, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%true, %false) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%false, %true) : (i1, i1) -> ()
-
-  // CHECK-LABEL: @mulsi_extended_i1
-  // CHECK-NEXT:  0
-  // CHECK-NEXT:  0
-  func.call @mulsi_extended_i1(%false, %false) : (i1, i1) -> ()
-
-  // ------------------------------------------------
-  // Test i8
-  // ------------------------------------------------
-  // mulsi extended versions, with overflow
-  %c_100_i8 = arith.constant -100 : i8
-
-  // mulsi_extended -100, -100 : i8 = (16, 39)
-  // CHECK-LABEL: @mulsi_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  39
-  func.call @mulsi_extended_i8(%c_100_i8, %c_100_i8) : (i8, i8) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i16, i32 etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @mului_extended_i8(%v1 : i8, %v2 : i8) {
-  vector.print str "@mului_extended_i8\n"
-  %low, %high = arith.mului_extended %v1, %v2 : i8
-  vector.print %low : i8
-  vector.print %high : i8
-  return
-}
-
-func.func @mului_extended() {
-  // ------------------------------------------------
-  // Test i8
-  // ------------------------------------------------
-  %c_n100_i8 = arith.constant -100 : i8
-  %c_156_i8 = arith.constant 156 : i8
-
-  // mului_extended -100, -100 : i8 = (16, 95)
-  // and on equivalent representations (e.g. 156 === -100 (mod 256))
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_n100_i8, %c_n100_i8) : (i8, i8) -> ()
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_n100_i8, %c_156_i8) : (i8, i8) -> ()
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_156_i8, %c_n100_i8) : (i8, i8) -> ()
-
-  // CHECK-LABEL: @mului_extended_i8
-  // CHECK-NEXT:  16
-  // CHECK-NEXT:  95
-  func.call @mului_extended_i8(%c_156_i8, %c_156_i8) : (i8, i8) -> ()
-
-  // ------------------------------------------------
-  // TODO: Test i1, i16, i32 etc.. 
-  // ------------------------------------------------
-  return
-}
-
-func.func @entry() {
-  func.call @mulsi_extended() : () -> ()
-  func.call @mului_extended() : () -> ()
-  return
-}
-- 
GitLab


From e1d205a3855898b413978ee457f37e361ae981bd Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 16 Oct 2024 00:09:12 -0700
Subject: [PATCH 078/329] [SCCP] Simplify code with DenseMap::operator[] (NFC)
 (#112473)

---
 llvm/lib/Transforms/Utils/SCCPSolver.cpp | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/SCCPSolver.cpp b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
index 101d60525f41..c65710ea7551 100644
--- a/llvm/lib/Transforms/Utils/SCCPSolver.cpp
+++ b/llvm/lib/Transforms/Utils/SCCPSolver.cpp
@@ -630,10 +630,7 @@ private:
   }
 
   // Add U as additional user of V.
-  void addAdditionalUser(Value *V, User *U) {
-    auto Iter = AdditionalUsers.insert({V, {}});
-    Iter.first->second.insert(U);
-  }
+  void addAdditionalUser(Value *V, User *U) { AdditionalUsers[V].insert(U); }
 
   // Mark I's users as changed, including AdditionalUsers.
   void markUsersAsChanged(Value *I) {
-- 
GitLab


From 14d006c53c67ded7da00e7880c58f2c7e25ee1f1 Mon Sep 17 00:00:00 2001
From: Petar Avramovic <Petar.Avramovic@amd.com>
Date: Wed, 16 Oct 2024 09:43:16 +0200
Subject: [PATCH 079/329] AMDGPU/GlobalISel: Run redundant_and combine in
 RegBankCombiner (#112353)

Combine is needed to clear redundant ANDs with 1 that will be
created by reg-bank-select to clean-up high bits in register.
Fix replaceRegWith from CombinerHelper:
If copy had to be inserted, first create copy then delete MI.
If MI is deleted first insert point is not valid.
---
 .../lib/CodeGen/GlobalISel/CombinerHelper.cpp |  12 +-
 llvm/lib/Target/AMDGPU/AMDGPUCombine.td       |   3 +-
 .../GlobalISel/artifact-combiner-asserts.ll   |   4 +-
 .../regbankcombiner-redundant-and.mir         |  28 +++
 llvm/test/CodeGen/AMDGPU/fptoi.i128.ll        |  12 +-
 llvm/test/CodeGen/AMDGPU/itofp.i128.ll        | 166 +++++++++---------
 6 files changed, 125 insertions(+), 100 deletions(-)
 create mode 100644 llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir

diff --git a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
index 14e94d48bf83..f9b1621955c2 100644
--- a/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/CombinerHelper.cpp
@@ -178,7 +178,7 @@ void CombinerHelper::replaceRegWith(MachineRegisterInfo &MRI, Register FromReg,
   if (MRI.constrainRegAttrs(ToReg, FromReg))
     MRI.replaceRegWith(FromReg, ToReg);
   else
-    Builder.buildCopy(ToReg, FromReg);
+    Builder.buildCopy(FromReg, ToReg);
 
   Observer.finishedChangingAllUsesOfReg();
 }
@@ -229,8 +229,8 @@ bool CombinerHelper::matchCombineCopy(MachineInstr &MI) {
 void CombinerHelper::applyCombineCopy(MachineInstr &MI) {
   Register DstReg = MI.getOperand(0).getReg();
   Register SrcReg = MI.getOperand(1).getReg();
-  MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, SrcReg);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchFreezeOfSingleMaybePoisonOperand(
@@ -379,8 +379,8 @@ void CombinerHelper::applyCombineConcatVectors(MachineInstr &MI,
     Builder.buildUndef(NewDstReg);
   else
     Builder.buildBuildVector(NewDstReg, Ops);
-  MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, NewDstReg);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchCombineShuffleConcat(MachineInstr &MI,
@@ -559,8 +559,8 @@ void CombinerHelper::applyCombineShuffleVector(MachineInstr &MI,
   else
     Builder.buildMergeLikeInstr(NewDstReg, Ops);
 
-  MI.eraseFromParent();
   replaceRegWith(MRI, DstReg, NewDstReg);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchShuffleToExtract(MachineInstr &MI) {
@@ -2825,8 +2825,8 @@ void CombinerHelper::replaceSingleDefInstWithOperand(MachineInstr &MI,
   Register OldReg = MI.getOperand(0).getReg();
   Register Replacement = MI.getOperand(OpIdx).getReg();
   assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
-  MI.eraseFromParent();
   replaceRegWith(MRI, OldReg, Replacement);
+  MI.eraseFromParent();
 }
 
 void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
@@ -2834,8 +2834,8 @@ void CombinerHelper::replaceSingleDefInstWithReg(MachineInstr &MI,
   assert(MI.getNumExplicitDefs() == 1 && "Expected one explicit def?");
   Register OldReg = MI.getOperand(0).getReg();
   assert(canReplaceReg(OldReg, Replacement, MRI) && "Cannot replace register?");
-  MI.eraseFromParent();
   replaceRegWith(MRI, OldReg, Replacement);
+  MI.eraseFromParent();
 }
 
 bool CombinerHelper::matchConstantLargerBitWidth(MachineInstr &MI,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
index b2a3f9392157..985fa8f1deff 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCombine.td
@@ -169,5 +169,6 @@ def AMDGPURegBankCombiner : GICombiner<
   "AMDGPURegBankCombinerImpl",
   [unmerge_merge, unmerge_cst, unmerge_undef,
    zext_trunc_fold, int_minmax_to_med3, ptr_add_immed_chain,
-   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp]> {
+   fp_minmax_to_clamp, fp_minmax_to_med3, fmed3_intrinsic_to_clamp,
+   redundant_and]> {
 }
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
index 6dce6c1852af..6e4fb2678b38 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/artifact-combiner-asserts.ll
@@ -27,10 +27,8 @@ define hidden <2 x i64> @icmp_v2i32_zext_to_v2i64(<2 x i32> %arg) {
 ; CHECK-NEXT:    v_mov_b32_e32 v3, 0
 ; CHECK-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
 ; CHECK-NEXT:    v_cmp_eq_u32_e32 vcc_lo, 0, v1
-; CHECK-NEXT:    v_and_b32_e32 v0, 1, v0
-; CHECK-NEXT:    v_cndmask_b32_e64 v1, 0, 1, vcc_lo
-; CHECK-NEXT:    v_and_b32_e32 v2, 1, v1
 ; CHECK-NEXT:    v_mov_b32_e32 v1, 0
+; CHECK-NEXT:    v_cndmask_b32_e64 v2, 0, 1, vcc_lo
 ; CHECK-NEXT:    s_setpc_b64 s[30:31]
   %cmp = icmp eq <2 x i32> %arg, zeroinitializer
   %sext = zext <2 x i1> %cmp to <2 x i64>
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir
new file mode 100644
index 000000000000..f87a253dcb43
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/regbankcombiner-redundant-and.mir
@@ -0,0 +1,28 @@
+# NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
+# RUN: llc -mtriple=amdgcn-amd-mesa3d -mcpu=gfx1010 -run-pass=amdgpu-regbank-combiner -verify-machineinstrs %s -o - | FileCheck %s
+
+---
+name: replaceRegWith_requires_copy
+tracksRegLiveness: true
+body: |
+  bb.0:
+    liveins: $sgpr0, $vgpr0_vgpr1
+
+    ; CHECK-LABEL: name: replaceRegWith_requires_copy
+    ; CHECK: liveins: $sgpr0, $vgpr0_vgpr1
+    ; CHECK-NEXT: {{  $}}
+    ; CHECK-NEXT: [[COPY:%[0-9]+]]:sgpr(p1) = COPY $vgpr0_vgpr1
+    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:sgpr(s32) = COPY $sgpr0
+    ; CHECK-NEXT: [[C:%[0-9]+]]:sgpr(s32) = G_CONSTANT i32 1
+    ; CHECK-NEXT: [[ICMP:%[0-9]+]]:sreg_32(s32) = G_ICMP intpred(ne), [[COPY1]](s32), [[C]]
+    ; CHECK-NEXT: [[COPY2:%[0-9]+]]:sgpr(s32) = COPY [[ICMP]](s32)
+    ; CHECK-NEXT: G_STORE [[COPY2]](s32), [[COPY]](p1) :: (store (s32), addrspace 1)
+    ; CHECK-NEXT: S_ENDPGM 0
+    %0:sgpr(p1) = COPY $vgpr0_vgpr1
+    %1:sgpr(s32) = COPY $sgpr0
+    %2:sgpr(s32) = G_CONSTANT i32 1
+    %3:sreg_32(s32) = G_ICMP intpred(ne), %1, %2
+    %4:sgpr(s32) = G_AND %3, %2
+    G_STORE %4(s32), %0(p1) :: (store (s32), addrspace 1)
+    S_ENDPGM 0
+...
diff --git a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
index 6e8e6c072178..786fe0316469 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoi.i128.ll
@@ -136,12 +136,12 @@ define i128 @fptosi_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
-; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 20, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v2
 ; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s4
@@ -508,12 +508,12 @@ define i128 @fptoui_f64_to_i128(double %x) {
 ; GISEL-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
 ; GISEL-NEXT:    v_mov_b32_e32 v5, v1
 ; GISEL-NEXT:    v_mov_b32_e32 v4, v0
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 20, v5
-; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v0
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 20, v5
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff
 ; GISEL-NEXT:    s_mov_b64 s[4:5], 0
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0
 ; GISEL-NEXT:    v_mov_b32_e32 v7, 0
+; GISEL-NEXT:    v_mov_b32_e32 v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v6, 0x7ff, v2
 ; GISEL-NEXT:    v_cmp_ge_u64_e32 vcc, v[6:7], v[0:1]
 ; GISEL-NEXT:    s_mov_b64 s[6:7], s[4:5]
 ; GISEL-NEXT:    v_mov_b32_e32 v0, s4
diff --git a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
index 38d928a006fb..2999ddb83158 100644
--- a/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
+++ b/llvm/test/CodeGen/AMDGPU/itofp.i128.ll
@@ -673,38 +673,38 @@ define double @sitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_ashrrev_i32_e32 v6, 31, v3
 ; GISEL-NEXT:    v_xor_b32_e32 v0, v6, v4
 ; GISEL-NEXT:    v_xor_b32_e32 v1, v6, v5
-; GISEL-NEXT:    v_sub_co_u32_e32 v0, vcc, v0, v6
-; GISEL-NEXT:    v_xor_b32_e32 v2, v6, v2
-; GISEL-NEXT:    v_subb_co_u32_e32 v1, vcc, v1, v6, vcc
-; GISEL-NEXT:    v_xor_b32_e32 v3, v6, v3
-; GISEL-NEXT:    v_subb_co_u32_e32 v2, vcc, v2, v6, vcc
-; GISEL-NEXT:    v_ffbh_u32_e32 v5, v0
-; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v3, v6, vcc
-; GISEL-NEXT:    v_ffbh_u32_e32 v4, v1
-; GISEL-NEXT:    v_add_u32_e32 v5, 32, v5
-; GISEL-NEXT:    v_ffbh_u32_e32 v7, v2
-; GISEL-NEXT:    v_min_u32_e32 v4, v4, v5
-; GISEL-NEXT:    v_ffbh_u32_e32 v5, v3
+; GISEL-NEXT:    v_xor_b32_e32 v4, v6, v2
+; GISEL-NEXT:    v_sub_co_u32_e32 v2, vcc, v0, v6
+; GISEL-NEXT:    v_xor_b32_e32 v5, v6, v3
+; GISEL-NEXT:    v_subb_co_u32_e32 v3, vcc, v1, v6, vcc
+; GISEL-NEXT:    v_subb_co_u32_e32 v4, vcc, v4, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v1, v2
+; GISEL-NEXT:    v_subb_co_u32_e32 v5, vcc, v5, v6, vcc
+; GISEL-NEXT:    v_ffbh_u32_e32 v0, v3
+; GISEL-NEXT:    v_add_u32_e32 v1, 32, v1
+; GISEL-NEXT:    v_ffbh_u32_e32 v7, v4
+; GISEL-NEXT:    v_min_u32_e32 v0, v0, v1
+; GISEL-NEXT:    v_ffbh_u32_e32 v1, v5
 ; GISEL-NEXT:    v_add_u32_e32 v7, 32, v7
-; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[2:3]
-; GISEL-NEXT:    v_add_u32_e32 v4, 64, v4
-; GISEL-NEXT:    v_min_u32_e32 v5, v5, v7
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, v5, v4, vcc
+; GISEL-NEXT:    v_cmp_eq_u64_e32 vcc, 0, v[4:5]
+; GISEL-NEXT:    v_add_u32_e32 v0, 64, v0
+; GISEL-NEXT:    v_min_u32_e32 v1, v1, v7
+; GISEL-NEXT:    v_cndmask_b32_e32 v9, v1, v0, vcc
 ; GISEL-NEXT:    v_sub_u32_e32 v8, 0x80, v9
 ; GISEL-NEXT:    v_sub_u32_e32 v7, 0x7f, v9
 ; GISEL-NEXT:    v_cmp_ge_i32_e32 vcc, 53, v8
 ; GISEL-NEXT:    ; implicit-def: $vgpr10
-; GISEL-NEXT:    ; implicit-def: $vgpr4_vgpr5
+; GISEL-NEXT:    ; implicit-def: $vgpr0_vgpr1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:    s_xor_b64 s[4:5], exec, s[4:5]
 ; GISEL-NEXT:  ; %bb.2: ; %itofp-if-else
-; GISEL-NEXT:    v_add_u32_e32 v2, 0xffffffb5, v9
-; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v2, v[0:1]
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v2
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, 0, v0, vcc
+; GISEL-NEXT:    v_add_u32_e32 v4, 0xffffffb5, v9
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], v4, v[2:3]
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v4
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
 ; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v1, vcc
 ; GISEL-NEXT:    ; implicit-def: $vgpr8
-; GISEL-NEXT:    ; implicit-def: $vgpr0
+; GISEL-NEXT:    ; implicit-def: $vgpr2
 ; GISEL-NEXT:    ; implicit-def: $vgpr9
 ; GISEL-NEXT:  ; %bb.3: ; %Flow3
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[8:9], s[4:5]
@@ -721,89 +721,88 @@ define double @sitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:  ; %bb.6: ; %itofp-sw-default
 ; GISEL-NEXT:    v_sub_u32_e32 v14, 0x49, v9
 ; GISEL-NEXT:    v_sub_u32_e32 v10, 64, v14
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v14, v[0:1]
-; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v14, v[2:3]
+; GISEL-NEXT:    v_lshlrev_b64 v[10:11], v10, v[4:5]
 ; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
-; GISEL-NEXT:    v_or_b32_e32 v10, v4, v10
-; GISEL-NEXT:    v_or_b32_e32 v11, v5, v11
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], v15, v[2:3]
-; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v14, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v14, v[4:5]
+; GISEL-NEXT:    v_or_b32_e32 v10, v0, v10
+; GISEL-NEXT:    v_or_b32_e32 v11, v1, v11
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v15, v[4:5]
 ; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
+; GISEL-NEXT:    v_add_u32_e32 v9, 55, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, v0, v10, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, v1, v11, vcc
 ; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT:    v_add_u32_e32 v14, 55, v9
-; GISEL-NEXT:    v_cndmask_b32_e32 v4, v4, v10, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, v5, v11, vcc
-; GISEL-NEXT:    v_sub_u32_e32 v11, 64, v14
-; GISEL-NEXT:    v_cndmask_b32_e64 v13, v4, v0, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v4, v5, v1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e32 v5, 0, v12, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[9:10], v14, -1
-; GISEL-NEXT:    v_lshlrev_b64 v[11:12], v11, -1
-; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v14
-; GISEL-NEXT:    v_or_b32_e32 v16, v9, v11
-; GISEL-NEXT:    v_or_b32_e32 v17, v10, v12
-; GISEL-NEXT:    v_lshrrev_b64 v[11:12], v15, -1
-; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v11, v11, v16, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v17, vcc
-; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v14
-; GISEL-NEXT:    v_cndmask_b32_e32 v9, 0, v9, vcc
-; GISEL-NEXT:    v_cndmask_b32_e32 v10, 0, v10, vcc
-; GISEL-NEXT:    v_cndmask_b32_e64 v11, v11, -1, s[4:5]
-; GISEL-NEXT:    v_cndmask_b32_e64 v12, v12, -1, s[4:5]
-; GISEL-NEXT:    v_and_b32_e32 v2, v9, v2
-; GISEL-NEXT:    v_and_b32_e32 v3, v10, v3
-; GISEL-NEXT:    v_and_or_b32 v0, v11, v0, v2
-; GISEL-NEXT:    v_and_or_b32 v1, v12, v1, v3
+; GISEL-NEXT:    v_cndmask_b32_e32 v11, 0, v12, vcc
+; GISEL-NEXT:    v_sub_u32_e32 v12, 64, v9
+; GISEL-NEXT:    v_cndmask_b32_e64 v14, v0, v2, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v10, v1, v3, s[4:5]
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], v9, -1
+; GISEL-NEXT:    v_lshlrev_b64 v[12:13], v12, -1
+; GISEL-NEXT:    v_subrev_u32_e32 v15, 64, v9
+; GISEL-NEXT:    v_or_b32_e32 v16, v0, v12
+; GISEL-NEXT:    v_or_b32_e32 v17, v1, v13
+; GISEL-NEXT:    v_lshrrev_b64 v[12:13], v15, -1
+; GISEL-NEXT:    v_cmp_gt_u32_e32 vcc, 64, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v12, v12, v16, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v13, v13, v17, vcc
+; GISEL-NEXT:    v_cmp_eq_u32_e64 s[4:5], 0, v9
+; GISEL-NEXT:    v_cndmask_b32_e32 v0, 0, v0, vcc
+; GISEL-NEXT:    v_cndmask_b32_e32 v1, 0, v1, vcc
+; GISEL-NEXT:    v_cndmask_b32_e64 v9, v12, -1, s[4:5]
+; GISEL-NEXT:    v_cndmask_b32_e64 v12, v13, -1, s[4:5]
+; GISEL-NEXT:    v_and_b32_e32 v0, v0, v4
+; GISEL-NEXT:    v_and_b32_e32 v1, v1, v5
+; GISEL-NEXT:    v_and_or_b32 v0, v9, v2, v0
+; GISEL-NEXT:    v_and_or_b32 v1, v12, v3, v1
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[0:1]
 ; GISEL-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc
-; GISEL-NEXT:    v_or_b32_e32 v3, v13, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v3
-; GISEL-NEXT:    v_mov_b32_e32 v1, v4
-; GISEL-NEXT:    v_mov_b32_e32 v2, v5
-; GISEL-NEXT:    v_mov_b32_e32 v3, v6
+; GISEL-NEXT:    v_or_b32_e32 v9, v14, v0
+; GISEL-NEXT:    v_mov_b32_e32 v2, v9
+; GISEL-NEXT:    v_mov_b32_e32 v3, v10
+; GISEL-NEXT:    v_mov_b32_e32 v4, v11
+; GISEL-NEXT:    v_mov_b32_e32 v5, v12
 ; GISEL-NEXT:  .LBB2_7: ; %Flow1
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[12:13]
 ; GISEL-NEXT:  .LBB2_8: ; %Flow2
 ; GISEL-NEXT:    s_andn2_saveexec_b64 s[4:5], s[10:11]
 ; GISEL-NEXT:    s_cbranch_execz .LBB2_10
 ; GISEL-NEXT:  ; %bb.9: ; %itofp-sw-bb
-; GISEL-NEXT:    v_lshlrev_b64 v[9:10], 1, v[0:1]
-; GISEL-NEXT:    v_lshlrev_b64 v[2:3], 1, v[2:3]
-; GISEL-NEXT:    v_lshrrev_b32_e32 v0, 31, v1
-; GISEL-NEXT:    v_or_b32_e32 v11, v2, v0
-; GISEL-NEXT:    v_mov_b32_e32 v0, v9
-; GISEL-NEXT:    v_mov_b32_e32 v1, v10
-; GISEL-NEXT:    v_mov_b32_e32 v2, v11
-; GISEL-NEXT:    v_mov_b32_e32 v3, v12
+; GISEL-NEXT:    v_lshlrev_b64 v[4:5], 1, v[4:5]
+; GISEL-NEXT:    v_lshlrev_b64 v[0:1], 1, v[2:3]
+; GISEL-NEXT:    v_lshrrev_b32_e32 v2, 31, v3
+; GISEL-NEXT:    v_or_b32_e32 v2, v4, v2
+; GISEL-NEXT:    v_mov_b32_e32 v5, v3
+; GISEL-NEXT:    v_mov_b32_e32 v4, v2
+; GISEL-NEXT:    v_mov_b32_e32 v3, v1
+; GISEL-NEXT:    v_mov_b32_e32 v2, v0
 ; GISEL-NEXT:  .LBB2_10: ; %itofp-sw-epilog
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
-; GISEL-NEXT:    v_bfe_u32 v3, v0, 2, 1
-; GISEL-NEXT:    v_or_b32_e32 v0, v0, v3
-; GISEL-NEXT:    v_add_co_u32_e32 v0, vcc, 1, v0
-; GISEL-NEXT:    v_addc_co_u32_e32 v1, vcc, 0, v1, vcc
-; GISEL-NEXT:    v_addc_co_u32_e32 v2, vcc, 0, v2, vcc
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 2, v[0:1]
+; GISEL-NEXT:    v_bfe_u32 v0, v2, 2, 1
+; GISEL-NEXT:    v_or_b32_e32 v0, v2, v0
+; GISEL-NEXT:    v_add_co_u32_e32 v2, vcc, 1, v0
+; GISEL-NEXT:    v_addc_co_u32_e32 v3, vcc, 0, v3, vcc
+; GISEL-NEXT:    v_addc_co_u32_e32 v4, vcc, 0, v4, vcc
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 2, v[2:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v9, 0
-; GISEL-NEXT:    v_and_b32_e32 v10, 0x800000, v1
+; GISEL-NEXT:    v_and_b32_e32 v10, 0x800000, v3
 ; GISEL-NEXT:    v_cmp_ne_u64_e32 vcc, 0, v[9:10]
-; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 30, v5
+; GISEL-NEXT:    v_lshl_or_b32 v10, v4, 30, v1
 ; GISEL-NEXT:    s_and_saveexec_b64 s[4:5], vcc
 ; GISEL-NEXT:  ; %bb.11: ; %itofp-if-then20
-; GISEL-NEXT:    v_lshrrev_b64 v[4:5], 3, v[0:1]
+; GISEL-NEXT:    v_lshrrev_b64 v[0:1], 3, v[2:3]
 ; GISEL-NEXT:    v_mov_b32_e32 v7, v8
-; GISEL-NEXT:    v_lshl_or_b32 v10, v2, 29, v5
+; GISEL-NEXT:    v_lshl_or_b32 v10, v4, 29, v1
 ; GISEL-NEXT:  ; %bb.12: ; %Flow
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[4:5]
 ; GISEL-NEXT:  .LBB2_13: ; %Flow4
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[8:9]
-; GISEL-NEXT:    v_and_b32_e32 v0, 0x80000000, v6
-; GISEL-NEXT:    v_mov_b32_e32 v1, 0x3ff00000
-; GISEL-NEXT:    v_mov_b32_e32 v2, 0xfffff
-; GISEL-NEXT:    v_lshl_add_u32 v1, v7, 20, v1
-; GISEL-NEXT:    v_and_or_b32 v2, v10, v2, v0
-; GISEL-NEXT:    v_and_or_b32 v0, v4, -1, 0
-; GISEL-NEXT:    v_or3_b32 v1, v2, v1, 0
+; GISEL-NEXT:    v_and_b32_e32 v1, 0x80000000, v6
+; GISEL-NEXT:    v_mov_b32_e32 v2, 0x3ff00000
+; GISEL-NEXT:    v_mov_b32_e32 v3, 0xfffff
+; GISEL-NEXT:    v_lshl_add_u32 v2, v7, 20, v2
+; GISEL-NEXT:    v_and_or_b32 v1, v10, v3, v1
+; GISEL-NEXT:    v_or3_b32 v1, v1, v2, 0
 ; GISEL-NEXT:  .LBB2_14: ; %Flow5
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
 ; GISEL-NEXT:    s_setpc_b64 s[30:31]
@@ -1083,7 +1082,6 @@ define double @uitofp_i128_to_f64(i128 %x) {
 ; GISEL-NEXT:    v_mov_b32_e32 v0, 0x3ff00000
 ; GISEL-NEXT:    v_lshl_add_u32 v0, v6, 20, v0
 ; GISEL-NEXT:    v_and_b32_e32 v1, 0xfffff, v9
-; GISEL-NEXT:    v_and_or_b32 v4, v4, -1, 0
 ; GISEL-NEXT:    v_or3_b32 v5, v1, v0, 0
 ; GISEL-NEXT:  .LBB3_14: ; %Flow5
 ; GISEL-NEXT:    s_or_b64 exec, exec, s[6:7]
-- 
GitLab


From eccf4d44d346eee498b0ff709e625e3104448751 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Wed, 16 Oct 2024 13:16:55 +0530
Subject: [PATCH 080/329] [CodeGen] Remove unused MachineBranchProbabilityInfo
 from MachineTraceMetrics pass(NFC). (#108506)

---
 llvm/lib/CodeGen/MachineTraceMetrics.cpp | 7 ++-----
 1 file changed, 2 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index bf3add010574..5a1670953ddd 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -14,7 +14,6 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
-#include "llvm/CodeGen/MachineBranchProbabilityInfo.h"
 #include "llvm/CodeGen/MachineFunction.h"
 #include "llvm/CodeGen/MachineInstr.h"
 #include "llvm/CodeGen/MachineLoopInfo.h"
@@ -44,9 +43,8 @@ char MachineTraceMetrics::ID = 0;
 
 char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
 
-INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE,
-                      "Machine Trace Metrics", false, true)
-INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
+INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, "Machine Trace Metrics",
+                      false, true)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
 INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
                     "Machine Trace Metrics", false, true)
@@ -57,7 +55,6 @@ MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
 
 void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
-  AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
-- 
GitLab


From 732b804e5f0fd3d5e267c7f39fedc6525ebda3ba Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Wed, 16 Oct 2024 13:19:55 +0530
Subject: [PATCH 081/329] [CodeGen][NewPM] Port machine trace metrics analysis
 to new pass manager. (#108507)

---
 .../llvm/CodeGen/MachineTraceMetrics.h        | 65 +++++++++++++---
 llvm/include/llvm/InitializePasses.h          |  2 +-
 .../llvm/Passes/MachinePassRegistry.def       |  4 +-
 llvm/lib/CodeGen/EarlyIfConversion.cpp        |  8 +-
 llvm/lib/CodeGen/MachineCombiner.cpp          |  8 +-
 llvm/lib/CodeGen/MachineTraceMetrics.cpp      | 78 +++++++++++++------
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 .../AArch64/AArch64ConditionalCompares.cpp    |  8 +-
 .../AArch64/AArch64StorePairSuppress.cpp      |  6 +-
 9 files changed, 130 insertions(+), 50 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
index c7d97597d551..d51de24d64e8 100644
--- a/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
+++ b/llvm/include/llvm/CodeGen/MachineTraceMetrics.h
@@ -46,12 +46,13 @@
 #ifndef LLVM_CODEGEN_MACHINETRACEMETRICS_H
 #define LLVM_CODEGEN_MACHINETRACEMETRICS_H
 
-#include "llvm/ADT/SparseSet.h"
 #include "llvm/ADT/ArrayRef.h"
 #include "llvm/ADT/DenseMap.h"
 #include "llvm/ADT/SmallVector.h"
+#include "llvm/ADT/SparseSet.h"
 #include "llvm/CodeGen/MachineBasicBlock.h"
 #include "llvm/CodeGen/MachineFunctionPass.h"
+#include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/TargetSchedule.h"
 
 namespace llvm {
@@ -93,7 +94,7 @@ enum class MachineTraceStrategy {
   TS_NumStrategies
 };
 
-class MachineTraceMetrics : public MachineFunctionPass {
+class MachineTraceMetrics {
   const MachineFunction *MF = nullptr;
   const TargetInstrInfo *TII = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
@@ -102,19 +103,25 @@ class MachineTraceMetrics : public MachineFunctionPass {
   TargetSchedModel SchedModel;
 
 public:
+  friend class MachineTraceMetricsWrapperPass;
   friend class Ensemble;
   friend class Trace;
 
   class Ensemble;
 
-  static char ID;
+  // For legacy pass.
+  MachineTraceMetrics() = default;
+
+  explicit MachineTraceMetrics(MachineFunction &MF, const MachineLoopInfo &LI) {
+    init(MF, LI);
+  }
 
-  MachineTraceMetrics();
+  MachineTraceMetrics(MachineTraceMetrics &&) = default;
 
-  void getAnalysisUsage(AnalysisUsage&) const override;
-  bool runOnMachineFunction(MachineFunction&) override;
-  void releaseMemory() override;
-  void verifyAnalysis() const override;
+  ~MachineTraceMetrics();
+
+  void init(MachineFunction &Func, const MachineLoopInfo &LI);
+  void clear();
 
   /// Per-basic block information that doesn't depend on the trace through the
   /// block.
@@ -400,6 +407,12 @@ public:
   /// Call Ensemble::getTrace() again to update any trace handles.
   void invalidate(const MachineBasicBlock *MBB);
 
+  /// Handle invalidation explicitly.
+  bool invalidate(MachineFunction &, const PreservedAnalyses &PA,
+                  MachineFunctionAnalysisManager::Invalidator &);
+
+  void verifyAnalysis() const;
+
 private:
   // One entry per basic block, indexed by block number.
   SmallVector<FixedBlockInfo, 4> BlockInfo;
@@ -412,8 +425,8 @@ private:
   SmallVector<unsigned, 0> ProcReleaseAtCycles;
 
   // One ensemble per strategy.
-  Ensemble
-      *Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)];
+  std::unique_ptr<Ensemble>
+      Ensembles[static_cast<size_t>(MachineTraceStrategy::TS_NumStrategies)];
 
   // Convert scaled resource usage to a cycle count that can be compared with
   // latencies.
@@ -435,6 +448,38 @@ inline raw_ostream &operator<<(raw_ostream &OS,
   return OS;
 }
 
+class MachineTraceMetricsAnalysis
+    : public AnalysisInfoMixin<MachineTraceMetricsAnalysis> {
+  friend AnalysisInfoMixin<MachineTraceMetricsAnalysis>;
+  static AnalysisKey Key;
+
+public:
+  using Result = MachineTraceMetrics;
+  Result run(MachineFunction &MF, MachineFunctionAnalysisManager &MFAM);
+};
+
+/// Verifier pass for \c MachineTraceMetrics.
+struct MachineTraceMetricsVerifierPass
+    : PassInfoMixin<MachineTraceMetricsVerifierPass> {
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+  static bool isRequired() { return true; }
+};
+
+class MachineTraceMetricsWrapperPass : public MachineFunctionPass {
+public:
+  static char ID;
+  MachineTraceMetrics MTM;
+
+  MachineTraceMetricsWrapperPass();
+
+  void getAnalysisUsage(AnalysisUsage &) const override;
+  bool runOnMachineFunction(MachineFunction &) override;
+  void releaseMemory() override { MTM.clear(); }
+  void verifyAnalysis() const override { MTM.verifyAnalysis(); }
+  MachineTraceMetrics &getMTM() { return MTM; }
+};
+
 } // end namespace llvm
 
 #endif // LLVM_CODEGEN_MACHINETRACEMETRICS_H
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 6a75dc0285cc..5ed0ad98a2a7 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -209,7 +209,7 @@ void initializeMachineRegionInfoPassPass(PassRegistry &);
 void initializeMachineSanitizerBinaryMetadataPass(PassRegistry &);
 void initializeMachineSchedulerPass(PassRegistry &);
 void initializeMachineSinkingPass(PassRegistry &);
-void initializeMachineTraceMetricsPass(PassRegistry &);
+void initializeMachineTraceMetricsWrapperPassPass(PassRegistry &);
 void initializeMachineUniformityInfoPrinterPassPass(PassRegistry &);
 void initializeMachineUniformityAnalysisPassPass(PassRegistry &);
 void initializeMachineVerifierLegacyPassPass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 2aa5f4fc176a..1d7084354455 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -106,6 +106,7 @@ MACHINE_FUNCTION_ANALYSIS("machine-opt-remark-emitter",
                           MachineOptimizationRemarkEmitterAnalysis())
 MACHINE_FUNCTION_ANALYSIS("machine-post-dom-tree",
                           MachinePostDominatorTreeAnalysis())
+MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics", MachineTraceMetricsAnalysis())
 MACHINE_FUNCTION_ANALYSIS("pass-instrumentation", PassInstrumentationAnalysis(PIC))
 MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("live-stacks", LiveStacksPass())
@@ -119,8 +120,6 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 // MachinePostDominatorTreeAnalysis())
 // MACHINE_FUNCTION_ANALYSIS("machine-region-info",
 // MachineRegionInfoPassAnalysis())
-// MACHINE_FUNCTION_ANALYSIS("machine-trace-metrics",
-// MachineTraceMetricsAnalysis()) MACHINE_FUNCTION_ANALYSIS("reaching-def",
 // ReachingDefAnalysisAnalysis()) MACHINE_FUNCTION_ANALYSIS("live-reg-matrix",
 // LiveRegMatrixAnalysis()) MACHINE_FUNCTION_ANALYSIS("gc-analysis",
 // GCMachineCodeAnalysisPass())
@@ -156,6 +155,7 @@ MACHINE_FUNCTION_PASS("stack-coloring", StackColoringPass())
 MACHINE_FUNCTION_PASS("trigger-verifier-error", TriggerVerifierErrorPass())
 MACHINE_FUNCTION_PASS("two-address-instruction", TwoAddressInstructionPass())
 MACHINE_FUNCTION_PASS("verify", MachineVerifierPass())
+MACHINE_FUNCTION_PASS("verify<machine-trace-metrics>", MachineTraceMetricsVerifierPass())
 #undef MACHINE_FUNCTION_PASS
 
 #ifndef MACHINE_FUNCTION_PASS_WITH_PARAMS
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 8d9813edd7e5..53cf6a516979 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -792,7 +792,7 @@ INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
                       "Early If Converter", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
 INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
                     "Early If Converter", false, false)
 
@@ -802,8 +802,8 @@ void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<MachineTraceMetricsWrapperPass>();
+  AU.addPreserved<MachineTraceMetricsWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -1093,7 +1093,7 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   MRI = &MF.getRegInfo();
   DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
 
   bool Changed = false;
diff --git a/llvm/lib/CodeGen/MachineCombiner.cpp b/llvm/lib/CodeGen/MachineCombiner.cpp
index 1a19e053d30f..5bfc1d63ac37 100644
--- a/llvm/lib/CodeGen/MachineCombiner.cpp
+++ b/llvm/lib/CodeGen/MachineCombiner.cpp
@@ -133,7 +133,7 @@ char &llvm::MachineCombinerID = MachineCombiner::ID;
 INITIALIZE_PASS_BEGIN(MachineCombiner, DEBUG_TYPE,
                       "Machine InstCombiner", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
 INITIALIZE_PASS_END(MachineCombiner, DEBUG_TYPE, "Machine InstCombiner",
                     false, false)
 
@@ -142,8 +142,8 @@ void MachineCombiner::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<MachineTraceMetricsWrapperPass>();
+  AU.addPreserved<MachineTraceMetricsWrapperPass>();
   AU.addRequired<LazyMachineBlockFrequencyInfoPass>();
   AU.addRequired<ProfileSummaryInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
@@ -727,7 +727,7 @@ bool MachineCombiner::runOnMachineFunction(MachineFunction &MF) {
   TSchedModel.init(STI);
   MRI = &MF.getRegInfo();
   MLI = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   PSI = &getAnalysis<ProfileSummaryInfoWrapperPass>().getPSI();
   MBFI = (PSI && PSI->hasProfileSummary()) ?
          &getAnalysis<LazyMachineBlockFrequencyInfoPass>().getBFI() :
diff --git a/llvm/lib/CodeGen/MachineTraceMetrics.cpp b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
index 5a1670953ddd..92df6b9ab48d 100644
--- a/llvm/lib/CodeGen/MachineTraceMetrics.cpp
+++ b/llvm/lib/CodeGen/MachineTraceMetrics.cpp
@@ -39,47 +39,66 @@ using namespace llvm;
 
 #define DEBUG_TYPE "machine-trace-metrics"
 
-char MachineTraceMetrics::ID = 0;
+AnalysisKey MachineTraceMetricsAnalysis::Key;
 
-char &llvm::MachineTraceMetricsID = MachineTraceMetrics::ID;
+MachineTraceMetricsAnalysis::Result
+MachineTraceMetricsAnalysis::run(MachineFunction &MF,
+                                 MachineFunctionAnalysisManager &MFAM) {
+  return Result(MF, MFAM.getResult<MachineLoopAnalysis>(MF));
+}
+
+PreservedAnalyses
+MachineTraceMetricsVerifierPass::run(MachineFunction &MF,
+                                     MachineFunctionAnalysisManager &MFAM) {
+  MFAM.getResult<MachineTraceMetricsAnalysis>(MF).verifyAnalysis();
+  return PreservedAnalyses::all();
+}
 
-INITIALIZE_PASS_BEGIN(MachineTraceMetrics, DEBUG_TYPE, "Machine Trace Metrics",
-                      false, true)
+char MachineTraceMetricsWrapperPass::ID = 0;
+
+char &llvm::MachineTraceMetricsID = MachineTraceMetricsWrapperPass::ID;
+
+INITIALIZE_PASS_BEGIN(MachineTraceMetricsWrapperPass, DEBUG_TYPE,
+                      "Machine Trace Metrics", false, true)
 INITIALIZE_PASS_DEPENDENCY(MachineLoopInfoWrapperPass)
-INITIALIZE_PASS_END(MachineTraceMetrics, DEBUG_TYPE,
+INITIALIZE_PASS_END(MachineTraceMetricsWrapperPass, DEBUG_TYPE,
                     "Machine Trace Metrics", false, true)
 
-MachineTraceMetrics::MachineTraceMetrics() : MachineFunctionPass(ID) {
-  std::fill(std::begin(Ensembles), std::end(Ensembles), nullptr);
-}
+MachineTraceMetricsWrapperPass::MachineTraceMetricsWrapperPass()
+    : MachineFunctionPass(ID) {}
 
-void MachineTraceMetrics::getAnalysisUsage(AnalysisUsage &AU) const {
+void MachineTraceMetricsWrapperPass::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.setPreservesAll();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
-bool MachineTraceMetrics::runOnMachineFunction(MachineFunction &Func) {
+void MachineTraceMetrics::init(MachineFunction &Func,
+                               const MachineLoopInfo &LI) {
   MF = &Func;
   const TargetSubtargetInfo &ST = MF->getSubtarget();
   TII = ST.getInstrInfo();
   TRI = ST.getRegisterInfo();
   MRI = &MF->getRegInfo();
-  Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  Loops = &LI;
   SchedModel.init(&ST);
   BlockInfo.resize(MF->getNumBlockIDs());
   ProcReleaseAtCycles.resize(MF->getNumBlockIDs() *
                             SchedModel.getNumProcResourceKinds());
+}
+
+bool MachineTraceMetricsWrapperPass::runOnMachineFunction(MachineFunction &MF) {
+  MTM.init(MF, getAnalysis<MachineLoopInfoWrapperPass>().getLI());
   return false;
 }
 
-void MachineTraceMetrics::releaseMemory() {
+MachineTraceMetrics::~MachineTraceMetrics() { clear(); }
+
+void MachineTraceMetrics::clear() {
   MF = nullptr;
   BlockInfo.clear();
-  for (Ensemble *&E : Ensembles) {
-    delete E;
-    E = nullptr;
-  }
+  for (auto &E : Ensembles)
+    E.reset();
 }
 
 //===----------------------------------------------------------------------===//
@@ -395,35 +414,50 @@ MachineTraceMetrics::Ensemble *
 MachineTraceMetrics::getEnsemble(MachineTraceStrategy strategy) {
   assert(strategy < MachineTraceStrategy::TS_NumStrategies &&
          "Invalid trace strategy enum");
-  Ensemble *&E = Ensembles[static_cast<size_t>(strategy)];
+  std::unique_ptr<MachineTraceMetrics::Ensemble> &E =
+      Ensembles[static_cast<size_t>(strategy)];
   if (E)
-    return E;
+    return E.get();
 
   // Allocate new Ensemble on demand.
   switch (strategy) {
   case MachineTraceStrategy::TS_MinInstrCount:
-    return (E = new MinInstrCountEnsemble(this));
+    E = std::make_unique<MinInstrCountEnsemble>(MinInstrCountEnsemble(this));
+    break;
   case MachineTraceStrategy::TS_Local:
-    return (E = new LocalEnsemble(this));
+    E = std::make_unique<LocalEnsemble>(LocalEnsemble(this));
+    break;
   default: llvm_unreachable("Invalid trace strategy enum");
   }
+  return E.get();
 }
 
 void MachineTraceMetrics::invalidate(const MachineBasicBlock *MBB) {
   LLVM_DEBUG(dbgs() << "Invalidate traces through " << printMBBReference(*MBB)
                     << '\n');
   BlockInfo[MBB->getNumber()].invalidate();
-  for (Ensemble *E : Ensembles)
+  for (auto &E : Ensembles)
     if (E)
       E->invalidate(MBB);
 }
 
+bool MachineTraceMetrics::invalidate(
+    MachineFunction &, const PreservedAnalyses &PA,
+    MachineFunctionAnalysisManager::Invalidator &) {
+  // Check whether the analysis, all analyses on machine functions, or the
+  // machine function's CFG have been preserved.
+  auto PAC = PA.getChecker<MachineTraceMetricsAnalysis>();
+  return !PAC.preserved() &&
+         !PAC.preservedSet<AllAnalysesOn<MachineFunction>>() &&
+         !PAC.preservedSet<CFGAnalyses>();
+}
+
 void MachineTraceMetrics::verifyAnalysis() const {
   if (!MF)
     return;
 #ifndef NDEBUG
   assert(BlockInfo.size() == MF->getNumBlockIDs() && "Outdated BlockInfo size");
-  for (Ensemble *E : Ensembles)
+  for (auto &E : Ensembles)
     if (E)
       E->verify();
 #endif
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 36c0cea36131..08cc4ddbe343 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -109,6 +109,7 @@
 #include "llvm/CodeGen/MachinePassManager.h"
 #include "llvm/CodeGen/MachinePostDominators.h"
 #include "llvm/CodeGen/MachineRegisterInfo.h"
+#include "llvm/CodeGen/MachineTraceMetrics.h"
 #include "llvm/CodeGen/MachineVerifier.h"
 #include "llvm/CodeGen/PHIElimination.h"
 #include "llvm/CodeGen/PreISelIntrinsicLowering.h"
diff --git a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
index 9669a393bc2b..0301032e8497 100644
--- a/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ConditionalCompares.cpp
@@ -795,7 +795,7 @@ INITIALIZE_PASS_BEGIN(AArch64ConditionalCompares, "aarch64-ccmp",
                       "AArch64 CCMP Pass", false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
-INITIALIZE_PASS_DEPENDENCY(MachineTraceMetrics)
+INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
 INITIALIZE_PASS_END(AArch64ConditionalCompares, "aarch64-ccmp",
                     "AArch64 CCMP Pass", false, false)
 
@@ -809,8 +809,8 @@ void AArch64ConditionalCompares::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
   AU.addRequired<MachineLoopInfoWrapperPass>();
   AU.addPreserved<MachineLoopInfoWrapperPass>();
-  AU.addRequired<MachineTraceMetrics>();
-  AU.addPreserved<MachineTraceMetrics>();
+  AU.addRequired<MachineTraceMetricsWrapperPass>();
+  AU.addPreserved<MachineTraceMetricsWrapperPass>();
   MachineFunctionPass::getAnalysisUsage(AU);
 }
 
@@ -937,7 +937,7 @@ bool AArch64ConditionalCompares::runOnMachineFunction(MachineFunction &MF) {
   DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
   Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
   MBPI = &getAnalysis<MachineBranchProbabilityInfoWrapperPass>().getMBPI();
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
   MinSize = MF.getFunction().hasMinSize();
 
diff --git a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
index 047e38261a6f..d8c8b17565ab 100644
--- a/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StorePairSuppress.cpp
@@ -53,8 +53,8 @@ private:
 
   void getAnalysisUsage(AnalysisUsage &AU) const override {
     AU.setPreservesCFG();
-    AU.addRequired<MachineTraceMetrics>();
-    AU.addPreserved<MachineTraceMetrics>();
+    AU.addRequired<MachineTraceMetricsWrapperPass>();
+    AU.addPreserved<MachineTraceMetricsWrapperPass>();
     MachineFunctionPass::getAnalysisUsage(AU);
   }
 };
@@ -139,7 +139,7 @@ bool AArch64StorePairSuppress::runOnMachineFunction(MachineFunction &MF) {
   TRI = ST.getRegisterInfo();
   MRI = &MF.getRegInfo();
   SchedModel.init(&ST);
-  Traces = &getAnalysis<MachineTraceMetrics>();
+  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
 
   LLVM_DEBUG(dbgs() << "*** " << getPassName() << ": " << MF.getName() << '\n');
-- 
GitLab


From e36b22f3bf45a23d31b569e53d22b98714cf00e3 Mon Sep 17 00:00:00 2001
From: Howard Roark <howard.roark@huawei-partners.com>
Date: Wed, 16 Oct 2024 10:50:48 +0300
Subject: [PATCH 082/329] Revert "[PGO] Preserve analysis results when nothing
 was instrumented (#93421)"

This reverts commit 23c64beeccc03c6a8329314ecd75864e09bb6d97.
---
 .../Instrumentation/PGOInstrumentation.cpp    |  4 +-
 .../PGOInstrumentationTest.cpp                | 68 +++++--------------
 2 files changed, 19 insertions(+), 53 deletions(-)

diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index dbe908bb5e72..e6e474ed3760 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -1916,7 +1916,6 @@ static bool InstrumentAllFunctions(
   std::unordered_multimap<Comdat *, GlobalValue *> ComdatMembers;
   collectComdatMembers(M, ComdatMembers);
 
-  bool AnythingInstrumented = false;
   for (auto &F : M) {
     if (skipPGOGen(F))
       continue;
@@ -1926,9 +1925,8 @@ static bool InstrumentAllFunctions(
     FunctionInstrumenter FI(M, F, TLI, ComdatMembers, BPI, BFI,
                             InstrumentationType);
     FI.instrument();
-    AnythingInstrumented = true;
   }
-  return AnythingInstrumented;
+  return true;
 }
 
 PreservedAnalyses
diff --git a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
index a4c076a8752f..9ccb13934cbd 100644
--- a/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
+++ b/llvm/unittests/Transforms/Instrumentation/PGOInstrumentationTest.cpp
@@ -103,13 +103,9 @@ public:
                ModuleAnalysisManager::Invalidator &));
 };
 
-template <typename ParamType> struct PGOTestName {
-  std::string operator()(const TestParamInfo<ParamType> &Info) const {
-    return std::get<1>(Info.param).str();
-  }
-};
-
-struct PGOInstrumentationGenTest : public Test {
+struct PGOInstrumentationGenTest
+    : public Test,
+      WithParamInterface<std::tuple<StringRef, StringRef>> {
   ModulePassManager MPM;
   PassBuilder PB;
   MockModuleAnalysisHandle MMAHandle;
@@ -145,47 +141,12 @@ struct PGOInstrumentationGenTest : public Test {
   }
 };
 
-struct PGOInstrumentationGenInstrumentTest
-    : PGOInstrumentationGenTest,
-      WithParamInterface<std::tuple<StringRef, StringRef>> {};
-
 static constexpr StringRef CodeWithFuncDefs = R"(
   define i32 @f(i32 %n) {
   entry:
     ret i32 0
   })";
 
-INSTANTIATE_TEST_SUITE_P(
-    PGOInstrumetationGenTestSuite, PGOInstrumentationGenInstrumentTest,
-    Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs")),
-    PGOTestName<PGOInstrumentationGenInstrumentTest::ParamType>());
-
-TEST_P(PGOInstrumentationGenInstrumentTest, Instrumented) {
-  const StringRef Code = std::get<0>(GetParam());
-  parseAssembly(Code);
-
-  ASSERT_THAT(M, NotNull());
-
-  Sequence PassSequence;
-  EXPECT_CALL(MMAHandle, run(Ref(*M), _))
-      .InSequence(PassSequence)
-      .WillOnce(DoDefault());
-  EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _))
-      .InSequence(PassSequence)
-      .WillOnce(DoDefault());
-
-  MPM.run(*M, MAM);
-
-  const auto *IRInstrVar =
-      M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
-  ASSERT_THAT(IRInstrVar, NotNull());
-  EXPECT_FALSE(IRInstrVar->isDeclaration());
-}
-
-struct PGOInstrumentationGenIgnoreTest
-    : PGOInstrumentationGenTest,
-      WithParamInterface<std::tuple<StringRef, StringRef>> {};
-
 static constexpr StringRef CodeWithFuncDecls = R"(
   declare i32 @f(i32);
 )";
@@ -196,26 +157,33 @@ static constexpr StringRef CodeWithGlobals = R"(
 )";
 
 INSTANTIATE_TEST_SUITE_P(
-    PGOInstrumetationGenIgnoreTestSuite, PGOInstrumentationGenIgnoreTest,
-    Values(std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"),
+    PGOInstrumetationGenTestSuite, PGOInstrumentationGenTest,
+    Values(std::make_tuple(CodeWithFuncDefs, "instrument_function_defs"),
+           std::make_tuple(CodeWithFuncDecls, "instrument_function_decls"),
            std::make_tuple(CodeWithGlobals, "instrument_globals")),
-    PGOTestName<PGOInstrumentationGenIgnoreTest::ParamType>());
+    [](const TestParamInfo<PGOInstrumentationGenTest::ParamType> &Info) {
+      return std::get<1>(Info.param).str();
+    });
 
-TEST_P(PGOInstrumentationGenIgnoreTest, NotInstrumented) {
+TEST_P(PGOInstrumentationGenTest, Instrumented) {
   const StringRef Code = std::get<0>(GetParam());
-
   parseAssembly(Code);
 
   ASSERT_THAT(M, NotNull());
 
-  EXPECT_CALL(MMAHandle, run(Ref(*M), _)).WillOnce(DoDefault());
-  EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _)).Times(0);
+  Sequence PassSequence;
+  EXPECT_CALL(MMAHandle, run(Ref(*M), _))
+      .InSequence(PassSequence)
+      .WillOnce(DoDefault());
+  EXPECT_CALL(MMAHandle, invalidate(Ref(*M), _, _))
+      .InSequence(PassSequence)
+      .WillOnce(DoDefault());
 
   MPM.run(*M, MAM);
 
   const auto *IRInstrVar =
       M->getNamedGlobal(INSTR_PROF_QUOTE(INSTR_PROF_RAW_VERSION_VAR));
-  ASSERT_THAT(IRInstrVar, NotNull());
+  EXPECT_THAT(IRInstrVar, NotNull());
   EXPECT_FALSE(IRInstrVar->isDeclaration());
 }
 
-- 
GitLab


From 488d3924dd28d0402a4c32a6386865adc936d368 Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Wed, 16 Oct 2024 13:22:57 +0530
Subject: [PATCH 083/329] [CodeGen][NewPM] Port EarlyIfConversion pass to NPM.
 (#108508)

---
 llvm/include/llvm/CodeGen/EarlyIfConversion.h | 24 ++++++
 llvm/include/llvm/CodeGen/Passes.h            |  2 +-
 llvm/include/llvm/InitializePasses.h          |  2 +-
 llvm/include/llvm/Passes/CodeGenPassBuilder.h |  1 +
 .../llvm/Passes/MachinePassRegistry.def       |  2 +-
 llvm/lib/CodeGen/CodeGen.cpp                  |  2 +-
 llvm/lib/CodeGen/EarlyIfConversion.cpp        | 79 ++++++++++++++-----
 llvm/lib/CodeGen/TargetPassConfig.cpp         |  4 +-
 llvm/lib/Passes/PassBuilder.cpp               |  1 +
 .../Target/AArch64/AArch64TargetMachine.cpp   |  2 +-
 .../lib/Target/AMDGPU/AMDGPUTargetMachine.cpp |  2 +-
 llvm/lib/Target/PowerPC/PPCTargetMachine.cpp  |  2 +-
 .../Target/SystemZ/SystemZTargetMachine.cpp   |  2 +-
 llvm/lib/Target/X86/X86TargetMachine.cpp      |  2 +-
 .../early-ifcvt-likely-predictable.mir        |  1 +
 .../AArch64/early-ifcvt-regclass-mismatch.mir |  1 +
 .../AArch64/early-ifcvt-same-value.mir        |  1 +
 .../CodeGen/PowerPC/early-ifcvt-no-isel.mir   |  2 +
 18 files changed, 102 insertions(+), 30 deletions(-)
 create mode 100644 llvm/include/llvm/CodeGen/EarlyIfConversion.h

diff --git a/llvm/include/llvm/CodeGen/EarlyIfConversion.h b/llvm/include/llvm/CodeGen/EarlyIfConversion.h
new file mode 100644
index 000000000000..78bf12ade02c
--- /dev/null
+++ b/llvm/include/llvm/CodeGen/EarlyIfConversion.h
@@ -0,0 +1,24 @@
+//===- llvm/CodeGen/EarlyIfConversion.h -------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CODEGEN_EARLYIFCONVERSION_H
+#define LLVM_CODEGEN_EARLYIFCONVERSION_H
+
+#include "llvm/CodeGen/MachinePassManager.h"
+
+namespace llvm {
+
+class EarlyIfConverterPass : public PassInfoMixin<EarlyIfConverterPass> {
+public:
+  PreservedAnalyses run(MachineFunction &MF,
+                        MachineFunctionAnalysisManager &MFAM);
+};
+
+} // namespace llvm
+
+#endif // LLVM_CODEGEN_EARLYIFCONVERSION_H
diff --git a/llvm/include/llvm/CodeGen/Passes.h b/llvm/include/llvm/CodeGen/Passes.h
index 99421bdf769f..bbbf99626098 100644
--- a/llvm/include/llvm/CodeGen/Passes.h
+++ b/llvm/include/llvm/CodeGen/Passes.h
@@ -273,7 +273,7 @@ namespace llvm {
 
   /// EarlyIfConverter - This pass performs if-conversion on SSA form by
   /// inserting cmov instructions.
-  extern char &EarlyIfConverterID;
+  extern char &EarlyIfConverterLegacyID;
 
   /// EarlyIfPredicator - This pass performs if-conversion on SSA form by
   /// predicating if/else block and insert select at the join point.
diff --git a/llvm/include/llvm/InitializePasses.h b/llvm/include/llvm/InitializePasses.h
index 5ed0ad98a2a7..1374880b6a71 100644
--- a/llvm/include/llvm/InitializePasses.h
+++ b/llvm/include/llvm/InitializePasses.h
@@ -98,7 +98,7 @@ void initializeDominatorTreeWrapperPassPass(PassRegistry &);
 void initializeDwarfEHPrepareLegacyPassPass(PassRegistry &);
 void initializeEarlyCSELegacyPassPass(PassRegistry &);
 void initializeEarlyCSEMemSSALegacyPassPass(PassRegistry &);
-void initializeEarlyIfConverterPass(PassRegistry &);
+void initializeEarlyIfConverterLegacyPass(PassRegistry &);
 void initializeEarlyIfPredicatorPass(PassRegistry &);
 void initializeEarlyMachineLICMPass(PassRegistry &);
 void initializeEarlyTailDuplicatePass(PassRegistry &);
diff --git a/llvm/include/llvm/Passes/CodeGenPassBuilder.h b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
index 0d45df08cb0c..9ef6e39dbb1c 100644
--- a/llvm/include/llvm/Passes/CodeGenPassBuilder.h
+++ b/llvm/include/llvm/Passes/CodeGenPassBuilder.h
@@ -27,6 +27,7 @@
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
diff --git a/llvm/include/llvm/Passes/MachinePassRegistry.def b/llvm/include/llvm/Passes/MachinePassRegistry.def
index 1d7084354455..4e44d0312ede 100644
--- a/llvm/include/llvm/Passes/MachinePassRegistry.def
+++ b/llvm/include/llvm/Passes/MachinePassRegistry.def
@@ -129,6 +129,7 @@ MACHINE_FUNCTION_ANALYSIS("slot-indexes", SlotIndexesAnalysis())
 #define MACHINE_FUNCTION_PASS(NAME, CREATE_PASS)
 #endif
 MACHINE_FUNCTION_PASS("dead-mi-elimination", DeadMachineInstructionElimPass())
+MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass())
 MACHINE_FUNCTION_PASS("early-machinelicm", EarlyMachineLICMPass())
 MACHINE_FUNCTION_PASS("finalize-isel", FinalizeISelPass())
 MACHINE_FUNCTION_PASS("localstackalloc", LocalStackSlotAllocationPass())
@@ -205,7 +206,6 @@ DUMMY_MACHINE_FUNCTION_PASS("cfi-fixup", CFIFixupPass)
 DUMMY_MACHINE_FUNCTION_PASS("cfi-instr-inserter", CFIInstrInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("detect-dead-lanes", DetectDeadLanesPass)
 DUMMY_MACHINE_FUNCTION_PASS("dot-machine-cfg", MachineCFGPrinter)
-DUMMY_MACHINE_FUNCTION_PASS("early-ifcvt", EarlyIfConverterPass)
 DUMMY_MACHINE_FUNCTION_PASS("early-tailduplication", EarlyTailDuplicatePass)
 DUMMY_MACHINE_FUNCTION_PASS("fentry-insert", FEntryInserterPass)
 DUMMY_MACHINE_FUNCTION_PASS("fixup-statepoint-caller-saved", FixupStatepointCallerSavedPass)
diff --git a/llvm/lib/CodeGen/CodeGen.cpp b/llvm/lib/CodeGen/CodeGen.cpp
index 48cc21ee20f0..2d7f351de54e 100644
--- a/llvm/lib/CodeGen/CodeGen.cpp
+++ b/llvm/lib/CodeGen/CodeGen.cpp
@@ -35,7 +35,7 @@ void llvm::initializeCodeGen(PassRegistry &Registry) {
   initializeDebugifyMachineModulePass(Registry);
   initializeDetectDeadLanesPass(Registry);
   initializeDwarfEHPrepareLegacyPassPass(Registry);
-  initializeEarlyIfConverterPass(Registry);
+  initializeEarlyIfConverterLegacyPass(Registry);
   initializeEarlyIfPredicatorPass(Registry);
   initializeEarlyMachineLICMPass(Registry);
   initializeEarlyTailDuplicatePass(Registry);
diff --git a/llvm/lib/CodeGen/EarlyIfConversion.cpp b/llvm/lib/CodeGen/EarlyIfConversion.cpp
index 53cf6a516979..3e7399584617 100644
--- a/llvm/lib/CodeGen/EarlyIfConversion.cpp
+++ b/llvm/lib/CodeGen/EarlyIfConversion.cpp
@@ -15,6 +15,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+#include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/ADT/BitVector.h"
 #include "llvm/ADT/PostOrderIterator.h"
 #include "llvm/ADT/SmallPtrSet.h"
@@ -760,7 +761,7 @@ void SSAIfConv::convertIf(SmallVectorImpl<MachineBasicBlock *> &RemoveBlocks,
 //===----------------------------------------------------------------------===//
 
 namespace {
-class EarlyIfConverter : public MachineFunctionPass {
+class EarlyIfConverter {
   const TargetInstrInfo *TII = nullptr;
   const TargetRegisterInfo *TRI = nullptr;
   MCSchedModel SchedModel;
@@ -772,31 +773,41 @@ class EarlyIfConverter : public MachineFunctionPass {
   SSAIfConv IfConv;
 
 public:
-  static char ID;
-  EarlyIfConverter() : MachineFunctionPass(ID) {}
-  void getAnalysisUsage(AnalysisUsage &AU) const override;
-  bool runOnMachineFunction(MachineFunction &MF) override;
-  StringRef getPassName() const override { return "Early If-Conversion"; }
+  EarlyIfConverter(MachineDominatorTree &DT, MachineLoopInfo &LI,
+                   MachineTraceMetrics &MTM)
+      : DomTree(&DT), Loops(&LI), Traces(&MTM) {}
+  EarlyIfConverter() = delete;
+
+  bool run(MachineFunction &MF);
 
 private:
   bool tryConvertIf(MachineBasicBlock *);
   void invalidateTraces();
   bool shouldConvertIf();
 };
+
+class EarlyIfConverterLegacy : public MachineFunctionPass {
+public:
+  static char ID;
+  EarlyIfConverterLegacy() : MachineFunctionPass(ID) {}
+  void getAnalysisUsage(AnalysisUsage &AU) const override;
+  bool runOnMachineFunction(MachineFunction &MF) override;
+  StringRef getPassName() const override { return "Early If-Conversion"; }
+};
 } // end anonymous namespace
 
-char EarlyIfConverter::ID = 0;
-char &llvm::EarlyIfConverterID = EarlyIfConverter::ID;
+char EarlyIfConverterLegacy::ID = 0;
+char &llvm::EarlyIfConverterLegacyID = EarlyIfConverterLegacy::ID;
 
-INITIALIZE_PASS_BEGIN(EarlyIfConverter, DEBUG_TYPE,
-                      "Early If Converter", false, false)
+INITIALIZE_PASS_BEGIN(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter",
+                      false, false)
 INITIALIZE_PASS_DEPENDENCY(MachineBranchProbabilityInfoWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineDominatorTreeWrapperPass)
 INITIALIZE_PASS_DEPENDENCY(MachineTraceMetricsWrapperPass)
-INITIALIZE_PASS_END(EarlyIfConverter, DEBUG_TYPE,
-                    "Early If Converter", false, false)
+INITIALIZE_PASS_END(EarlyIfConverterLegacy, DEBUG_TYPE, "Early If Converter",
+                    false, false)
 
-void EarlyIfConverter::getAnalysisUsage(AnalysisUsage &AU) const {
+void EarlyIfConverterLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
   AU.addRequired<MachineBranchProbabilityInfoWrapperPass>();
   AU.addRequired<MachineDominatorTreeWrapperPass>();
   AU.addPreserved<MachineDominatorTreeWrapperPass>();
@@ -1076,11 +1087,9 @@ bool EarlyIfConverter::tryConvertIf(MachineBasicBlock *MBB) {
   return Changed;
 }
 
-bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
+bool EarlyIfConverter::run(MachineFunction &MF) {
   LLVM_DEBUG(dbgs() << "********** EARLY IF-CONVERSION **********\n"
                     << "********** Function: " << MF.getName() << '\n');
-  if (skipFunction(MF.getFunction()))
-    return false;
 
   // Only run if conversion if the target wants it.
   const TargetSubtargetInfo &STI = MF.getSubtarget();
@@ -1091,9 +1100,6 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   TRI = STI.getRegisterInfo();
   SchedModel = STI.getSchedModel();
   MRI = &MF.getRegInfo();
-  DomTree = &getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
-  Loops = &getAnalysis<MachineLoopInfoWrapperPass>().getLI();
-  Traces = &getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
   MinInstr = nullptr;
 
   bool Changed = false;
@@ -1110,6 +1116,41 @@ bool EarlyIfConverter::runOnMachineFunction(MachineFunction &MF) {
   return Changed;
 }
 
+PreservedAnalyses
+EarlyIfConverterPass::run(MachineFunction &MF,
+                          MachineFunctionAnalysisManager &MFAM) {
+  if (MF.getFunction().hasOptNone())
+    return PreservedAnalyses::all();
+
+  MachineDominatorTree &MDT = MFAM.getResult<MachineDominatorTreeAnalysis>(MF);
+  MachineLoopInfo &LI = MFAM.getResult<MachineLoopAnalysis>(MF);
+  MachineTraceMetrics &MTM = MFAM.getResult<MachineTraceMetricsAnalysis>(MF);
+
+  EarlyIfConverter Impl(MDT, LI, MTM);
+  bool Changed = Impl.run(MF);
+  if (!Changed)
+    return PreservedAnalyses::all();
+
+  auto PA = getMachineFunctionPassPreservedAnalyses();
+  PA.preserve<MachineDominatorTreeAnalysis>();
+  PA.preserve<MachineLoopAnalysis>();
+  PA.preserve<MachineTraceMetricsAnalysis>();
+  return PA;
+}
+
+bool EarlyIfConverterLegacy::runOnMachineFunction(MachineFunction &MF) {
+  if (skipFunction(MF.getFunction()))
+    return false;
+
+  MachineDominatorTree &MDT =
+      getAnalysis<MachineDominatorTreeWrapperPass>().getDomTree();
+  MachineLoopInfo &LI = getAnalysis<MachineLoopInfoWrapperPass>().getLI();
+  MachineTraceMetrics &MTM =
+      getAnalysis<MachineTraceMetricsWrapperPass>().getMTM();
+
+  return EarlyIfConverter(MDT, LI, MTM).run(MF);
+}
+
 //===----------------------------------------------------------------------===//
 //                           EarlyIfPredicator Pass
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/CodeGen/TargetPassConfig.cpp b/llvm/lib/CodeGen/TargetPassConfig.cpp
index cf9d63df2515..02c3a8526975 100644
--- a/llvm/lib/CodeGen/TargetPassConfig.cpp
+++ b/llvm/lib/CodeGen/TargetPassConfig.cpp
@@ -305,7 +305,7 @@ static IdentifyingPassPtr overridePass(AnalysisID StandardID,
   if (StandardID == &DeadMachineInstructionElimID)
     return applyDisable(TargetID, DisableMachineDCE);
 
-  if (StandardID == &EarlyIfConverterID)
+  if (StandardID == &EarlyIfConverterLegacyID)
     return applyDisable(TargetID, DisableEarlyIfConversion);
 
   if (StandardID == &EarlyMachineLICMID)
@@ -521,7 +521,7 @@ void llvm::registerCodeGenCallback(PassInstrumentationCallbacks &PIC,
     DISABLE_PASS(DisableBlockPlacement, MachineBlockPlacementPass)
     DISABLE_PASS(DisableBranchFold, BranchFolderPass)
     DISABLE_PASS(DisableCopyProp, MachineCopyPropagationPass)
-    DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterPass)
+    DISABLE_PASS(DisableEarlyIfConversion, EarlyIfConverterLegacyPass)
     DISABLE_PASS(DisableEarlyTailDup, EarlyTailDuplicatePass)
     DISABLE_PASS(DisableMachineCSE, MachineCSELegacyPass)
     DISABLE_PASS(DisableMachineDCE, DeadMachineInstructionElimPass)
diff --git a/llvm/lib/Passes/PassBuilder.cpp b/llvm/lib/Passes/PassBuilder.cpp
index 08cc4ddbe343..ebad3507eb5e 100644
--- a/llvm/lib/Passes/PassBuilder.cpp
+++ b/llvm/lib/Passes/PassBuilder.cpp
@@ -82,6 +82,7 @@
 #include "llvm/CodeGen/CodeGenPrepare.h"
 #include "llvm/CodeGen/DeadMachineInstructionElim.h"
 #include "llvm/CodeGen/DwarfEHPrepare.h"
+#include "llvm/CodeGen/EarlyIfConversion.h"
 #include "llvm/CodeGen/ExpandLargeDivRem.h"
 #include "llvm/CodeGen/ExpandLargeFpConvert.h"
 #include "llvm/CodeGen/ExpandMemCmp.h"
diff --git a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
index 21b86f5fe5d9..c7bd0390b656 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetMachine.cpp
@@ -784,7 +784,7 @@ bool AArch64PassConfig::addILPOpts() {
   if (EnableCondBrTuning)
     addPass(createAArch64CondBrTuning());
   if (EnableEarlyIfConversion)
-    addPass(&EarlyIfConverterID);
+    addPass(&EarlyIfConverterLegacyID);
   if (EnableStPairSuppress)
     addPass(createAArch64StorePairSuppressPass());
   addPass(createAArch64SIMDInstrOptPass());
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 23ee0c3e896e..7cc65c3996fa 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1335,7 +1335,7 @@ void GCNPassConfig::addMachineSSAOptimization() {
 
 bool GCNPassConfig::addILPOpts() {
   if (EnableEarlyIfConversion)
-    addPass(&EarlyIfConverterID);
+    addPass(&EarlyIfConverterLegacyID);
 
   TargetPassConfig::addILPOpts();
   return false;
diff --git a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
index 7d0455942923..cd188304595e 100644
--- a/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
+++ b/llvm/lib/Target/PowerPC/PPCTargetMachine.cpp
@@ -521,7 +521,7 @@ bool PPCPassConfig::addPreISel() {
 }
 
 bool PPCPassConfig::addILPOpts() {
-  addPass(&EarlyIfConverterID);
+  addPass(&EarlyIfConverterLegacyID);
 
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
diff --git a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
index 53ed46f14f14..f76f41768e88 100644
--- a/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTargetMachine.cpp
@@ -257,7 +257,7 @@ bool SystemZPassConfig::addInstSelector() {
 }
 
 bool SystemZPassConfig::addILPOpts() {
-  addPass(&EarlyIfConverterID);
+  addPass(&EarlyIfConverterLegacyID);
 
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
diff --git a/llvm/lib/Target/X86/X86TargetMachine.cpp b/llvm/lib/Target/X86/X86TargetMachine.cpp
index ceb87a62dba0..4ba0ac11d209 100644
--- a/llvm/lib/Target/X86/X86TargetMachine.cpp
+++ b/llvm/lib/Target/X86/X86TargetMachine.cpp
@@ -536,7 +536,7 @@ bool X86PassConfig::addGlobalInstructionSelect() {
 }
 
 bool X86PassConfig::addILPOpts() {
-  addPass(&EarlyIfConverterID);
+  addPass(&EarlyIfConverterLegacyID);
   if (EnableMachineCombinerPass)
     addPass(&MachineCombinerID);
   addPass(createX86CmovConverterPass());
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
index 425a23214871..ab5e320725d5 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-likely-predictable.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -run-pass=early-ifcvt -o - %s | FileCheck %s
+# RUN: llc -mtriple=arm64-apple-ios -mcpu=apple-m1 -passes=early-ifcvt -o - %s | FileCheck %s
 
 --- |
   define void @test_cond_is_load_with_invariant_ops() {
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
index 318bdceeaef4..a7f67f8b682c 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-regclass-mismatch.mir
@@ -1,4 +1,5 @@
 # RUN: llc -mtriple=aarch64-unknown-unknown -run-pass=early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-unknown-unknown -passes=early-ifcvt -verify-each %s -o - | FileCheck %s
 --- |
   target datalayout = "e-m:o-i64:64-i128:128-n32:64-S128"
   target triple = "arm64-apple-ios13.3.0"
diff --git a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
index b9298608e192..16d5dfc78f56 100644
--- a/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
+++ b/llvm/test/CodeGen/AArch64/early-ifcvt-same-value.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=aarch64-- -run-pass=early-ifcvt -stress-early-ifcvt -verify-machineinstrs %s -o - | FileCheck %s
+# RUN: llc -mtriple=aarch64-- -passes=early-ifcvt -stress-early-ifcvt %s -o - | FileCheck %s
 
 ---
 name:            fmov0
diff --git a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
index 99a3f80ff81b..794480bfc6ce 100644
--- a/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
+++ b/llvm/test/CodeGen/PowerPC/early-ifcvt-no-isel.mir
@@ -1,6 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 4
 # RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-machineinstrs \
 # RUN:   -run-pass=early-ifcvt %s -o - | FileCheck %s
+# RUN: llc -mtriple=powerpc64-ibm-aix -mcpu=pwr7 -simplify-mir -verify-each \
+# RUN:   -passes=early-ifcvt %s -o - | FileCheck %s
 
 --- |
   source_filename = "<stdin>"
-- 
GitLab


From c137b3ee357b6e7564f6717bcfb56e28044fc583 Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Wed, 16 Oct 2024 09:47:58 +0200
Subject: [PATCH 084/329] [X86] Introduce test for PR112098 (NFC)

---
 .../test/CodeGen/X86/tailcall-caller-nocsr.ll | 32 +++++++++++++++++++
 1 file changed, 32 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll

diff --git a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
new file mode 100644
index 000000000000..5606fbb27032
--- /dev/null
+++ b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
@@ -0,0 +1,32 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple=x86_64-linux-gnu -mattr=-sse,-avx | FileCheck %s
+
+@.str = private unnamed_addr constant [6 x i8] c"%d %d\00", align 1
+
+define void @caller(i32 %0, i32 %1) #0 {
+; CHECK-LABEL: caller:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %r11
+; CHECK-NEXT:    pushq %r10
+; CHECK-NEXT:    pushq %r9
+; CHECK-NEXT:    pushq %r8
+; CHECK-NEXT:    pushq %rdx
+; CHECK-NEXT:    pushq %rcx
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %edi, %esi
+; CHECK-NEXT:    movl $.L.str, %edi
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    popq %rcx
+; CHECK-NEXT:    popq %rdx
+; CHECK-NEXT:    popq %r8
+; CHECK-NEXT:    popq %r9
+; CHECK-NEXT:    popq %r10
+; CHECK-NEXT:    popq %r11
+; CHECK-NEXT:    jmp printf@PLT # TAILCALL
+  %3 = tail call i32 @printf(ptr @.str, i32 %0, i32 %1)
+  ret void
+}
+
+declare i32 @printf(ptr, ...) nounwind
+
+attributes #0 = { mustprogress nounwind "no_caller_saved_registers" }
-- 
GitLab


From d3a8363beccf4a45f222c63b20ba94e8d450c8db Mon Sep 17 00:00:00 2001
From: Antonio Frighetto <me@antoniofrighetto.com>
Date: Wed, 16 Oct 2024 09:48:32 +0200
Subject: [PATCH 085/329] [X86] Do not elect to tail call if caller must
 preserve all registers

A miscompilation issue has been addressed with improved checking.

Fixes: https://github.com/llvm/llvm-project/issues/97758.
---
 llvm/lib/Target/X86/X86ISelLoweringCall.cpp    | 7 +++++++
 llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll | 4 +++-
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
index 8561658379f7..12cd92e2d0d7 100644
--- a/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
+++ b/llvm/lib/Target/X86/X86ISelLoweringCall.cpp
@@ -2856,6 +2856,13 @@ bool X86TargetLowering::IsEligibleForTailCallOptimization(
       return false;
   }
 
+  // The stack frame of the caller cannot be replaced by the tail-callee one's
+  // if the function is required to preserve all the registers. Conservatively
+  // prevent tail optimization even if hypothetically all the registers are used
+  // for passing formal parameters or returning values.
+  if (CallerF.hasFnAttribute("no_caller_saved_registers"))
+    return false;
+
   unsigned StackArgsSize = CCInfo.getStackSize();
 
   // If the callee takes no arguments then go on to check the results of the
diff --git a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
index 5606fbb27032..0385017a1ced 100644
--- a/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
+++ b/llvm/test/CodeGen/X86/tailcall-caller-nocsr.ll
@@ -13,8 +13,10 @@ define void @caller(i32 %0, i32 %1) #0 {
 ; CHECK-NEXT:    pushq %rdx
 ; CHECK-NEXT:    pushq %rcx
 ; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movl %esi, %edx
 ; CHECK-NEXT:    movl %edi, %esi
 ; CHECK-NEXT:    movl $.L.str, %edi
+; CHECK-NEXT:    callq printf@PLT
 ; CHECK-NEXT:    popq %rax
 ; CHECK-NEXT:    popq %rcx
 ; CHECK-NEXT:    popq %rdx
@@ -22,7 +24,7 @@ define void @caller(i32 %0, i32 %1) #0 {
 ; CHECK-NEXT:    popq %r9
 ; CHECK-NEXT:    popq %r10
 ; CHECK-NEXT:    popq %r11
-; CHECK-NEXT:    jmp printf@PLT # TAILCALL
+; CHECK-NEXT:    retq
   %3 = tail call i32 @printf(ptr @.str, i32 %0, i32 %1)
   ret void
 }
-- 
GitLab


From 72a7b471de9ad6d9bf6540f02a10774c0b246e2e Mon Sep 17 00:00:00 2001
From: Christudasan Devadasan <christudasan.devadasan@amd.com>
Date: Wed, 16 Oct 2024 13:30:46 +0530
Subject: [PATCH 086/329] [AMDGPU][NewPM] Fill out addILPOpts. (#108514)

---
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp | 7 +++++++
 llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h   | 1 +
 2 files changed, 8 insertions(+)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
index 7cc65c3996fa..e4cc522194f2 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.cpp
@@ -1983,6 +1983,13 @@ void AMDGPUCodeGenPassBuilder::addPreISel(AddIRPass &addPass) const {
   addPass(RequireAnalysisPass<UniformityInfoAnalysis, Function>());
 }
 
+void AMDGPUCodeGenPassBuilder::addILPOpts(AddMachinePass &addPass) const {
+  if (EnableEarlyIfConversion)
+    addPass(EarlyIfConverterPass());
+
+  Base::addILPOpts(addPass);
+}
+
 void AMDGPUCodeGenPassBuilder::addAsmPrinter(AddMachinePass &addPass,
                                              CreateMCStreamer) const {
   // TODO: Add AsmPrinter.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
index af8476bc21ec..d8a5111e5898 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
+++ b/llvm/lib/Target/AMDGPU/AMDGPUTargetMachine.h
@@ -172,6 +172,7 @@ public:
   void addIRPasses(AddIRPass &) const;
   void addCodeGenPrepare(AddIRPass &) const;
   void addPreISel(AddIRPass &addPass) const;
+  void addILPOpts(AddMachinePass &) const;
   void addAsmPrinter(AddMachinePass &, CreateMCStreamer) const;
   Error addInstSelector(AddMachinePass &) const;
   void addMachineSSAOptimization(AddMachinePass &) const;
-- 
GitLab


From 4ddea298e60c31d0995c06189a592895d2ad512b Mon Sep 17 00:00:00 2001
From: Hans <hans@hanshq.net>
Date: Wed, 16 Oct 2024 10:06:43 +0200
Subject: [PATCH 087/329] [clang-cl]: Add /std:c++23preview and update
 _MSVC_LANG for C++23 (#112378)

As discussed in
https://discourse.llvm.org/t/clang-cl-adding-std-c-23preview/82553
---
 clang/docs/ReleaseNotes.rst                     | 2 ++
 clang/include/clang/Driver/Options.td           | 2 +-
 clang/lib/Basic/Targets/OSTargets.cpp           | 6 ++++--
 clang/lib/Driver/ToolChains/Clang.cpp           | 1 +
 clang/test/Driver/cl-options.c                  | 3 +++
 clang/test/Preprocessor/predefined-win-macros.c | 7 ++++++-
 6 files changed, 17 insertions(+), 4 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 817e3abef8d5..33eb9a2b5804 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -605,6 +605,8 @@ Android Support
 Windows Support
 ^^^^^^^^^^^^^^^
 
+- clang-cl now supports ``/std:c++23preview`` which enables C++23 features.
+
 - Clang no longer allows references inside a union when emulating MSVC 1900+ even if `fms-extensions` is enabled.
   Starting with VS2015, MSVC 1900, this Microsoft extension is no longer allowed and always results in an error.
   Clang now follows the MSVC behavior in this scenario.
diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 2072ae45d554..379e75b197cf 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -8531,7 +8531,7 @@ def _SLASH_execution_charset : CLCompileJoined<"execution-charset:">,
   HelpText<"Set runtime encoding, supports only UTF-8">,
   Alias<fexec_charset_EQ>;
 def _SLASH_std : CLCompileJoined<"std:">,
-  HelpText<"Set language version (c++14,c++17,c++20,c++latest,c11,c17)">;
+  HelpText<"Set language version (c++14,c++17,c++20,c++23preview,c++latest,c11,c17)">;
 def _SLASH_U : CLJoinedOrSeparate<"U">, HelpText<"Undefine macro">,
   MetaVarName<"<macro>">, Alias<U>;
 def _SLASH_validate_charset : CLFlag<"validate-charset">,
diff --git a/clang/lib/Basic/Targets/OSTargets.cpp b/clang/lib/Basic/Targets/OSTargets.cpp
index b56e2c7ca9c4..88c054150ab2 100644
--- a/clang/lib/Basic/Targets/OSTargets.cpp
+++ b/clang/lib/Basic/Targets/OSTargets.cpp
@@ -214,9 +214,11 @@ static void addVisualCDefines(const LangOptions &Opts, MacroBuilder &Builder) {
       Builder.defineMacro("_HAS_CHAR16_T_LANGUAGE_SUPPORT", Twine(1));
 
     if (Opts.isCompatibleWithMSVC(LangOptions::MSVC2015)) {
-      if (Opts.CPlusPlus23)
+      if (Opts.CPlusPlus26)
         // TODO update to the proper value.
-        Builder.defineMacro("_MSVC_LANG", "202004L");
+        Builder.defineMacro("_MSVC_LANG", "202400L");
+      else if (Opts.CPlusPlus23)
+        Builder.defineMacro("_MSVC_LANG", "202302L");
       else if (Opts.CPlusPlus20)
         Builder.defineMacro("_MSVC_LANG", "202002L");
       else if (Opts.CPlusPlus17)
diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index c132fa35098a..3fc39296f442 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -7225,6 +7225,7 @@ void Clang::ConstructJob(Compilation &C, const JobAction &JA,
                              .Case("c++17", "-std=c++17")
                              .Case("c++20", "-std=c++20")
                              // TODO add c++23 and c++26 when MSVC supports it.
+                             .Case("c++23preview", "-std=c++23")
                              .Case("c++latest", "-std=c++26")
                              .Default("");
       if (LanguageStandard.empty())
diff --git a/clang/test/Driver/cl-options.c b/clang/test/Driver/cl-options.c
index 48d281bcd447..8191fda97788 100644
--- a/clang/test/Driver/cl-options.c
+++ b/clang/test/Driver/cl-options.c
@@ -605,6 +605,9 @@
 // RUN: %clang_cl -fmsc-version=1900 -TP -std:c++20 -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX20 %s
 // STDCXX20: -std=c++20
 
+// RUN: %clang_cl -fmsc-version=1900 -TP -std:c++23preview -### -- %s 2>&1 | FileCheck -check-prefix=STDCXX23PREVIEW %s
+// STDCXX23PREVIEW: -std=c++23
+
 // RUN: %clang_cl -fmsc-version=1900 -TP -std:c++latest -### -- %s 2>&1 | FileCheck -check-prefix=STDCXXLATEST %s
 // STDCXXLATEST: -std=c++26
 
diff --git a/clang/test/Preprocessor/predefined-win-macros.c b/clang/test/Preprocessor/predefined-win-macros.c
index 7d29e45c7d5a..8e539a2a1faf 100644
--- a/clang/test/Preprocessor/predefined-win-macros.c
+++ b/clang/test/Preprocessor/predefined-win-macros.c
@@ -56,7 +56,12 @@
 // RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
 // RUN:     -fms-compatibility-version=19.00 -std=c++23 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2B
 // CHECK-MS-CPP2B: #define _MSC_VER 1900
-// CHECK-MS-CPP2B: #define _MSVC_LANG 202004L
+// CHECK-MS-CPP2B: #define _MSVC_LANG 202302L
+
+// RUN: %clang_cc1 %s -x c++ -E -dM -triple i686-pc-win32 -fms-extensions -fms-compatibility \
+// RUN:     -fms-compatibility-version=19.00 -std=c++26 -o - | FileCheck -match-full-lines %s --check-prefix=CHECK-MS-CPP2C
+// CHECK-MS-CPP2C: #define _MSC_VER 1900
+// CHECK-MS-CPP2C: #define _MSVC_LANG 202400L
 
 // RUN: %clang_cc1 -triple i386-windows %s -E -dM -o - \
 // RUN:   | FileCheck -match-full-lines %s --check-prefix=CHECK-X86-WIN
-- 
GitLab


From 9df8d8d05c2650b51bd4233e1759206d163f3133 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Bal=C3=A1zs=20K=C3=A9ri?= <balazs.keri@ericsson.com>
Date: Wed, 16 Oct 2024 10:17:34 +0200
Subject: [PATCH 088/329] [clang][analyzer] Improve test and documentation in
 cstring NotNullTerminated checker (#112019)

CStringChecker has a sub-checker alpha.unix.cstring.NotNullTerminated
which checks for invalid objects passed to string functions. The checker
and its name are not exact and more functions could be checked, this
change only adds some tests and improves documentation.
---
 clang/docs/analyzer/checkers.rst | 17 +++++++++---
 clang/test/Analysis/string.c     | 46 +++++++++++++++++++++++++++-----
 clang/test/Analysis/string.cpp   | 10 ++++++-
 3 files changed, 63 insertions(+), 10 deletions(-)

diff --git a/clang/docs/analyzer/checkers.rst b/clang/docs/analyzer/checkers.rst
index 81264428c72e..58dbd686a6dc 100644
--- a/clang/docs/analyzer/checkers.rst
+++ b/clang/docs/analyzer/checkers.rst
@@ -3371,12 +3371,23 @@ Checks for overlap in two buffer arguments. Applies to:  ``memcpy, mempcpy, wmem
 
 alpha.unix.cstring.NotNullTerminated (C)
 """"""""""""""""""""""""""""""""""""""""
-Check for arguments which are not null-terminated strings; applies to: ``strlen, strnlen, strcpy, strncpy, strcat, strncat, wcslen, wcsnlen``.
+Check for arguments which are not null-terminated strings;
+applies to the ``strlen``, ``strcpy``, ``strcat``, ``strcmp`` family of functions.
+
+Only very fundamental cases are detected where the passed memory block is
+absolutely different from a null-terminated string. This checker does not
+find if a memory buffer is passed where the terminating zero character
+is missing.
 
 .. code-block:: c
 
- void test() {
-   int y = strlen((char *)&test); // warn
+ void test1() {
+   int l = strlen((char *)&test); // warn
+ }
+
+ void test2() {
+ label:
+   int l = strlen((char *)&&label); // warn
  }
 
 .. _alpha-unix-cstring-OutOfBounds:
diff --git a/clang/test/Analysis/string.c b/clang/test/Analysis/string.c
index 79b4877eedbd..2e0a49d083b0 100644
--- a/clang/test/Analysis/string.c
+++ b/clang/test/Analysis/string.c
@@ -361,6 +361,10 @@ void strcpy_fn_const(char *x) {
   strcpy(x, (const char*)&strcpy_fn); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
 }
 
+void strcpy_fn_dst(const char *x) {
+  strcpy((char*)&strcpy_fn, x); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
+}
+
 extern int globalInt;
 void strcpy_effects(char *x, char *y) {
   char a = x[0];
@@ -469,8 +473,22 @@ void strcat_null_src(char *x) {
   strcat(x, NULL); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}}
 }
 
-void strcat_fn(char *x) {
-  strcat(x, (char*)&strcat_fn); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn', which is not a null-terminated string}}
+void strcat_fn_dst(const char *x) {
+  strcat((char*)&strcat_fn_dst, x); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_dst', which is not a null-terminated string}}
+}
+
+void strcat_fn_src(char *x) {
+  strcat(x, (char*)&strcat_fn_src); // expected-warning{{Argument to string concatenation function is the address of the function 'strcat_fn_src', which is not a null-terminated string}}
+}
+
+void strcat_label_dst(const char *x) {
+label:
+  strcat((char*)&&label, x); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}}
+}
+
+void strcat_label_src(char *x) {
+label:
+  strcat(x, (char*)&&label); // expected-warning{{Argument to string concatenation function is the address of the label 'label', which is not a null-terminated string}}
 }
 
 void strcat_effects(char *y) {
@@ -568,8 +586,12 @@ void strncpy_null_src(char *x) {
   strncpy(x, NULL, 5); // expected-warning{{Null pointer passed as 2nd argument to string copy function}}
 }
 
-void strncpy_fn(char *x) {
-  strncpy(x, (char*)&strcpy_fn, 5); // expected-warning{{Argument to string copy function is the address of the function 'strcpy_fn', which is not a null-terminated string}}
+void strncpy_fn_src(char *x) {
+  strncpy(x, (char*)&strncpy_fn_src, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_src', which is not a null-terminated string}}
+}
+
+void strncpy_fn_dst(const char *x) {
+  strncpy((char*)&strncpy_fn_dst, x, 5); // expected-warning{{Argument to string copy function is the address of the function 'strncpy_fn_dst', which is not a null-terminated string}}
 }
 
 void strncpy_effects(char *x, char *y) {
@@ -680,8 +702,12 @@ void strncat_null_src(char *x) {
   strncat(x, NULL, 4); // expected-warning{{Null pointer passed as 2nd argument to string concatenation function}}
 }
 
-void strncat_fn(char *x) {
-  strncat(x, (char*)&strncat_fn, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn', which is not a null-terminated string}}
+void strncat_fn_src(char *x) {
+  strncat(x, (char*)&strncat_fn_src, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_src', which is not a null-terminated string}}
+}
+
+void strncat_fn_dst(const char *x) {
+  strncat((char*)&strncat_fn_dst, x, 4); // expected-warning{{Argument to string concatenation function is the address of the function 'strncat_fn_dst', which is not a null-terminated string}}
 }
 
 void strncat_effects(char *y) {
@@ -921,6 +947,14 @@ int strcmp_null_argument(char *a) {
   return strcmp(a, b); // expected-warning{{Null pointer passed as 2nd argument to string comparison function}}
 }
 
+void strcmp_fn_r(char *x) {
+  strcmp(x, (char*)&strcmp_null_argument); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}}
+}
+
+void strcmp_fn_l(char *x) {
+  strcmp((char*)&strcmp_null_argument, x); // expected-warning{{Argument to string comparison function is the address of the function 'strcmp_null_argument', which is not a null-terminated string}}
+}
+
 //===----------------------------------------------------------------------===
 // strncmp()
 //===----------------------------------------------------------------------===
diff --git a/clang/test/Analysis/string.cpp b/clang/test/Analysis/string.cpp
index 1be6c21466cc..c09422d19223 100644
--- a/clang/test/Analysis/string.cpp
+++ b/clang/test/Analysis/string.cpp
@@ -1,4 +1,4 @@
-// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,debug.ExprInspection -verify %s
+// RUN: %clang_analyze_cc1 -analyzer-checker=core,unix,alpha.unix.cstring,debug.ExprInspection -verify %s
 
 // Test functions that are called "memcpy" but aren't the memcpy
 // we're looking for. Unfortunately, this test cannot be put into
@@ -6,6 +6,7 @@
 // as a normal C function for the test to make sense.
 typedef __typeof(sizeof(int)) size_t;
 void *memcpy(void *, const void *, size_t);
+size_t strlen(const char *s);
 
 int sprintf(char *str, const char *format, ...);
 int snprintf(char *str, size_t size, const char *format, ...);
@@ -45,3 +46,10 @@ void log(const char* fmt, const Args&... args) {
 void test_gh_74269_no_crash() {
   log("%d", 1);
 }
+
+struct TestNotNullTerm {
+  void test1() {
+    TestNotNullTerm * const &x = this;
+    strlen((char *)&x); // expected-warning{{Argument to string length function is not a null-terminated string}}
+  }
+};
-- 
GitLab


From 3bf2295ee0ebd1eafe66ca15dff44bdb31e6198a Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 16 Oct 2024 16:24:44 +0800
Subject: [PATCH 089/329] [InstCombine] Drop `samesign` flag in
 `foldAndOrOfICmpsWithConstEq` (#112489)

In
https://github.com/llvm/llvm-project/commit/5dbfca30c1a672cd0c5089df2b4fdd171436643a
we assume that RHS is poison implies LHS is also poison. It doesn't hold
after introducing samesign flag.

This patch drops the `samesign` flag on RHS if the original expression
is a logical and/or.

Closes #112467.
---
 .../Transforms/InstCombine/InstCombineAndOrXor.cpp  |  8 +++++++-
 .../Transforms/InstCombine/and-or-icmp-min-max.ll   | 11 +++++++++++
 .../Transforms/InstCombine/and-or-icmp-nullptr.ll   | 13 +++++++++++++
 3 files changed, 31 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 64bee4ab974e..c8407e8ba5ab 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3369,8 +3369,14 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   // We can convert this case to bitwise and, because both operands are used
   // on the LHS, and as such poison from both will propagate.
   if (Value *V = foldAndOrOfICmpsWithConstEq(RHS, LHS, IsAnd,
-                                             /*IsLogical*/ false, Builder, Q))
+                                             /*IsLogical=*/false, Builder, Q)) {
+    // If RHS is still used, we should drop samesign flag.
+    if (IsLogical && RHS->hasSameSign() && !RHS->use_empty()) {
+      RHS->setSameSign(false);
+      addToWorklist(RHS);
+    }
     return V;
+  }
 
   if (Value *V = foldIsPowerOf2OrZero(LHS, RHS, IsAnd, Builder, *this))
     return V;
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
index 058847a75bde..cc55c4a39a26 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmp-min-max.ll
@@ -689,6 +689,17 @@ define i1 @sge_and_max_logical(i8 %x, i8 %y)  {
   ret i1 %r
 }
 
+define i1 @sge_and_max_logical_samesign(i8 %x, i8 %y)  {
+; CHECK-LABEL: @sge_and_max_logical_samesign(
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
+; CHECK-NEXT:    ret i1 [[CMPEQ]]
+;
+  %cmp = icmp sge i8 %x, %y
+  %cmpeq = icmp samesign eq i8 %x, 127
+  %r = select i1 %cmp, i1 %cmpeq, i1 false
+  ret i1 %r
+}
+
 define i1 @sge_and_max_commute(i8 %x, i8 %y)  {
 ; CHECK-LABEL: @sge_and_max_commute(
 ; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq i8 [[X:%.*]], 127
diff --git a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
index d533cc704853..8650b89c3b9e 100644
--- a/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
+++ b/llvm/test/Transforms/InstCombine/and-or-icmp-nullptr.ll
@@ -592,6 +592,19 @@ define i1 @sgt_and_min_logical(ptr %x, ptr %y)  {
   ret i1 %r
 }
 
+define i1 @sgt_and_min_logical_samesign(ptr %x, ptr %y)  {
+; CHECK-LABEL: @sgt_and_min_logical_samesign(
+; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp eq ptr [[X:%.*]], null
+; CHECK-NEXT:    [[TMP1:%.*]] = icmp slt ptr [[Y:%.*]], null
+; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMPEQ]], [[TMP1]]
+; CHECK-NEXT:    ret i1 [[R]]
+;
+  %cmp = icmp sgt ptr %x, %y
+  %cmpeq = icmp samesign eq ptr %x, null
+  %r = select i1 %cmp, i1 %cmpeq, i1 false
+  ret i1 %r
+}
+
 define i1 @sle_or_not_min(ptr %x, ptr %y)  {
 ; CHECK-LABEL: @sle_or_not_min(
 ; CHECK-NEXT:    [[CMPEQ:%.*]] = icmp ne ptr [[X:%.*]], null
-- 
GitLab


From f0d7cccaf5ad372c1039d0699e36771ac50fa5c4 Mon Sep 17 00:00:00 2001
From: Ilya Biryukov <ibiryukov@google.com>
Date: Wed, 16 Oct 2024 10:38:53 +0200
Subject: [PATCH 090/329] [Modules][NFC] Rewrite friend-definition-2.cpp with
 split-file (#112380)

Instead of the pragmas, which are less familiar to people.
This is a follow-up of a discussion from #111992.
---
 clang/test/Modules/friend-definition-2.cpp | 59 +++++++++++++++-------
 1 file changed, 40 insertions(+), 19 deletions(-)

diff --git a/clang/test/Modules/friend-definition-2.cpp b/clang/test/Modules/friend-definition-2.cpp
index 41c2141f4013..d91ce14722b4 100644
--- a/clang/test/Modules/friend-definition-2.cpp
+++ b/clang/test/Modules/friend-definition-2.cpp
@@ -1,32 +1,53 @@
-// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify
-// RUN: %clang_cc1 -std=c++14 -fmodules %s -verify -triple i686-windows
-// expected-no-diagnostics
-#pragma clang module build A
-module A {}
-#pragma clang module contents
-#pragma clang module begin A
+// RUN: split-file %s %t
+
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \
+// RUN:   -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \
+// RUN:   %t/use.cc -verify
+
+// RUN: rm -f %t/*.pcm
+
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=A -emit-module %t/a.modulemap -o %t/a.pcm -triple i686-windows
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-name=B -emit-module %t/b.modulemap -o %t/b.pcm -triple i686-windows
+// RUN: %clang_cc1 -std=c++14 -x c++ -fmodules -fmodule-map-file=%t/a.modulemap -fmodule-map-file=%t/b.modulemap \
+// RUN:   -fmodule-file=%t/a.pcm -fmodule-file=%t/b.pcm \
+// RUN:   %t/use.cc -verify -triple i686-windows
+
+//--- a.modulemap
+module A {
+  header "a.h"
+}
+
+//--- a.h
+#ifndef A_H
+#define A_H
+template<typename T> struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); };
+#endif
+
+//--- b.modulemap
+module B {
+  header "b.h"
+}
+
+//--- b.h
+#ifndef B_H
+#define B_H
 template<typename T> struct ct { friend auto operator-(ct, ct) { struct X {}; return X(); } void x(); };
-#pragma clang module end
-#pragma clang module endbuild
-
-#pragma clang module build B
-module B {}
-#pragma clang module contents
-#pragma clang module begin B
-template<typename T> struct ct { friend auto operator-(ct, ct) { struct X{}; return X(); } void x(); };
 inline auto f() { return ct<float>() - ct<float>(); }
-#pragma clang module end
-#pragma clang module endbuild
+#endif
 
+//--- use.cc
+// expected-no-diagnostics
 // Force the definition of ct in module A to be the primary definition.
-#pragma clang module import A
+#include "a.h"
 template<typename T> void ct<T>::x() {}
 
 // Attempt to cause the definition of operator- in the ct primary template in
 // module B to be the primary definition of that function. If that happens,
 // we'll be left with a class template ct that appears to not contain a
 // definition of the inline friend function.
-#pragma clang module import B
+#include "b.h"
 auto v = f();
 
 ct<int> make();
-- 
GitLab


From 5059059c7b752904c7e81078395adcdb8cd1d63b Mon Sep 17 00:00:00 2001
From: Jonas Paulsson <paulson1@linux.ibm.com>
Date: Wed, 16 Oct 2024 10:39:28 +0200
Subject: [PATCH 091/329] [SystemZ]  Add missing newline character in
 verifyNarrowIntegerArgs_Call(). (#112499)

---
 llvm/lib/Target/SystemZ/SystemZISelLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
index c7626434efac..83417e570dab 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.cpp
@@ -9863,7 +9863,7 @@ verifyNarrowIntegerArgs_Call(const SmallVectorImpl<ISD::OutputArg> &Outs,
     if (CalleeFn != nullptr)
       printFunctionArgExts(CalleeFn, errs());
     else
-      errs() << "-";
+      errs() << "-\n";
     errs() << "Caller:  ";
     printFunctionArgExts(F, errs());
     llvm_unreachable("");
-- 
GitLab


From 949177dabc86c99667cb490119e028ce0e7dc628 Mon Sep 17 00:00:00 2001
From: Daniel Grumberg <dgrumberg@apple.com>
Date: Wed, 16 Oct 2024 09:45:43 +0100
Subject: [PATCH 092/329] [clang][ExtractAPI] Fix up casting from
 CXXClassRecord (#110983)

`RecordRecord::classOfKind` and `TagRecord::classofKind` didn't
correctly capture `RK_CXXClass` and derived variants, e.g.
`RK_ClassTemplate`. This materialized by anonymous C++ tag types not
being correctly detected when they need to be merged with another
record.
---
 clang/include/clang/ExtractAPI/API.h          | 37 +++++++++++++++-
 .../ExtractAPI/anonymous_record_no_typedef.c  | 44 +++++++++++++------
 .../ExtractAPI/typedef_anonymous_record.c     | 27 +++++++-----
 3 files changed, 82 insertions(+), 26 deletions(-)

diff --git a/clang/include/clang/ExtractAPI/API.h b/clang/include/clang/ExtractAPI/API.h
index 4f34fcc575e8..c30e6fac66d6 100644
--- a/clang/include/clang/ExtractAPI/API.h
+++ b/clang/include/clang/ExtractAPI/API.h
@@ -26,6 +26,7 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/Support/Allocator.h"
 #include "llvm/Support/Casting.h"
+#include "llvm/Support/Compiler.h"
 #include "llvm/TargetParser/Triple.h"
 #include <cstddef>
 #include <iterator>
@@ -615,7 +616,24 @@ struct TagRecord : APIRecord, RecordContext {
     return classofKind(Record->getKind());
   }
   static bool classofKind(RecordKind K) {
-    return K == RK_Struct || K == RK_Union || K == RK_Enum;
+    switch (K) {
+    case RK_Enum:
+      LLVM_FALLTHROUGH;
+    case RK_Struct:
+      LLVM_FALLTHROUGH;
+    case RK_Union:
+      LLVM_FALLTHROUGH;
+    case RK_CXXClass:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplate:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplateSpecialization:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplatePartialSpecialization:
+      return true;
+    default:
+      return false;
+    }
   }
 
   bool IsEmbeddedInVarDeclarator;
@@ -684,7 +702,22 @@ struct RecordRecord : TagRecord {
     return classofKind(Record->getKind());
   }
   static bool classofKind(RecordKind K) {
-    return K == RK_Struct || K == RK_Union;
+    switch (K) {
+    case RK_Struct:
+      LLVM_FALLTHROUGH;
+    case RK_Union:
+      LLVM_FALLTHROUGH;
+    case RK_CXXClass:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplate:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplateSpecialization:
+      LLVM_FALLTHROUGH;
+    case RK_ClassTemplatePartialSpecialization:
+      return true;
+    default:
+      return false;
+    }
   }
 
   bool isAnonymousWithNoTypedef() { return Name.empty(); }
diff --git a/clang/test/ExtractAPI/anonymous_record_no_typedef.c b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
index 064c223ad56e..c0c76ef1f06b 100644
--- a/clang/test/ExtractAPI/anonymous_record_no_typedef.c
+++ b/clang/test/ExtractAPI/anonymous_record_no_typedef.c
@@ -1,11 +1,18 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
 // RUN:   -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \
-// RUN:   -x c-header %s -o %t/output.symbols.json -verify
+// RUN:   -x c-header %s -o %t/output-c.symbols.json -verify
+//
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN:   -triple arm64-apple-macosx -isystem %S -fretain-comments-from-system-headers \
+// RUN:   -x c++-header %s -o %t/output-cxx.symbols.json -verify
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBAL
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix PREFIX
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CONTENT
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBAL
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix PREFIX
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CONTENT
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBAL
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix PREFIX
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CONTENT
 /// A global variable with an anonymous struct type.
 struct { char *prefix; char *content; } global;
 // GLOBAL-LABEL: "!testLabel": "c:@global"
@@ -30,7 +37,7 @@ struct { char *prefix; char *content; } global;
 // GLOBAL: "text": "A global variable with an anonymous struct type."
 // GLOBAL:     "kind": {
 // GLOBAL-NEXT:  "displayName": "Global Variable",
-// GLOBAL-NEXT:  "identifier": "c.var"
+// GLOBAL-NEXT:  "identifier": "c{{(\+\+)?}}.var"
 // GLOBAL:       "title": "global"
 // GLOBAL:     "pathComponents": [
 // GLOBAL-NEXT:  "global"
@@ -54,9 +61,12 @@ struct { char *prefix; char *content; } global;
 
 /// A Vehicle
 struct Vehicle {
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix TYPE
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix BICYCLE
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix CAR
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix TYPE
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix BICYCLE
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix CAR
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix TYPE
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix BICYCLE
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix CAR
     /// The type of vehicle.
     enum {
         Bicycle,
@@ -96,9 +106,12 @@ struct Vehicle {
     // CAR-NEXT:   "Car"
     // CAR-NEXT: ]
 
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix INFORMATION
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix WHEELS
-    // RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix NAME
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix INFORMATION
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix WHEELS
+    // RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix NAME
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix INFORMATION
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix WHEELS
+    // RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix NAME
     /// The information about the vehicle.
     union {
         int wheels;
@@ -145,8 +158,10 @@ struct Vehicle {
     // NAME-NEXT: ]
 };
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALCASE
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix GLOBALOTHERCASE
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix GLOBALOTHERCASE
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALCASE
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix GLOBALOTHERCASE
 enum {
   GlobalCase,
   GlobalOtherCase
@@ -163,7 +178,8 @@ enum {
 // GLOBALOTHERCASE-NEXT:   "GlobalOtherCase"
 // GLOBALOTHERCASE-NEXT: ]
 
-// RUN: FileCheck %s --input-file %t/output.symbols.json --check-prefix VEC
+// RUN: FileCheck %s --input-file %t/output-c.symbols.json --check-prefix VEC
+// RUN: FileCheck %s --input-file %t/output-cxx.symbols.json --check-prefix VEC
 union Vector {
   struct {
     float X;
diff --git a/clang/test/ExtractAPI/typedef_anonymous_record.c b/clang/test/ExtractAPI/typedef_anonymous_record.c
index 8e298f8d9ce8..c100e30803e4 100644
--- a/clang/test/ExtractAPI/typedef_anonymous_record.c
+++ b/clang/test/ExtractAPI/typedef_anonymous_record.c
@@ -1,8 +1,11 @@
 // RUN: rm -rf %t
 // RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
-// RUN:   --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain.symbols.json -verify
+// RUN:   --product-name=TypedefChain -triple arm64-apple-macosx -x c-header %s -o %t/typedefchain-c.symbols.json -verify
+// RUN: %clang_cc1 -extract-api --pretty-sgf --emit-sgf-symbol-labels-for-testing \
+// RUN:   --product-name=TypedefChain -triple arm64-apple-macosx -x c++-header %s -o %t/typedefchain-cxx.symbols.json -verify
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCT
 typedef struct { } MyStruct;
 // MYSTRUCT-LABEL: "!testLabel": "c:@SA@MyStruct"
 // MYSTRUCT:      "accessLevel": "public",
@@ -34,7 +37,7 @@ typedef struct { } MyStruct;
 // MYSTRUCT-NEXT: ]
 // MYSTRUCT:      "kind": {
 // MYSTRUCT-NEXT:   "displayName": "Structure",
-// MYSTRUCT-NEXT:   "identifier": "c.struct"
+// MYSTRUCT-NEXT:   "identifier": "c{{(\+\+)?}}.struct"
 // MYSTRUCT:           "names": {
 // MYSTRUCT-NEXT:        "navigator": [
 // MYSTRUCT-NEXT:          {
@@ -54,7 +57,8 @@ typedef struct { } MyStruct;
 // MYSTRUCT-NEXT:    "MyStruct"
 // MYSTRUCT-NEXT:  ]
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYSTRUCTSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYSTRUCTSTRUCT
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYSTRUCTSTRUCT
 typedef MyStruct MyStructStruct;
 // MYSTRUCTSTRUCT-LABEL: "!testLabel": "c:typedef_anonymous_record.c@T@MyStructStruct"
 // MYSTRUCTSTRUCT: "accessLevel": "public",
@@ -87,10 +91,12 @@ typedef MyStruct MyStructStruct;
 // MYSTRUCTSTRUCT-NEXT:],
 // MYSTRUCTSTRUCT:     "kind": {
 // MYSTRUCTSTRUCT-NEXT:  "displayName": "Type Alias",
-// MYSTRUCTSTRUCT-NEXT:  "identifier": "c.typealias"
+// MYSTRUCTSTRUCT-NEXT:  "identifier": "c{{(\+\+)?}}.typealias"
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUM
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix CASE
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix CASE
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix CASE
 typedef enum { Case } MyEnum;
 // MYENUM: "source": "c:@EA@MyEnum@Case",
 // MYENUM-NEXT: "target": "c:@EA@MyEnum",
@@ -124,7 +130,7 @@ typedef enum { Case } MyEnum;
 // MYENUM-NEXT:],
 // MYENUM:     "kind": {
 // MYENUM-NEXT:  "displayName": "Enumeration",
-// MYENUM-NEXT:  "identifier": "c.enum"
+// MYENUM-NEXT:  "identifier": "c{{(\+\+)?}}.enum"
 // MYENUM:           "names": {
 // MYENUM-NEXT:        "navigator": [
 // MYENUM-NEXT:          {
@@ -147,7 +153,8 @@ typedef enum { Case } MyEnum;
 // CASE-NEXT:   "Case"
 // CASE-NEXT: ]
 
-// RUN: FileCheck %s --input-file %t/typedefchain.symbols.json --check-prefix MYENUMENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-c.symbols.json --check-prefix MYENUMENUM
+// RUN: FileCheck %s --input-file %t/typedefchain-cxx.symbols.json --check-prefix MYENUMENUM
 typedef MyEnum MyEnumEnum;
 // MYENUMENUM-LABEL: "!testLabel": "c:typedef_anonymous_record.c@T@MyEnumEnum"
 // MYENUMENUM:      "declarationFragments": [
@@ -179,7 +186,7 @@ typedef MyEnum MyEnumEnum;
 // MYENUMENUM-NEXT: ],
 // MYENUMENUM:      "kind": {
 // MYENUMENUM-NEXT:   "displayName": "Type Alias",
-// MYENUMENUM-NEXT:   "identifier": "c.typealias"
+// MYENUMENUM-NEXT:   "identifier": "c{{(\+\+)?}}.typealias"
 // MYENUMENUM-NEXT: },
 // MYENUMENUM: "title": "MyEnumEnum"
 
-- 
GitLab


From 154929169ab460b6b135103208e7fecd3cfa58f0 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Wed, 16 Oct 2024 10:47:12 +0200
Subject: [PATCH 093/329] [clang] Implement constexpr __builtin_bit_cast for
 complex types (#109981)

Fixes https://github.com/llvm/llvm-project/issues/94620
---
 clang/lib/AST/ExprConstant.cpp                | 43 +++++++++++++++++++
 .../SemaCXX/constexpr-builtin-bit-cast.cpp    | 16 +++++++
 2 files changed, 59 insertions(+)

diff --git a/clang/lib/AST/ExprConstant.cpp b/clang/lib/AST/ExprConstant.cpp
index 52a7f5778ce6..8544052d5e49 100644
--- a/clang/lib/AST/ExprConstant.cpp
+++ b/clang/lib/AST/ExprConstant.cpp
@@ -7237,6 +7237,7 @@ class APValueToBufferConverter {
 
     case APValue::ComplexInt:
     case APValue::ComplexFloat:
+      return visitComplex(Val, Ty, Offset);
     case APValue::FixedPoint:
       // FIXME: We should support these.
 
@@ -7323,6 +7324,31 @@ class APValueToBufferConverter {
     return true;
   }
 
+  bool visitComplex(const APValue &Val, QualType Ty, CharUnits Offset) {
+    const ComplexType *ComplexTy = Ty->castAs<ComplexType>();
+    QualType EltTy = ComplexTy->getElementType();
+    CharUnits EltSizeChars = Info.Ctx.getTypeSizeInChars(EltTy);
+    bool IsInt = Val.isComplexInt();
+
+    if (IsInt) {
+      if (!visitInt(Val.getComplexIntReal(), EltTy,
+                    Offset + (0 * EltSizeChars)))
+        return false;
+      if (!visitInt(Val.getComplexIntImag(), EltTy,
+                    Offset + (1 * EltSizeChars)))
+        return false;
+    } else {
+      if (!visitFloat(Val.getComplexFloatReal(), EltTy,
+                      Offset + (0 * EltSizeChars)))
+        return false;
+      if (!visitFloat(Val.getComplexFloatImag(), EltTy,
+                      Offset + (1 * EltSizeChars)))
+        return false;
+    }
+
+    return true;
+  }
+
   bool visitVector(const APValue &Val, QualType Ty, CharUnits Offset) {
     const VectorType *VTy = Ty->castAs<VectorType>();
     QualType EltTy = VTy->getElementType();
@@ -7595,6 +7621,23 @@ class BufferToAPValueConverter {
     return ArrayValue;
   }
 
+  std::optional<APValue> visit(const ComplexType *Ty, CharUnits Offset) {
+    QualType ElementType = Ty->getElementType();
+    CharUnits ElementWidth = Info.Ctx.getTypeSizeInChars(ElementType);
+    bool IsInt = ElementType->isIntegerType();
+
+    std::optional<APValue> Values[2];
+    for (unsigned I = 0; I != 2; ++I) {
+      Values[I] = visitType(Ty->getElementType(), Offset + I * ElementWidth);
+      if (!Values[I])
+        return std::nullopt;
+    }
+
+    if (IsInt)
+      return APValue(Values[0]->getInt(), Values[1]->getInt());
+    return APValue(Values[0]->getFloat(), Values[1]->getFloat());
+  }
+
   std::optional<APValue> visit(const VectorType *VTy, CharUnits Offset) {
     QualType EltTy = VTy->getElementType();
     unsigned NElts = VTy->getNumElements();
diff --git a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
index 7520b43a194a..5ddb77b35ff1 100644
--- a/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
+++ b/clang/test/SemaCXX/constexpr-builtin-bit-cast.cpp
@@ -511,3 +511,19 @@ constexpr bool9 bad_short_to_bool9 = __builtin_bit_cast(bool9, static_cast<unsig
 constexpr bool17 bad_int_to_bool17 = __builtin_bit_cast(bool17, 0x0001CAFEU);
 
 }
+
+namespace test_complex {
+  constexpr _Complex unsigned test_int_complex = { 0x0C05FEFE, 0xCAFEBABE };
+  static_assert(round_trip<_Complex unsigned>(0xCAFEBABE0C05FEFEULL), "");
+  static_assert(bit_cast<unsigned long long>(test_int_complex) == (LITTLE_END
+                                                                   ? 0xCAFEBABE0C05FEFE
+                                                                   : 0x0C05FEFECAFEBABE), "");
+  static_assert(sizeof(double) == 2 * sizeof(float));
+  struct TwoFloats { float A; float B; };
+  constexpr _Complex float test_float_complex = {1.0f, 2.0f};
+  constexpr TwoFloats TF = __builtin_bit_cast(TwoFloats, test_float_complex);
+  static_assert(TF.A == 1.0f && TF.B == 2.0f);
+
+  constexpr double D = __builtin_bit_cast(double, test_float_complex);
+  constexpr int M = __builtin_bit_cast(int, test_int_complex); // expected-error {{__builtin_bit_cast source size does not equal destination size}}
+}
-- 
GitLab


From 21223020ed1bb431d9812e11e0c45dcba5d31ff4 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 16 Oct 2024 10:54:08 +0200
Subject: [PATCH 094/329] [LLVM][AArch64][NFC] Remove redundant copy parameter
 in method (#110300)

Remove redundant copy parameter in method

Fixes #94233
---
 llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
index 72f110cebbdc..85b9733e95c5 100644
--- a/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
+++ b/llvm/lib/Target/AArch64/AArch64MachineFunctionInfo.h
@@ -303,7 +303,7 @@ public:
   void setLocalStackSize(uint64_t Size) { LocalStackSize = Size; }
   uint64_t getLocalStackSize() const { return LocalStackSize; }
 
-  void setOutliningStyle(std::string Style) { OutliningStyle = Style; }
+  void setOutliningStyle(const std::string &Style) { OutliningStyle = Style; }
   std::optional<std::string> getOutliningStyle() const {
     return OutliningStyle;
   }
-- 
GitLab


From 4ba1800be6c9294e21e2b87b64600daac12730c1 Mon Sep 17 00:00:00 2001
From: Amr Hesham <amr96@programmer.net>
Date: Wed, 16 Oct 2024 10:55:01 +0200
Subject: [PATCH 095/329] [LLVM][NFC] Reduce copying of parameter in lambda
 (#110299)

Reduce redundant copy parameter in lambda

Fixes #95642
---
 llvm/lib/Analysis/VectorUtils.cpp | 11 +++++------
 1 file changed, 5 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Analysis/VectorUtils.cpp b/llvm/lib/Analysis/VectorUtils.cpp
index dbffbb8a5f81..6b5251e0ad34 100644
--- a/llvm/lib/Analysis/VectorUtils.cpp
+++ b/llvm/lib/Analysis/VectorUtils.cpp
@@ -1414,7 +1414,7 @@ void InterleavedAccessInfo::analyzeInterleaving(
 
   auto InvalidateGroupIfMemberMayWrap = [&](InterleaveGroup<Instruction> *Group,
                                             int Index,
-                                            std::string FirstOrLast) -> bool {
+                                            const char *FirstOrLast) -> bool {
     Instruction *Member = Group->getMember(Index);
     assert(Member && "Group member does not exist");
     Value *MemberPtr = getLoadStorePointerOperand(Member);
@@ -1455,11 +1455,10 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // So we check only group member 0 (which is always guaranteed to exist),
     // and group member Factor - 1; If the latter doesn't exist we rely on
     // peeling (if it is a non-reversed access -- see Case 3).
-    if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
+    if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
       continue;
     if (Group->getMember(Group->getFactor() - 1))
-      InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1,
-                                     std::string("last"));
+      InvalidateGroupIfMemberMayWrap(Group, Group->getFactor() - 1, "last");
     else {
       // Case 3: A non-reversed interleaved load group with gaps: We need
       // to execute at least one scalar epilogue iteration. This will ensure
@@ -1503,11 +1502,11 @@ void InterleavedAccessInfo::analyzeInterleaving(
     // and the last group member. Case 3 (scalar epilog) is not relevant for
     // stores with gaps, which are implemented with masked-store (rather than
     // speculative access, as in loads).
-    if (InvalidateGroupIfMemberMayWrap(Group, 0, std::string("first")))
+    if (InvalidateGroupIfMemberMayWrap(Group, 0, "first"))
       continue;
     for (int Index = Group->getFactor() - 1; Index > 0; Index--)
       if (Group->getMember(Index)) {
-        InvalidateGroupIfMemberMayWrap(Group, Index, std::string("last"));
+        InvalidateGroupIfMemberMayWrap(Group, Index, "last");
         break;
       }
   }
-- 
GitLab


From 4c28d21f6af70ffee33660de35b263283dc32139 Mon Sep 17 00:00:00 2001
From: Benjamin Maxwell <benjamin.maxwell@arm.com>
Date: Wed, 16 Oct 2024 10:00:49 +0100
Subject: [PATCH 096/329] [AArch64] Avoid single-element vector fp converts in
 streaming[-compatible] functions (#112213)

The single-element vector variants of FCVTZS, FCVTZU, UCVTF, and SCVTF
are only supported in streaming[-compatible] functions with `+sme2p2`.

Reference:
-
https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/FCVTZS--vector--integer---Floating-point-convert-to-signed-integer--rounding-toward-zero--vector--
-
https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/UCVTF--vector--integer---Unsigned-integer-convert-to-floating-point--vector--
-
https://developer.arm.com/documentation/ddi0602/2024-09/SIMD-FP-Instructions/SCVTF--vector--integer---Signed-integer-convert-to-floating-point--vector--

Codegen will be improved in follow up patches.
---
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   |  15 +-
 .../sve-streaming-mode-cvt-fp-int-fp.ll       | 121 +++++
 ...e-streaming-mode-fixed-length-int-to-fp.ll | 426 +++++++++---------
 3 files changed, 333 insertions(+), 229 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll

diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 325508b62a9f..32f2c7c71d17 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -6237,7 +6237,8 @@ def : Pat<(v2f64 (AArch64frsqrts (v2f64 FPR128:$Rn), (v2f64 FPR128:$Rm))),
 // Some float -> int -> float conversion patterns for which we want to keep the
 // int values in FP registers using the corresponding NEON instructions to
 // avoid more costly int <-> fp register transfers.
-let Predicates = [HasNEONandIsStreamingSafe] in {
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON] in {
 def : Pat<(f64 (any_sint_to_fp (i64 (any_fp_to_sint f64:$Rn)))),
           (SCVTFv1i64 (i64 (FCVTZSv1i64 f64:$Rn)))>;
 def : Pat<(f32 (any_sint_to_fp (i32 (any_fp_to_sint f32:$Rn)))),
@@ -6247,7 +6248,8 @@ def : Pat<(f64 (any_uint_to_fp (i64 (any_fp_to_uint f64:$Rn)))),
 def : Pat<(f32 (any_uint_to_fp (i32 (any_fp_to_uint f32:$Rn)))),
           (UCVTFv1i32 (i32 (FCVTZUv1i32 f32:$Rn)))>;
 
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON, HasFullFP16] in {
 def : Pat<(f16 (any_sint_to_fp (i32 (any_fp_to_sint f16:$Rn)))),
           (SCVTFv1i16 (f16 (FCVTZSv1f16 f16:$Rn)))>;
 def : Pat<(f16 (any_uint_to_fp (i32 (any_fp_to_uint f16:$Rn)))),
@@ -6270,9 +6272,10 @@ def : Pat<(f64 (uint_to_fp (i64 (vector_extract (v2i64 FPR128:$Rn), (i64 0))))),
 
 // fp16: integer extraction from vector must be at least 32-bits to be legal.
 // Actual extraction result is then an in-reg sign-extension of lower 16-bits.
-let Predicates = [HasNEONandIsStreamingSafe, HasFullFP16] in {
-def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract 
-                (v8i16 FPR128:$Rn), (i64 0))), i16)))), 
+// TODO: Allow these in streaming[-compatible] functions with +sme2p2.
+let Predicates = [HasNEON, HasFullFP16] in {
+def : Pat<(f16 (sint_to_fp (i32 (sext_inreg (i32 (vector_extract
+                (v8i16 FPR128:$Rn), (i64 0))), i16)))),
           (SCVTFv1i16 (f16 (EXTRACT_SUBREG (v8i16 FPR128:$Rn), hsub)))>;
 
 // unsigned 32-bit extracted element is truncated to 16-bits using AND
@@ -6367,7 +6370,7 @@ def : Pat <(f64 (uint_to_fp (i32
                           (LDURSi GPR64sp:$Rn, simm9:$offset), ssub))>;
 // 64-bits -> double are handled in target specific dag combine:
 // performIntToFpCombine.
-} // let Predicates = [HasNEONandIsStreamingSafe]
+} // let Predicates = [HasNEON]
 
 //===----------------------------------------------------------------------===//
 // Advanced SIMD three different-sized vector instructions.
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
new file mode 100644
index 000000000000..9aadf3133ba1
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-cvt-fp-int-fp.ll
@@ -0,0 +1,121 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc -force-streaming-compatible  < %s | FileCheck %s
+; RUN: llc < %s | FileCheck %s --check-prefix=NON-STREAMING
+
+target triple = "aarch64-unknown-linux-gnu"
+
+define double @t1(double %x) {
+; CHECK-LABEL: t1:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs x8, d0
+; CHECK-NEXT:    scvtf d0, x8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t1:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzs d0, d0
+; NON-STREAMING-NEXT:    scvtf d0, d0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi double %x to i64
+  %conv1 = sitofp i64 %conv to double
+  ret double %conv1
+}
+
+define float @t2(float %x) {
+; CHECK-LABEL: t2:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t2:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzs s0, s0
+; NON-STREAMING-NEXT:    scvtf s0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi float %x to i32
+  %conv1 = sitofp i32 %conv to float
+  ret float %conv1
+}
+
+define half @t3(half %x)  {
+; CHECK-LABEL: t3:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvtzs w8, s0
+; CHECK-NEXT:    scvtf s0, w8
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t3:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvt s0, h0
+; NON-STREAMING-NEXT:    fcvtzs s0, s0
+; NON-STREAMING-NEXT:    scvtf s0, s0
+; NON-STREAMING-NEXT:    fcvt h0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptosi half %x to i32
+  %conv1 = sitofp i32 %conv to half
+  ret half %conv1
+}
+
+define double @t4(double %x) {
+; CHECK-LABEL: t4:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu x8, d0
+; CHECK-NEXT:    ucvtf d0, x8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t4:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzu d0, d0
+; NON-STREAMING-NEXT:    ucvtf d0, d0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui double %x to i64
+  %conv1 = uitofp i64 %conv to double
+  ret double %conv1
+}
+
+define float @t5(float %x) {
+; CHECK-LABEL: t5:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvtzu w8, s0
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t5:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvtzu s0, s0
+; NON-STREAMING-NEXT:    ucvtf s0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui float %x to i32
+  %conv1 = uitofp i32 %conv to float
+  ret float %conv1
+}
+
+define half @t6(half %x)  {
+; CHECK-LABEL: t6:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    fcvt s0, h0
+; CHECK-NEXT:    fcvtzu w8, s0
+; CHECK-NEXT:    ucvtf s0, w8
+; CHECK-NEXT:    fcvt h0, s0
+; CHECK-NEXT:    ret
+;
+; NON-STREAMING-LABEL: t6:
+; NON-STREAMING:       // %bb.0: // %entry
+; NON-STREAMING-NEXT:    fcvt s0, h0
+; NON-STREAMING-NEXT:    fcvtzu s0, s0
+; NON-STREAMING-NEXT:    ucvtf s0, s0
+; NON-STREAMING-NEXT:    fcvt h0, s0
+; NON-STREAMING-NEXT:    ret
+entry:
+  %conv = fptoui half %x to i32
+  %conv1 = uitofp i32 %conv to half
+  ret half %conv1
+}
diff --git a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
index afd3bb7161c1..0c712a15d4de 100644
--- a/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
+++ b/llvm/test/CodeGen/AArch64/sve-streaming-mode-fixed-length-int-to-fp.ll
@@ -21,20 +21,20 @@ define <4 x half> @ucvtf_v4i16_v4f16(<4 x i16> %op1) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
@@ -58,36 +58,36 @@ define void @ucvtf_v8i16_v8f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldr q0, [x0]
 ; NONEON-NOSVE-NEXT:    str q0, [sp, #-32]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #30]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #26]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #22]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
@@ -115,68 +115,68 @@ define void @ucvtf_v16i16_v16f16(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [x0]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [sp, #-64]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #62]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #60]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #58]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #54]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #52]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #50]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #46]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #42]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #6]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #6]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #38]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #2]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #2]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp]
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #34]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    str h0, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
@@ -207,11 +207,11 @@ define <2 x float> @ucvtf_v2i16_v2f32(<2 x i16> %op1) {
 ; NONEON-NOSVE:       // %bb.0:
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #-16]!
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #4]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
-; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #8]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #4]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ucvtf s1, w9
+; NONEON-NOSVE-NEXT:    stp s1, s0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp, #8]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -234,15 +234,15 @@ define <4 x float> @ucvtf_v4i16_v4f32(<4 x i16> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #32
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #14]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #14]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #10]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #24]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #10]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
@@ -271,25 +271,25 @@ define void @ucvtf_v8i16_v8f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #30]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #30]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #28]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #26]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #56]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #26]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #22]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #22]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #20]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #18]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #40]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #18]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #16]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
@@ -328,47 +328,47 @@ define void @ucvtf_v16i16_v16f32(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #46]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #46]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #44]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #42]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #88]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #42]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #38]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #38]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #36]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #34]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #72]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #34]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #32]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #62]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #62]
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #60]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #60]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #58]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #120]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #58]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #56]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #56]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #54]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #54]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #52]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #52]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #50]
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #104]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #50]
-; NONEON-NOSVE-NEXT:    ucvtf s1, s0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #48]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ucvtf s1, w8
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #48]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    stp s0, s1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
@@ -399,8 +399,8 @@ define <1 x double> @ucvtf_v1i16_v1f64(<1 x i16> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #16
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 16
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    str d0, [sp]
 ; NONEON-NOSVE-NEXT:    ldr d0, [sp], #16
 ; NONEON-NOSVE-NEXT:    ret
@@ -424,11 +424,11 @@ define <2 x double> @ucvtf_v2i16_v2f64(<2 x i16> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #32
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr h0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
-; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldrh w8, [sp, #12]
+; NONEON-NOSVE-NEXT:    ldrh w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    stp d1, d0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
 ; NONEON-NOSVE-NEXT:    ret
@@ -464,15 +464,13 @@ define void @ucvtf_v4i16_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #48]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
@@ -529,27 +527,23 @@ define void @ucvtf_v8i16_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
 ; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #32]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #88]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #92]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #88]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #80]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #144]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #84]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #80]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #72]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #128]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #76]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #64]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #128]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #72]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #68]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #64]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
@@ -649,49 +643,42 @@ define void @ucvtf_v16i16_v16f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    stp w8, w9, [sp, #104]
 ; NONEON-NOSVE-NEXT:    str d1, [sp, #328]
 ; NONEON-NOSVE-NEXT:    ldp d0, d1, [sp, #104]
-; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #164]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #160]
 ; NONEON-NOSVE-NEXT:    stp d1, d2, [sp, #176]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #160]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    str d0, [sp, #168]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #152]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #240]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #156]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #152]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #144]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #224]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #148]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #144]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #136]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #208]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #140]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #136]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #332]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #192]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #332]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w8
+; NONEON-NOSVE-NEXT:    ldr w8, [sp, #328]
 ; NONEON-NOSVE-NEXT:    ldp q4, q3, [sp, #192]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #328]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #184]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #304]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #188]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #184]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #176]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #288]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #180]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #168]
 ; NONEON-NOSVE-NEXT:    ldp q7, q6, [sp, #288]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #176]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #272]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #172]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #168]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #256]
 ; NONEON-NOSVE-NEXT:    ldp q1, q0, [sp, #224]
 ; NONEON-NOSVE-NEXT:    ldp q2, q5, [sp, #256]
@@ -1041,10 +1028,9 @@ define <2 x double> @ucvtf_v2i32_v2f64(<2 x i32> %op1) {
 ; NONEON-NOSVE-NEXT:    sub sp, sp, #32
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 32
 ; NONEON-NOSVE-NEXT:    str d0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #12]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #8]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #8]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
 ; NONEON-NOSVE-NEXT:    ldr q0, [sp, #16]
 ; NONEON-NOSVE-NEXT:    add sp, sp, #32
@@ -1073,15 +1059,13 @@ define void @ucvtf_v4i32_v4f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    .cfi_def_cfa_offset 64
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #16]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #28]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #24]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #24]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #16]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #20]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #16]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp q1, q0, [x1]
@@ -1120,27 +1104,23 @@ define void @ucvtf_v8i32_v8f64(ptr %a, ptr %b) {
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #32]
 ; NONEON-NOSVE-NEXT:    ldp d1, d0, [sp, #16]
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #40]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #48]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #44]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #40]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #32]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #80]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #36]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #32]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #56]
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #64]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #60]
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
+; NONEON-NOSVE-NEXT:    ldp w8, w9, [sp, #48]
 ; NONEON-NOSVE-NEXT:    ldp q3, q2, [sp, #64]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #56]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #112]
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #52]
-; NONEON-NOSVE-NEXT:    ucvtf d1, d0
-; NONEON-NOSVE-NEXT:    ldr s0, [sp, #48]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ucvtf d1, w9
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    stp d0, d1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    ldp q0, q1, [sp, #96]
 ; NONEON-NOSVE-NEXT:    stp q2, q3, [x1]
@@ -2984,8 +2964,8 @@ define half @ucvtf_i16_f16(ptr %0) {
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f16:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    fcvt h0, s0
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
@@ -2996,14 +2976,14 @@ define half @ucvtf_i16_f16(ptr %0) {
 define float @ucvtf_i16_f32(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f32:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ucvtf s0, s0
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ucvtf s0, w8
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f32:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf s0, s0
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf s0, w8
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to float
@@ -3013,14 +2993,14 @@ define float @ucvtf_i16_f32(ptr %0) {
 define double @ucvtf_i16_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i16_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr h0, [x0]
-; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ldrh w8, [x0]
+; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i16_f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr h0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldrh w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i16, ptr %0, align 64
   %3 = uitofp i16 %2 to double
@@ -3065,14 +3045,14 @@ define float @ucvtf_i32_f32(ptr %0) {
 define double @ucvtf_i32_f64(ptr %0) {
 ; CHECK-LABEL: ucvtf_i32_f64:
 ; CHECK:       // %bb.0:
-; CHECK-NEXT:    ldr s0, [x0]
-; CHECK-NEXT:    ucvtf d0, d0
+; CHECK-NEXT:    ldr w8, [x0]
+; CHECK-NEXT:    ucvtf d0, w8
 ; CHECK-NEXT:    ret
 ;
 ; NONEON-NOSVE-LABEL: ucvtf_i32_f64:
 ; NONEON-NOSVE:       // %bb.0:
-; NONEON-NOSVE-NEXT:    ldr s0, [x0]
-; NONEON-NOSVE-NEXT:    ucvtf d0, d0
+; NONEON-NOSVE-NEXT:    ldr w8, [x0]
+; NONEON-NOSVE-NEXT:    ucvtf d0, w8
 ; NONEON-NOSVE-NEXT:    ret
   %2 = load i32, ptr %0, align 64
   %3 = uitofp i32 %2 to double
-- 
GitLab


From 7c5d5c08181f399858d918d6910c021af4ec36c0 Mon Sep 17 00:00:00 2001
From: Karl-Johan Karlsson <karl-johan.karlsson@ericsson.com>
Date: Wed, 16 Oct 2024 11:01:33 +0200
Subject: [PATCH 097/329] [Sema] Fix warning in SemaOpenACC.cpp (#112481)

Fix gcc warning:

clang/lib/Sema/SemaOpenACC.cpp:2208:5: warning: this statement may fall
through [-Wimplicit-fallthrough=]
---
 clang/lib/Sema/SemaOpenACC.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/Sema/SemaOpenACC.cpp b/clang/lib/Sema/SemaOpenACC.cpp
index 22aedbc70df8..d33b0d0c1c30 100644
--- a/clang/lib/Sema/SemaOpenACC.cpp
+++ b/clang/lib/Sema/SemaOpenACC.cpp
@@ -2216,7 +2216,7 @@ ExprResult SemaOpenACC::CheckGangExpr(OpenACCGangKind GK, Expr *E) {
     case OpenACCGangKind::Static:
       return CheckGangStaticExpr(*this, E);
     }
-  }
+  } break;
   default:
     llvm_unreachable("Non compute construct in active compute construct?");
   }
-- 
GitLab


From 0eed3055511381436ee69d1caf64a4af47f8d65c Mon Sep 17 00:00:00 2001
From: Mehdi Amini <joker.eph@gmail.com>
Date: Wed, 16 Oct 2024 11:09:17 +0200
Subject: [PATCH 098/329] Revert "[MLIR][TableGen] Use const pointers for
 various `Init` objects" (#112506)

Reverts llvm/llvm-project#112316

Bots are failing.
---
 mlir/include/mlir/TableGen/AttrOrTypeDef.h    |  2 +-
 mlir/include/mlir/TableGen/Dialect.h          |  2 +-
 mlir/include/mlir/TableGen/Operator.h         | 15 ++++----
 mlir/lib/TableGen/AttrOrTypeDef.cpp           | 12 +++---
 mlir/lib/TableGen/Attribute.cpp               |  2 +-
 mlir/lib/TableGen/Dialect.cpp                 |  2 +-
 mlir/lib/TableGen/Interfaces.cpp              |  6 +--
 mlir/lib/TableGen/Operator.cpp                | 21 +++++-----
 mlir/lib/TableGen/Pattern.cpp                 |  2 +-
 mlir/lib/TableGen/Type.cpp                    |  2 +-
 mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp | 16 ++++----
 mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp | 38 +++++++++----------
 mlir/tools/mlir-tblgen/DialectGen.cpp         |  9 ++---
 mlir/tools/mlir-tblgen/OmpOpGen.cpp           | 19 ++++------
 14 files changed, 68 insertions(+), 80 deletions(-)

diff --git a/mlir/include/mlir/TableGen/AttrOrTypeDef.h b/mlir/include/mlir/TableGen/AttrOrTypeDef.h
index c3d730e42ef7..36744c85bc70 100644
--- a/mlir/include/mlir/TableGen/AttrOrTypeDef.h
+++ b/mlir/include/mlir/TableGen/AttrOrTypeDef.h
@@ -105,7 +105,7 @@ public:
   std::optional<StringRef> getDefaultValue() const;
 
   /// Return the underlying def of this parameter.
-  const llvm::Init *getDef() const;
+  llvm::Init *getDef() const;
 
   /// The parameter is pointer-comparable.
   bool operator==(const AttrOrTypeParameter &other) const {
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
index ea8f40555e44..3530d240c976 100644
--- a/mlir/include/mlir/TableGen/Dialect.h
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -92,7 +92,7 @@ public:
   /// dialect.
   bool usePropertiesForAttributes() const;
 
-  const llvm::DagInit *getDiscardableAttributes() const;
+  llvm::DagInit *getDiscardableAttributes() const;
 
   const llvm::Record *getDef() const { return def; }
 
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index 9e570373d9cd..768291a3a726 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -119,15 +119,14 @@ public:
 
   /// A utility iterator over a list of variable decorators.
   struct VariableDecoratorIterator
-      : public llvm::mapped_iterator<const llvm::Init *const *,
-                                     VariableDecorator (*)(
-                                         const llvm::Init *)> {
+      : public llvm::mapped_iterator<llvm::Init *const *,
+                                     VariableDecorator (*)(llvm::Init *)> {
     /// Initializes the iterator to the specified iterator.
-    VariableDecoratorIterator(const llvm::Init *const *it)
-        : llvm::mapped_iterator<const llvm::Init *const *,
-                                VariableDecorator (*)(const llvm::Init *)>(
-              it, &unwrap) {}
-    static VariableDecorator unwrap(const llvm::Init *init);
+    VariableDecoratorIterator(llvm::Init *const *it)
+        : llvm::mapped_iterator<llvm::Init *const *,
+                                VariableDecorator (*)(llvm::Init *)>(it,
+                                                                     &unwrap) {}
+    static VariableDecorator unwrap(llvm::Init *init);
   };
   using var_decorator_iterator = VariableDecoratorIterator;
   using var_decorator_range = llvm::iterator_range<VariableDecoratorIterator>;
diff --git a/mlir/lib/TableGen/AttrOrTypeDef.cpp b/mlir/lib/TableGen/AttrOrTypeDef.cpp
index e72ca155bcf7..9b9d9fd2317d 100644
--- a/mlir/lib/TableGen/AttrOrTypeDef.cpp
+++ b/mlir/lib/TableGen/AttrOrTypeDef.cpp
@@ -40,7 +40,7 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) {
   auto *builderList =
       dyn_cast_or_null<llvm::ListInit>(def->getValueInit("builders"));
   if (builderList && !builderList->empty()) {
-    for (const llvm::Init *init : builderList->getValues()) {
+    for (llvm::Init *init : builderList->getValues()) {
       AttrOrTypeBuilder builder(cast<llvm::DefInit>(init)->getDef(),
                                 def->getLoc());
 
@@ -58,8 +58,8 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) {
   if (auto *traitList = def->getValueAsListInit("traits")) {
     SmallPtrSet<const llvm::Init *, 32> traitSet;
     traits.reserve(traitSet.size());
-    llvm::unique_function<void(const llvm::ListInit *)> processTraitList =
-        [&](const llvm::ListInit *traitList) {
+    llvm::unique_function<void(llvm::ListInit *)> processTraitList =
+        [&](llvm::ListInit *traitList) {
           for (auto *traitInit : *traitList) {
             if (!traitSet.insert(traitInit).second)
               continue;
@@ -335,9 +335,7 @@ std::optional<StringRef> AttrOrTypeParameter::getDefaultValue() const {
   return result && !result->empty() ? result : std::nullopt;
 }
 
-const llvm::Init *AttrOrTypeParameter::getDef() const {
-  return def->getArg(index);
-}
+llvm::Init *AttrOrTypeParameter::getDef() const { return def->getArg(index); }
 
 std::optional<Constraint> AttrOrTypeParameter::getConstraint() const {
   if (auto *param = dyn_cast<llvm::DefInit>(getDef()))
@@ -351,7 +349,7 @@ std::optional<Constraint> AttrOrTypeParameter::getConstraint() const {
 //===----------------------------------------------------------------------===//
 
 bool AttributeSelfTypeParameter::classof(const AttrOrTypeParameter *param) {
-  const llvm::Init *paramDef = param->getDef();
+  llvm::Init *paramDef = param->getDef();
   if (auto *paramDefInit = dyn_cast<llvm::DefInit>(paramDef))
     return paramDefInit->getDef()->isSubClassOf("AttributeSelfTypeParameter");
   return false;
diff --git a/mlir/lib/TableGen/Attribute.cpp b/mlir/lib/TableGen/Attribute.cpp
index 887553bca661..de930cb40070 100644
--- a/mlir/lib/TableGen/Attribute.cpp
+++ b/mlir/lib/TableGen/Attribute.cpp
@@ -126,7 +126,7 @@ StringRef Attribute::getDerivedCodeBody() const {
 Dialect Attribute::getDialect() const {
   const llvm::RecordVal *record = def->getValue("dialect");
   if (record && record->getValue()) {
-    if (const DefInit *init = dyn_cast<DefInit>(record->getValue()))
+    if (DefInit *init = dyn_cast<DefInit>(record->getValue()))
       return Dialect(init->getDef());
   }
   return Dialect(nullptr);
diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp
index ef39818e439b..081f6e56f9de 100644
--- a/mlir/lib/TableGen/Dialect.cpp
+++ b/mlir/lib/TableGen/Dialect.cpp
@@ -106,7 +106,7 @@ bool Dialect::usePropertiesForAttributes() const {
   return def->getValueAsBit("usePropertiesForAttributes");
 }
 
-const llvm::DagInit *Dialect::getDiscardableAttributes() const {
+llvm::DagInit *Dialect::getDiscardableAttributes() const {
   return def->getValueAsDag("discardableAttrs");
 }
 
diff --git a/mlir/lib/TableGen/Interfaces.cpp b/mlir/lib/TableGen/Interfaces.cpp
index 4a6709a43d0a..a209b003b0f3 100644
--- a/mlir/lib/TableGen/Interfaces.cpp
+++ b/mlir/lib/TableGen/Interfaces.cpp
@@ -22,7 +22,7 @@ using namespace mlir::tblgen;
 //===----------------------------------------------------------------------===//
 
 InterfaceMethod::InterfaceMethod(const llvm::Record *def) : def(def) {
-  const llvm::DagInit *args = def->getValueAsDag("arguments");
+  llvm::DagInit *args = def->getValueAsDag("arguments");
   for (unsigned i = 0, e = args->getNumArgs(); i != e; ++i) {
     arguments.push_back(
         {llvm::cast<llvm::StringInit>(args->getArg(i))->getValue(),
@@ -78,7 +78,7 @@ Interface::Interface(const llvm::Record *def) : def(def) {
 
   // Initialize the interface methods.
   auto *listInit = dyn_cast<llvm::ListInit>(def->getValueInit("methods"));
-  for (const llvm::Init *init : listInit->getValues())
+  for (llvm::Init *init : listInit->getValues())
     methods.emplace_back(cast<llvm::DefInit>(init)->getDef());
 
   // Initialize the interface base classes.
@@ -98,7 +98,7 @@ Interface::Interface(const llvm::Record *def) : def(def) {
         baseInterfaces.push_back(std::make_unique<Interface>(baseInterface));
         basesAdded.insert(baseInterface.getName());
       };
-  for (const llvm::Init *init : basesInit->getValues())
+  for (llvm::Init *init : basesInit->getValues())
     addBaseInterfaceFn(Interface(cast<llvm::DefInit>(init)->getDef()));
 }
 
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index 86670e9f8712..6a33ff5ecd67 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -161,7 +161,7 @@ std::string Operator::getQualCppClassName() const {
 StringRef Operator::getCppNamespace() const { return cppNamespace; }
 
 int Operator::getNumResults() const {
-  const DagInit *results = def.getValueAsDag("results");
+  DagInit *results = def.getValueAsDag("results");
   return results->getNumArgs();
 }
 
@@ -198,12 +198,12 @@ auto Operator::getResults() const -> const_value_range {
 }
 
 TypeConstraint Operator::getResultTypeConstraint(int index) const {
-  const DagInit *results = def.getValueAsDag("results");
+  DagInit *results = def.getValueAsDag("results");
   return TypeConstraint(cast<DefInit>(results->getArg(index)));
 }
 
 StringRef Operator::getResultName(int index) const {
-  const DagInit *results = def.getValueAsDag("results");
+  DagInit *results = def.getValueAsDag("results");
   return results->getArgNameStr(index);
 }
 
@@ -241,7 +241,7 @@ Operator::arg_range Operator::getArgs() const {
 }
 
 StringRef Operator::getArgName(int index) const {
-  const DagInit *argumentValues = def.getValueAsDag("arguments");
+  DagInit *argumentValues = def.getValueAsDag("arguments");
   return argumentValues->getArgNameStr(index);
 }
 
@@ -557,7 +557,7 @@ void Operator::populateOpStructure() {
   auto *opVarClass = recordKeeper.getClass("OpVariable");
   numNativeAttributes = 0;
 
-  const DagInit *argumentValues = def.getValueAsDag("arguments");
+  DagInit *argumentValues = def.getValueAsDag("arguments");
   unsigned numArgs = argumentValues->getNumArgs();
 
   // Mapping from name of to argument or result index. Arguments are indexed
@@ -721,8 +721,8 @@ void Operator::populateOpStructure() {
                   " to precede it in traits list");
     };
 
-    std::function<void(const llvm::ListInit *)> insert;
-    insert = [&](const llvm::ListInit *traitList) {
+    std::function<void(llvm::ListInit *)> insert;
+    insert = [&](llvm::ListInit *traitList) {
       for (auto *traitInit : *traitList) {
         auto *def = cast<DefInit>(traitInit)->getDef();
         if (def->isSubClassOf("TraitList")) {
@@ -780,7 +780,7 @@ void Operator::populateOpStructure() {
   auto *builderList =
       dyn_cast_or_null<llvm::ListInit>(def.getValueInit("builders"));
   if (builderList && !builderList->empty()) {
-    for (const llvm::Init *init : builderList->getValues())
+    for (llvm::Init *init : builderList->getValues())
       builders.emplace_back(cast<llvm::DefInit>(init)->getDef(), def.getLoc());
   } else if (skipDefaultBuilders()) {
     PrintFatalError(
@@ -818,8 +818,7 @@ bool Operator::hasAssemblyFormat() const {
 }
 
 StringRef Operator::getAssemblyFormat() const {
-  return TypeSwitch<const llvm::Init *, StringRef>(
-             def.getValueInit("assemblyFormat"))
+  return TypeSwitch<llvm::Init *, StringRef>(def.getValueInit("assemblyFormat"))
       .Case<llvm::StringInit>([&](auto *init) { return init->getValue(); });
 }
 
@@ -833,7 +832,7 @@ void Operator::print(llvm::raw_ostream &os) const {
   }
 }
 
-auto Operator::VariableDecoratorIterator::unwrap(const llvm::Init *init)
+auto Operator::VariableDecoratorIterator::unwrap(llvm::Init *init)
     -> VariableDecorator {
   return VariableDecorator(cast<llvm::DefInit>(init)->getDef());
 }
diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
index bee20354387f..6437839ef208 100644
--- a/mlir/lib/TableGen/Pattern.cpp
+++ b/mlir/lib/TableGen/Pattern.cpp
@@ -700,7 +700,7 @@ int Pattern::getBenefit() const {
   // The initial benefit value is a heuristic with number of ops in the source
   // pattern.
   int initBenefit = getSourcePattern().getNumOps();
-  const llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
+  llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
   if (delta->getNumArgs() != 1 || !isa<llvm::IntInit>(delta->getArg(0))) {
     PrintFatalError(&def,
                     "The 'addBenefit' takes and only takes one integer value");
diff --git a/mlir/lib/TableGen/Type.cpp b/mlir/lib/TableGen/Type.cpp
index c3b813ec598d..cda752297988 100644
--- a/mlir/lib/TableGen/Type.cpp
+++ b/mlir/lib/TableGen/Type.cpp
@@ -50,7 +50,7 @@ std::optional<StringRef> TypeConstraint::getBuilderCall() const {
   const llvm::RecordVal *builderCall = baseType->getValue("builderCall");
   if (!builderCall || !builderCall->getValue())
     return std::nullopt;
-  return TypeSwitch<const llvm::Init *, std::optional<StringRef>>(
+  return TypeSwitch<llvm::Init *, std::optional<StringRef>>(
              builderCall->getValue())
       .Case<llvm::StringInit>([&](auto *init) {
         StringRef value = init->getValue();
diff --git a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
index 20ad4292a548..7119324dd125 100644
--- a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
+++ b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
@@ -30,8 +30,8 @@ enum DeprecatedAction { None, Warn, Error };
 static DeprecatedAction actionOnDeprecatedValue;
 
 // Returns if there is a use of `deprecatedInit` in `field`.
-static bool findUse(const Init *field, const Init *deprecatedInit,
-                    llvm::DenseMap<const Init *, bool> &known) {
+static bool findUse(Init *field, Init *deprecatedInit,
+                    llvm::DenseMap<Init *, bool> &known) {
   if (field == deprecatedInit)
     return true;
 
@@ -64,13 +64,13 @@ static bool findUse(const Init *field, const Init *deprecatedInit,
     if (findUse(dagInit->getOperator(), deprecatedInit, known))
       return memoize(true);
 
-    return memoize(llvm::any_of(dagInit->getArgs(), [&](const Init *arg) {
+    return memoize(llvm::any_of(dagInit->getArgs(), [&](Init *arg) {
       return findUse(arg, deprecatedInit, known);
     }));
   }
 
-  if (const ListInit *li = dyn_cast<ListInit>(field)) {
-    return memoize(llvm::any_of(li->getValues(), [&](const Init *jt) {
+  if (ListInit *li = dyn_cast<ListInit>(field)) {
+    return memoize(llvm::any_of(li->getValues(), [&](Init *jt) {
       return findUse(jt, deprecatedInit, known);
     }));
   }
@@ -83,8 +83,8 @@ static bool findUse(const Init *field, const Init *deprecatedInit,
 }
 
 // Returns if there is a use of `deprecatedInit` in `record`.
-static bool findUse(Record &record, const Init *deprecatedInit,
-                    llvm::DenseMap<const Init *, bool> &known) {
+static bool findUse(Record &record, Init *deprecatedInit,
+                    llvm::DenseMap<Init *, bool> &known) {
   return llvm::any_of(record.getValues(), [&](const RecordVal &val) {
     return findUse(val.getValue(), deprecatedInit, known);
   });
@@ -100,7 +100,7 @@ static void warnOfDeprecatedUses(const RecordKeeper &records) {
     if (!r || !r->getValue())
       continue;
 
-    llvm::DenseMap<const Init *, bool> hasUse;
+    llvm::DenseMap<Init *, bool> hasUse;
     if (auto *si = dyn_cast<StringInit>(r->getValue())) {
       for (auto &jt : records.getDefs()) {
         // Skip anonymous defs.
diff --git a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
index 6a3d5a25e28c..86ebaf2cf27d 100644
--- a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
@@ -46,9 +46,8 @@ public:
 private:
   /// Emits parse calls to construct given kind.
   void emitParseHelper(StringRef kind, StringRef returnType, StringRef builder,
-                       ArrayRef<const Init *> args,
-                       ArrayRef<std::string> argNames, StringRef failure,
-                       mlir::raw_indented_ostream &ios);
+                       ArrayRef<Init *> args, ArrayRef<std::string> argNames,
+                       StringRef failure, mlir::raw_indented_ostream &ios);
 
   /// Emits print instructions.
   void emitPrintHelper(const Record *memberRec, StringRef kind,
@@ -136,12 +135,10 @@ void Generator::emitParse(StringRef kind, const Record &x) {
       R"(static {0} read{1}(MLIRContext* context, DialectBytecodeReader &reader) )";
   mlir::raw_indented_ostream os(output);
   std::string returnType = getCType(&x);
-  os << formatv(head,
-                kind == "attribute" ? "::mlir::Attribute" : "::mlir::Type",
-                x.getName());
-  const DagInit *members = x.getValueAsDag("members");
-  SmallVector<std::string> argNames = llvm::to_vector(
-      map_range(members->getArgNames(), [](const StringInit *init) {
+  os << formatv(head, kind == "attribute" ? "::mlir::Attribute" : "::mlir::Type", x.getName());
+  DagInit *members = x.getValueAsDag("members");
+  SmallVector<std::string> argNames =
+      llvm::to_vector(map_range(members->getArgNames(), [](StringInit *init) {
         return init->getAsUnquotedString();
       }));
   StringRef builder = x.getValueAsString("cBuilder").trim();
@@ -151,7 +148,7 @@ void Generator::emitParse(StringRef kind, const Record &x) {
 }
 
 void printParseConditional(mlir::raw_indented_ostream &ios,
-                           ArrayRef<const Init *> args,
+                           ArrayRef<Init *> args,
                            ArrayRef<std::string> argNames) {
   ios << "if ";
   auto parenScope = ios.scope("(", ") {");
@@ -162,7 +159,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
   };
 
   auto parsedArgs =
-      llvm::to_vector(make_filter_range(args, [](const Init *const attr) {
+      llvm::to_vector(make_filter_range(args, [](Init *const attr) {
         const Record *def = cast<DefInit>(attr)->getDef();
         if (def->isSubClassOf("Array"))
           return true;
@@ -171,7 +168,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
 
   interleave(
       zip(parsedArgs, argNames),
-      [&](std::tuple<const Init *&, const std::string &> it) {
+      [&](std::tuple<llvm::Init *&, const std::string &> it) {
         const Record *attr = cast<DefInit>(std::get<0>(it))->getDef();
         std::string parser;
         if (auto optParser = attr->getValueAsOptionalString("cParser")) {
@@ -199,7 +196,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
 }
 
 void Generator::emitParseHelper(StringRef kind, StringRef returnType,
-                                StringRef builder, ArrayRef<const Init *> args,
+                                StringRef builder, ArrayRef<Init *> args,
                                 ArrayRef<std::string> argNames,
                                 StringRef failure,
                                 mlir::raw_indented_ostream &ios) {
@@ -213,7 +210,7 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType,
   // Print decls.
   std::string lastCType = "";
   for (auto [arg, name] : zip(args, argNames)) {
-    const DefInit *first = dyn_cast<DefInit>(arg);
+    DefInit *first = dyn_cast<DefInit>(arg);
     if (!first)
       PrintFatalError("Unexpected type for " + name);
     const Record *def = first->getDef();
@@ -254,14 +251,13 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType,
     std::string returnType = getCType(def);
     ios << "auto " << listHelperName(name) << " = [&]() -> FailureOr<"
         << returnType << "> ";
-    SmallVector<const Init *> args;
+    SmallVector<Init *> args;
     SmallVector<std::string> argNames;
     if (def->isSubClassOf("CompositeBytecode")) {
-      const DagInit *members = def->getValueAsDag("members");
-      args = llvm::to_vector(map_range(
-          members->getArgs(), [](Init *init) { return (const Init *)init; }));
+      DagInit *members = def->getValueAsDag("members");
+      args = llvm::to_vector(members->getArgs());
       argNames = llvm::to_vector(
-          map_range(members->getArgNames(), [](const StringInit *init) {
+          map_range(members->getArgNames(), [](StringInit *init) {
             return init->getAsUnquotedString();
           }));
     } else {
@@ -336,7 +332,7 @@ void Generator::emitPrint(StringRef kind, StringRef type,
     auto *members = rec->getValueAsDag("members");
     for (auto [arg, name] :
          llvm::zip(members->getArgs(), members->getArgNames())) {
-      const DefInit *def = dyn_cast<DefInit>(arg);
+      DefInit *def = dyn_cast<DefInit>(arg);
       assert(def);
       const Record *memberRec = def->getDef();
       emitPrintHelper(memberRec, kind, kind, name->getAsUnquotedString(), os);
@@ -389,7 +385,7 @@ void Generator::emitPrintHelper(const Record *memberRec, StringRef kind,
     auto *members = memberRec->getValueAsDag("members");
     for (auto [arg, argName] :
          zip(members->getArgs(), members->getArgNames())) {
-      const DefInit *def = dyn_cast<DefInit>(arg);
+      DefInit *def = dyn_cast<DefInit>(arg);
       assert(def);
       emitPrintHelper(def->getDef(), kind, parent,
                       argName->getAsUnquotedString(), ios);
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 414cad5e1dcc..55c3d9da2590 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -46,10 +46,10 @@ using DialectFilterIterator =
 } // namespace
 
 static void populateDiscardableAttributes(
-    Dialect &dialect, const llvm::DagInit *discardableAttrDag,
+    Dialect &dialect, llvm::DagInit *discardableAttrDag,
     SmallVector<std::pair<std::string, std::string>> &discardableAttributes) {
   for (int i : llvm::seq<int>(0, discardableAttrDag->getNumArgs())) {
-    const llvm::Init *arg = discardableAttrDag->getArg(i);
+    llvm::Init *arg = discardableAttrDag->getArg(i);
 
     StringRef givenName = discardableAttrDag->getArgNameStr(i);
     if (givenName.empty())
@@ -271,8 +271,7 @@ static void emitDialectDecl(Dialect &dialect, raw_ostream &os) {
     if (dialect.hasOperationInterfaceFallback())
       os << operationInterfaceFallbackDecl;
 
-    const llvm::DagInit *discardableAttrDag =
-        dialect.getDiscardableAttributes();
+    llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
     SmallVector<std::pair<std::string, std::string>> discardableAttributes;
     populateDiscardableAttributes(dialect, discardableAttrDag,
                                   discardableAttributes);
@@ -371,7 +370,7 @@ static void emitDialectDef(Dialect &dialect, const RecordKeeper &records,
   StringRef superClassName =
       dialect.isExtensible() ? "ExtensibleDialect" : "Dialect";
 
-  const llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
+  llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
   SmallVector<std::pair<std::string, std::string>> discardableAttributes;
   populateDiscardableAttributes(dialect, discardableAttrDag,
                                 discardableAttributes);
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index 8716667723a3..1c20a6a9bcf4 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -102,13 +102,11 @@ static StringRef extractOmpClauseName(const Record *clause) {
 
 /// Check that the given argument, identified by its name and initialization
 /// value, is present in the \c arguments `dag`.
-static bool verifyArgument(const DagInit *arguments, StringRef argName,
-                           const Init *argInit) {
+static bool verifyArgument(DagInit *arguments, StringRef argName,
+                           Init *argInit) {
   auto range = zip_equal(arguments->getArgNames(), arguments->getArgs());
   return llvm::any_of(
-      range,
-      [&](std::tuple<const llvm::StringInit *const &, const llvm::Init *const &>
-              v) {
+      range, [&](std::tuple<llvm::StringInit *const &, llvm::Init *const &> v) {
         return std::get<0>(v)->getAsUnquotedString() == argName &&
                std::get<1>(v) == argInit;
       });
@@ -143,8 +141,8 @@ static void verifyClause(const Record *op, const Record *clause) {
   StringRef clauseClassName = extractOmpClauseName(clause);
 
   if (!clause->getValueAsBit("ignoreArgs")) {
-    const DagInit *opArguments = op->getValueAsDag("arguments");
-    const DagInit *arguments = clause->getValueAsDag("arguments");
+    DagInit *opArguments = op->getValueAsDag("arguments");
+    DagInit *arguments = clause->getValueAsDag("arguments");
 
     for (auto [name, arg] :
          zip(arguments->getArgNames(), arguments->getArgs())) {
@@ -210,9 +208,8 @@ static void verifyClause(const Record *op, const Record *clause) {
 ///
 /// \return the name of the base type to represent elements of the argument
 ///         type.
-static StringRef translateArgumentType(ArrayRef<SMLoc> loc,
-                                       const StringInit *name, const Init *init,
-                                       int &nest, int &rank) {
+static StringRef translateArgumentType(ArrayRef<SMLoc> loc, StringInit *name,
+                                       Init *init, int &nest, int &rank) {
   const Record *def = cast<DefInit>(init)->getDef();
 
   llvm::StringSet<> superClasses;
@@ -285,7 +282,7 @@ static void genClauseOpsStruct(const Record *clause, raw_ostream &os) {
   StringRef clauseName = extractOmpClauseName(clause);
   os << "struct " << clauseName << "ClauseOps {\n";
 
-  const DagInit *arguments = clause->getValueAsDag("arguments");
+  DagInit *arguments = clause->getValueAsDag("arguments");
   for (auto [name, arg] :
        zip_equal(arguments->getArgNames(), arguments->getArgs())) {
     int nest = 0, rank = 1;
-- 
GitLab


From 1d40fefb08e9b11b72bf40274aa7839ae9f7fe07 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Tue, 15 Oct 2024 23:36:10 +0100
Subject: [PATCH 099/329] [RISCV] Add zvfhmin/zvfbfmin cost model tests for
 libcall ops. NFC

---
 .../Analysis/CostModel/RISCV/fp-sqrt-pow.ll   | 207 +++--
 .../CostModel/RISCV/fp-trig-log-exp.ll        | 805 ++++++++++++------
 2 files changed, 655 insertions(+), 357 deletions(-)

diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index 78acba832dbb..1768222b8a92 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -1,17 +1,18 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define void @sqrt() {
 ; CHECK-LABEL: 'sqrt'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call bfloat @llvm.sqrt.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x bfloat> @llvm.sqrt.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x bfloat> @llvm.sqrt.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x bfloat> @llvm.sqrt.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x bfloat> @llvm.sqrt.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call float @llvm.sqrt.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %11 = call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %12 = call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -33,15 +34,15 @@ define void @sqrt() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <vscale x 8 x double> @llvm.sqrt.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
-  call half @llvm.sqrt.f16(half undef)
-  call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
-  call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
-  call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
-  call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
-  call <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half> undef)
-  call <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half> undef)
-  call <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half> undef)
-  call <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half> undef)
+  call bfloat @llvm.sqrt.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.sqrt.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.sqrt.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.sqrt.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.sqrt.v16bf16(<16 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.sqrt.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.sqrt.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.sqrt.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.sqrt.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.sqrt.f32(float undef)
   call <2 x float> @llvm.sqrt.v2f32(<2 x float> undef)
   call <4 x float> @llvm.sqrt.v4f32(<4 x float> undef)
@@ -64,58 +65,74 @@ define void @sqrt() {
   ret void
 }
 
-declare half @llvm.sqrt.f16(half)
-declare <2 x half> @llvm.sqrt.v2f16(<2 x half>)
-declare <4 x half> @llvm.sqrt.v4f16(<4 x half>)
-declare <8 x half> @llvm.sqrt.v8f16(<8 x half>)
-declare <16 x half> @llvm.sqrt.v16f16(<16 x half>)
-declare <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half>)
-declare <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half>)
-declare <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half>)
-declare <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half>)
-declare float @llvm.sqrt.f32(float)
-declare <2 x float> @llvm.sqrt.v2f32(<2 x float>)
-declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
-declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
-declare <16 x float> @llvm.sqrt.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.sqrt.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.sqrt.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.sqrt.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.sqrt.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.sqrt.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.sqrt.f64(double)
-declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
-declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
-declare <8 x double> @llvm.sqrt.v8f64(<8 x double>)
-declare <16 x double> @llvm.sqrt.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.sqrt.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.sqrt.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.sqrt.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.sqrt.nvx8f64(<vscale x 8 x double>)
+define void @sqrt_f16() {
+; CHECK-LABEL: 'sqrt_f16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sqrt.f16(half undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %2 = call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %3 = call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %4 = call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %5 = call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 2 x half> @llvm.sqrt.nxv2f16(<vscale x 2 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 4 x half> @llvm.sqrt.nxv4f16(<vscale x 4 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 8 x half> @llvm.sqrt.nxv8f16(<vscale x 8 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 16 x half> @llvm.sqrt.nxv16f16(<vscale x 16 x half> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.sqrt.f16(half undef)
+  call <2 x half> @llvm.sqrt.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.sqrt.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.sqrt.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.sqrt.v16f16(<16 x half> undef)
+  call <vscale x 2 x half> @llvm.sqrt.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.sqrt.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.sqrt.nvx8f16(<vscale x 8 x half> undef)
+  call <vscale x 16 x half> @llvm.sqrt.nvx16f16(<vscale x 16 x half> undef)
+  ret void
+}
 
 define void @pow() {
 ; CHECK-LABEL: 'pow'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.pow.f32(float undef, float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.pow.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.pow.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.pow.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.pow.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.pow.f64(double undef, double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.pow.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.pow.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.pow.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.pow.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.pow.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.pow.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.pow.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.pow.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.pow.f32(float undef, float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.pow.v8f32(<8 x float> undef, <8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.pow.v16f32(<16 x float> undef, <16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.pow.nxv1f32(<vscale x 1 x float> undef, <vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.pow.nxv2f32(<vscale x 2 x float> undef, <vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.pow.nxv4f32(<vscale x 4 x float> undef, <vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.pow.nxv8f32(<vscale x 8 x float> undef, <vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.pow.nxv16f32(<vscale x 16 x float> undef, <vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.pow.f64(double undef, double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.pow.v2f64(<2 x double> undef, <2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.pow.v4f64(<4 x double> undef, <4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.pow.v8f64(<8 x double> undef, <8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.pow.v16f64(<16 x double> undef, <16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.pow.nxv1f64(<vscale x 1 x double> undef, <vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.pow.nxv2f64(<vscale x 2 x double> undef, <vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.pow.nxv4f64(<vscale x 4 x double> undef, <vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.pow.nxv8f64(<vscale x 8 x double> undef, <vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.pow.bf16(bfloat undef, bfloat undef)
+  call <2 x bfloat> @llvm.pow.v2bf16(<2 x bfloat> undef, <2 x bfloat> undef)
+  call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
+  call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
+  call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.pow.nvx1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.pow.nvx2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.pow.nvx4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.pow.nvx8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.pow.nvx16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
   call float @llvm.pow.f32(float undef, float undef)
   call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
   call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
@@ -138,22 +155,42 @@ define void @pow() {
   ret void
 }
 
-declare float @llvm.pow.f32(float, float)
-declare <2 x float> @llvm.pow.v2f32(<2 x float>, <2 x float>)
-declare <4 x float> @llvm.pow.v4f32(<4 x float>, <4 x float>)
-declare <8 x float> @llvm.pow.v8f32(<8 x float>, <8 x float>)
-declare <16 x float> @llvm.pow.v16f32(<16 x float>, <16 x float>)
-declare <vscale x 1 x float> @llvm.pow.nvx1f32(<vscale x 1 x float>, <vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.pow.nvx2f32(<vscale x 2 x float>, <vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.pow.nvx4f32(<vscale x 4 x float>, <vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.pow.nvx8f32(<vscale x 8 x float>, <vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.pow.nvx16f32(<vscale x 16 x float>, <vscale x 16 x float>)
-declare double @llvm.pow.f64(double, double)
-declare <2 x double> @llvm.pow.v2f64(<2 x double>, <2 x double>)
-declare <4 x double> @llvm.pow.v4f64(<4 x double>, <4 x double>)
-declare <8 x double> @llvm.pow.v8f64(<8 x double>, <8 x double>)
-declare <16 x double> @llvm.pow.v16f64(<16 x double>, <16 x double>)
-declare <vscale x 1 x double> @llvm.pow.nvx1f64(<vscale x 1 x double>, <vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.pow.nvx2f64(<vscale x 2 x double>, <vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.pow.nvx4f64(<vscale x 4 x double>, <vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.pow.nvx8f64(<vscale x 8 x double>, <vscale x 8 x double>)
+define void @pow_f16() {
+; ZVFH-LABEL: 'pow_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'pow_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.pow.f16(half undef, half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.pow.f16(half undef, half undef)
+  call <2 x half> @llvm.pow.v2f16(<2 x half> undef, <2 x half> undef)
+  call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
+  call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
+  call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
+  call <vscale x 1 x half> @llvm.pow.nvx1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.pow.nvx2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.pow.nvx4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.pow.nvx8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+  call <vscale x 16 x half> @llvm.pow.nvx16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+  ret void
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
index af779116a627..d65fa43b8995 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
@@ -1,29 +1,50 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfh,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFH
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zvfhmin,+zvfbfmin | FileCheck %s --check-prefixes=CHECK,ZVFHMIN
 
 define void @sin() {
 ; CHECK-LABEL: 'sin'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.sin.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.sin.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.sin.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.sin.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.sin.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.sin.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.sin.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.sin.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.sin.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.sin.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.sin.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.sin.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.sin.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.sin.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.sin.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.sin.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.sin.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.sin.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.sin.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.sin.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.sin.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.sin.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.sin.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.sin.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.sin.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.sin.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.sin.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.sin.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.sin.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.sin.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.sin.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.sin.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.sin.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.sin.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.sin.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.sin.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.sin.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.sin.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.sin.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.sin.f32(float undef)
   call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
   call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
@@ -46,29 +67,86 @@ define void @sin() {
   ret void
 }
 
+define void @sin_f16() {
+; ZVFH-LABEL: 'sin_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.sin.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'sin_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.sin.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.sin.f16(half undef)
+  call <2 x half> @llvm.sin.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.sin.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.sin.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.sin.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.sin.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @cos() {
 ; CHECK-LABEL: 'cos'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.cos.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.cos.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.cos.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.cos.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.cos.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.cos.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.cos.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.cos.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.cos.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.cos.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.cos.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.cos.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.cos.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.cos.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.cos.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.cos.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.cos.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.cos.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.cos.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.cos.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.cos.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.cos.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.cos.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.cos.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.cos.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.cos.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.cos.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.cos.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.cos.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.cos.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.cos.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.cos.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.cos.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.cos.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.cos.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.cos.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.cos.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.cos.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.cos.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.cos.f32(float undef)
   call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
   call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
@@ -91,29 +169,86 @@ define void @cos() {
   ret void
 }
 
+define void @cos_f16() {
+; ZVFH-LABEL: 'cos_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.cos.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'cos_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.cos.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.cos.f16(half undef)
+  call <2 x half> @llvm.cos.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.cos.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.cos.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.cos.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.cos.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @exp() {
 ; CHECK-LABEL: 'exp'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.exp.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.exp.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.exp.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.exp.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.exp.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.exp.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.exp.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.exp.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.exp.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.exp.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.exp.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.exp.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.exp.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.exp.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.exp.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.exp.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.exp.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.exp.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.exp.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.exp.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.exp.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.exp.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.exp.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.exp.f32(float undef)
   call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
   call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
@@ -136,29 +271,86 @@ define void @exp() {
   ret void
 }
 
+define void @exp_f16() {
+; ZVFH-LABEL: 'exp_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'exp_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.exp.f16(half undef)
+  call <2 x half> @llvm.exp.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.exp.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.exp.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.exp.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.exp.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @exp2() {
 ; CHECK-LABEL: 'exp2'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.exp2.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.exp2.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.exp2.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.exp2.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.exp2.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.exp2.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.exp2.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.exp2.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.exp2.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.exp2.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp2.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.exp2.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.exp2.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.exp2.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.exp2.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.exp2.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.exp2.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.exp2.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.exp2.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.exp2.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.exp2.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.exp2.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.exp2.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.exp2.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.exp2.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.exp2.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.exp2.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.exp2.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.exp2.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.exp2.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.exp2.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.exp2.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.exp2.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.exp2.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.exp2.f32(float undef)
   call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
   call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
@@ -181,29 +373,86 @@ define void @exp2() {
   ret void
 }
 
+define void @exp2_f16() {
+; ZVFH-LABEL: 'exp2_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.exp2.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'exp2_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.exp2.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.exp2.f16(half undef)
+  call <2 x half> @llvm.exp2.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.exp2.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.exp2.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.exp2.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.exp2.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @log() {
 ; CHECK-LABEL: 'log'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.log.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.log.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.log.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.log.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.log.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.log.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.log.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.log.f32(float undef)
   call <2 x float> @llvm.log.v2f32(<2 x float> undef)
   call <4 x float> @llvm.log.v4f32(<4 x float> undef)
@@ -226,29 +475,86 @@ define void @log() {
   ret void
 }
 
+define void @log_f16() {
+; ZVFH-LABEL: 'log_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.log.f16(half undef)
+  call <2 x half> @llvm.log.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.log.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.log.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.log.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.log.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.log.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.log.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.log.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @log10() {
 ; CHECK-LABEL: 'log10'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log10.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log10.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log10.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log10.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log10.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log10.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log10.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log10.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log10.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log10.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log10.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log10.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log10.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log10.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log10.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log10.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log10.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log10.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log10.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log10.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log10.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log10.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log10.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log10.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log10.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log10.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log10.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log10.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log10.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log10.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log10.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log10.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.log10.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.log10.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.log10.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.log10.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.log10.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.log10.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.log10.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.log10.f32(float undef)
   call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
   call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
@@ -271,29 +577,86 @@ define void @log10() {
   ret void
 }
 
+define void @log10_f16() {
+; ZVFH-LABEL: 'log10_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log10.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log10_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log10.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.log10.f16(half undef)
+  call <2 x half> @llvm.log10.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.log10.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.log10.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.log10.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.log10.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
+
 define void @log2() {
 ; CHECK-LABEL: 'log2'
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call float @llvm.log2.f32(float undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %4 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x float> @llvm.log2.nxv1f32(<vscale x 1 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x float> @llvm.log2.nxv2f32(<vscale x 2 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x float> @llvm.log2.nxv8f32(<vscale x 8 x float> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x float> @llvm.log2.nxv16f32(<vscale x 16 x float> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call double @llvm.log2.f64(double undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %13 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x double> @llvm.log2.nxv1f64(<vscale x 1 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x double> @llvm.log2.nxv4f64(<vscale x 4 x double> undef)
-; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x double> @llvm.log2.nxv8f64(<vscale x 8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call bfloat @llvm.log2.bf16(bfloat undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log2.f32(float undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %14 = call <8 x float> @llvm.log2.v8f32(<8 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %15 = call <16 x float> @llvm.log2.v16f32(<16 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %16 = call <vscale x 1 x float> @llvm.log2.nxv1f32(<vscale x 1 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %17 = call <vscale x 2 x float> @llvm.log2.nxv2f32(<vscale x 2 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %18 = call <vscale x 4 x float> @llvm.log2.nxv4f32(<vscale x 4 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %19 = call <vscale x 8 x float> @llvm.log2.nxv8f32(<vscale x 8 x float> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %20 = call <vscale x 16 x float> @llvm.log2.nxv16f32(<vscale x 16 x float> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %21 = call double @llvm.log2.f64(double undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %22 = call <2 x double> @llvm.log2.v2f64(<2 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %23 = call <4 x double> @llvm.log2.v4f64(<4 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 95 for instruction: %24 = call <8 x double> @llvm.log2.v8f64(<8 x double> undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %25 = call <16 x double> @llvm.log2.v16f64(<16 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %26 = call <vscale x 1 x double> @llvm.log2.nxv1f64(<vscale x 1 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %27 = call <vscale x 2 x double> @llvm.log2.nxv2f64(<vscale x 2 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %28 = call <vscale x 4 x double> @llvm.log2.nxv4f64(<vscale x 4 x double> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %29 = call <vscale x 8 x double> @llvm.log2.nxv8f64(<vscale x 8 x double> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
+  call bfloat @llvm.log2.bf16(bfloat undef)
+  call <2 x bfloat> @llvm.log2.v2bf16(<2 x bfloat> undef)
+  call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
+  call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
+  call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
+  call <vscale x 1 x bfloat> @llvm.log2.nvx1bf16(<vscale x 1 x bfloat> undef)
+  call <vscale x 2 x bfloat> @llvm.log2.nvx2bf16(<vscale x 2 x bfloat> undef)
+  call <vscale x 4 x bfloat> @llvm.log2.nvx4bf16(<vscale x 4 x bfloat> undef)
+  call <vscale x 8 x bfloat> @llvm.log2.nvx8bf16(<vscale x 8 x bfloat> undef)
+  call <vscale x 16 x bfloat> @llvm.log2.nvx16bf16(<vscale x 16 x bfloat> undef)
   call float @llvm.log2.f32(float undef)
   call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
   call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
@@ -316,142 +679,40 @@ define void @log2() {
   ret void
 }
 
-declare float @llvm.sin.f32(float)
-declare <2 x float> @llvm.sin.v2f32(<2 x float>)
-declare <4 x float> @llvm.sin.v4f32(<4 x float>)
-declare <8 x float> @llvm.sin.v8f32(<8 x float>)
-declare <16 x float> @llvm.sin.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.sin.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.sin.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.sin.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.sin.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.sin.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.sin.f64(double)
-declare <2 x double> @llvm.sin.v2f64(<2 x double>)
-declare <4 x double> @llvm.sin.v4f64(<4 x double>)
-declare <8 x double> @llvm.sin.v8f64(<8 x double>)
-declare <16 x double> @llvm.sin.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.sin.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.sin.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.sin.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.sin.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.cos.f32(float)
-declare <2 x float> @llvm.cos.v2f32(<2 x float>)
-declare <4 x float> @llvm.cos.v4f32(<4 x float>)
-declare <8 x float> @llvm.cos.v8f32(<8 x float>)
-declare <16 x float> @llvm.cos.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.cos.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.cos.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.cos.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.cos.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.cos.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.cos.f64(double)
-declare <2 x double> @llvm.cos.v2f64(<2 x double>)
-declare <4 x double> @llvm.cos.v4f64(<4 x double>)
-declare <8 x double> @llvm.cos.v8f64(<8 x double>)
-declare <16 x double> @llvm.cos.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.cos.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.cos.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.cos.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.cos.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.exp.f32(float)
-declare <2 x float> @llvm.exp.v2f32(<2 x float>)
-declare <4 x float> @llvm.exp.v4f32(<4 x float>)
-declare <8 x float> @llvm.exp.v8f32(<8 x float>)
-declare <16 x float> @llvm.exp.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.exp.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.exp.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.exp.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.exp.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.exp.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.exp.f64(double)
-declare <2 x double> @llvm.exp.v2f64(<2 x double>)
-declare <4 x double> @llvm.exp.v4f64(<4 x double>)
-declare <8 x double> @llvm.exp.v8f64(<8 x double>)
-declare <16 x double> @llvm.exp.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.exp.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.exp.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.exp.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.exp.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.exp2.f32(float)
-declare <2 x float> @llvm.exp2.v2f32(<2 x float>)
-declare <4 x float> @llvm.exp2.v4f32(<4 x float>)
-declare <8 x float> @llvm.exp2.v8f32(<8 x float>)
-declare <16 x float> @llvm.exp2.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.exp2.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.exp2.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.exp2.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.exp2.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.exp2.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.exp2.f64(double)
-declare <2 x double> @llvm.exp2.v2f64(<2 x double>)
-declare <4 x double> @llvm.exp2.v4f64(<4 x double>)
-declare <8 x double> @llvm.exp2.v8f64(<8 x double>)
-declare <16 x double> @llvm.exp2.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.exp2.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.exp2.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.exp2.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.exp2.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.log.f32(float)
-declare <2 x float> @llvm.log.v2f32(<2 x float>)
-declare <4 x float> @llvm.log.v4f32(<4 x float>)
-declare <8 x float> @llvm.log.v8f32(<8 x float>)
-declare <16 x float> @llvm.log.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log.f64(double)
-declare <2 x double> @llvm.log.v2f64(<2 x double>)
-declare <4 x double> @llvm.log.v4f64(<4 x double>)
-declare <8 x double> @llvm.log.v8f64(<8 x double>)
-declare <16 x double> @llvm.log.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log.nvx8f64(<vscale x 8 x double>)
-
-declare float @llvm.log10.f32(float)
-declare <2 x float> @llvm.log10.v2f32(<2 x float>)
-declare <4 x float> @llvm.log10.v4f32(<4 x float>)
-declare <8 x float> @llvm.log10.v8f32(<8 x float>)
-declare <16 x float> @llvm.log10.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log10.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log10.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log10.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log10.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log10.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log10.f64(double)
-declare <2 x double> @llvm.log10.v2f64(<2 x double>)
-declare <4 x double> @llvm.log10.v4f64(<4 x double>)
-declare <8 x double> @llvm.log10.v8f64(<8 x double>)
-declare <16 x double> @llvm.log10.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log10.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log10.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log10.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log10.nvx8f64(<vscale x 8 x double>)
+define void @log2_f16() {
+; ZVFH-LABEL: 'log2_f16'
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %1 = call half @llvm.log2.f16(half undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 47 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+; ZVFHMIN-LABEL: 'log2_f16'
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %1 = call half @llvm.log2.f16(half undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %2 = call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call half @llvm.log2.f16(half undef)
+  call <2 x half> @llvm.log2.v2f16(<2 x half> undef)
+  call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
+  call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
+  call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
+  call <vscale x 1 x half> @llvm.log2.nvx1f16(<vscale x 1 x half> undef)
+  call <vscale x 2 x half> @llvm.log2.nvx2f16(<vscale x 2 x half> undef)
+  call <vscale x 4 x half> @llvm.log2.nvx4f16(<vscale x 4 x half> undef)
+  call <vscale x 8 x half> @llvm.log2.nvx8f16(<vscale x 8 x half> undef)
+  ret void
+}
 
-declare float @llvm.log2.f32(float)
-declare <2 x float> @llvm.log2.v2f32(<2 x float>)
-declare <4 x float> @llvm.log2.v4f32(<4 x float>)
-declare <8 x float> @llvm.log2.v8f32(<8 x float>)
-declare <16 x float> @llvm.log2.v16f32(<16 x float>)
-declare <vscale x 1 x float> @llvm.log2.nvx1f32(<vscale x 1 x float>)
-declare <vscale x 2 x float> @llvm.log2.nvx2f32(<vscale x 2 x float>)
-declare <vscale x 4 x float> @llvm.log2.nvx4f32(<vscale x 4 x float>)
-declare <vscale x 8 x float> @llvm.log2.nvx8f32(<vscale x 8 x float>)
-declare <vscale x 16 x float> @llvm.log2.nvx16f32(<vscale x 16 x float>)
-declare double @llvm.log2.f64(double)
-declare <2 x double> @llvm.log2.v2f64(<2 x double>)
-declare <4 x double> @llvm.log2.v4f64(<4 x double>)
-declare <8 x double> @llvm.log2.v8f64(<8 x double>)
-declare <16 x double> @llvm.log2.v16f64(<16 x double>)
-declare <vscale x 1 x double> @llvm.log2.nvx1f64(<vscale x 1 x double>)
-declare <vscale x 2 x double> @llvm.log2.nvx2f64(<vscale x 2 x double>)
-declare <vscale x 4 x double> @llvm.log2.nvx4f64(<vscale x 4 x double>)
-declare <vscale x 8 x double> @llvm.log2.nvx8f64(<vscale x 8 x double>)
-- 
GitLab


From b5cc222d7429fe6f18c787f633d5262fac2e676f Mon Sep 17 00:00:00 2001
From: Akshat Oke <Akshat.Oke@amd.com>
Date: Wed, 16 Oct 2024 14:52:25 +0530
Subject: [PATCH 100/329] [MIR] Fix vreg flag vector memory leak (#112479)

A fix-it patch for dbfca24 #110228.

No need for a container. This allows 8 flags for a register.

The virtual register flags vector had a memory leak because the vector's
memory is not freed.
The `BumpPtrAllocator` handles the deallocation and missed calling the
`std::vector<uint8_t> Flags` destructor.
---
 llvm/include/llvm/CodeGen/MIRParser/MIParser.h | 2 +-
 llvm/lib/CodeGen/MIRParser/MIRParser.cpp       | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
index 4d93213de5e0..0f2898d3554d 100644
--- a/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
+++ b/llvm/include/llvm/CodeGen/MIRParser/MIParser.h
@@ -45,7 +45,7 @@ struct VRegInfo {
   } D;
   Register VReg;
   Register PreferredReg;
-  std::vector<uint8_t> Flags;
+  uint8_t Flags = 0;
 };
 
 using Name2RegClassMap = StringMap<const TargetRegisterClass *>;
diff --git a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
index 10d3cdcf0c1c..c0c61b3fdd16 100644
--- a/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
+++ b/llvm/lib/CodeGen/MIRParser/MIRParser.cpp
@@ -703,7 +703,7 @@ bool MIRParserImpl::parseRegisterInfo(PerFunctionMIParsingState &PFS,
         return error(FlagStringValue.SourceRange.Start,
                      Twine("use of undefined register flag '") +
                          FlagStringValue.Value + "'");
-      Info.Flags.push_back(FlagValue);
+      Info.Flags |= FlagValue;
     }
     RegInfo.noteNewVirtualRegister(Info.VReg);
   }
-- 
GitLab


From 15d85769f119061fbfcae6e9de43982b534ef724 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Wed, 16 Oct 2024 10:27:50 +0100
Subject: [PATCH 101/329] [Flang][OpenMP] Support lowering of simd reductions
 (#112194)

This patch enables lowering to MLIR of the reduction clause of `simd`
constructs. Lowering from MLIR to LLVM IR remains unimplemented, so at
that stage it will result in errors being emitted rather than silently
ignoring it as it is currently done.

On composite `do simd` constructs, this lowering error will remain
untriggered, as the `omp.simd` operation in that case is currently
ignored. The MLIR representation, however, will now contain `reduction`
information.
---
 flang/lib/Lower/OpenMP/OpenMP.cpp             | 16 +++++++++----
 flang/test/Lower/OpenMP/simd.f90              | 24 +++++++++++++++++++
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  |  8 ++++---
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 21 ++++++++++++----
 4 files changed, 57 insertions(+), 12 deletions(-)

diff --git a/flang/lib/Lower/OpenMP/OpenMP.cpp b/flang/lib/Lower/OpenMP/OpenMP.cpp
index 70d89f5e521a..cf469003b729 100644
--- a/flang/lib/Lower/OpenMP/OpenMP.cpp
+++ b/flang/lib/Lower/OpenMP/OpenMP.cpp
@@ -2070,7 +2070,9 @@ static void genStandaloneSimd(lower::AbstractConverter &converter,
                      loopNestClauseOps, iv);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
 
@@ -2228,7 +2230,9 @@ static void genCompositeDistributeParallelDoSimd(
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
@@ -2285,7 +2289,9 @@ static void genCompositeDistributeSimd(lower::AbstractConverter &converter,
   distributeOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
@@ -2342,7 +2348,9 @@ static void genCompositeDoSimd(lower::AbstractConverter &converter,
   wsloopOp.setComposite(/*val=*/true);
 
   EntryBlockArgs simdArgs;
-  // TODO: Add private, reduction syms and vars.
+  // TODO: Add private syms and vars.
+  simdArgs.reduction.syms = simdReductionSyms;
+  simdArgs.reduction.vars = simdClauseOps.reductionVars;
   auto simdOp =
       genWrapperOp<mlir::omp::SimdOp>(converter, loc, simdClauseOps, simdArgs);
   simdOp.setComposite(/*val=*/true);
diff --git a/flang/test/Lower/OpenMP/simd.f90 b/flang/test/Lower/OpenMP/simd.f90
index f574a1265e06..d92f06cebfdb 100644
--- a/flang/test/Lower/OpenMP/simd.f90
+++ b/flang/test/Lower/OpenMP/simd.f90
@@ -4,6 +4,8 @@
 ! RUN: %flang_fc1 -flang-experimental-hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 ! RUN: bbc -hlfir -emit-hlfir -fopenmp -fopenmp-version=50 %s -o - | FileCheck %s
 
+!CHECK: omp.declare_reduction @[[REDUCER:.*]] : i32
+
 !CHECK-LABEL: func @_QPsimd()
 subroutine simd
   integer :: i
@@ -273,3 +275,25 @@ subroutine lastprivate_with_simd
     sum = i + 1
   end do
 end subroutine
+
+!CHECK-LABEL: func @_QPsimd_with_reduction_clause()
+subroutine simd_with_reduction_clause
+  integer :: i, x
+  x = 0
+  ! CHECK: %[[LB:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: %[[UB:.*]] = arith.constant 9 : i32
+  ! CHECK-NEXT: %[[STEP:.*]] = arith.constant 1 : i32
+  ! CHECK-NEXT: omp.simd reduction(@[[REDUCER]] %[[X:.*]]#0 -> %[[X_RED:.*]] : !fir.ref<i32>) {
+  ! CHECK-NEXT: omp.loop_nest (%[[I:.*]]) : i32 = (%[[LB]]) to (%[[UB]]) inclusive step (%[[STEP]]) {
+  !$omp simd reduction(+:x)
+  do i=1, 9
+    ! CHECK: %[[X_DECL:.*]]:2 = hlfir.declare %[[X_RED]] {uniq_name = "_QFsimd_with_reduction_clauseEx"} : (!fir.ref<i32>) -> (!fir.ref<i32>, !fir.ref<i32>)
+    ! CHECK: fir.store %[[I]] to %[[LOCAL:.*]]#1 : !fir.ref<i32>
+    ! CHECK: %[[X_LD:.*]] = fir.load %[[X_DECL]]#0 : !fir.ref<i32>
+    ! CHECK: %[[I_LD:.*]] = fir.load %[[LOCAL]]#0 : !fir.ref<i32>
+    ! CHECK: %[[SUM:.*]] = arith.addi %[[X_LD]], %[[I_LD]] : i32
+    ! CHECK: hlfir.assign %[[SUM]] to %[[X_DECL]]#0 : i32, !fir.ref<i32>
+    x = x+i
+  end do
+  !$OMP end simd
+end subroutine
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index c6c6edb8f999..3217542e1056 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -2012,14 +2012,16 @@ void SimdOp::build(OpBuilder &builder, OperationState &state,
                    const SimdOperands &clauses) {
   MLIRContext *ctx = builder.getContext();
   // TODO Store clauses in op: linearVars, linearStepVars, privateVars,
-  // privateSyms, reductionVars, reductionByref, reductionSyms.
+  // privateSyms.
   SimdOp::build(builder, state, clauses.alignedVars,
                 makeArrayAttr(ctx, clauses.alignments), clauses.ifExpr,
                 /*linear_vars=*/{}, /*linear_step_vars=*/{},
                 clauses.nontemporalVars, clauses.order, clauses.orderMod,
                 /*private_vars=*/{}, /*private_syms=*/nullptr,
-                /*reduction_vars=*/{}, /*reduction_byref=*/nullptr,
-                /*reduction_syms=*/nullptr, clauses.safelen, clauses.simdlen);
+                clauses.reductionVars,
+                makeDenseBoolArrayAttr(ctx, clauses.reductionByref),
+                makeArrayAttr(ctx, clauses.reductionSyms), clauses.safelen,
+                clauses.simdlen);
 }
 
 LogicalResult SimdOp::verify() {
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index 4a575f4e5770..cb7dd3cd874d 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -1785,6 +1785,20 @@ convertOrderKind(std::optional<omp::ClauseOrderKind> o) {
   llvm_unreachable("Unknown ClauseOrderKind kind");
 }
 
+static LogicalResult simdOpSupported(omp::SimdOp op) {
+  if (!op.getLinearVars().empty() || !op.getLinearStepVars().empty())
+    return op.emitError("linear clause not yet supported");
+
+  if (!op.getPrivateVars().empty() || op.getPrivateSyms())
+    return op.emitError("privatization clauses not yet supported");
+
+  if (!op.getReductionVars().empty() || op.getReductionByref() ||
+      op.getReductionSyms())
+    return op.emitError("reduction clause not yet supported");
+
+  return success();
+}
+
 /// Converts an OpenMP simd loop into LLVM IR using OpenMPIRBuilder.
 static LogicalResult
 convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
@@ -1792,11 +1806,8 @@ convertOmpSimd(Operation &opInst, llvm::IRBuilderBase &builder,
   auto simdOp = cast<omp::SimdOp>(opInst);
   auto loopOp = cast<omp::LoopNestOp>(simdOp.getWrappedLoop());
 
-  if (!simdOp.getLinearVars().empty() || !simdOp.getLinearStepVars().empty() ||
-      !simdOp.getPrivateVars().empty() || simdOp.getPrivateSyms() ||
-      !simdOp.getReductionVars().empty() || simdOp.getReductionByref() ||
-      simdOp.getReductionSyms())
-    return opInst.emitError("unhandled clauses for translation to LLVM IR");
+  if (failed(simdOpSupported(simdOp)))
+    return failure();
 
   llvm::OpenMPIRBuilder::LocationDescription ompLoc(builder);
 
-- 
GitLab


From 70334081f75d67900c6ffa193c60c4d6f4767354 Mon Sep 17 00:00:00 2001
From: Simon Camphausen <simon.camphausen@iml.fraunhofer.de>
Date: Wed, 16 Oct 2024 11:49:49 +0200
Subject: [PATCH 102/329] [mlir][bufferization] Expose buffer alignment as a
 pass option in one-shot-bufferize (#112505)

---
 mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td | 2 ++
 mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp      | 1 +
 2 files changed, 3 insertions(+)

diff --git a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
index a683a905cd2d..cc5463ea968f 100644
--- a/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
+++ b/mlir/include/mlir/Dialect/Bufferization/Transforms/Passes.td
@@ -536,6 +536,8 @@ def OneShotBufferize : Pass<"one-shot-bufferize", "ModuleOp"> {
     Option<"unknownTypeConversion", "unknown-type-conversion", "std::string",
            /*default=*/"\"fully-dynamic-layout-map\"",
            "Controls layout maps for non-inferrable memref types.">,
+    Option<"bufferAlignment", "buffer-alignment", "uint64_t", /*default=*/"64",
+           "Sets the alignment of newly allocated buffers.">,
   ];
   let constructor = "mlir::bufferization::createOneShotBufferizePass()";
 
diff --git a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
index 875d8c40e92c..1d009b03754c 100644
--- a/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
+++ b/mlir/lib/Dialect/Bufferization/Transforms/Bufferize.cpp
@@ -224,6 +224,7 @@ struct OneShotBufferizePass
         };
       }
       opt.printConflicts = printConflicts;
+      opt.bufferAlignment = bufferAlignment;
       opt.testAnalysisOnly = testAnalysisOnly;
       opt.bufferizeFunctionBoundaries = bufferizeFunctionBoundaries;
       opt.checkParallelRegions = checkParallelRegions;
-- 
GitLab


From 467a9bde06e681cecc69afa18580aadf2ed9769b Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Wed, 16 Oct 2024 06:14:38 -0400
Subject: [PATCH 103/329] [polly] Avoid llvm::Type::getPointerTo() (NFC)
 (#112368)

`llvm::Type::getPointerTo()` is to be deprecated & removed soon.

Also, avoid pointercasts that are essentially no-ops.
---
 polly/lib/CodeGen/BlockGenerators.cpp    | 6 ------
 polly/lib/CodeGen/IslNodeBuilder.cpp     | 2 --
 polly/lib/CodeGen/LoopGeneratorsGOMP.cpp | 2 +-
 3 files changed, 1 insertion(+), 9 deletions(-)

diff --git a/polly/lib/CodeGen/BlockGenerators.cpp b/polly/lib/CodeGen/BlockGenerators.cpp
index c58763603cfa..b76d8f4c18a5 100644
--- a/polly/lib/CodeGen/BlockGenerators.cpp
+++ b/polly/lib/CodeGen/BlockGenerators.cpp
@@ -786,12 +786,6 @@ void BlockGenerator::generateScalarStores(
                                Builder.GetInsertBlock())) &&
                  "Domination violation");
 
-          // The new Val might have a different type than the old Val due to
-          // ScalarEvolution looking through bitcasts.
-          Address = Builder.CreateBitOrPointerCast(
-              Address, Val->getType()->getPointerTo(
-                           Address->getType()->getPointerAddressSpace()));
-
           Builder.CreateStore(Val, Address);
         });
   }
diff --git a/polly/lib/CodeGen/IslNodeBuilder.cpp b/polly/lib/CodeGen/IslNodeBuilder.cpp
index 3f07f02038a1..d76f6251ea4c 100644
--- a/polly/lib/CodeGen/IslNodeBuilder.cpp
+++ b/polly/lib/CodeGen/IslNodeBuilder.cpp
@@ -1050,8 +1050,6 @@ Value *IslNodeBuilder::preloadUnconditionally(__isl_take isl_set *AccessRange,
 
   auto *Ptr = AddressValue;
   auto Name = Ptr->getName();
-  auto AS = Ptr->getType()->getPointerAddressSpace();
-  Ptr = Builder.CreatePointerCast(Ptr, Ty->getPointerTo(AS), Name + ".cast");
   PreloadVal = Builder.CreateLoad(Ty, Ptr, Name + ".load");
   if (LoadInst *PreloadInst = dyn_cast<LoadInst>(PreloadVal))
     PreloadInst->setAlignment(cast<LoadInst>(AccInst)->getAlign());
diff --git a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
index cd440b28202e..b98416a92097 100644
--- a/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
+++ b/polly/lib/CodeGen/LoopGeneratorsGOMP.cpp
@@ -183,7 +183,7 @@ Value *ParallelLoopGeneratorGOMP::createCallGetWorkItem(Value *LBPtr,
   // If F is not available, declare it.
   if (!F) {
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    Type *Params[] = {LongType->getPointerTo(), LongType->getPointerTo()};
+    Type *Params[] = {Builder.getPtrTy(0), Builder.getPtrTy(0)};
     FunctionType *Ty = FunctionType::get(Builder.getInt8Ty(), Params, false);
     F = Function::Create(Ty, Linkage, Name, M);
   }
-- 
GitLab


From 25b702f2637d5520cacf59e6a92e52956ccc7e8d Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 11:13:16 +0100
Subject: [PATCH 104/329] [DAG] visitXOR - add missing comment for or/and
 constant setcc demorgan fold. NFC.

Noticed while triaging #112347 which is using this fold - we described the or->and fold, but not the equivalent and->or which is also handled.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 608ee8558e99..0879165aac13 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -9594,6 +9594,7 @@ SDValue DAGCombiner::visitXOR(SDNode *N) {
   }
 
   // fold (not (or x, y)) -> (and (not x), (not y)) iff x or y are setcc
+  // fold (not (and x, y)) -> (or (not x), (not y)) iff x or y are setcc
   if (isOneConstant(N1) && VT == MVT::i1 && N0.hasOneUse() &&
       (N0Opcode == ISD::OR || N0Opcode == ISD::AND)) {
     SDValue N00 = N0.getOperand(0), N01 = N0.getOperand(1);
-- 
GitLab


From 9ee9e0e3b2323e7cca00a5223ace5e25e7ed1c1f Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 11:15:29 +0100
Subject: [PATCH 105/329] [X86] Extend ANDNOT fold tests to cover all legal
 scalars and 256-bit vectors

Add tests to check what happens on i8/i16/i32 scalars (ANDN only has i32/i64 variants)
---
 llvm/test/CodeGen/X86/pr108731.ll | 127 ++++++++++++++++++++++++++++--
 1 file changed, 121 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/pr108731.ll b/llvm/test/CodeGen/X86/pr108731.ll
index 87dce0314d08..473b4f7f4da2 100644
--- a/llvm/test/CodeGen/X86/pr108731.ll
+++ b/llvm/test/CodeGen/X86/pr108731.ll
@@ -2,8 +2,8 @@
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v2 | FileCheck %s --check-prefixes=CHECK,NOBMI
 ; RUN: llc < %s -mtriple=x86_64-- -mcpu=x86-64-v3 | FileCheck %s --check-prefixes=CHECK,BMI
 
-define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) {
-; NOBMI-LABEL: foo:
+define i64 @test_i64(i64 %w, i64 %x, i64 %y, i64 %z) {
+; NOBMI-LABEL: test_i64:
 ; NOBMI:       # %bb.0: # %Entry
 ; NOBMI-NEXT:    movq %rcx, %rax
 ; NOBMI-NEXT:    andq %rdx, %rsi
@@ -14,7 +14,7 @@ define i64 @foo(i64 %w, i64 %x, i64 %y, i64 %z) {
 ; NOBMI-NEXT:    andq %rsi, %rax
 ; NOBMI-NEXT:    retq
 ;
-; BMI-LABEL: foo:
+; BMI-LABEL: test_i64:
 ; BMI:       # %bb.0: # %Entry
 ; BMI-NEXT:    andq %rdx, %rsi
 ; BMI-NEXT:    andnq %rdi, %rsi, %rax
@@ -31,8 +31,91 @@ Entry:
   ret i64 %and3
 }
 
-define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) {
-; NOBMI-LABEL: fooVec:
+define i32 @test_i32(i32 %w, i32 %x, i32 %y, i32 %z) {
+; NOBMI-LABEL: test_i32:
+; NOBMI:       # %bb.0: # %Entry
+; NOBMI-NEXT:    movl %ecx, %eax
+; NOBMI-NEXT:    andl %edx, %esi
+; NOBMI-NEXT:    notl %esi
+; NOBMI-NEXT:    andl %edi, %esi
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    orl %edx, %eax
+; NOBMI-NEXT:    andl %esi, %eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: test_i32:
+; BMI:       # %bb.0: # %Entry
+; BMI-NEXT:    andl %edx, %esi
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    andnl %ecx, %edx, %ecx
+; BMI-NEXT:    andnl %eax, %ecx, %eax
+; BMI-NEXT:    retq
+Entry:
+  %and1 = and i32 %y, %x
+  %xor1 = xor i32 %and1, -1
+  %and2 = and i32 %xor1, %w
+  %.not = xor i32 %z, -1
+  %or1 = or i32 %.not, %y
+  %and3 = and i32 %and2, %or1
+  ret i32 %and3
+}
+
+define i16 @test_i16(i16 %w, i16 %x, i16 %y, i16 %z) {
+; NOBMI-LABEL: test_i16:
+; NOBMI:       # %bb.0: # %Entry
+; NOBMI-NEXT:    movl %ecx, %eax
+; NOBMI-NEXT:    andl %edx, %esi
+; NOBMI-NEXT:    notl %esi
+; NOBMI-NEXT:    andl %edi, %esi
+; NOBMI-NEXT:    notl %eax
+; NOBMI-NEXT:    orl %edx, %eax
+; NOBMI-NEXT:    andl %esi, %eax
+; NOBMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: test_i16:
+; BMI:       # %bb.0: # %Entry
+; BMI-NEXT:    andl %edx, %esi
+; BMI-NEXT:    andnl %edi, %esi, %eax
+; BMI-NEXT:    notl %ecx
+; BMI-NEXT:    orl %edx, %ecx
+; BMI-NEXT:    andl %ecx, %eax
+; BMI-NEXT:    # kill: def $ax killed $ax killed $eax
+; BMI-NEXT:    retq
+Entry:
+  %and1 = and i16 %y, %x
+  %xor1 = xor i16 %and1, -1
+  %and2 = and i16 %xor1, %w
+  %.not = xor i16 %z, -1
+  %or1 = or i16 %.not, %y
+  %and3 = and i16 %and2, %or1
+  ret i16 %and3
+}
+
+define i8 @test_i8(i8 %w, i8 %x, i8 %y, i8 %z) {
+; CHECK-LABEL: test_i8:
+; CHECK:       # %bb.0: # %Entry
+; CHECK-NEXT:    movl %edx, %eax
+; CHECK-NEXT:    andl %edx, %esi
+; CHECK-NEXT:    notb %sil
+; CHECK-NEXT:    andb %dil, %sil
+; CHECK-NEXT:    notb %cl
+; CHECK-NEXT:    orb %cl, %al
+; CHECK-NEXT:    andb %sil, %al
+; CHECK-NEXT:    # kill: def $al killed $al killed $eax
+; CHECK-NEXT:    retq
+Entry:
+  %and1 = and i8 %y, %x
+  %xor1 = xor i8 %and1, -1
+  %and2 = and i8 %xor1, %w
+  %.not = xor i8 %z, -1
+  %or1 = or i8 %.not, %y
+  %and3 = and i8 %and2, %or1
+  ret i8 %and3
+}
+
+define <16 x i8> @test_v16i8(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z) {
+; NOBMI-LABEL: test_v16i8:
 ; NOBMI:       # %bb.0: # %Entry
 ; NOBMI-NEXT:    andps %xmm2, %xmm1
 ; NOBMI-NEXT:    andnps %xmm0, %xmm1
@@ -41,7 +124,7 @@ define <16 x i8> @fooVec(<16 x i8> %w, <16 x i8> %x, <16 x i8> %y, <16 x i8> %z)
 ; NOBMI-NEXT:    movaps %xmm2, %xmm0
 ; NOBMI-NEXT:    retq
 ;
-; BMI-LABEL: fooVec:
+; BMI-LABEL: test_v16i8:
 ; BMI:       # %bb.0: # %Entry
 ; BMI-NEXT:    vandps %xmm1, %xmm2, %xmm1
 ; BMI-NEXT:    vandnps %xmm0, %xmm1, %xmm0
@@ -58,6 +141,38 @@ Entry:
   ret <16 x i8> %and3
 }
 
+define <32 x i8> @test_v32i8(<32 x i8> %w, <32 x i8> %x, <32 x i8> %y, <32 x i8> %z) {
+; NOBMI-LABEL: test_v32i8:
+; NOBMI:       # %bb.0: # %Entry
+; NOBMI-NEXT:    andps %xmm4, %xmm2
+; NOBMI-NEXT:    andps %xmm5, %xmm3
+; NOBMI-NEXT:    andnps %xmm1, %xmm3
+; NOBMI-NEXT:    andnps %xmm0, %xmm2
+; NOBMI-NEXT:    andnps %xmm6, %xmm4
+; NOBMI-NEXT:    andnps %xmm2, %xmm4
+; NOBMI-NEXT:    andnps %xmm7, %xmm5
+; NOBMI-NEXT:    andnps %xmm3, %xmm5
+; NOBMI-NEXT:    movaps %xmm4, %xmm0
+; NOBMI-NEXT:    movaps %xmm5, %xmm1
+; NOBMI-NEXT:    retq
+;
+; BMI-LABEL: test_v32i8:
+; BMI:       # %bb.0: # %Entry
+; BMI-NEXT:    vandps %ymm1, %ymm2, %ymm1
+; BMI-NEXT:    vandnps %ymm0, %ymm1, %ymm0
+; BMI-NEXT:    vandnps %ymm3, %ymm2, %ymm1
+; BMI-NEXT:    vandnps %ymm0, %ymm1, %ymm0
+; BMI-NEXT:    retq
+Entry:
+  %and1 = and <32 x i8> %y, %x
+  %xor1 = xor <32 x i8> %and1, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %and2 = and <32 x i8> %xor1, %w
+  %.not = xor <32 x i8> %z, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
+  %or1 = or <32 x i8> %.not, %y
+  %and3 = and <32 x i8> %and2, %or1
+  ret <32 x i8> %and3
+}
+
 ; PR112347 - don't fold if we'd be inverting a constant, as demorgan normalisation will invert it back again.
 define void @PR112347(ptr %p0, ptr %p1, ptr %p2) {
 ; CHECK-LABEL: PR112347:
-- 
GitLab


From d3d2d72549e03403317ce325c17ad7d516643e2b Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 11:52:58 +0100
Subject: [PATCH 106/329] [DAG] SDPatternMatch - add missing BSWAP/CTPOP/CTTZ
 matchers

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h        | 13 +++++++++++++
 .../CodeGen/SelectionDAGPatternMatchTest.cpp      | 15 +++++++++++++++
 2 files changed, 28 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index 0af4f73b869c..f0e12f28c9e6 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -822,6 +822,11 @@ inline UnaryOpc_match<Opnd, true> m_ChainedUnaryOp(unsigned Opc,
   return UnaryOpc_match<Opnd, true>(Opc, Op);
 }
 
+template <typename Opnd>
+inline UnaryOpc_match<Opnd> m_BSwap(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::BSWAP, Op);
+}
+
 template <typename Opnd>
 inline UnaryOpc_match<Opnd> m_BitReverse(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::BITREVERSE, Op);
@@ -892,10 +897,18 @@ template <typename Opnd> inline UnaryOpc_match<Opnd> m_FPToSI(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::FP_TO_SINT, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Ctpop(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::CTPOP, Op);
+}
+
 template <typename Opnd> inline UnaryOpc_match<Opnd> m_Ctlz(const Opnd &Op) {
   return UnaryOpc_match<Opnd>(ISD::CTLZ, Op);
 }
 
+template <typename Opnd> inline UnaryOpc_match<Opnd> m_Cttz(const Opnd &Op) {
+  return UnaryOpc_match<Opnd>(ISD::CTTZ, Op);
+}
+
 // === Constants ===
 struct ConstantInt_match {
   APInt *BindVal;
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index 7400b6c1984f..a28e2b2b47df 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -302,7 +302,12 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   SDValue FPToSI = DAG->getNode(ISD::FP_TO_SINT, DL, FloatVT, Op2);
   SDValue FPToUI = DAG->getNode(ISD::FP_TO_UINT, DL, FloatVT, Op2);
 
+  SDValue Brev = DAG->getNode(ISD::BITREVERSE, DL, Int32VT, Op0);
+  SDValue Bswap = DAG->getNode(ISD::BSWAP, DL, Int32VT, Op0);
+
+  SDValue Ctpop = DAG->getNode(ISD::CTPOP, DL, Int32VT, Op0);
   SDValue Ctlz = DAG->getNode(ISD::CTLZ, DL, Int32VT, Op0);
+  SDValue Cttz = DAG->getNode(ISD::CTTZ, DL, Int32VT, Op0);
 
   using namespace SDPatternMatch;
   EXPECT_TRUE(sd_match(ZExt, m_UnaryOp(ISD::ZERO_EXTEND, m_Value())));
@@ -328,7 +333,17 @@ TEST_F(SelectionDAGPatternMatchTest, matchUnaryOp) {
   EXPECT_FALSE(sd_match(FPToUI, m_FPToSI(m_Value())));
   EXPECT_FALSE(sd_match(FPToSI, m_FPToUI(m_Value())));
 
+  EXPECT_TRUE(sd_match(Brev, m_BitReverse(m_Value())));
+  EXPECT_TRUE(sd_match(Bswap, m_BSwap(m_Value())));
+  EXPECT_FALSE(sd_match(Brev, m_BSwap(m_Value())));
+  EXPECT_FALSE(sd_match(Bswap, m_BitReverse(m_Value())));
+
+  EXPECT_TRUE(sd_match(Ctpop, m_Ctpop(m_Value())));
   EXPECT_TRUE(sd_match(Ctlz, m_Ctlz(m_Value())));
+  EXPECT_TRUE(sd_match(Cttz, m_Cttz(m_Value())));
+  EXPECT_FALSE(sd_match(Ctpop, m_Ctlz(m_Value())));
+  EXPECT_FALSE(sd_match(Ctlz, m_Cttz(m_Value())));
+  EXPECT_FALSE(sd_match(Cttz, m_Ctlz(m_Value())));
 }
 
 TEST_F(SelectionDAGPatternMatchTest, matchConstants) {
-- 
GitLab


From 49fa91edf7704dc385d3a97ddb74b7348be10bc7 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 11:57:18 +0100
Subject: [PATCH 107/329] [DAG] SDPatternMatch - add missing ROTL/ROTR matchers

---
 llvm/include/llvm/CodeGen/SDPatternMatch.h             | 10 ++++++++++
 .../unittests/CodeGen/SelectionDAGPatternMatchTest.cpp |  7 +++++++
 2 files changed, 17 insertions(+)

diff --git a/llvm/include/llvm/CodeGen/SDPatternMatch.h b/llvm/include/llvm/CodeGen/SDPatternMatch.h
index f0e12f28c9e6..b3e249b7ebd5 100644
--- a/llvm/include/llvm/CodeGen/SDPatternMatch.h
+++ b/llvm/include/llvm/CodeGen/SDPatternMatch.h
@@ -759,6 +759,16 @@ inline BinaryOpc_match<LHS, RHS> m_Srl(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS>(ISD::SRL, L, R);
 }
 
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS> m_Rotl(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS>(ISD::ROTL, L, R);
+}
+
+template <typename LHS, typename RHS>
+inline BinaryOpc_match<LHS, RHS> m_Rotr(const LHS &L, const RHS &R) {
+  return BinaryOpc_match<LHS, RHS>(ISD::ROTR, L, R);
+}
+
 template <typename LHS, typename RHS>
 inline BinaryOpc_match<LHS, RHS, true> m_FAdd(const LHS &L, const RHS &R) {
   return BinaryOpc_match<LHS, RHS, true>(ISD::FADD, L, R);
diff --git a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
index a28e2b2b47df..dc40e5893b65 100644
--- a/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
+++ b/llvm/unittests/CodeGen/SelectionDAGPatternMatchTest.cpp
@@ -200,6 +200,8 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   SDValue SMin = DAG->getNode(ISD::SMIN, DL, Int32VT, Op1, Op0);
   SDValue UMax = DAG->getNode(ISD::UMAX, DL, Int32VT, Op0, Op1);
   SDValue UMin = DAG->getNode(ISD::UMIN, DL, Int32VT, Op1, Op0);
+  SDValue Rotl = DAG->getNode(ISD::ROTL, DL, Int32VT, Op0, Op1);
+  SDValue Rotr = DAG->getNode(ISD::ROTR, DL, Int32VT, Op1, Op0);
 
   SDValue ICMP_GT = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGT);
   SDValue ICMP_GE = DAG->getSetCC(DL, MVT::i1, Op0, Op1, ISD::SETGE);
@@ -246,6 +248,11 @@ TEST_F(SelectionDAGPatternMatchTest, matchBinaryOp) {
   EXPECT_FALSE(sd_match(DisOr, m_Add(m_Value(), m_Value())));
   EXPECT_TRUE(sd_match(DisOr, m_AddLike(m_Value(), m_Value())));
 
+  EXPECT_TRUE(sd_match(Rotl, m_Rotl(m_Value(), m_Value())));
+  EXPECT_TRUE(sd_match(Rotr, m_Rotr(m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Rotl, m_Rotr(m_Value(), m_Value())));
+  EXPECT_FALSE(sd_match(Rotr, m_Rotl(m_Value(), m_Value())));
+
   EXPECT_TRUE(sd_match(SMax, m_c_BinOp(ISD::SMAX, m_Value(), m_Value())));
   EXPECT_TRUE(sd_match(SMax, m_SMax(m_Value(), m_Value())));
   EXPECT_TRUE(sd_match(SMax, m_SMaxLike(m_Value(), m_Value())));
-- 
GitLab


From 3ef630ac339f31686290f9460a40eb2a9c9f5bd0 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Stefan=20Gr=C3=A4nitz?= <stefan.graenitz@gmail.com>
Date: Wed, 16 Oct 2024 13:07:02 +0200
Subject: [PATCH 108/329] [lldb] Support tests with nested make invocations on
 Windows 2/2 (#112360)

Following up from https://github.com/llvm/llvm-project/pull/112342, we
roll out the fix and quote nested `make` invocations in all API tests.
---
 .../commands/expression/top-level/Makefile    |  2 +-
 .../commands/expression/weak_symbols/Makefile |  4 ++--
 .../API/commands/target/create-deps/Makefile  |  2 +-
 .../breakpoint/break_in_loaded_dylib/Makefile |  2 +-
 .../dlopen_other_executable/Makefile          |  2 +-
 lldb/test/API/functionalities/exec/Makefile   |  2 +-
 .../functionalities/jitloader_gdb/Makefile    |  2 +-
 .../functionalities/limit-debug-info/Makefile |  4 ++--
 .../load_after_attach/Makefile                |  2 +-
 .../API/functionalities/load_lazy/Makefile    |  6 +++---
 .../API/functionalities/load_unload/Makefile  | 10 +++++-----
 .../functionalities/load_using_paths/Makefile |  2 +-
 .../functionalities/scripted_process/Makefile |  2 +-
 .../stop-on-sharedlibrary-load/Makefile       |  4 ++--
 .../tail_call_frames/cross_dso/Makefile       |  2 +-
 .../target-new-solib-notifications/Makefile   | 20 +++++++++----------
 .../API/lang/c/conflicting-symbol/Makefile    |  2 +-
 .../API/lang/cpp/incomplete-types/Makefile    |  2 +-
 .../lang/cpp/namespace_definitions/Makefile   |  4 ++--
 .../lang/objc/conflicting-definition/Makefile |  4 ++--
 .../lang/objc/modules-hash-mismatch/Makefile  |  2 +-
 .../API/macosx/delay-init-dependency/Makefile |  2 +-
 .../API/macosx/expedited-thread-pcs/Makefile  |  2 +-
 lldb/test/API/macosx/indirect_symbol/Makefile |  4 ++--
 .../API/macosx/lc-note/kern-ver-str/Makefile  |  2 +-
 .../lc-note/multiple-binary-corefile/Makefile |  4 ++--
 .../macCatalystAppMacOSFramework/Makefile     |  2 +-
 lldb/test/API/macosx/skinny-corefile/Makefile |  4 ++--
 .../API/tools/lldb-dap/breakpoint/Makefile    |  2 +-
 .../tools/lldb-server/libraries-svr4/Makefile |  4 ++--
 30 files changed, 54 insertions(+), 54 deletions(-)

diff --git a/lldb/test/API/commands/expression/top-level/Makefile b/lldb/test/API/commands/expression/top-level/Makefile
index e5e9e78d4ead..51b27ddbb3c2 100644
--- a/lldb/test/API/commands/expression/top-level/Makefile
+++ b/lldb/test/API/commands/expression/top-level/Makefile
@@ -5,6 +5,6 @@ all: dummy
 include Makefile.rules
 
 dummy: dummy.cpp
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		CXX_SOURCES=dummy.cpp EXE=dummy
 
diff --git a/lldb/test/API/commands/expression/weak_symbols/Makefile b/lldb/test/API/commands/expression/weak_symbols/Makefile
index 6fd8133312ad..1636e9b30326 100644
--- a/lldb/test/API/commands/expression/weak_symbols/Makefile
+++ b/lldb/test/API/commands/expression/weak_symbols/Makefile
@@ -9,12 +9,12 @@ a.out: libdylib.dylib
 include Makefile.rules
 
 libdylib.dylib: dylib.c
-	$(MAKE) -C $(BUILDDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -C $(BUILDDIR) -f $(MAKEFILE_RULES) \
 		C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \
 		CFLAGS_EXTRAS=-DHAS_THEM LD_EXTRAS=-dynamiclib
 
 hidden/libdylib.dylib:
 	mkdir hidden
-	$(MAKE) -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -C $(BUILDDIR)/hidden -f $(MAKEFILE_RULES) \
 		C_SOURCES= DYLIB_C_SOURCES=dylib.c DYLIB_NAME=dylib \
 		LD_EXTRAS=-dynamiclib
diff --git a/lldb/test/API/commands/target/create-deps/Makefile b/lldb/test/API/commands/target/create-deps/Makefile
index 3e5b1049b5a1..866d550ee7d0 100644
--- a/lldb/test/API/commands/target/create-deps/Makefile
+++ b/lldb/test/API/commands/target/create-deps/Makefile
@@ -6,5 +6,5 @@ a.out: libload_a
 include Makefile.rules
 
 libload_a:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=load_a DYLIB_CXX_SOURCES=a.cpp
diff --git a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
index 0f3fb37bdadf..112210e7e2c6 100644
--- a/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
+++ b/lldb/test/API/functionalities/breakpoint/break_in_loaded_dylib/Makefile
@@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp
 USE_LIBDL := 1
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b
 all: lib_b
 
diff --git a/lldb/test/API/functionalities/dlopen_other_executable/Makefile b/lldb/test/API/functionalities/dlopen_other_executable/Makefile
index 113b9fd7d3f1..51fc01bdde75 100644
--- a/lldb/test/API/functionalities/dlopen_other_executable/Makefile
+++ b/lldb/test/API/functionalities/dlopen_other_executable/Makefile
@@ -2,7 +2,7 @@ C_SOURCES := main.c
 USE_LIBDL := 1
 
 other:
-	$(MAKE) -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other
+	"$(MAKE)" -f $(MAKEFILE_RULES) C_SOURCES=other.c EXE=other
 all: other
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/exec/Makefile b/lldb/test/API/functionalities/exec/Makefile
index 8b9148ea8a35..65d4680077d2 100644
--- a/lldb/test/API/functionalities/exec/Makefile
+++ b/lldb/test/API/functionalities/exec/Makefile
@@ -5,5 +5,5 @@ all: secondprog
 include Makefile.rules
 
 secondprog: secondprog.cpp
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		CXX_SOURCES=secondprog.cpp EXE=secondprog
diff --git a/lldb/test/API/functionalities/jitloader_gdb/Makefile b/lldb/test/API/functionalities/jitloader_gdb/Makefile
index 357b1f83684f..9998cc9cf833 100644
--- a/lldb/test/API/functionalities/jitloader_gdb/Makefile
+++ b/lldb/test/API/functionalities/jitloader_gdb/Makefile
@@ -5,5 +5,5 @@ all: a.out simple
 include Makefile.rules
 
 simple:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		C_SOURCES=simple.c EXE=simple
diff --git a/lldb/test/API/functionalities/limit-debug-info/Makefile b/lldb/test/API/functionalities/limit-debug-info/Makefile
index 874b3a15e0fe..fa867a7aeb7c 100644
--- a/lldb/test/API/functionalities/limit-debug-info/Makefile
+++ b/lldb/test/API/functionalities/limit-debug-info/Makefile
@@ -17,11 +17,11 @@ include Makefile.rules
 a.out: libone libtwo
 
 libone:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_CXX_SOURCES=one.cpp DYLIB_NAME=one \
 	  CFLAGS_EXTRAS="$(ONE_CXXFLAGS)"
 
 libtwo: libone
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_CXX_SOURCES=two.cpp DYLIB_NAME=two \
 	  CFLAGS_EXTRAS="$(TWO_CXXFLAGS)" LD_EXTRAS="-L. -lone"
diff --git a/lldb/test/API/functionalities/load_after_attach/Makefile b/lldb/test/API/functionalities/load_after_attach/Makefile
index 0f3fb37bdadf..112210e7e2c6 100644
--- a/lldb/test/API/functionalities/load_after_attach/Makefile
+++ b/lldb/test/API/functionalities/load_after_attach/Makefile
@@ -2,7 +2,7 @@ CXX_SOURCES := main.cpp
 USE_LIBDL := 1
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=lib_b
 all: lib_b
 
diff --git a/lldb/test/API/functionalities/load_lazy/Makefile b/lldb/test/API/functionalities/load_lazy/Makefile
index 81bc7dcb4d05..8e1d06b1e39c 100644
--- a/lldb/test/API/functionalities/load_lazy/Makefile
+++ b/lldb/test/API/functionalities/load_lazy/Makefile
@@ -17,13 +17,13 @@ else
 endif
 
 t1: t2_0
-	$(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=t1.c DYLIB_NAME=t1 LD_EXTRAS="-L. $(LINKFLAGS)"
 
 t2_0:
-	$(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_0.c DYLIB_NAME=t2_0
 
 t2_1:
-	$(MAKE) VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR) -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=t2_1.c DYLIB_NAME=t2_1
diff --git a/lldb/test/API/functionalities/load_unload/Makefile b/lldb/test/API/functionalities/load_unload/Makefile
index e73ec7310876..dd7d16029427 100644
--- a/lldb/test/API/functionalities/load_unload/Makefile
+++ b/lldb/test/API/functionalities/load_unload/Makefile
@@ -7,25 +7,25 @@ a.out: lib_b lib_a lib_c lib_d hidden_lib_d
 include Makefile.rules
 
 lib_a: lib_b
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=loadunload_a \
 		LD_EXTRAS="-L. -lloadunload_b"
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=loadunload_b
 
 lib_c:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=loadunload_c
 
 lib_d:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d
 ifeq ($(OS),Darwin)
 	install_name_tool -id @executable_path/libloadunload_d.dylib libloadunload_d.dylib
 endif
 
 hidden_lib_d: hidden
-	$(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload_d
diff --git a/lldb/test/API/functionalities/load_using_paths/Makefile b/lldb/test/API/functionalities/load_using_paths/Makefile
index 814a96013756..f973a389d585 100644
--- a/lldb/test/API/functionalities/load_using_paths/Makefile
+++ b/lldb/test/API/functionalities/load_using_paths/Makefile
@@ -6,6 +6,6 @@ all: hidden_lib a.out
 include Makefile.rules
 
 hidden_lib:
-	$(MAKE) VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
+	"$(MAKE)" VPATH=$(SRCDIR)/hidden -C hidden -f $(MAKEFILE_RULES) \
 	DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=loadunload
 
diff --git a/lldb/test/API/functionalities/scripted_process/Makefile b/lldb/test/API/functionalities/scripted_process/Makefile
index ba739451fc7e..d4f12fbb3c4e 100644
--- a/lldb/test/API/functionalities/scripted_process/Makefile
+++ b/lldb/test/API/functionalities/scripted_process/Makefile
@@ -9,7 +9,7 @@ CXXFLAGS_EXTRAS := -target $(TRIPLE)
 all: libbaz.dylib a.out
 
 libbaz.dylib: baz.cpp
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=baz DYLIB_CXX_SOURCES=baz.cpp
 
 include Makefile.rules
diff --git a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
index 4abcab84eac2..e4b0e86c0c36 100644
--- a/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
+++ b/lldb/test/API/functionalities/stop-on-sharedlibrary-load/Makefile
@@ -6,11 +6,11 @@ a.out: lib_a lib_b
 include Makefile.rules
 
 lib_a:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=load_a
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=load_b
 
 
diff --git a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
index 42c010be9a03..963ce2ac94d9 100644
--- a/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
+++ b/lldb/test/API/functionalities/tail_call_frames/cross_dso/Makefile
@@ -10,4 +10,4 @@ a.out: lib_One lib_Two
 lib_One: lib_Two
 
 lib_%:
-	$(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL)
+	"$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk DSYMUTIL=$(DSYMUTIL)
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
index 6c61d210eeb2..e3b48697fd78 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
@@ -1,23 +1,23 @@
 CXX_SOURCES := main.cpp
-LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
+LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
 
 a.out: lib_b lib_a lib_c lib_d
 
 include Makefile.rules
 
 lib_a: lib_b
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
-		LD_EXTRAS="-L. -l_b"
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
+		LD_EXTRAS="-L. -l_b"
 
 lib_b:
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
 
 lib_c:
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
 
 lib_d:
-	$(MAKE) -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
diff --git a/lldb/test/API/lang/c/conflicting-symbol/Makefile b/lldb/test/API/lang/c/conflicting-symbol/Makefile
index 81594a1265da..1331c4e1ebfa 100644
--- a/lldb/test/API/lang/c/conflicting-symbol/Makefile
+++ b/lldb/test/API/lang/c/conflicting-symbol/Makefile
@@ -7,4 +7,4 @@ include Makefile.rules
 a.out: lib_One lib_Two
 
 lib_%:
-	$(MAKE) VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk
+	"$(MAKE)" VPATH=$(SRCDIR)/$* -I $(SRCDIR) -f $(SRCDIR)/$*.mk
diff --git a/lldb/test/API/lang/cpp/incomplete-types/Makefile b/lldb/test/API/lang/cpp/incomplete-types/Makefile
index f42ac2e81cc7..0cf3f6a31caa 100644
--- a/lldb/test/API/lang/cpp/incomplete-types/Makefile
+++ b/lldb/test/API/lang/cpp/incomplete-types/Makefile
@@ -16,7 +16,7 @@ main.o: CFLAGS_EXTRAS = -flimit-debug-info
 
 limit: a.o main.o
 	mkdir -p build_limit
-	$(MAKE) -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -C $(BUILDDIR)/build_limit -f $(MAKEFILE_RULES) \
 		EXE=../limit CXX_SOURCES="length.cpp ../a.o ../main.o" \
 		CFLAGS_EXTRAS=-flimit-debug-info NO_LIMIT_DEBUG_INFO_FLAGS=""
 
diff --git a/lldb/test/API/lang/cpp/namespace_definitions/Makefile b/lldb/test/API/lang/cpp/namespace_definitions/Makefile
index fc9165f67f42..b17d70fc9287 100644
--- a/lldb/test/API/lang/cpp/namespace_definitions/Makefile
+++ b/lldb/test/API/lang/cpp/namespace_definitions/Makefile
@@ -6,10 +6,10 @@ a.out: liba libb
 include Makefile.rules
 
 liba:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=a DYLIB_CXX_SOURCES=a.cpp
 
 libb:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=b DYLIB_CXX_SOURCES=b.cpp
 
diff --git a/lldb/test/API/lang/objc/conflicting-definition/Makefile b/lldb/test/API/lang/objc/conflicting-definition/Makefile
index 00a0769a086f..cba79c94d46b 100644
--- a/lldb/test/API/lang/objc/conflicting-definition/Makefile
+++ b/lldb/test/API/lang/objc/conflicting-definition/Makefile
@@ -9,14 +9,14 @@ include Makefile.rules
 
 libTest.dylib:	Test/Test.m
 	mkdir -p Test
-	$(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
+	"$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=Test DYLIB_OBJC_SOURCES=Test/Test.m \
 		LD_EXTRAS="-lobjc -framework Foundation" \
 		CFLAGS_EXTRAS=-I$(SRCDIR)
 
 libTestExt.dylib: TestExt/TestExt.m
 	mkdir -p TestExt
-	$(MAKE) MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
+	"$(MAKE)" MAKE_DSYM=YES -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=TestExt DYLIB_OBJC_SOURCES=TestExt/TestExt.m \
 		LD_EXTRAS="-lobjc -framework Foundation -lTest -L." \
 		CFLAGS_EXTRAS=-I$(SRCDIR)
diff --git a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
index 59bf009f6867..57da670b69ab 100644
--- a/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
+++ b/lldb/test/API/lang/objc/modules-hash-mismatch/Makefile
@@ -5,7 +5,7 @@ USE_PRIVATE_MODULE_CACHE = YES
 .PHONY: update-module
 
 all: $(EXE)
-	$(MAKE) -f $(SRCDIR)/Makefile update-module
+	"$(MAKE)" -f $(SRCDIR)/Makefile update-module
 
 include Makefile.rules
 
diff --git a/lldb/test/API/macosx/delay-init-dependency/Makefile b/lldb/test/API/macosx/delay-init-dependency/Makefile
index 246ea0f34e1a..7421c68b79ba 100644
--- a/lldb/test/API/macosx/delay-init-dependency/Makefile
+++ b/lldb/test/API/macosx/delay-init-dependency/Makefile
@@ -7,5 +7,5 @@ all: build-libfoo a.out
 include Makefile.rules
 
 build-libfoo: foo.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES
diff --git a/lldb/test/API/macosx/expedited-thread-pcs/Makefile b/lldb/test/API/macosx/expedited-thread-pcs/Makefile
index 7799f06e7709..73a969831e67 100644
--- a/lldb/test/API/macosx/expedited-thread-pcs/Makefile
+++ b/lldb/test/API/macosx/expedited-thread-pcs/Makefile
@@ -6,6 +6,6 @@ all: build-libfoo a.out
 include Makefile.rules
 
 build-libfoo: foo.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=foo.c DYLIB_NAME=foo DYLIB_ONLY=YES
 
diff --git a/lldb/test/API/macosx/indirect_symbol/Makefile b/lldb/test/API/macosx/indirect_symbol/Makefile
index 9069302b39c4..dee3e184fe19 100644
--- a/lldb/test/API/macosx/indirect_symbol/Makefile
+++ b/lldb/test/API/macosx/indirect_symbol/Makefile
@@ -7,11 +7,11 @@ all: build-libindirect build-libreepxoprt a.out
 include Makefile.rules
 
 build-libindirect: indirect.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=indirect.c DYLIB_NAME=indirect DYLIB_ONLY=YES \
 		LD_EXTRAS="-Wl,-image_base,0x200000000"
 
 build-libreepxoprt: reexport.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_C_SOURCES=reexport.c DYLIB_NAME=reexport DYLIB_ONLY=YES \
 		LD_EXTRAS="-L. -lindirect -Wl,-alias_list,$(SRCDIR)/alias.list"
diff --git a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
index 05d9552a8020..01b4acfdcfd2 100644
--- a/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
+++ b/lldb/test/API/macosx/lc-note/kern-ver-str/Makefile
@@ -5,7 +5,7 @@ C_SOURCES := main.c
 all: a.out create-empty-corefile
 
 create-empty-corefile:
-	$(MAKE) -f $(MAKEFILE_RULES) EXE=create-empty-corefile \
+	"$(MAKE)" -f $(MAKEFILE_RULES) EXE=create-empty-corefile \
 	    CXX=$(CC) CXX_SOURCES=create-empty-corefile.cpp
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
index 8e561f17383f..229235cda999 100644
--- a/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
+++ b/lldb/test/API/macosx/lc-note/multiple-binary-corefile/Makefile
@@ -10,11 +10,11 @@ create-empty-corefile:
 		CXX_SOURCES=create-multibin-corefile.cpp
 
 libone.dylib: one.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=one DYLIB_C_SOURCES=one.c
 
 libtwo.dylib: two.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=two DYLIB_C_SOURCES=two.c
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
index c77a186724fd..0dc9e71c3276 100644
--- a/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
+++ b/lldb/test/API/macosx/macCatalystAppMacOSFramework/Makefile
@@ -11,7 +11,7 @@ override CC=xcrun clang
 all: libfoo.dylib a.out
 
 libfoo.dylib: foo.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_NAME=foo DYLIB_C_SOURCES=foo.c
 
 include Makefile.rules
diff --git a/lldb/test/API/macosx/skinny-corefile/Makefile b/lldb/test/API/macosx/skinny-corefile/Makefile
index efe37f3d2b8b..fce43a36c33a 100644
--- a/lldb/test/API/macosx/skinny-corefile/Makefile
+++ b/lldb/test/API/macosx/skinny-corefile/Makefile
@@ -6,10 +6,10 @@ include Makefile.rules
 a.out: libto-be-removed libpresent
 
 libto-be-removed: libpresent
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_C_SOURCES=to-be-removed.c DYLIB_NAME=to-be-removed \
 	  LD_EXTRAS="-L. -lpresent"
 
 libpresent:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 	  DYLIB_ONLY=YES DYLIB_C_SOURCES=present.c DYLIB_NAME=present 
diff --git a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
index 30a640018493..7634f513e852 100644
--- a/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
+++ b/lldb/test/API/tools/lldb-dap/breakpoint/Makefile
@@ -15,5 +15,5 @@ main-copy.cpp: main.cpp
 
 # The following shared library will be used to test breakpoints under dynamic loading
 libother:  other-copy.c
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_ONLY=YES DYLIB_C_SOURCES=other-copy.c DYLIB_NAME=other 
diff --git a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
index 5b5c1dcef783..f13b1ac15928 100644
--- a/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
+++ b/lldb/test/API/tools/lldb-server/libraries-svr4/Makefile
@@ -9,11 +9,11 @@ a.out: svr4lib_a svr4lib_b_quote
 include Makefile.rules
 
 svr4lib_a:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_NAME=svr4lib_a DYLIB_CXX_SOURCES=svr4lib_a.cpp \
 		DYLIB_ONLY=YES
 
 svr4lib_b_quote:
-	$(MAKE) -f $(MAKEFILE_RULES) \
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
 		DYLIB_NAME=svr4lib_b\\\" DYLIB_CXX_SOURCES=svr4lib_b_quote.cpp \
 		DYLIB_ONLY=YES
-- 
GitLab


From 09361953116770b646decf5820a9455ada2ba4fc Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Wed, 16 Oct 2024 19:13:52 +0800
Subject: [PATCH 109/329] [InstCombine] Drop `samesign` in InstCombine
 (#112480)

Closes https://github.com/llvm/llvm-project/issues/112476.
---
 .../InstCombine/InstCombineCompares.cpp          |  4 ++--
 .../Transforms/InstCombine/InstCombineSelect.cpp |  1 +
 .../Transforms/InstCombine/icmp-and-shift.ll     | 13 +++++++++++++
 .../Transforms/InstCombine/icmp-equality-test.ll | 16 ++++++++++++++++
 llvm/test/Transforms/InstCombine/icmp.ll         | 15 +++++++++++++++
 5 files changed, 47 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 7129499e0f8f..18a6fdcec172 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -1738,7 +1738,7 @@ Instruction *InstCombinerImpl::foldICmpAndShift(ICmpInst &Cmp,
 
     // Compute X & (C2 << Y).
     Value *NewAnd = Builder.CreateAnd(Shift->getOperand(0), NewShift);
-    return replaceOperand(Cmp, 0, NewAnd);
+    return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1));
   }
 
   return nullptr;
@@ -1844,7 +1844,7 @@ Instruction *InstCombinerImpl::foldICmpAndConstConst(ICmpInst &Cmp,
                                                /*HasNUW=*/true),
                              One, Or->getName());
         Value *NewAnd = Builder.CreateAnd(A, NewOr, And->getName());
-        return replaceOperand(Cmp, 0, NewAnd);
+        return new ICmpInst(Cmp.getPredicate(), NewAnd, Cmp.getOperand(1));
       }
     }
   }
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 8be2eeed84ad..623694663aa1 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1448,6 +1448,7 @@ Instruction *InstCombinerImpl::foldSelectEqualityTest(SelectInst &Sel) {
              m_c_SpecificICmp(ICmpInst::ICMP_EQ, m_Specific(X), m_Specific(Y))))
     return nullptr;
 
+  cast<ICmpInst>(XeqY)->setSameSign(false);
   return replaceInstUsesWith(Sel, XeqY);
 }
 
diff --git a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
index 684ece21b116..d092363309fe 100644
--- a/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-and-shift.ll
@@ -619,6 +619,19 @@ define i1 @test_shr_and_1_ne_0(i32 %a, i32 %b) {
   ret i1 %cmp
 }
 
+define i1 @test_shr_and_1_ne_0_samesign(i32 %a, i32 %b) {
+; CHECK-LABEL: @test_shr_and_1_ne_0_samesign(
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]]
+; CHECK-NEXT:    [[TMP2:%.*]] = and i32 [[A:%.*]], [[TMP1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[TMP2]], 0
+; CHECK-NEXT:    ret i1 [[CMP]]
+;
+  %shr = lshr i32 %a, %b
+  %and = and i32 %shr, 1
+  %cmp = icmp samesign ne i32 %and, 0
+  ret i1 %cmp
+}
+
 define i1 @test_const_shr_and_1_ne_0(i32 %b) {
 ; CHECK-LABEL: @test_const_shr_and_1_ne_0(
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw i32 1, [[B:%.*]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
index c2740ca7fe8a..b9d8f2d54def 100644
--- a/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-equality-test.ll
@@ -33,6 +33,22 @@ entry:
   ret i1 %equal
 }
 
+define i1 @icmp_equality_test_constant_samesign(i42 %X, i42 %Y) {
+; CHECK-LABEL: @icmp_equality_test_constant_samesign(
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[XEQY:%.*]] = icmp eq i42 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    ret i1 [[XEQY]]
+;
+entry:
+  %XeqC = icmp eq i42 %X, -42
+  %YeqC = icmp eq i42 %Y, -42
+  %XeqY = icmp samesign eq i42 %X, %Y
+  %not.YeqC = xor i1 %YeqC, true
+  %and = select i1 %not.YeqC, i1 %XeqY, i1 false
+  %equal = select i1 %XeqC, i1 %YeqC, i1 %and
+  ret i1 %equal
+}
+
 define i1 @icmp_equality_test_swift_optional_pointers(i64 %X, i64 %Y) {
 ; CHECK-LABEL: @icmp_equality_test_swift_optional_pointers(
 ; CHECK-NEXT:  entry:
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 5e80134b153b..7cafb4885ff0 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -3203,6 +3203,21 @@ define i1 @icmp_and_or_lshr(i32 %x, i32 %y) {
   ret i1 %ret
 }
 
+define i1 @icmp_and_or_lshr_samesign(i32 %x, i32 %y) {
+; CHECK-LABEL: @icmp_and_or_lshr_samesign(
+; CHECK-NEXT:    [[SHF1:%.*]] = shl nuw i32 1, [[Y:%.*]]
+; CHECK-NEXT:    [[OR2:%.*]] = or i32 [[SHF1]], 1
+; CHECK-NEXT:    [[AND3:%.*]] = and i32 [[X:%.*]], [[OR2]]
+; CHECK-NEXT:    [[RET:%.*]] = icmp ne i32 [[AND3]], 0
+; CHECK-NEXT:    ret i1 [[RET]]
+;
+  %shf = lshr i32 %x, %y
+  %or = or i32 %shf, %x
+  %and = and i32 %or, 1
+  %ret = icmp samesign ne i32 %and, 0
+  ret i1 %ret
+}
+
 define <2 x i1> @icmp_and_or_lshr_vec(<2 x i32> %x, <2 x i32> %y) {
 ; CHECK-LABEL: @icmp_and_or_lshr_vec(
 ; CHECK-NEXT:    [[SHF:%.*]] = lshr <2 x i32> [[X:%.*]], [[Y:%.*]]
-- 
GitLab


From 682fa797b7358733df9e439241a9ef2906003adf Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 16 Oct 2024 12:44:09 +0100
Subject: [PATCH 110/329] InstCombine/Select: remove redundant code (NFC)
 (#112388)

InstCombinerImpl::foldSelectInstWithICmp has some inlined code for
select-icmp-xor simplification, but this simplification is already done
by other code, via another path:

  (X & Y) == 0 ? X : X ^ Y ->
  ((X & Y) == 0 ? 0 : Y) ^ X ->
  (X & Y) ^ X ->
  X & ~Y

Cover the cases that it claims to simplify, and demonstrate that
stripping it doesn't cause test changes.
---
 .../InstCombine/InstCombineSelect.cpp         |  50 -----
 .../Transforms/InstCombine/select-icmp-xor.ll | 190 ++++++++++++++++++
 2 files changed, 190 insertions(+), 50 deletions(-)
 create mode 100644 llvm/test/Transforms/InstCombine/select-icmp-xor.ll

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index 623694663aa1..ed44f0596f32 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -1954,56 +1954,6 @@ Instruction *InstCombinerImpl::foldSelectInstWithICmp(SelectInst &SI,
     return &SI;
   }
 
-  // FIXME: This code is nearly duplicated in InstSimplify. Using/refactoring
-  // decomposeBitTestICmp() might help.
-  if (TrueVal->getType()->isIntOrIntVectorTy()) {
-    unsigned BitWidth =
-        DL.getTypeSizeInBits(TrueVal->getType()->getScalarType());
-    APInt MinSignedValue = APInt::getSignedMinValue(BitWidth);
-    Value *X;
-    const APInt *Y, *C;
-    bool TrueWhenUnset;
-    bool IsBitTest = false;
-    if (ICmpInst::isEquality(Pred) &&
-        match(CmpLHS, m_And(m_Value(X), m_Power2(Y))) &&
-        match(CmpRHS, m_Zero())) {
-      IsBitTest = true;
-      TrueWhenUnset = Pred == ICmpInst::ICMP_EQ;
-    } else if (Pred == ICmpInst::ICMP_SLT && match(CmpRHS, m_Zero())) {
-      X = CmpLHS;
-      Y = &MinSignedValue;
-      IsBitTest = true;
-      TrueWhenUnset = false;
-    } else if (Pred == ICmpInst::ICMP_SGT && match(CmpRHS, m_AllOnes())) {
-      X = CmpLHS;
-      Y = &MinSignedValue;
-      IsBitTest = true;
-      TrueWhenUnset = true;
-    }
-    if (IsBitTest) {
-      Value *V = nullptr;
-      // (X & Y) == 0 ? X : X ^ Y  --> X & ~Y
-      if (TrueWhenUnset && TrueVal == X &&
-          match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateAnd(X, ~(*Y));
-      // (X & Y) != 0 ? X ^ Y : X  --> X & ~Y
-      else if (!TrueWhenUnset && FalseVal == X &&
-               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateAnd(X, ~(*Y));
-      // (X & Y) == 0 ? X ^ Y : X  --> X | Y
-      else if (TrueWhenUnset && FalseVal == X &&
-               match(TrueVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateOr(X, *Y);
-      // (X & Y) != 0 ? X : X ^ Y  --> X | Y
-      else if (!TrueWhenUnset && TrueVal == X &&
-               match(FalseVal, m_Xor(m_Specific(X), m_APInt(C))) && *Y == *C)
-        V = Builder.CreateOr(X, *Y);
-
-      if (V)
-        return replaceInstUsesWith(SI, V);
-    }
-  }
-
   if (Instruction *V =
           foldSelectICmpAndAnd(SI.getType(), ICI, TrueVal, FalseVal, Builder))
     return V;
diff --git a/llvm/test/Transforms/InstCombine/select-icmp-xor.ll b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll
new file mode 100644
index 000000000000..c8ce114a683e
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/select-icmp-xor.ll
@@ -0,0 +1,190 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=instcombine -S %s | FileCheck %s
+
+define i8 @select_icmp_eq_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], -5
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp eq i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_eq_pow2_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_pow2_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], 4
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp eq i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_eq_not_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_eq_not_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], 5
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 5
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 5
+  %icmp = icmp eq i8 %and, 0
+  %xor = xor i8 %x, 5
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_ne_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], -5
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp ne i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_ne_pow2_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_pow2_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], 4
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 4
+  %icmp = icmp ne i8 %and, 0
+  %xor = xor i8 %x, 4
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_ne_not_pow2(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_ne_not_pow2(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[AND:%.*]] = and i8 [[X]], 5
+; CHECK-NEXT:    [[ICMP_NOT:%.*]] = icmp eq i8 [[AND]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], 5
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP_NOT]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %and = and i8 %x, 5
+  %icmp = icmp ne i8 %and, 0
+  %xor = xor i8 %x, 5
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_zero_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], -128
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 0
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_zero_smin_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_zero_smin_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], 127
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 0
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_not_zero(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_not_zero(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i8 [[X]], 1
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -128
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 1
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_slt_not_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_slt_not_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp slt i8 [[X]], 0
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -127
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp slt i8 %x, 0
+  %xor = xor i8 %x, -127
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_allones_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = and i8 [[X]], 127
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 255
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_allones_smin_flipped(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_allones_smin_flipped(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[SEL:%.*]] = or i8 [[X]], -128
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 255
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %xor, i8 %x
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_not_allones(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_not_allones(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[ICMP:%.*]] = icmp sgt i8 [[X]], -2
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -128
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP]], i8 [[X]], i8 [[XOR]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 254
+  %xor = xor i8 %x, -128
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
+
+define i8 @select_icmp_sgt_not_smin(i8 %x) {
+; CHECK-LABEL: define i8 @select_icmp_sgt_not_smin(
+; CHECK-SAME: i8 [[X:%.*]]) {
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[X]], -127
+; CHECK-NEXT:    [[ICMP1:%.*]] = icmp slt i8 [[X]], 0
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[ICMP1]], i8 [[XOR]], i8 [[X]]
+; CHECK-NEXT:    ret i8 [[SEL]]
+;
+  %icmp = icmp sgt i8 %x, 255
+  %xor = xor i8 %x, -127
+  %sel = select i1 %icmp, i8 %x, i8 %xor
+  ret i8 %sel
+}
-- 
GitLab


From f3e804b9fd561c0da970536643e2a5cd6c3d4215 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Don=C3=A1t=20Nagy?= <donat.nagy@ericsson.com>
Date: Wed, 16 Oct 2024 13:48:38 +0200
Subject: [PATCH 111/329] [analyzer][clang-tidy][NFC] Clean up eagerly-assume
 handling (#112209)

This commit is a collection of several very minor code quality
improvements. The main goal is removing the misleading "Bin" substring
from the names of several methods and variables (like
`evalEagerlyAssumedBinOpBifurcation`) that are also applied for the
unary logical not operator.

In addition to this, I clarified the doc-comment of the method
`evalEagerlyAssumedBinOpBifurcation` and refactored the body of this
method to fix the capitalization of variable names and replace an
obsolete use of `std::tie` with a structured binding.

Finally, the data member `eagerlyAssumeBinOpBifurcation` of the class
`AnalyzerOptions` was completely removed (including a line in clang-tidy
that sets it to true), because it was never read by any code.

Note that the eagerly-assume mode of the analyzer is controlled by a
different boolean member of `AnalyzerOptions` which is called
`ShouldEagerlyAssume` and is defined via the macro magic from
`AnalyzerOptions.def`.
---
 clang-tools-extra/clang-tidy/ClangTidy.cpp    |  1 -
 .../StaticAnalyzer/Core/AnalyzerOptions.def   | 13 +++---
 .../StaticAnalyzer/Core/AnalyzerOptions.h     |  8 ++--
 .../Core/PathSensitive/ExprEngine.h           | 11 +++--
 .../Core/BugReporterVisitors.cpp              |  2 +-
 clang/lib/StaticAnalyzer/Core/ExprEngine.cpp  | 41 ++++++++-----------
 6 files changed, 33 insertions(+), 43 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/ClangTidy.cpp b/clang-tools-extra/clang-tidy/ClangTidy.cpp
index 62f9d19b2a36..c4cac7d27b77 100644
--- a/clang-tools-extra/clang-tidy/ClangTidy.cpp
+++ b/clang-tools-extra/clang-tidy/ClangTidy.cpp
@@ -458,7 +458,6 @@ ClangTidyASTConsumerFactory::createASTConsumer(
   if (!AnalyzerOptions.CheckersAndPackages.empty()) {
     setStaticAnalyzerCheckerOpts(Context.getOptions(), AnalyzerOptions);
     AnalyzerOptions.AnalysisDiagOpt = PD_NONE;
-    AnalyzerOptions.eagerlyAssumeBinOpBifurcation = true;
     std::unique_ptr<ento::AnalysisASTConsumer> AnalysisConsumer =
         ento::CreateAnalysisConsumer(Compiler);
     AnalysisConsumer->AddDiagnosticConsumer(
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
index 737bc8e86cfb..ad2dbffe8832 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.def
@@ -299,13 +299,12 @@ ANALYZER_OPTION(
 
 ANALYZER_OPTION(
     bool, ShouldEagerlyAssume, "eagerly-assume",
-    "Whether we should eagerly assume evaluations of conditionals, thus, "
-    "bifurcating the path. This indicates how the engine should handle "
-    "expressions such as: 'x = (y != 0)'. When this is true then the "
-    "subexpression 'y != 0' will be eagerly assumed to be true or false, thus "
-    "evaluating it to the integers 0 or 1 respectively. The upside is that "
-    "this can increase analysis precision until we have a better way to lazily "
-    "evaluate such logic. The downside is that it eagerly bifurcates paths.",
+    "If this is enabled (the default behavior), when the analyzer encounters "
+    "a comparison operator or logical negation, it immediately splits the "
+    "state to separate the case when the expression is true and the case when "
+    "it's false. The upside is that this can increase analysis precision until "
+    "we have a better way to lazily evaluate such logic; the downside is that "
+    "it eagerly bifurcates paths.",
     true)
 
 ANALYZER_OPTION(
diff --git a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
index 3a3c1a13d67d..2f4cd277cccd 100644
--- a/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
+++ b/clang/include/clang/StaticAnalyzer/Core/AnalyzerOptions.h
@@ -229,8 +229,6 @@ public:
   unsigned AnalyzerDisplayProgress : 1;
   unsigned AnalyzerNoteAnalysisEntryPoints : 1;
 
-  unsigned eagerlyAssumeBinOpBifurcation : 1;
-
   unsigned TrimGraph : 1;
   unsigned visualizeExplodedGraphWithGraphViz : 1;
   unsigned UnoptimizedCFG : 1;
@@ -293,9 +291,9 @@ public:
         ShowConfigOptionsList(false),
         ShouldEmitErrorsOnInvalidConfigValue(false), AnalyzeAll(false),
         AnalyzerDisplayProgress(false), AnalyzerNoteAnalysisEntryPoints(false),
-        eagerlyAssumeBinOpBifurcation(false), TrimGraph(false),
-        visualizeExplodedGraphWithGraphViz(false), UnoptimizedCFG(false),
-        PrintStats(false), NoRetryExhausted(false), AnalyzerWerror(false) {}
+        TrimGraph(false), visualizeExplodedGraphWithGraphViz(false),
+        UnoptimizedCFG(false), PrintStats(false), NoRetryExhausted(false),
+        AnalyzerWerror(false) {}
 
   /// Interprets an option's string value as a boolean. The "true" string is
   /// interpreted as true and the "false" string is interpreted as false.
diff --git a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
index 04eacd1df7ff..8c7493e27fca 100644
--- a/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
+++ b/clang/include/clang/StaticAnalyzer/Core/PathSensitive/ExprEngine.h
@@ -583,14 +583,13 @@ public:
                                 ExplodedNode *Pred,
                                 ExplodedNodeSet &Dst);
 
-  /// evalEagerlyAssumeBinOpBifurcation - Given the nodes in 'Src', eagerly assume symbolic
-  ///  expressions of the form 'x != 0' and generate new nodes (stored in Dst)
-  ///  with those assumptions.
-  void evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
-                         const Expr *Ex);
+  /// evalEagerlyAssumeBifurcation - Given the nodes in 'Src', eagerly assume
+  /// concrete boolean values for 'Ex', storing the resulting nodes in 'Dst'.
+  void evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst, ExplodedNodeSet &Src,
+                                    const Expr *Ex);
 
   static std::pair<const ProgramPointTag *, const ProgramPointTag *>
-    geteagerlyAssumeBinOpBifurcationTags();
+  getEagerlyAssumeBifurcationTags();
 
   ProgramStateRef handleLValueBitCast(ProgramStateRef state, const Expr *Ex,
                                       const LocationContext *LCtx, QualType T,
diff --git a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
index 68c8a8dc6825..c4479db14b79 100644
--- a/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
+++ b/clang/lib/StaticAnalyzer/Core/BugReporterVisitors.cpp
@@ -2695,7 +2695,7 @@ ConditionBRVisitor::VisitNodeImpl(const ExplodedNode *N,
                                   PathSensitiveBugReport &BR) {
   ProgramPoint ProgPoint = N->getLocation();
   const std::pair<const ProgramPointTag *, const ProgramPointTag *> &Tags =
-      ExprEngine::geteagerlyAssumeBinOpBifurcationTags();
+      ExprEngine::getEagerlyAssumeBifurcationTags();
 
   // If an assumption was made on a branch, it should be caught
   // here by looking at the state transition.
diff --git a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
index 43ab646d398b..0e400dfadb8c 100644
--- a/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
+++ b/clang/lib/StaticAnalyzer/Core/ExprEngine.cpp
@@ -2129,7 +2129,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
           (B->isRelationalOp() || B->isEqualityOp())) {
         ExplodedNodeSet Tmp;
         VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Tmp);
-        evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, cast<Expr>(S));
+        evalEagerlyAssumeBifurcation(Dst, Tmp, cast<Expr>(S));
       }
       else
         VisitBinaryOperator(cast<BinaryOperator>(S), Pred, Dst);
@@ -2402,7 +2402,7 @@ void ExprEngine::Visit(const Stmt *S, ExplodedNode *Pred,
       if (AMgr.options.ShouldEagerlyAssume && (U->getOpcode() == UO_LNot)) {
         ExplodedNodeSet Tmp;
         VisitUnaryOperator(U, Pred, Tmp);
-        evalEagerlyAssumeBinOpBifurcation(Dst, Tmp, U);
+        evalEagerlyAssumeBifurcation(Dst, Tmp, U);
       }
       else
         VisitUnaryOperator(U, Pred, Dst);
@@ -3742,23 +3742,20 @@ void ExprEngine::evalLocation(ExplodedNodeSet &Dst,
   BldrTop.addNodes(Tmp);
 }
 
-std::pair<const ProgramPointTag *, const ProgramPointTag*>
-ExprEngine::geteagerlyAssumeBinOpBifurcationTags() {
-  static SimpleProgramPointTag
-         eagerlyAssumeBinOpBifurcationTrue(TagProviderName,
-                                           "Eagerly Assume True"),
-         eagerlyAssumeBinOpBifurcationFalse(TagProviderName,
-                                            "Eagerly Assume False");
-  return std::make_pair(&eagerlyAssumeBinOpBifurcationTrue,
-                        &eagerlyAssumeBinOpBifurcationFalse);
+std::pair<const ProgramPointTag *, const ProgramPointTag *>
+ExprEngine::getEagerlyAssumeBifurcationTags() {
+  static SimpleProgramPointTag TrueTag(TagProviderName, "Eagerly Assume True"),
+      FalseTag(TagProviderName, "Eagerly Assume False");
+
+  return std::make_pair(&TrueTag, &FalseTag);
 }
 
-void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
-                                                   ExplodedNodeSet &Src,
-                                                   const Expr *Ex) {
+void ExprEngine::evalEagerlyAssumeBifurcation(ExplodedNodeSet &Dst,
+                                              ExplodedNodeSet &Src,
+                                              const Expr *Ex) {
   StmtNodeBuilder Bldr(Src, Dst, *currBldrCtx);
 
-  for (const auto Pred : Src) {
+  for (ExplodedNode *Pred : Src) {
     // Test if the previous node was as the same expression.  This can happen
     // when the expression fails to evaluate to anything meaningful and
     // (as an optimization) we don't generate a node.
@@ -3767,28 +3764,26 @@ void ExprEngine::evalEagerlyAssumeBinOpBifurcation(ExplodedNodeSet &Dst,
       continue;
     }
 
-    ProgramStateRef state = Pred->getState();
-    SVal V = state->getSVal(Ex, Pred->getLocationContext());
+    ProgramStateRef State = Pred->getState();
+    SVal V = State->getSVal(Ex, Pred->getLocationContext());
     std::optional<nonloc::SymbolVal> SEV = V.getAs<nonloc::SymbolVal>();
     if (SEV && SEV->isExpression()) {
-      const std::pair<const ProgramPointTag *, const ProgramPointTag*> &tags =
-        geteagerlyAssumeBinOpBifurcationTags();
+      const auto &[TrueTag, FalseTag] = getEagerlyAssumeBifurcationTags();
 
-      ProgramStateRef StateTrue, StateFalse;
-      std::tie(StateTrue, StateFalse) = state->assume(*SEV);
+      auto [StateTrue, StateFalse] = State->assume(*SEV);
 
       // First assume that the condition is true.
       if (StateTrue) {
         SVal Val = svalBuilder.makeIntVal(1U, Ex->getType());
         StateTrue = StateTrue->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateTrue, tags.first);
+        Bldr.generateNode(Ex, Pred, StateTrue, TrueTag);
       }
 
       // Next, assume that the condition is false.
       if (StateFalse) {
         SVal Val = svalBuilder.makeIntVal(0U, Ex->getType());
         StateFalse = StateFalse->BindExpr(Ex, Pred->getLocationContext(), Val);
-        Bldr.generateNode(Ex, Pred, StateFalse, tags.second);
+        Bldr.generateNode(Ex, Pred, StateFalse, FalseTag);
       }
     }
   }
-- 
GitLab


From f113a66c29b17e4937ff5d0ab67dc087fa6ee27e Mon Sep 17 00:00:00 2001
From: Karl-Johan Karlsson <karl-johan.karlsson@ericsson.com>
Date: Wed, 16 Oct 2024 13:49:34 +0200
Subject: [PATCH 112/329] [ARM] Fix warnings in ARMAsmParser.cpp and
 ARMDisassembler.cpp (#112507)

Fix gcc warnings like:
ARMAsmParser.cpp:7168:46: warning: enumeral and non-enumeral type in
conditional expression [-Wextra]
---
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 31 +++++++++++--------
 .../ARM/Disassembler/ARMDisassembler.cpp      |  5 +--
 2 files changed, 21 insertions(+), 15 deletions(-)

diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 75fb90477f88..b908e4f367e1 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -2532,14 +2532,14 @@ public:
   void addCondCodeOperands(MCInst &Inst, unsigned N) const {
     assert(N == 2 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(unsigned(getCondCode())));
-    unsigned RegNum = getCondCode() == ARMCC::AL ? 0: ARM::CPSR;
+    unsigned RegNum = getCondCode() == ARMCC::AL ? ARM::NoRegister : ARM::CPSR;
     Inst.addOperand(MCOperand::createReg(RegNum));
   }
 
   void addVPTPredNOperands(MCInst &Inst, unsigned N) const {
     assert(N == 3 && "Invalid number of operands!");
     Inst.addOperand(MCOperand::createImm(unsigned(getVPTPred())));
-    unsigned RegNum = getVPTPred() == ARMVCC::None ? 0: ARM::P0;
+    unsigned RegNum = getVPTPred() == ARMVCC::None ? ARM::NoRegister : ARM::P0;
     Inst.addOperand(MCOperand::createReg(RegNum));
     Inst.addOperand(MCOperand::createReg(0));
   }
@@ -7164,8 +7164,8 @@ bool ARMAsmParser::parseInstruction(ParseInstructionInfo &Info, StringRef Name,
   // Add the carry setting operand, if necessary.
   if (CanAcceptCarrySet && CarrySetting) {
     SMLoc Loc = SMLoc::getFromPointer(NameLoc.getPointer() + Mnemonic.size());
-    Operands.push_back(
-        ARMOperand::CreateCCOut(CarrySetting ? ARM::CPSR : 0, Loc, *this));
+    Operands.push_back(ARMOperand::CreateCCOut(
+        CarrySetting ? ARM::CPSR : ARM::NoRegister, Loc, *this));
   }
 
   // Add the predication code operand, if necessary.
@@ -10372,7 +10372,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
   case ARM::t2ASRri:
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         isARMLowRegister(Inst.getOperand(1).getReg()) &&
-        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(5).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
@@ -10422,14 +10423,14 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
     if (isNarrow)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     TmpInst.addOperand(Inst.getOperand(2)); // Rm
     TmpInst.addOperand(Inst.getOperand(4)); // CondCode
     TmpInst.addOperand(Inst.getOperand(5));
     if (!isNarrow)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsr ? ARM::CPSR : ARM::NoRegister));
     Inst = TmpInst;
     return true;
   }
@@ -10475,7 +10476,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.addOperand(Inst.getOperand(0)); // Rd
     if (isNarrow && !isMov)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister));
     TmpInst.addOperand(Inst.getOperand(1)); // Rn
     if (newOpc != ARM::t2RRX && !isMov)
       TmpInst.addOperand(MCOperand::createImm(Amount));
@@ -10483,7 +10484,7 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     TmpInst.addOperand(Inst.getOperand(4));
     if (!isNarrow)
       TmpInst.addOperand(MCOperand::createReg(
-          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : 0));
+          Inst.getOpcode() == ARM::t2MOVSsi ? ARM::CPSR : ARM::NoRegister));
     Inst = TmpInst;
     return true;
   }
@@ -10684,7 +10685,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
         !isARMLowRegister(Inst.getOperand(0).getReg()) ||
         (Inst.getOperand(2).isImm() &&
          (unsigned)Inst.getOperand(2).getImm() > 255) ||
-        Inst.getOperand(5).getReg() != (inITBlock() ? 0 : ARM::CPSR) ||
+        Inst.getOperand(5).getReg() !=
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) ||
         HasWideQualifier)
       break;
     MCInst TmpInst;
@@ -10852,7 +10854,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if (isARMLowRegister(Inst.getOperand(0).getReg()) &&
         (Inst.getOperand(1).isImm() &&
          (unsigned)Inst.getOperand(1).getImm() <= 255) &&
-        Inst.getOperand(4).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(4).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       // The operands aren't in the same order for tMOVi8...
       MCInst TmpInst;
@@ -10993,7 +10996,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
     if ((isARMLowRegister(Inst.getOperand(1).getReg()) &&
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() &&
-        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(5).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
@@ -11029,7 +11033,8 @@ bool ARMAsmParser::processInstruction(MCInst &Inst,
          isARMLowRegister(Inst.getOperand(2).getReg())) &&
         (Inst.getOperand(0).getReg() == Inst.getOperand(1).getReg() ||
          Inst.getOperand(0).getReg() == Inst.getOperand(2).getReg()) &&
-        Inst.getOperand(5).getReg() == (inITBlock() ? 0 : ARM::CPSR) &&
+        Inst.getOperand(5).getReg() ==
+            (inITBlock() ? ARM::NoRegister : ARM::CPSR) &&
         !HasWideQualifier) {
       unsigned NewOpc;
       switch (Inst.getOpcode()) {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index 93b74905fc59..fa5dd10cfdaa 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -894,12 +894,13 @@ void ARMDisassembler::AddThumb1SBit(MCInst &MI, bool InITBlock) const {
         MCID.operands()[i].RegClass == ARM::CCRRegClassID) {
       if (i > 0 && MCID.operands()[i - 1].isPredicate())
         continue;
-      MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+      MI.insert(I,
+                MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
       return;
     }
   }
 
-  MI.insert(I, MCOperand::createReg(InITBlock ? 0 : ARM::CPSR));
+  MI.insert(I, MCOperand::createReg(InITBlock ? ARM::NoRegister : ARM::CPSR));
 }
 
 bool ARMDisassembler::isVectorPredicable(const MCInst &MI) const {
-- 
GitLab


From caa7301bc8081bfaf8fc9f3644d558d336038c43 Mon Sep 17 00:00:00 2001
From: Sven van Haastregt <sven.vanhaastregt@arm.com>
Date: Wed, 16 Oct 2024 13:58:12 +0200
Subject: [PATCH 113/329] [OpenCL] Restore addrspacecast for pipe builtins
 (#112514)

Commit 84ee629bc515 ("clang: Remove some pointer bitcasts (#112324)",
2024-10-15) triggered some "Call parameter type does not match function
signature!" errors when using the OpenCL pipe builtin functions under
the spir triple, due to a missing addrspacecast.

This would have been caught by the pipe_builtin.cl test if that had used
the `spir-unknown-unknown` triple, so extend the test to use that
triple too.
---
 clang/lib/CodeGen/CGBuiltin.cpp          |  3 ++-
 clang/test/CodeGenOpenCL/pipe_builtin.cl | 21 +++++++++++++++++++++
 2 files changed, 23 insertions(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index 12f99d9f1178..f6d7db2c204c 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -5657,13 +5657,14 @@ RValue CodeGenFunction::EmitBuiltinExpr(const GlobalDecl GD, unsigned BuiltinID,
             *Arg3 = EmitScalarExpr(E->getArg(3));
       llvm::FunctionType *FTy = llvm::FunctionType::get(
           Int32Ty, llvm::ArrayRef<llvm::Type *>(ArgTys), false);
+      Value *ACast = Builder.CreateAddrSpaceCast(Arg3, I8PTy);
       // We know the third argument is an integer type, but we may need to cast
       // it to i32.
       if (Arg2->getType() != Int32Ty)
         Arg2 = Builder.CreateZExtOrTrunc(Arg2, Int32Ty);
       return RValue::get(
           EmitRuntimeCall(CGM.CreateRuntimeFunction(FTy, Name),
-                          {Arg0, Arg1, Arg2, Arg3, PacketSize, PacketAlign}));
+                          {Arg0, Arg1, Arg2, ACast, PacketSize, PacketAlign}));
     }
   }
   // OpenCL v2.0 s6.13.16 ,s9.17.3.5 - Built-in pipe reserve read and write
diff --git a/clang/test/CodeGenOpenCL/pipe_builtin.cl b/clang/test/CodeGenOpenCL/pipe_builtin.cl
index c59f63bab6a4..ec9d7cb04506 100644
--- a/clang/test/CodeGenOpenCL/pipe_builtin.cl
+++ b/clang/test/CodeGenOpenCL/pipe_builtin.cl
@@ -1,3 +1,4 @@
+// RUN: %clang_cc1 -triple spir-unknown-unknown -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck --check-prefix=CHECK-SPIR %s
 // RUN: %clang_cc1 -triple %itanium_abi_triple -emit-llvm -cl-ext=+cl_khr_subgroups -O0 -cl-std=clc++ -o - %s | FileCheck %s
 // FIXME: Add MS ABI manglings of OpenCL things and remove %itanium_abi_triple
 // above to support OpenCL in the MS C++ ABI.
@@ -5,65 +6,85 @@
 #pragma OPENCL EXTENSION cl_khr_subgroups : enable
 
 void test1(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__read_pipe_2(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__read_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   read_pipe(p, ptr);
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = reserve_read_pipe(p, 2);
+  // CHECK-SPIR: call spir_func i32 @__read_pipe_4(target("spirv.Pipe", 0) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__read_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4)
   read_pipe(p, rid, 2, ptr);
+  // CHECK-SPIR: call spir_func void @__commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   commit_read_pipe(p, rid);
 }
 
 void test2(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__write_pipe_2(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__write_pipe_2(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   write_pipe(p, ptr);
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = reserve_write_pipe(p, 2);
+  // CHECK-SPIR: call spir_func i32 @__write_pipe_4(target("spirv.Pipe", 1) %{{.*}}, ptr addrspace(4) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__write_pipe_4(ptr %{{.*}}, ptr %{{.*}}, i32 {{.*}}, ptr %{{.*}}, i32 4, i32 4)
   write_pipe(p, rid, 2, ptr);
+  // CHECK-SPIR: call spir_func void @__commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   commit_write_pipe(p, rid);
 }
 
 void test3(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__work_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = work_group_reserve_read_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__work_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__work_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   work_group_commit_read_pipe(p, rid);
 }
 
 void test4(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__work_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__work_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = work_group_reserve_write_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__work_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__work_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   work_group_commit_write_pipe(p, rid);
 }
 
 void test5(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_read_pipe(target("spirv.Pipe", 0) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__sub_group_reserve_read_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = sub_group_reserve_read_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__sub_group_commit_read_pipe(target("spirv.Pipe", 0) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__sub_group_commit_read_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   sub_group_commit_read_pipe(p, rid);
 }
 
 void test6(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func target("spirv.ReserveId") @__sub_group_reserve_write_pipe(target("spirv.Pipe", 1) %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   // CHECK: call ptr @__sub_group_reserve_write_pipe(ptr %{{.*}}, i32 {{.*}}, i32 4, i32 4)
   reserve_id_t rid = sub_group_reserve_write_pipe(p, 2);
+  // CHECK-SPIR: call spir_func void @__sub_group_commit_write_pipe(target("spirv.Pipe", 1) %{{.*}}, target("spirv.ReserveId") %{{.*}}, i32 4, i32 4)
   // CHECK: call void @__sub_group_commit_write_pipe(ptr %{{.*}}, ptr %{{.*}}, i32 4, i32 4)
   sub_group_commit_write_pipe(p, rid);
 }
 
 void test7(read_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_num_packets_ro(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_num_packets(p);
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_ro(target("spirv.Pipe", 0) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_max_packets_ro(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_max_packets(p);
 }
 
 void test8(write_only pipe int p, global int *ptr) {
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_num_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_num_packets_wo(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_num_packets(p);
+  // CHECK-SPIR: call spir_func i32 @__get_pipe_max_packets_wo(target("spirv.Pipe", 1) %{{.*}}, i32 4, i32 4)
   // CHECK: call i32 @__get_pipe_max_packets_wo(ptr %{{.*}}, i32 4, i32 4)
   *ptr = get_pipe_max_packets(p);
 }
-- 
GitLab


From f5f00764abeb7023719d64774e263936b3f31ab7 Mon Sep 17 00:00:00 2001
From: Lewis Crawford <lcrawford@nvidia.com>
Date: Wed, 16 Oct 2024 13:23:46 +0100
Subject: [PATCH 114/329] [DAGCombiner] Fix check for extending loads (#112182)

Fix a check for extending loads in DAGCombiner,
where if the result type has more bits than the
loaded type it should count as an extending load.

All backends apart from AArch64 ignore this
ExtTy argument to shouldReduceLoadWidth, so this
change currently only impacts AArch64.
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp |  2 +-
 .../AArch64/aarch64-scalarize-vec-load-ext.ll | 35 +++++++++++++++++++
 2 files changed, 36 insertions(+), 1 deletion(-)
 create mode 100644 llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 0879165aac13..ca91d35573c3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -22568,7 +22568,7 @@ SDValue DAGCombiner::scalarizeExtractedVectorLoad(SDNode *EVE, EVT InVecVT,
     return SDValue();
 
   ISD::LoadExtType ExtTy =
-      ResultVT.bitsGT(VecEltVT) ? ISD::NON_EXTLOAD : ISD::EXTLOAD;
+      ResultVT.bitsGT(VecEltVT) ? ISD::EXTLOAD : ISD::NON_EXTLOAD;
   if (!TLI.isOperationLegalOrCustom(ISD::LOAD, VecEltVT) ||
       !TLI.shouldReduceLoadWidth(OriginalLoad, ExtTy, VecEltVT))
     return SDValue();
diff --git a/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll
new file mode 100644
index 000000000000..30ce0cb09fc0
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/aarch64-scalarize-vec-load-ext.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=aarch64-unknown-linux-gnu < %s | FileCheck %s
+
+; FIXME: Currently, we avoid narrowing this v4i32 load, in the
+; hopes of being able to fold the shift, despite it requiring stack
+; storage + loads. Ideally, we should narrow here and load the i32
+; directly from the variable offset e.g:
+;
+; add     x8, x0, x1, lsl #4
+; and     x9, x2, #0x3
+; ldr     w0, [x8, x9, lsl #2]
+;
+; The AArch64TargetLowering::shouldReduceLoadWidth heuristic should
+; probably be updated to choose load-narrowing instead of folding the
+; lsl in larger vector cases.
+;
+define i32 @narrow_load_v4_i32_single_ele_variable_idx(ptr %ptr, i64 %off, i32 %ele) {
+; CHECK-LABEL: narrow_load_v4_i32_single_ele_variable_idx:
+; CHECK:       // %bb.0: // %entry
+; CHECK-NEXT:    sub sp, sp, #16
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    ldr q0, [x0, x1, lsl #4]
+; CHECK-NEXT:    mov x8, sp
+; CHECK-NEXT:    // kill: def $w2 killed $w2 def $x2
+; CHECK-NEXT:    bfi x8, x2, #2, #2
+; CHECK-NEXT:    str q0, [sp]
+; CHECK-NEXT:    ldr w0, [x8]
+; CHECK-NEXT:    add sp, sp, #16
+; CHECK-NEXT:    ret
+entry:
+  %idx = getelementptr inbounds <4 x i32>, ptr %ptr, i64 %off
+  %x = load <4 x i32>, ptr %idx, align 8
+  %res = extractelement <4 x i32> %x, i32 %ele
+  ret i32 %res
+}
-- 
GitLab


From 3e31e30a844fe388e71b1dfafe836c31dd0dd4a4 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 13:26:54 +0100
Subject: [PATCH 115/329] [X86] Add some basic test coverage for #112425

---
 llvm/test/CodeGen/X86/andnot-patterns.ll | 652 +++++++++++++++++++++++
 1 file changed, 652 insertions(+)
 create mode 100644 llvm/test/CodeGen/X86/andnot-patterns.ll

diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll
new file mode 100644
index 000000000000..0ff4e3b47ae4
--- /dev/null
+++ b/llvm/test/CodeGen/X86/andnot-patterns.ll
@@ -0,0 +1,652 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc < %s -mtriple=i686-- -mattr=+bmi | FileCheck %s --check-prefixes=X86
+; RUN: llc < %s -mtriple=x86_64-- -mattr=+bmi | FileCheck %s --check-prefixes=X64
+
+; TODO - PR112425 - attempt to reconstruct andnot patterns through bitwise-agnostic operations
+
+declare void @use_i64(i64)
+
+;
+; Fold (and X, (rotl (not Y), Z))) -> (and X, (not (rotl Y, Z)))
+;
+
+define i64 @andnot_rotl_i64(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB0_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    jmp .LBB0_3
+; X86-NEXT:  .LBB0_1:
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:  .LBB0_3:
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rolq %cl, %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %and = and i64 %rot, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_rotl_i32(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    roll %cl, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %and = and i32 %rot, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_rotl_i16(i16 %a0, i16 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw %cl, %ax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %and = and i16 %rot, %a0
+  ret i16 %and
+}
+
+define i8 @andnot_rotl_i8(i8 %a0, i8 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notb %al
+; X86-NEXT:    rolb %cl, %al
+; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    retq
+  %not = xor i8 %a0, -1
+  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %and = and i8 %rot, %a0
+  ret i8 %and
+}
+
+define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_rotl_i64_multiuse:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl %edi, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB4_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    jmp .LBB4_3
+; X86-NEXT:  .LBB4_1:
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:  .LBB4_3:
+; X86-NEXT:    movl %eax, %ebx
+; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    andl %ebx, %edi
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edx
+; X86-NEXT:    calll use_i64@PLT
+; X86-NEXT:    addl $8, %esp
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotl_i64_multiuse:
+; X64:       # %bb.0:
+; X64-NEXT:    pushq %rbx
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdi, %rbx
+; X64-NEXT:    notq %rdi
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rolq %cl, %rdi
+; X64-NEXT:    andq %rdi, %rbx
+; X64-NEXT:    callq use_i64@PLT
+; X64-NEXT:    movq %rbx, %rax
+; X64-NEXT:    popq %rbx
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %and = and i64 %rot, %a0
+  call void @use_i64(i64 %rot)
+  ret i64 %and
+}
+
+;
+; Fold (and X, (rotr (not Y), Z))) -> (and X, (not (rotr Y, Z)))
+;
+
+define i64 @andnot_rotr_i64(i64 %a0, i64 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %ebx
+; X86-NEXT:    pushl %edi
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    movl %esi, %ebx
+; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    testb $32, %cl
+; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:  # %bb.2:
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    jmp .LBB5_3
+; X86-NEXT:  .LBB5_1:
+; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:  .LBB5_3:
+; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X86-NEXT:    shldl %cl, %ebx, %eax
+; X86-NEXT:    andl %edi, %eax
+; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    popl %esi
+; X86-NEXT:    popl %edi
+; X86-NEXT:    popl %ebx
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
+; X64-NEXT:    rolq %cl, %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %and = and i64 %rot, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_rotr_i32(i32 %a0, i32 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    roll %cl, %eax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %and = and i32 %rot, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_rotr_i16(i16 %a0, i16 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw %cl, %ax
+; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %and = and i16 %rot, %a0
+  ret i16 %and
+}
+
+define i8 @andnot_rotr_i8(i8 %a0, i8 %a1) nounwind {
+; X86-LABEL: andnot_rotr_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    notb %al
+; X86-NEXT:    rolb %cl, %al
+; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_rotr_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %esi, %ecx
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
+; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    retq
+  %not = xor i8 %a0, -1
+  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %and = and i8 %rot, %a0
+  ret i8 %and
+}
+
+;
+; Fold (and X, (bswap (not Y)))) -> (and X, (not (bswap Y)))
+;
+
+define i64 @andnot_bswap_i64(i64 %a0) nounwind {
+; X86-LABEL: andnot_bswap_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bswap_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %bswap = tail call i64 @llvm.bswap.i64(i64 %not)
+  %and = and i64 %bswap, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_bswap_i32(i32 %a0) nounwind {
+; X86-LABEL: andnot_bswap_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bswap_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %bswap = tail call i32 @llvm.bswap.i32(i32 %not)
+  %and = and i32 %bswap, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_bswap_i16(i16 %a0) nounwind {
+; X86-LABEL: andnot_bswap_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bswap_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %bswap = tail call i16 @llvm.bswap.i16(i16 %not)
+  %and = and i16 %bswap, %a0
+  ret i16 %and
+}
+
+;
+; Fold (and X, (bitreverse (not Y)))) -> (and X, (not (bitreverse Y)))
+;
+
+define i64 @andnot_bitreverse_i64(i64 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i64:
+; X86:       # %bb.0:
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    bswapl %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    leal (%edx,%esi,4), %edx
+; X86-NEXT:    movl %edx, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%edx,%esi,2), %edx
+; X86-NEXT:    andl %eax, %edx
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT:    orl %esi, %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    leal (%eax,%esi,4), %eax
+; X86-NEXT:    movl %eax, %esi
+; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%esi,2), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i64:
+; X64:       # %bb.0:
+; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    notq %rax
+; X64-NEXT:    bswapq %rax
+; X64-NEXT:    movq %rax, %rcx
+; X64-NEXT:    shrq $4, %rcx
+; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
+; X64-NEXT:    andq %rdx, %rcx
+; X64-NEXT:    andq %rdx, %rax
+; X64-NEXT:    shlq $4, %rax
+; X64-NEXT:    orq %rcx, %rax
+; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    shrq $2, %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    leaq (%rax,%rdx,4), %rax
+; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
+; X64-NEXT:    movq %rax, %rdx
+; X64-NEXT:    andq %rcx, %rdx
+; X64-NEXT:    shrq %rax
+; X64-NEXT:    andq %rcx, %rax
+; X64-NEXT:    leaq (%rax,%rdx,2), %rax
+; X64-NEXT:    andq %rdi, %rax
+; X64-NEXT:    retq
+  %not = xor i64 %a0, -1
+  %bitrev = tail call i64 @llvm.bitreverse.i64(i64 %not)
+  %and = and i64 %bitrev, %a0
+  ret i64 %and
+}
+
+define i32 @andnot_bitreverse_i32(i32 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i32:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    bswapl %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i32:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    bswapl %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
+; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $1431655765, %eax # imm = 0x55555555
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    retq
+  %not = xor i32 %a0, -1
+  %bitrev = tail call i32 @llvm.bitreverse.i32(i32 %not)
+  %and = and i32 %bitrev, %a0
+  ret i32 %and
+}
+
+define i16 @andnot_bitreverse_i16(i16 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i16:
+; X86:       # %bb.0:
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    rolw $8, %ax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %eax
+; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
+; X86-NEXT:    orl %edx, %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $13107, %edx # imm = 0x3333
+; X86-NEXT:    shrl $2, %eax
+; X86-NEXT:    andl $13107, %eax # imm = 0x3333
+; X86-NEXT:    leal (%eax,%edx,4), %eax
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andl $21845, %edx # imm = 0x5555
+; X86-NEXT:    shrl %eax
+; X86-NEXT:    andl $21845, %eax # imm = 0x5555
+; X86-NEXT:    leal (%eax,%edx,2), %eax
+; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    # kill: def $ax killed $ax killed $eax
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i16:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notl %eax
+; X64-NEXT:    rolw $8, %ax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $3855, %ecx # imm = 0xF0F
+; X64-NEXT:    shll $4, %ecx
+; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
+; X64-NEXT:    orl %ecx, %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $13107, %ecx # imm = 0x3333
+; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    andl $13107, %eax # imm = 0x3333
+; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andl $21845, %ecx # imm = 0x5555
+; X64-NEXT:    shrl %eax
+; X64-NEXT:    andl $21845, %eax # imm = 0x5555
+; X64-NEXT:    leal (%rax,%rcx,2), %eax
+; X64-NEXT:    andl %edi, %eax
+; X64-NEXT:    # kill: def $ax killed $ax killed $eax
+; X64-NEXT:    retq
+  %not = xor i16 %a0, -1
+  %bitrev = tail call i16 @llvm.bitreverse.i16(i16 %not)
+  %and = and i16 %bitrev, %a0
+  ret i16 %and
+}
+
+define i8 @andnot_bitreverse_i8(i8 %a0) nounwind {
+; X86-LABEL: andnot_bitreverse_i8:
+; X86:       # %bb.0:
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    notb %al
+; X86-NEXT:    rolb $4, %al
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andb $51, %dl
+; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    shrb $2, %al
+; X86-NEXT:    andb $51, %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    movl %eax, %edx
+; X86-NEXT:    andb $85, %dl
+; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    shrb %al
+; X86-NEXT:    andb $85, %al
+; X86-NEXT:    orb %dl, %al
+; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    retl
+;
+; X64-LABEL: andnot_bitreverse_i8:
+; X64:       # %bb.0:
+; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    notb %al
+; X64-NEXT:    rolb $4, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $51, %cl
+; X64-NEXT:    shlb $2, %cl
+; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    andb $51, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    movl %eax, %ecx
+; X64-NEXT:    andb $85, %cl
+; X64-NEXT:    addb %cl, %cl
+; X64-NEXT:    shrb %al
+; X64-NEXT:    andb $85, %al
+; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    retq
+  %not = xor i8 %a0, -1
+  %bitrev = tail call i8 @llvm.bitreverse.i8(i8 %not)
+  %and = and i8 %bitrev, %a0
+  ret i8 %and
+}
-- 
GitLab


From 11ed7f2d3cbe9a22be2edb67881efd76fb8bba17 Mon Sep 17 00:00:00 2001
From: Sander de Smalen <sander.desmalen@arm.com>
Date: Wed, 16 Oct 2024 09:55:37 +0100
Subject: [PATCH 116/329] [AArch64] NFC: Regenerate aarch64-sve-asm.ll

It should use update_mir_test_checks.py instead of
update_llc_test_checks.py.
---
 llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll | 129 ++++++++++++-------
 1 file changed, 83 insertions(+), 46 deletions(-)

diff --git a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
index 4ca2fb881579..068e194779c1 100644
--- a/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
+++ b/llvm/test/CodeGen/AArch64/aarch64-sve-asm.ll
@@ -1,84 +1,121 @@
-; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
-; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK
+; NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64-none-linux-gnu -mattr=+sve2p1 -stop-after=finalize-isel | FileCheck %s --check-prefix=CHECK
 
 target datalayout = "e-m:e-i64:64-i128:128-n32:64-S128"
 target triple = "aarch64-none-linux-gnu"
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 16 x i8> @test_svadd_i8(<vscale x 16 x i8> %Zn, <vscale x 16 x i8> %Zm) {
+  ; CHECK-LABEL: name: test_svadd_i8
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"add $0.b, $1.b, $2.b", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 16 x i8> asm "add $0.b, $1.b, $2.b", "=w,w,y"(<vscale x 16 x i8> %Zn, <vscale x 16 x i8> %Zm)
   ret <vscale x 16 x i8> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 2 x i64> @test_svsub_i64(<vscale x 2 x i64> %Zn, <vscale x 2 x i64> %Zm) {
+  ; CHECK-LABEL: name: test_svsub_i64
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"sub $0.d, $1.d, $2.d", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 2 x i64> asm "sub $0.d, $1.d, $2.d", "=w,w,x"(<vscale x 2 x i64> %Zn, <vscale x 2 x i64> %Zm)
   ret <vscale x 2 x i64> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_3b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 8 x half> @test_svfmul_f16(<vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+  ; CHECK-LABEL: name: test_svfmul_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_3b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fmul $0.h, $1.h, $2.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5373961 /* reguse:ZPR_3b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 8 x half> asm "fmul $0.h, $1.h, $2.h", "=w,w,y"(<vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
   ret <vscale x 8 x half> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:zpr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr_4b = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 4 x float> @test_svfmul_f(<vscale x 4 x float> %Zn, <vscale x 4 x float> %Zm) {
+  ; CHECK-LABEL: name: test_svfmul_f
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr_4b = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fmul $0.s, $1.s, $2.s", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 5046281 /* reguse:ZPR */, [[COPY2]], 5242889 /* reguse:ZPR_4b */, [[COPY3]]
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 4 x float> asm "fmul $0.s, $1.s, $2.s", "=w,w,x"(<vscale x 4 x float> %Zn, <vscale x 4 x float> %Zm)
   ret <vscale x 4 x float> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG4:%[0-9]+]]:ppr_3b = COPY [[ARG3]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 8 x half> @test_svfadd_f16(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+  ; CHECK-LABEL: name: test_svfadd_f16
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $p0, $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ppr = COPY $p0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ppr_3b = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 589833 /* reguse:PPR_3b */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]]
+  ; CHECK-NEXT:   $z0 = COPY %3
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Upl,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
   ret <vscale x 8 x half> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG2:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY [[ARG2]]
-; CHECK: [[ARG4:%[0-9]+]]:zpr = COPY [[ARG1]]
-; CHECK: INLINEASM {{.*}} [[ARG3]]
 define <vscale x 4 x i32> @test_incp(<vscale x 16 x i1> %Pg, <vscale x 4 x i32> %Zn) {
+  ; CHECK-LABEL: name: test_incp
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $p0, $z0
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:ppr = COPY $p0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ppr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:zpr = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"incp $0.s, $1", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %2, 393225 /* reguse:PPR */, [[COPY2]], 2147483657 /* reguse tiedto:$0 */, [[COPY3]](tied-def 3)
+  ; CHECK-NEXT:   $z0 = COPY %2
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 4 x i32> asm "incp $0.s, $1", "=w,@3Upa,0"(<vscale x 16 x i1> %Pg, <vscale x 4 x i32> %Zn)
   ret <vscale x 4 x i32> %1
 }
 
-; Function Attrs: nounwind readnone
-; CHECK: [[ARG1:%[0-9]+]]:zpr = COPY $z1
-; CHECK: [[ARG2:%[0-9]+]]:zpr = COPY $z0
-; CHECK: [[ARG3:%[0-9]+]]:ppr = COPY $p0
-; CHECK: [[ARG4:%[0-9]+]]:ppr_p8to15 = COPY [[ARG3]]
-; CHECK: INLINEASM {{.*}} [[ARG4]]
 define <vscale x 8 x half> @test_svfadd_f16_Uph_constraint(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm) {
+  ; CHECK-LABEL: name: test_svfadd_f16_Uph_constraint
+  ; CHECK: bb.0 (%ir-block.0):
+  ; CHECK-NEXT:   liveins: $p0, $z0, $z1
+  ; CHECK-NEXT: {{  $}}
+  ; CHECK-NEXT:   [[COPY:%[0-9]+]]:zpr = COPY $z1
+  ; CHECK-NEXT:   [[COPY1:%[0-9]+]]:zpr = COPY $z0
+  ; CHECK-NEXT:   [[COPY2:%[0-9]+]]:ppr = COPY $p0
+  ; CHECK-NEXT:   [[COPY3:%[0-9]+]]:ppr_p8to15 = COPY [[COPY2]]
+  ; CHECK-NEXT:   [[COPY4:%[0-9]+]]:zpr = COPY [[COPY1]]
+  ; CHECK-NEXT:   [[COPY5:%[0-9]+]]:zpr = COPY [[COPY]]
+  ; CHECK-NEXT:   INLINEASM &"fadd $0.h, $1/m, $2.h, $3.h", 0 /* attdialect */, 5046282 /* regdef:ZPR */, def %3, 655369 /* reguse:PPR_p8to15 */, [[COPY3]], 5046281 /* reguse:ZPR */, [[COPY4]], 5046281 /* reguse:ZPR */, [[COPY5]]
+  ; CHECK-NEXT:   $z0 = COPY %3
+  ; CHECK-NEXT:   RET_ReallyLR implicit $z0
   %1 = tail call <vscale x 8 x half> asm "fadd $0.h, $1/m, $2.h, $3.h", "=w,@3Uph,w,w"(<vscale x 16 x i1> %Pg, <vscale x 8 x half> %Zn, <vscale x 8 x half> %Zm)
   ret <vscale x 8 x half> %1
 }
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; CHECK: {{.*}}
-- 
GitLab


From 9ad4f05ef7491cff73e7f574fed717531974fc6b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Wed, 16 Oct 2024 14:40:51 +0200
Subject: [PATCH 117/329] [bazel][lldb] Port
 5f2cf99e146ce99d4e148038d9bdd012331b4821

---
 .../lldb/source/Plugins/BUILD.bazel           | 20 +++++++++++++++++++
 1 file changed, 20 insertions(+)

diff --git a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
index 38493411adde..7057f5d5c5c1 100644
--- a/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
+++ b/utils/bazel/llvm-project-overlay/lldb/source/Plugins/BUILD.bazel
@@ -1221,12 +1221,31 @@ cc_library(
     ],
 )
 
+gentbl_cc_library(
+    name = "DynamicLoaderMacOSXDYLDProperties",
+    strip_include_prefix = "DynamicLoader/MacOSX-DYLD",
+    tbl_outs = [
+        (
+            ["-gen-lldb-property-defs"],
+            "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.inc",
+        ),
+        (
+            ["-gen-lldb-property-enum-defs"],
+            "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinPropertiesEnum.inc",
+        ),
+    ],
+    tblgen = "//lldb:lldb-tblgen",
+    td_file = "DynamicLoader/MacOSX-DYLD/DynamicLoaderDarwinProperties.td",
+    deps = ["//lldb:CoreTdFiles"],
+)
+
 cc_library(
     name = "PluginDynamicLoaderMacOSXDYLD",
     srcs = glob(["DynamicLoader/MacOSX-DYLD/*.cpp"]),
     hdrs = glob(["DynamicLoader/MacOSX-DYLD/*.h"]),
     include_prefix = "Plugins",
     deps = [
+        ":DynamicLoaderMacOSXDYLDProperties",
         ":PluginObjCRuntime",
         ":PluginTypeSystemClang",
         ":PluginTypeSystemClangHeaders",
@@ -1239,6 +1258,7 @@ cc_library(
         "//lldb:Target",
         "//lldb:TargetHeaders",
         "//lldb:Utility",
+        "//llvm:Support",
         "//llvm:TargetParser",
     ],
 )
-- 
GitLab


From 157f10ddf2d851125a85a71e530dc9d50cb032a2 Mon Sep 17 00:00:00 2001
From: Yuta Saito <kateinoigakukun@gmail.com>
Date: Wed, 16 Oct 2024 21:42:54 +0900
Subject: [PATCH 118/329] Revert "[llvm-cov][WebAssembly] Read
 `__llvm_prf_names` from data segments" (#112520)

This reverts commit efc9dd4118a7ada7d8c898582f16db64827f7ce0 in order to
fix Windows test failure:
https://github.com/llvm/llvm-project/pull/111332#issuecomment-2416462512
---
 .../Coverage/CoverageMappingReader.cpp        |  64 +++---------------
 .../llvm-cov/Inputs/binary-formats.v6.wasm32  | Bin 87781 -> 0 bytes
 .../Inputs/binary-formats.wasm.proftext       |   4 --
 llvm/test/tools/llvm-cov/binary-formats.c     |   7 --
 4 files changed, 9 insertions(+), 66 deletions(-)
 delete mode 100755 llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32
 delete mode 100644 llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext

diff --git a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
index 461fc43d32f8..8881bffe41c5 100644
--- a/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
+++ b/llvm/lib/ProfileData/Coverage/CoverageMappingReader.cpp
@@ -18,14 +18,12 @@
 #include "llvm/ADT/SmallVector.h"
 #include "llvm/ADT/Statistic.h"
 #include "llvm/ADT/StringRef.h"
-#include "llvm/BinaryFormat/Wasm.h"
 #include "llvm/Object/Archive.h"
 #include "llvm/Object/Binary.h"
 #include "llvm/Object/COFF.h"
 #include "llvm/Object/Error.h"
 #include "llvm/Object/MachOUniversal.h"
 #include "llvm/Object/ObjectFile.h"
-#include "llvm/Object/Wasm.h"
 #include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/Casting.h"
 #include "llvm/Support/Compression.h"
@@ -1079,53 +1077,6 @@ lookupSections(ObjectFile &OF, InstrProfSectKind IPSK) {
   return Sections;
 }
 
-/// Find a section that matches \p Name and is allocatable at runtime.
-///
-/// Returns the contents of the section and its start offset in the object file.
-static Expected<std::pair<StringRef, uint64_t>>
-lookupAllocatableSection(ObjectFile &OF, InstrProfSectKind IPSK) {
-  // On Wasm, allocatable sections can live only in data segments.
-  if (auto *WOF = dyn_cast<WasmObjectFile>(&OF)) {
-    std::vector<const WasmSegment *> Segments;
-    auto ObjFormat = OF.getTripleObjectFormat();
-    auto Name =
-        getInstrProfSectionName(IPSK, ObjFormat, /*AddSegmentInfo=*/false);
-    for (const auto &DebugName : WOF->debugNames()) {
-      if (DebugName.Type != wasm::NameType::DATA_SEGMENT ||
-          DebugName.Name != Name)
-        continue;
-      if (DebugName.Index >= WOF->dataSegments().size())
-        return make_error<CoverageMapError>(coveragemap_error::malformed);
-      auto &Segment = WOF->dataSegments()[DebugName.Index];
-      Segments.push_back(&Segment);
-    }
-    if (Segments.empty())
-      return make_error<CoverageMapError>(coveragemap_error::no_data_found);
-    if (Segments.size() != 1)
-      return make_error<CoverageMapError>(coveragemap_error::malformed);
-
-    const auto &Segment = *Segments.front();
-    auto &Data = Segment.Data;
-    StringRef Content(reinterpret_cast<const char *>(Data.Content.data()),
-                      Data.Content.size());
-    return std::make_pair(Content, Segment.SectionOffset);
-  }
-
-  // On other object file types, delegate to lookupSections to find the section.
-  auto Sections = lookupSections(OF, IPSK);
-  if (!Sections)
-    return Sections.takeError();
-  if (Sections->size() != 1)
-    return make_error<CoverageMapError>(
-        coveragemap_error::malformed,
-        "the size of coverage mapping section is not one");
-  auto &Section = Sections->front();
-  auto ContentsOrErr = Section.getContents();
-  if (!ContentsOrErr)
-    return ContentsOrErr.takeError();
-  return std::make_pair(*ContentsOrErr, Section.getAddress());
-}
-
 static Expected<std::unique_ptr<BinaryCoverageReader>>
 loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
                  StringRef CompilationDir = "",
@@ -1156,20 +1107,23 @@ loadBinaryFormat(std::unique_ptr<Binary> Bin, StringRef Arch,
 
   // Look for the sections that we are interested in.
   auto ProfileNames = std::make_unique<InstrProfSymtab>();
+  std::vector<SectionRef> NamesSectionRefs;
   // If IPSK_name is not found, fallback to search for IPK_covname, which is
   // used when binary correlation is enabled.
-  auto NamesSection = lookupAllocatableSection(*OF, IPSK_name);
+  auto NamesSection = lookupSections(*OF, IPSK_name);
   if (auto E = NamesSection.takeError()) {
     consumeError(std::move(E));
-    NamesSection = lookupAllocatableSection(*OF, IPSK_covname);
+    NamesSection = lookupSections(*OF, IPSK_covname);
     if (auto E = NamesSection.takeError())
       return std::move(E);
   }
+  NamesSectionRefs = *NamesSection;
 
-  uint64_t NamesAddress;
-  StringRef NamesContent;
-  std::tie(NamesContent, NamesAddress) = *NamesSection;
-  if (Error E = ProfileNames->create(NamesContent, NamesAddress))
+  if (NamesSectionRefs.size() != 1)
+    return make_error<CoverageMapError>(
+        coveragemap_error::malformed,
+        "the size of coverage mapping section is not one");
+  if (Error E = ProfileNames->create(NamesSectionRefs.back()))
     return std::move(E);
 
   auto CoverageSection = lookupSections(*OF, IPSK_covmap);
diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32 b/llvm/test/tools/llvm-cov/Inputs/binary-formats.v6.wasm32
deleted file mode 100755
index 5a606d5a2f69fd150690bd17ebb9d922cc17c383..0000000000000000000000000000000000000000
GIT binary patch
literal 0
HcmV?d00001

literal 87781
zcmd4437B0~mG^zmxwod=O6nx3kV;Y{dsiw@AqfNsNeBUQLa0hem?D#k5{5vy6(E;X
z7&Ihd6cuDp5fu~^2?9}^XGKNDc4$<z>27Ut=oZ_d(bj&Qx_N*9wfDJIl>vR<=X<{I
z(<0}dbN0CQ+H0---)pZ^I(PFWNvTvy{wbNhF4<mMSKhvTd$OIM{FB<x62F>wYBxM8
ztt%<nw(ZF_Qf!NFx!u055>r)ptNS*~wk_P0NS`Qqvpo&j^>6bwrJ!8iZVP3UV%DNO
z-ft_#j4DE{jExAj>S3!=$I8;J+mo%a*`@Thrt$k-dG6+m&)IzGxtlg`xNPX0O_#5~
z;^OsJ9@={D<ri%}=c4sPrSd*fjI$J*FMief7f)ARf9Vw$Uw+x8FIw36BGF$jul2$U
z&N+YMWt-QR(tWgSEGaI$VDr$qLoZtA#NlL{*AJa@;l^_>+Pu%+b}I438`oQ@O0I^M
z+V;@`wGm35dqHW;K2nSeDgLHLQj(>+;Nr{Ax%Avi)|Zlf6xu?eR%`V|5^P?-ez4TO
zj|6J?m6u;Uw7#^jA|`D*cWA>o=Tptl`g3T}`tye_yZq{XHmS{$U$$xerKPd^sIzU;
z<(Hj*&ibn^9x9c}KS_R+v^2GpE5-lP%D9ekm2uhl@e{^Z##bgbCuwIXNs~#{q%y9p
zElta*{@0RL_N$c3Nu{Y=Zk}9jPPi$zRJzJ7X$$|Wv{cecIc=#_%2UWfMLa51l8`xV
zuC$~n56jJMQ%lNOPRr%CG3BJ%UM){6ku0IKmU1<rv@}s0TGCPr84~JGlBBgYNlI!*
zIUz}!R4K5Ul)KAGQ`i12WFrklP@S%oressH-+oPH@-%OnxIM{l|4XI1y_A>q&tvoT
z@3SNR{jL3e$o>s)EusDM%)v^X=$~I6)cE<tf7;6j{%DtRdj5EL)KP7jm^NRs{t`wl
zEj6D5XI(y2n%LHpsFX>EoN&x>t4}=X<Wtt1KD7SSo0E?xUrN4`d@*@E`AIV7=U=V-
zD5;dPtEy12x;9C+r_0i1Gq2sTV@E09@d5s=^`)h;rBP4vWMNv$?ogWSc9NESDf@)3
zt~563s8WzCW%uhrsru4#WqWyBx_u_8NtouDFT2#02Ytm=Txnt2&V}+?^OA3JO|#1n
zJ*ITc%M;(?nqA9QuQzS3&9yEH@3<Y~#&lHoOVaK9rqaIjvEM6G;;6lYKFqw%D&tkW
zzkav-8+N<Daku-McDsMgZud9ucK_Pp`&mh)=xjn~Q<u1OkUp2Kvox4~cT|-szd`i%
zk+y2RE7i!5LUCp2s7_3hZOgBH&s+ce;2ZAw=yr9rqx#Bnd3(#Y^1kX=RBm<U+2vMW
zS>6IcE3WB!h}jA;A?G!|m8X!>wOubF?)26Cg~YeJs%x+Nv2JWfwJSv*ZkzeKqC>OG
zk*-%K<G1HGzgUYy-+WWL%$1KHTgq>-#La<qLIY~P)wQa7Z6qG!+I-cG3Dpm%df!xa
zO@3US_zpMDb<}<4vX1Jvl5z)aYVc4gcMz-Ol0gxxmzKhZcUgmtwHBt`*<0+k0;g3L
zrd`?F?6!&9riE!|_IA5%=C*lZnq_y}Z40+8@MHE?yKUvR6~5GPbVD;PT{M|XC#~j^
z(Qq#9G@MJT9|w<)5hdMNH!gdR71BXdGYZOTe!LqWn(8__s_!eewON<ct;)7>+s6)f
zuQ`9J6nKA7)QXu}#Xf5oIp)erF6qK_1()(dsNJegF6Bb!7K^@%HO77EQjyq=8Dx}8
zfyzTh<#BG@E-2jUIu@1ZE8!X%$z>&t%-s0wL2LU25=>ZB?$etUexfeRe5ae}I<uXY
zU=j%?Eh;<LI=kHG_tVAs$-2z<U3r(E!uu&TKh;h3)0hd}+;-Re{%(IiJ#Y0hxSdh+
z&N=Vher~esa?{*&$n}s4r@=k0CwqrgG&6su<a^yr*IV<m+$?{9J7AYTFjU*;4s?B6
z{cPQJ`GeeSchFWpC$wd*o8#tg_49O>`Geg&ckot!i0<b5`R))mf2%)Kcdh;~cc?pT
zt3O<%UqHha)cg_dh)DXwJE|MYP5YK&G1(|-<|^V0=ASsD*)_T5?7h~PmgRknYKv>#
z#vI5Ppo$w)gQ2Ril(vrQHBC*%8|g4lj24@JV}*}|3bTjp3M;`CZM&$l#f{kpPl$ml
zuDvFnh(%UAs?VhzP1{ohj11T$UMu`g?Z1ZLY0l^Gl}n9dTw;9T(naMi$O)Hra)C?X
zGjVBq_NX<gP1?q_jV+10I{ldCB}oKVT~y|Y*s0yMXZKsuv658dG`H&TZpW^`vYF<U
z%VPR2Kfz6i=_e|^v1{E=a+7w6K{Y0mVYU58-=0)eMW63l4tGs~{ibXcuV%Zfx@n<B
z-LBhBYqSR5jjyNcbp{j9aMK%=!^`ot4|T!9&hxqot!C2mnKjtBH_*DHdSjAGuKiWb
z*x)5m4($<Gsv_A{(jC$wk4#mAsglO5*UL(n_=A2dX^ecAihPo+yF|N=>OEL7+ncww
z78<1lW9%Vb-(^H8FXv+a%TQxwSGnFdF=X(!SRwHB+TkYDebvUdDs3P&le%`s9?CQS
zk(zEScP2~Q5&9YJN?k`yO^fU}uA}<!Zo>cnU?%LFT#YHgJbW>d#dLd+jkZo;>P)C1
z7AMBGLdhzo2ZGDg^v{xJO`#^EY*R7F3Ho59MX;!xNn1nBs4bhKwrs}sX{tfh7LD(S
zxr?~CjyY1+9EqB~ogo<OAR!9W^yV2pgWM*|>wba_!GvLjJTX?$>6jfg%?(wtpW9DU
zW0&s==7UX$YnTvaH<d}um{H3#?wAc-)Gukv<mjlLnzR(0GkR=B8h4R}XTBchF@9+<
zuPhVTv|W9a`o%VSZ6#S&6j_E~#%Y5H)22x3CKcW7nxqT6xS$I=U6a&wb6ycAx)vE4
zP14`(3`iS4+mJo!vhp&eli1}VJ#N;AsOQG6+RgAder%#tT@cdpOrY$EZi1UQgjgBx
zIwR>Skj|1#4#~PmX2ctUc$3}K8dRI6Dcg;f>)wd|YjXR`uCPqgL#7#Ia>x=lgfc`4
z)};M3EHi07n$OR2y{Nre?tlS*AY*G<ukL5t*v@8bRZKhT?;vHG6Uv?&%AQBrru&A_
zeRJI*HFV#6B|B8_rTgYMv|si!8SGOf7OiE6hujOueS}-!ju`R_-QjLw%`b9`RM%nz
z_hM?UTKZOhq|&$hC2onzKFS^Cj;#5mZi!neF~z&gA5F`TaYwsjYW`TLa4bX8PJX($
zjA71sl-K-nx4c&(rsS4&RBua@{R>tYZB55C(W-uvnVOZz*>H-h)DTO}YDx=C3uCqB
zcUvOk3nsm5)5|fu93!b(7Ac~gRLmwfZoqfY=FD}ttnSCt<MBwc`>dP^u`(op1VJYS
zb*h#fkC&KJWjBdZnJ1#jexdZqx>PWY?)weh^HX90=p30za7i$OGK$9llEGAt22<?p
zw2taql4NW_kP2zc!XH#u!*px5wG?;E2{eQ>Z%Jg(G~?2kX7;eXFv9~0cwb>kSTgjk
ztBOmxlXS6%*o`K#!(9jCW|UzVJF1VQi7y)VA|n+DVoI!HD}^QrrrV98`&cQ2EUZ@4
z5)38RHs1IHtr~{CJrWW#M%E0iRLa;qi9v+*csF6dPh>0$i-575G+gz5u?RB`YRJGY
z6)`1NFmfK3B=y{s*aS$hKjX%HE=Bg5&YYiKGs8enWo0?W(bbsPK8#~W^|O^EFNjsf
zHrTs_>w}GvaJ^g@O{Y2Jq-*=qK1LJ8kRjxnO@oDpMx%CU)F!1Lq}u9HQqAN=7Ue}B
zv>Bg$SllKp;Y>$tN5(a|i3`)Y?P-|Wz0j;lye*|NiJx$`iCZVh=k`MeLk6Uun`}~9
ztwbuTl~ddl@!V9`1$$40a?@P*YQI0_On3Xc>2*KD8Z@KPAm^M252<wN38l<*Gj-Ps
zDQ8j3teR&55XE4x!~AzW<B?rEN+k(ICvli>M;11@gVc?3HOjO;1mhe{1rv;YtPAFe
zh6f`Q53V5_o7^Gj3)R`~=OgxzkM7VK^6@Z#IHGTU_P#hV7w{si5d1>YE#h`j%`bL~
z{gHHSi96CQbW5`LEA><r*NQkadMtHIb$2wv@#sNBp*=j-9qW#Px+VuHY8j0-nL5|!
z@Wyf;EwB0G+;P1WxTECqj_T{uWNN`bqfLsK#>66;T<Xgv8<N6Et?3pXn{<>|FzHyL
z;g}QBm?jImf`F8!Gyy3P#;gue$cjZmXjGVVx{M1Cu`G3mQRBx`;sh-xQJLe4@tMf@
zObp`_1!gyF7V=J_iv8RqF_h5)acLZ7ue%YO4t^@f6=m#aS4Z`~lVnmsi+#@}hFo(=
zQ;NCN8{q<5m@%~?T+qT>G(~rW3rtf8JqU0?v?4kqT!0B7v6)4<fDx0xH1?Ux^kZlp
z#NpBnakzA0n{nxcI9xRMxJWc{Y1gFdOEuFBJ&?XJ!fCPnQH{Ev1S{BdtI1634XiM!
zqx$1<2g-V{usloS5>3|>+f)Cwf-bc2Ur3{8VHr1rcg-XItMa?Te-jH*w&hQy!wZRG
zbqcVzFnuyb2t&*#bcLFa>x!cJw63i2KCUZEl@II6BIPk%q3ff%!e9@jSy)cAH<lAx
z4$$-K{kY{wZZ}SfMUvZ%U+X8pNM?7{Ax~9;=m84q<6l3?cv^CtY-Yh~s`KuN6cNPr
z@if?C5z-J8F}dm{BaNyuWbZBt>b8QqLqUd63>P#b7G(BWT*V`x{Pe1uj)<zdo<-%`
zt)Ne*ex?$CJeAGndLvFUi!`$qm2WQ691znS=nilPW?_}yN1DDx<sC(XhAlg01!kR}
z6C3nsiuj_A$5Q0iTsLnJ`E{_LQ+0EYV28-GdlbP{b%!o0{~>YJ+4K<!#`WeDDaQ2=
ziC>W4onX{e-4Qj6x`lpGetUxWs=CEB%(x@{lKkcbkyUj^$%0#oygCf|x4<pJnman+
zGb(9EiU_N^V`#S3bbE?fS9QlODqoe|XH2_{+hvQ&_a$+_bMA9s+=NbH=S6r2(dJjU
z6_5Jkd42q%@?%!oRlc9w{zc`Fhi)W(fct?(<;U%QN8(p<zj9IeNxP>9tGHjasQi@O
z)6EmOKVeb%8SeXFPQO}LIAe`l?bZzX6WxjaBzMv-e=_M#UQ~Y8(m~f#+$sK4cj_*`
zmKSRmm7hxhqhG@Bmn<qjZ;4>N(~xAR1%5f5SEnmKOn3&r&rp_XeqZ9x<nGLxKg*rv
zU&_yyE=<=6%6b{UU$!v40LFefKVQBu-2g$(=I7Z9(~WB7lc|4&dxb>lE931s`LiXz
zjwI`9{#<viKab4kyK~%m?tJ&kLcCsJY<xiwu<J>@eqs9cG;$7QUg$22G`|Q9xxroJ
zFLoPz&0Xwj{eF-VH@ZQ$vF<OSbC-ltE~S)98>RTm)Ev6M$!gl9G$mhmb$|KrgxTyi
z`^&5DazErQb3-zX62FC?Tcle8M_%Es*y*q2*_ASxOa3Z;UZura=C9`G)iwVrp1zu&
zuMT#{R(@`k-J$%6zs6m&({JP1Hd!B7p8VX7<R(A7_$s#}srhS*albC%?R5w&N6PDU
zbG?~W6vd>tA#pb(JN=D%exp<b72l+ro4A=vfv-s%MZJa+Zx-LaR_|WRyZMy%I^DdE
zn+0kZJox&=y`Iu;(eqn)zL-R}>gHB%mQvaq5_far-au)$S^FMJ{q2c}3CV9yP|ktA
zcO>qPgcYcaf;$s;XX4+OL<+;9Z%W*o^rY(EnD{p*j-uX7`6BdPc6S$dMr)?QTN3vc
znsm3lxSJOu_*?Dnt=wsW{5HFL8+Wqt-fnkq=g!EDEVw6e_fW*W_TpY%i01FGyLWKc
z9eMPfiF;=vroGR)2E*OY5C5(tZ}#s_95?Su96#<$3Wj}vuDb{5s4TYkkl4M4@{DD(
zcbf=#P$?d?6eYh?DRveqVyzEZWgkxcd+o;)sejmhAp0Ll9B+BflSh*5V^-p$D&$cs
z#B{~iQ@<;bwtJsmypNH2f8ySsL>0S>Ev46&lLHF9SjpXpMORE@i?1$haZ2svf>O-5
zpc6%110-xYCiO4sgU(C0Vo`y04)}JcY5K43#~CAxvuV{q5gkxh<K8R-0$?m-cWg%q
z)!6LD$K{!9h-No|3Jl*MrEm%^?dZ#iSnQp_tXkzKQ%@J`ldiffbKx7LYQ;R~(uq#A
ztlca}G1Ie;TSfar{r$u6OsAIVYEIdjQ}HuE6iq2=ZPV;LEzoj46e{#SEG=eI#LQ4c
zFUfj`d(`Sd^Ro}CB8YMTFAw0Q70``+&mw?FeKo-S*^$r(vI3rvT-(?RDhse5UOj1b
zG;&B*lAj@2!G$4NHar%K`Dtq>CS^gjr3G-QVZnl9C+9>^(~+QaB-t|XH5bg2nuxER
zmd4t02#QZ|lWMX@4oO%TQB_y|!V6_+KG{vF`>A#_P5T{nmPAus_rPA3L`VWF*Fj{X
zu0cY6W-i;px!xgYH`CTaly8;<^FMOR3JBLHHkhq4mSI0wQFG{vZHWwFIvnKY)%?M9
z;}Can#Dvt^L*1cn_W0{GBqe`Xeib$Yy1&MzNV8j@wJ0U^`XllV42L5O4azrRIxKRF
z>q3B7{IL8P#CSN0S<)B_5ojqLIhs<Au3<GC<Bz2y%iL18Y{=&<fN}$V+);EQH1;@>
zYur}UFbj_N{Ujf7$GL$a)PBEPSwq{elBhW$v}iR{SmRD`==SV61G0>bmiiOv^hxeS
zcT&xtOmQbO;02Sekfv7|r>cx)el1nLgfuUy`P1BK{&ct2o$gK@Z+pkm?9J{B1iDSo
zGi-vM$)hvbm1~e4MK7EFrS7HUZwMrO8A(v}^HsxQw0*O*J;c;z+u4}(ui(imYH0aa
z`f~uq*SWLN>Fcr&+2lKynR9NKIp>k?ypdI%@6OlUy?`>-^KN}j%H3b&E^-$X!?%HV
z7xV7o8Y;eqes2x>z2+}L>tE_FL7QLdE*tQhpi$jza&^==RbC#d+)UQZp-PB5#2vbP
z#BA9@t}EOY)c6(X?<?I^C;F>-^eT6?dsQ9X{c2_lRc<9Cnj3j28eap)ZFASSZ8dZ@
zPnM#u$#iXk%0|r&d6e~blyyyt`uapHd4sg>jfu?~6!Q%Unz@cvwg)#SC~Y)qU3wW!
z{n}&zrA&z^VwCgid2tIGgkIf(a*YGE)T5G{O(i3()vPxpv<oeYcD{|eZ#T)f)Zd|!
zP`_y7J89b+6I3)Rw-ywz8u#YV474t4_%3Qg+cu+bQITPgQM=TTp>Xp~>DgxYHu9rl
z-)?G^*C^MwCUL~?mA1P#>O&N4vy*;Z(+-kIDJy)`uoY=oYxw<YI9m07>DHRGERUD^
z2h?!%=>rK0b;v(xYWG1Bph0&URUZlsM>jqs4Oy4c<PX&xCqKH=8jiZ;=cCCWT9Wqc
zqCM?;vr_ufDtmvz#Cc3FAESm3B<=%=|6sy|{gCvi`w$KIaN<6gxDWICBZ>Qn*$5xC
z-0l8j2|Cwt_pyQq9~Tim9*OV?mHUZE1Q_9yy8k5iu8`Z#f69tl=07cBd?s<9PTXgx
z>a+HEssEga@p$4sm$=7i%jXlvo6nQr3yJ%J5#x)27*8bbi-~&zVtgrapGy3f6ZfUW
ze<gAJ_)4<MKdFMgnz$zu_f-=8UE=<ZR*F5prdEB8R+;o#;lG}^uhYwR|Bcv&Z`#%Q
zzfT-L`S%-1Ay}SL-lxc05jiXVTXY`+d@JxPI~Mlz+e!W`_*2P$Cy7|)(~0BnJM`nb
ziTke6^m~c>cH;jbao<b)Gl}EJGs$ZIk81WmCGH;+_fItYpF{coYy=Kve!nR52Z`hF
z`;_@FiTf8T^M{H1Vd9@P>(g5Dqs0A45326j#Q#{0|1ph6n~FhylDMCct<(QBcKlys
z$N$Z)CI2(Kw)*Fi{PEwYKR++({`bW3_jBt0MdHXt4t|jSxy1c68GmCCsJ|q?`(;7t
zmHt<;yk93EasQFHUnlNA806=R;(t>V|J%gz_Zy1;&%}|7f1fWn?st&H{VsCc?^VO^
zsbNifYcyv5%POh(|CKoY{)@Tr2POK$@Sy!MaerjMg7xG4pG5LMC+<&)`!hBEMXCQn
zYA4+jHPe49&Hp6szZ3UAq<KMUULcL>CwK}hpxN!fZdqd)*7$2vSv%M1@pUNzGIiId
z(b~B-?Wk5t`9akmmOHfVSinDmbOiGx+b06=wDCOdHkZ&1i7Sovp$Vvr`OvV7A5pMM
zguiU3A)*zSQ?8jE{s^PBMEuPB8X`i1!;EWVt1!Z2V<Kv17VjX|mE2~=J_<CJx^e7+
zX7`%L>&Wi0tL-Zcf+1>09|Z9dEZT{fvz?D(U8V9mX^nO7$1R8<o<7<}oxGg=d|9|R
zVQulm`KimN3ww6cZo<}@;<~p2*z=;xPhZ{%N<ZDrxXC-fXt&jSy=e74%f|rY^thQf
zVQG2SyA?~zavZSSEG=!mVOjOL*=P8J7{WQ&QgiBluFcxH=uWu<RfI0fcS`8YA?O|Q
zg3b6ts(>_yx>@eft-vxA+ZQaXt$snoOl-4lE5P!6faNp-%W;?&vRgl9UMmBtfvte+
zHSwRWx=(RY8F`iCUGXmAcS5@0#-X%LcpZr`LLOZw7tE1T_pp;*th*xUug_J2YX@7l
zePJDh#jXw9KK5RLHX56+Jd)Q`n?@ygnz>e2tugYroJc#8pbH+bbZpsZNI;KwVp<Cu
z1^L^^(2os!>=kWDbh?TLv{UXHaZv*lis#U2*w;zG`LM2S7^cdENf`Kwo2ESggq2mq
z-i-!j|43_gA;beP09&tXy(dc#L*7&OGp(4J45sD7YeTFtE8<w}#LDItC%C?~em42c
z^QaCp%qjBBb#v9Jd67{Mj+4af?Kny1yLm9fd^mv(*tPy}%2^<!tuF7Qq8t!9vB{!1
z9~Z~@c%)nGj*RnhiCcupy#%oCD7SR&@IGrDK<?P0oMmp=?DArt#~Hai&dB53a(CRI
z05}=ikAk}8Sla!V*#mCnTEB`mo#0lv6YAh+tBbPMxHYrOr}z^A#xSn2tWUz0KG~hJ
z)}KoHweD26wvH|R60~EL2d6O(rxoL%K|DQ>^$d50_~T3s=~?bfch**HXCwH_+{;9~
zmoM)F7kjxodmEPWOWiBBVkz7EbKE)V%sTf<w{ELHH#|JgohM42AKf-Ca2H@3uiu4b
ze7?JI8^-au?xL;0UZ)SkUZ-Ie?+JS)SjQE?UIpiLVJcVLWtg#XG&ga(2}{U~5L%RB
zKy&HzmxG0EHtcJ&8(QU~k?gLhV<%q;Ig~2HUcO3vc(v}@F_~YbyH~rbv6f#AxV6<?
zv(|5;VcXp{x4n*yZ0xuL4HNQTD`Qz4d7Ty59;{_OxFNxM7MgWKA{dJwHzjLjEhoTN
zvXG;#{MtrYual*$+4}le)-4IfvL4(j16qjHtx3Z;9>6%JEG%M7;X5!VFpuw)6{K;-
zK(1gPi*A4f*3&o1IK(yvvU)RG?k=SeI+ca4zeTSxg5Q$3yLk;PRlypDu+cH#ZF(SJ
ziY+n9n~UX)aKdD+U^mx<NZp%60rhr}ns<$#qMeShNVX=Ekjb_dlZ>fJMybZtyq|3E
z({1y(b`MLvXXSQV_$H4OduLG{9&M+@>EASB-@>+#uJ<j-chp~j?6@&&y$E6`0-m)t
zd}9$xGjB2d*4U%I^9{0rvS`woFPk6*e2WC<{rpb7orng+QKyq#A!K0oO<Uel_#~3C
z7khS;ZfAiwRHO@+ic(9pO*Ff_40YJG6X95KQ%rpKrDuUaFyBlm)=-L;m^}~8#OZGO
zP9$P?@XV|@hxMTZYfz4s4p>L?k}cVtY{>R`{8HSkjs5_n<^eTiV#OUO|5VbhL^@Vn
zpFC2zTa3oE0@%yY9Gw&AC_+-2au<4Xjyrgp_Aqv$C}+F*H6&xj9jetb85bZUEk8T4
zn)nOcf}IG-!@?GZy}>U<Z`g*-?l&SIDGxyH#xAKN9T$e(jEekG=7%DW)@3np#L>vf
zV}K)$b<0+<>6)-Pk-Oz}e;j7vaY2==pfM|kiM<A4S#igMEk%LQ&u#xg(o^Jo1Zc$#
zz_f*USb;d@#Y#|%l@h1O(i7b3RelY1od{@gVqIGiQ9LX|o|1SOA{BTd(_TdO8SBwn
zWbE1+g0<pa0vLml7$vI<Ju$Jm0;u8)<vUXk`+{H{M6jOjURq=Cp&~#c>g<;@geG4H
zk*^i^ipU}>1V~5%oWsL&8rpgt-CJjh^<18xD{6#k-tEr|4LIMO=Pz*QBX%!v>sR>;
z>E1=|LU&Q!Z?Gw_p-><fEByj(J@`Se3EOu#*<V6>a0z#59eI0Mk#<w$6_c=ClCbXb
zjR@IIZgUM>!V21g=TrnE+?A{SRTOl!yUJZ%mvD{J>D4rv?F2O%%5CLNI|{`Rbs~2u
zs!Q;M<Ss%NBw`24zH24$5VQ#0>kxR?3z(qDC=6~$8qy#Oau=a{J+c>xfVjOW336BX
z1OoWA2;+U>6EYoXfyrI2ok8*zdj+>8^GYQN->!!69SXu1F}%e@ub?^PtmPdga__Rd
z$l9)ktR0f9<?n9tB4qh{BYgvxKtjJgSyD2r#9AX^tJd6`<Yj^W$lP}%LxLiB^G<~D
zeTlmdWmNL_TgqPH5tifK%JG0<4-b&zJ%<0kha3+kK>EtDQ*Z>}#a9229)U&3TQ=l=
zugLyzgzsy@@+%+_jF_p0$Ny8J@zG*+w<~e{JxbZ{6P_Sc4_KlkSmOPOFp0;6Nq|g%
zLks{+Q1pidO@K+%!6b}1KY~g`mXhnEs_UcFl@;8Pfm4v@<AN#tClcTZArbueWO5?-
z1Ft?U$N=b}E(GE;Vi0v>1t>)BfFOi9e2(TlZjUoi2=aVEI0P8PkpE%=HUauj^G^tY
z5VZ8A07`sWAOr|R9Uy|HJt-(6fDi&8KqLN+4t-6feLVq!0C%Vhb)atNfg1Sx2Bm^H
zRDc~=f$B}>pHhZzCEyMK4K@F5z5cd3Y~ZM6{ti?DUI?fePv-ikt=O}KH(-jN;=d=L
zVW;p0p8NyFfH^#!fHwd;GysQ_0UW67`-%H!Aq#c?1C{#&TWlwS87cq`nw6C*2bwgf
zKP(3I*#w9I^r7Z|lz=&CP=9Pqm@AxtCM^@pz)$}DaWeiU$w-?*bN#=@bpIwCK^cE$
zn8ME}>p4~V990IBVg(2UQqZ6W00=`LDj*NqHn1vwsn@?s03Co0YW~;C{%c+@izW)7
zLj}x%E*RsUE$rdhlK+h!3vK{)_-zuciRZCV`d;*akbGg<<+IaK-IR<gY?IL*b-~py
zB!newWj@6EOWr)u_tiXkaOe?6+?Ka!x5>^K<X1Ip5`1a!zm91zNII&QaON!H1EXDe
zN7_Qgc`Xq~Cmfox&zf#()&p7q3}ZOJ5VlaWYpeM&dB%<kyOCP_wa213zja<Rrl@Un
z_hORr$ZiI*P@9(4xGu#?@f|i_U#e)MKdQ}*iTPVQs{foM)AnA6l@0A|?<j1EHc@~a
z>}|?JIMrh`HeWmSs(IMFTQ$nObC`BXNIZ^FFehyJv0Ev&n3YjCVO|MiPDV-Lgs23q
z+9=X~QS^7R0g;zzT+-?Q4L;b#=9=ZeSG1#g{Rmgkzkz_2?DIBGv71&+`yO4i;@eoN
zw`HHR<YO2_9Z-S7IDYMMTIBGg`9%mENK%0$#-rwv%XzLenK_(kq(l73k?$r~935Q6
zq!w-T<vm`OS5e03Md!7yV#vDKT}_HXg6Y^@J#p0Tsv8#vs#{=#h;M_WQzTRo!6ug#
z(CkF20(mfxH+EA^OLPW-Nb+_paBCO?s5|rT%`mvFDDwe#lZ&FJ;@LJe(iAs(DO5R<
zPSb&?vK&Y|qZ{0GdSJLz&70#W)|bg-%i?nfLQ!@-HIE4%$>lq$KTngn1xfdIegj&?
zqjBx|Pg#P%_;JYrQKPSP_LvqH&@>!_3uoK@>Tb0tYKT|6C@K!muriG2hs5)4{1#z%
zjYMF0*-x#aN%>C=(B9McR;!v){-h}b+caekKowiD@TM9wcE>KTc9EYnYZnRn8Jb`K
zI-Y+5nzP!^B)55F)%~n=nVTh(K2&}{EaO0s92PKr%52!4)sq41r0$@aV4Xnhj_QY+
z+O&+^AHb{(pe7Kl0h}fIzPPz-d6YF^o%s%2LlN7tYWZ$Aanyq`GQm0A_d<Y1*JoN@
zZ8Us$r)M?X<c>AmTrgc$02J^}dvwSg&uB~t0_a4X3KOIcE|+z!7UR_DLiWhjk^@mD
zdz5z?3C(+V2zH(7_BU$FNP*hyY#V-7hvI&h*i$P>*wgiFh3aM3Yo}aP%q)trQw&-v
zmbFy0b$-av2Z<oPkhFlQp5f<`-n`uE{$QJY2W#XKe+d8C0lqqm#6yKW{D2d=Y*<H<
z%3b#`Hm&m?aE`UaI`ME@C+45E_4Fcc<*%Fndx<^k#jFgEbPKYNh$*o>t@UhU@AOBZ
z$_!D<US|w@G)+Fbp)rqPyyB{nS-H&R1AaNpGB~a7S6Fjb(Ap9!#j;k4=Eq^8p`RH7
z#O4NSEDL$k3d*LP?3FAES7m=Txx1Rt7Tjj9f!ng1)drHCNNFdA(x~nv?oQ&)gh#hO
zId`lY%kGq#R*g|UoZL~pcf^sJy~>BclI2K0%nw(tH3Y8prP(OoLem4xA82AQzlM=9
zc*>^pm{^89^-&zCe9g+S<g;T`H5~JwwuBi8!ns%_n1I7)gELo;;!G$3Z<g|}n?UG_
z<>SHPy6kkRvNsr1@)QQuBpQQAm^O^>uG|gZHK&w_$AIsulv(*_jgNMh3J3v!j8gu3
zb|=K72fEoc_;eP?e;V-^EgR;LZq6uT%#Do&;&5~2Z_yR}EwaDZAkQbmd~uW=AeQe1
zr*jUA^f{b7=6SI|O#|zY%LRJ^3*DmCFmGbKdt}`&fj&z_ADy~M<wBv+2t_XC0eDAy
zJY;t?TKpIu*#VqmBYPa(QT=%PEXg$O8p&Z;@R&9kv?-wHVbdUNGE9h@4LTbd{!3%p
zknXTT4OL_Wo7%VIDY3F<SJIKt=G{G|beOGK$dVmX!AnZpi8AVVC|@be+C}BFxZpp0
z8W)apox)|zqVh_mJPl>vEdOg<MzrXX+Z7&FV!0p{2ljF#E+}#;Sm`G&FG(gA`wDKq
zwWFM+pzxHKTBYMgEMs9Bl^_&hxbASCVE}Pxy2k+mdop<@u(e0tP+ztopUGpk73^?@
zy)xG8^0R3FEEW}%86BaP^7BBRA|)5Alw|;FTux!TA(fDH2Mxh-eQr+8gQAH?=8<|H
zfZ6ixz-i6yU^ek6;}CuxvZQpN8A%kdfWr-P^s#M+u=_NCsFX)A$Gqxr*+OXo=yDd^
zW*NZLv|2xsZ3gf(Z8f-~PQ{}q!J}uXJG$<Vp;yPigUYoMv1qw$!=V-7h=b2oAhl&~
zIl~;?#}*(dkJH#PVgdw@XaAueM<0`mr}>qXy$XT2s_svq>=UePwVh1}<yu3h*666Q
z)pR0v$i&6AQMv%3*es#AG`u|7oh()oyPRUaH}gsd4sTNkO~d5}`k0l~te2-n<KXl-
zd%@G(={4}QGezaI+-ZPmXMv=>)V=Iv|8iPt5L(^8f|kA_wsf%|G_~}cK$CUu9DlA`
z2T*pdJMS!iKDhwO+y!;N-o)$rM!pMa&xO`djle~r6B``fx;kiV1jef;pl9G`A}Q0|
zo)7BzV$igWs`(O-w83it)Hb?HNu^^j*;kASFQW=JGt3WZ0q#F?;oVehUAWByeu(A)
zsktq6e+8YoqA^v@VjIKiW(UJv*=W$!;h}A0w95C#UA+~A?Nx3o6;u2*RCtZR2l6fh
zVdHX&-wq$|2q4=KARAj9*Rou`PAHq`e7)d~8w9rPww*Ds=XQpX(6%(Jg4e;ms^RtW
zLSZ}O_1e)G^0%5#(5+PR21~GjT@C8E-TZ#|_qHS|-#d5>s;2#VkydE>ol4362A;S;
zY{Bi8A8!VHxl8y48yI(K8>0@w2F30cz7cW)vb~iKWx#9{0mAk+^Zdd82P6#$?Ouv_
zhl&7EdxsFUdO*~Q{P$~d_b#DiZrH_Vfn0n_g4EtE7atsbV$&Z~(;rmR3tt2Pww(#;
zA*lF=lE~C}{_yt@#CTYqKce^}a`g$-E&y5sP778?Rib>C3f@KX_X%JN;=H3eJt=R`
zpEG^FXB?A`>hD^U+X%-3)9onv(c8*efzQRxpsA8lrYQ`U)*{@thv#acFF|XoeA@sP
z3e}oFWrjdo><BA<E&OF~pd1YU>BH>6eG|n;$?P!;=;$^12S(9boQNAfWOCx@LnfV)
zf25p?8KOgO7|2?lTi(XHeB|+7S;8RduKTEjy_-Xl34VVno^H1B3|1#I>fYIn9qz6B
z9y~aE2K-D0eRx>{*USp@jEcJa0lYN7PF5upjs0On=|ueNM2OkQ2!0}54ETAZwIjZD
ze~3+mL&SX)(}|hPrQ09MM6g}vAte6%Xjk|nU~fv(8IU754T8A0J}xG)Z7&aT6eJM)
zs7P#%gJ>V+Xm_+*f+HsuGB&U~w&oG`Zjo%H5(2;Cau~26xI^|EQI0bjtiX+PMIC{^
zd|2SG8YS>0?#11!DQGo=7<#gXC+w9J8_w?J0e=epKGmJ#POba3*6+2k-@X1N^x-u3
z5_j5=KRx#0jMxX{xI4p6Wk{;Ols=d!ui>8ga!rG?RiE>(poCYtSGZRWA;WRYtoe0r
zo!+0T;}?N)cwzeUs<!D-aTj386@6~?>uK(VZoT$DJhIzu81NU<Tsv%0XVYUaGFiLd
z$m2^8(wEeVZD;FotM))7H{xD%YXQx+`?^+pqjs8CkJ@R5mDp=8Db-cbP$w%4n{3BJ
zGmPh}A}u7UUsXd?Z`JzDc0hXBc}~O5^N{58HEe>|X^Lx2NFbk$Jl9K3Bb;lJ(fnP_
zo<VqXqlDQ_aqqc?u%-#GF<~nIDC2~i1GZi4iPYH>QC{1FMp(a&QW4t~q_(YXj4N+d
zu?TLYcsxE)L1Htp3%YbklHU%(dd2?@j!7LvYKi1&iR7`M1U1Z19sX<1zHV}CUx4N<
zNs<+^u8hn!^vdEE1SGRbvQCEz@#!KilL#IUAlmtOK`$KY)e&vp3AT*~@P?3W(RU^u
zxil2CSWXqF3xjR23n3rega9+=HyF~er)18au%1|usc)B?W?{@X^Irr0Z9?u@+XmGr
zHNTq+&F|tu^E<h4x*_Ajfxq^67#WTSI>cIkk&_LYpmuZ_w8&>tuiSM5PNXy&Pp=_A
zF^)lJ90LMWDA<BfQg*>7GhNNYs|G~D>EaUBI3>s$Pi2j}?vW6M-V>w6XDl_nqqS^k
zS7@f^r!S@}+41^PDeWH6!pC?EZq}+{AzViY_i4tGN`CdTWe*>uyH1R7D`74zGq7SP
zY-a07@geRY*+`ZN_b*vxhlWgtg-nN&$y|Phw8D!A@^pBQMYd|PU<(<W+AVIexW;NK
zP8lPCkKzR@z91Vc2NJrYdg<`i!Du8uaH475R?Rp=@I|F&M!d0mU~{33>1NvoBl%C7
zjz~OUKE{3)il?Lc(Q>l5;QW+=hP^0MA9~A{d(b^<d2{~gg2Qlm)s(RM8r;ua=ZkPZ
z#l&Tu6u8U!3uMbpsQZbg2JDEjR&*5|7ZvbArsyuwUWjPLWw45YzC?jhsF$^&0V<4S
zslA|H`@ub4tm9^`!Yd;2<`q%*98-@%zSRRTirT1h_3lNMaJ+91%G?-eUJ1uR333)}
z?$CO0gUC?g0pX&D9Lq3zwsR~a5k?S#bX25{1H49&Vxfvy2&9PHZuI++9o4&<n&va@
z<s#EK_&eH4>owDjtb4WA%px%}E&HMglxV35n`cyIkBu<9+Envgc#2343N$eyO2OV@
zuUM-Uq2O^VIy%L4QuIWM7A()tIK(4t4Tf3WHB7#O)iz1uBEXrehGAr#yy3X7X`Kvf
ztu?IR6}W-y4q1Umpv7#^0Q68{&k}&=5~EykGa+NaMcuyFRpyndU`Gs+ftz&(yO)uU
zUZSf;d5JnaM6K3NgauVG)`$Re3LL^%n-?5k4wZN@j~DrelB^3U!KD)^VVOeQ0id8E
z2KI<}a984G2~tF7b(dsMifkeEQ80!fnX=p}0x`|P=*ECch7GuiLJxEgYtx*U<_M$c
z0~L2%v_PF-!3);w?)VxOoVmTYB3H%&ejxjTjo3=wt&GgsJM2hUE9^Q`-1&Ug@)ftH
zhHZCZ;5Oo8q%jbsE9X0|`&Oxr(kSQUdQWp+?$-iES_Bnahn5UV-p!ftKhoQLdq>p-
zK&k|3UfFvBr{&P@xg6dRwH|<}lkFw2+=wF4{Wl4J?GXx!@AQb5tcuW!9B*+=lG=9a
z53wdKNkb{qH(Qg+xl=cqeOg{-4-}k`2t(N$tl)CK{RGOkI*~^Cw0=xrx}#8vCQcO@
zGI8ZWPaap^wN<ea=|=#P@{&u2&plP=mC|LdlwYOToi2f5yY-fL8)^ljTwRy64GZsm
zh35>6)Rc$7kj*o1wIP;q(A<c%WZb4GbH<{!E<<OrJJnilLd4wMSPAYsPD3A^gPbE{
zp5R%mWuO8(S$GC*rO1jR9;oC@49mcTv?+;c)QyPY;t(-*j!#J7KEpfpVl-KveS~)+
zt^k$bXFM!M21bby8%sqFP%*uLfmYgdCbjM24IxVKE;mskEn_iFdJ!X*te0e_EQb66
zY?!bEWqP85^sv%qhhztl%-qa}Sn<skk_@0MtL0$57pB|;5Xd>c>}lJnm>+Tzz(mwI
zjNEpNZwMiBXpCH<+zY_~!IP07oZlO01TT@h9Q<<>ePW_7h15XEbK9GWI4Zpe5i7W{
z{2OBVceGnpW9gSmq%Ak1EN>9yxR8DY4KcYfgxomJ_17dUG_os0dsl^ICy>ni<Ax9o
ztD<O7drxvJ-ANnqj%(66zM#lyNIUC59pXDhj>o`xYu!s$gW)B38K34(uge`yGS^a`
ziKGw5@y`sazSX+{cy!z#JWKsb$FV$F53QcHpFgK+AYPMP-&!&7&!Zvd)i{QKzP~`l
zv*^>MEBd`%z^ZROC+@?TYzQeXc6Q)iUT<!2V0i6U_m@)jrGe&`Q84)2Tpd8MayRM1
zx)0a4O@plbn%w14xyypKIydXmi$5HfZdQX8`NO#@h5+n_+?6#Jf>(*uSBGS;3dvqg
zGF*<`)*;q@uX5Maz~;88BilK}w|yhaKgf7(;x@Ae1f9$N-q`CpBopu)>%Wos9Byy@
zEc?j7ijU>rYXGutPUzIl(btF7UK0**ftOebT0=)U!`Z5@DXjXWQ5e=+l}U%^@zx>3
zTvmXnjN9}yp47WN=|vXpD6uN+sJ_se+};o~d)?`J*1)5E>~v8f0uAvJqvQ_Yy^=_d
zxN6<QaK_i|8l+In*tkZ)ea4MtYpV!<lgVlrPiYfm6_^7r(M$B(4hPpUPdeRx1EV$>
z*kX_}V?C3~-NUxjLM_u`Ee(f({rgg)OH7Bh(;H2h5fcF|$$Qs3PoRbbdDGKKGBYM&
z%fZd8c@AHTRSuxec$h)^uWt6hZU##u|Gb!=Gc7tj%<`r$eMmd8^BYMHjY$Zaq}WNA
zsf6CpsfGoOBuB&~piWxv*b1kF3|gH1%DC%D`hVo8{x69+6jY8ik3~@`34@9`j!{@8
z2y$%Rjv(Spu?eF1a(TWD^Mx}GHMSa#bt^U^f{qEM3%Q`sK<*GgOXHd1nCk?#V#%@E
z4Y<`C5kvh!43VSTp9I0;iHCqb4RCZ=5ke3=bp-}tTof84^2pwH5xOOWVpT6CuK`el
z0-(Iyxv7BgA0CIYvw@&abg!%dL8bD<ZH<MlizESyQbZ6=KG;y4Pu1s3?AnIn1$kHS
z#MR;8RGSK|D*7URUxZ>deQ9TvHpEgdj-_&3Sf_>osciH%BC`Off@|(7e;Ezl<SuiY
z>Jms2tybIR`87KE5S(&T9Tv8I%?h$8u<;eKbPhteD@-7{Ej19SN8_R4nl=tJakkQK
zI|MN#apbPq;<vMzyMwI)$)X*@&bTLOlOPHqFsM2k1rXpy!``h;h@<Tdaijnqbp2+8
z33Ams{#s=RdqOk`a{_uw*(`uIX3ts?x9FL0sA3Zp9LmCYqy-FhTM`A(tq7o<lghu*
zzUbk@Ce`!N`Nl$?$E3EP+JNCp!52jO?Cubv#eQQQwKCl8Ep9elC>G->yaxq3hPiK#
zqXiRGC>}W?v=&p7B$Z4Y;RK@JUEnPiA%bWHqfH&x9VFELw3{9FjpEcWwA16zg0(36
z9U{mkq^B*uF%f!VB6AI~NVqot5JQb5du+TD#-Q8(6~x;e(#StQR-vtQP7ol4Y+4|O
z#+I^oo5}juL=n;oS;Q2=h`qocNe_-}6u&P)WJ{`znBtKf$HZoE`bvB4VB;b>#!(@Y
zAmH+Nv|<C&$zGegsm>JukGtbzI8L=<gH~BOgHia5FlwtXg;qtxbpx&=6?ej@0a+be
zfai!?&Dsjd*ocB~a%{yZ<YGtkEacy*2$@rB*iUQYyLiOlG<Tu~;52H)n8K1eon6o~
z+?fOZEVzzcQ1{Zhf0^;z%Z7RG<+1X!fz4lDlPFUm7*nxu4sZE&LCgtDUq?_QBv?fO
z4rIxl&(HG}Dyf87Ww83$#cF#c@(fa}6SD3utRc!S5+83M>BZvRn%m%NpzHd*(Oqn@
zjNG8RWWZladoA2S9oa?PB3py645zl%2}^gE*AQQuRooEr$96w;rT{C77zg<*45%x0
zrXU13==ZOtKr@}{{+c-Lu94)Wc`FfH*2?Xqk=R0J8G>HHhN5H0w8l=S>O+Wi1xpGc
zHl*E7{%*iAAMu6oFmp<>>orN^6hZ%AJ4LWTmJ^LQdpF2j=YKdv5T@-N#b)50+Sp{r
zv&I%DtYSNy9NDThf(wDGkKo9&>>QU0XWP?uK5IHyP+{9sp4^zEMg?|0jSO$IlLn2r
z2kP=Y>T<FFIizC+{M}2J-<jmkmHd6$2le+S4z2#Kxb@lZ-z{nVKs--SXY<n-?>#2&
zsc3`F6<C*d+F^p7wErP{a+>EbLFn?sMVBA3xCRfC=Fud_aZTHuF79uVi(St5j_mSj
zoF@o{Kc>S3?)_x^08cnlFtW=Zv_$JTVZcEH&KG3Qn9%ufXzGX6)L~x!NKE`u#YEt|
zL5)KOoG%c4J}&xvLgx%PWH6+&2K;>j`f%3Z52bL}KnD$wR-e^z1CAFAaLNGsJRSob
z)H!0{K2L?Qk}v4x7j@de#bX8)4jRCXQgw|X`%+B(<#_6##;F4i9H>b|Kj>;iKNt$p
z4>*6YMTZZ5R|=;Obn@WqiOwDH<D0Ap{$8~{Wf2daLO(ZR8^oEi!A>5Ck-nqQ1s2WV
z>0+q9TMX6rbk2Y?1~nZu_y=~Ox^>ckkv>Id4W8C<1A-fj;Li;@ePEoxnFCAr{doMK
z#_<E9AaMRbArA6GIBdZ21NXyZNy(<}DFim4#Xm~ivk6BIYVp9q+5RVX(BQ{B`6)$m
z;$Y;c|0_){`G3=ke<Q`u6t+O;4E{CYph5PJ#sWWw1&D3{V$)L1K4Fjkz3AyLik|+`
zjv4%dGJd7g2By4zZE+EPoorHggK*NI3x<7OH_wxjg9g7zIAxH1(hB`go@)*IJH7uM
z%@h#(pE_5Qf7iSre_vGgUkYVFq=Oo94e}`pY4F>Gm<9@QP|2P#Tl|kI<c}1>?zYHI
zT!ZZ)u7P42{5kReoe<Z65C;83GJtL`D3*bHfsx*k+T`4k29emU=mlL0UT|%y$OYFC
zcunyO+zsgo{>GH7H>K{z)ZLU4+9h?b0l>q!=<+wGG~i~iw18W0@UP{8_~mt}<LB#A
zjJ6bN+@f^1aCd9!ZcY6gELFF^4Ra1VEp@l0BiN(c6R*)qxg&M_-A=wcRluE8bT$DR
zhUdc@E#W%<rquECO(cAC>fW4cKHO!mHu$&1vhG$d-kLgsO1_nJZ&QJ9Q-O{Yck|@!
zspIdi6jswdsMOd!%5!h(?n&Lfq<Dw&yn{S8DV1b{e`hTIzSQyaK2qJEqVC?6y8Bc2
zE)^Eiy<2(TP2P=G_D26eD&_GWJ$?_l9!%YXX|z5cAY8;3%E{5gHtI;&Z={JT=3)48
z2o);8_Kh$E+ZnXr^rAy6!(=B0h6L@qwk@oetJ(9$jJ9NU?HEuEKMZqDw9>|f(i#!A
z^^KHqZv2R&5?$c?gg{kDE%pGn0L3BuMqzRN0gHo*7D=lP{x;1P#`_1p(|1O!&5k4<
zyVNU=){N*tujcpC{00OEe!oQ!Tt;Y;9VOhs<?T7!AH>vU_s=NZ3KC}}&Sk^V_WSVb
zbR1qCv0c*@c58-YfVxAs_`}t1w)sHs7la*}eyw}`K~;B<tj!Qhw6U$H<Ej;N<g}q|
z$J37B={yRtv*P%$&398JSw1|5f5W~T{fV1yj7)i%I%4iM+F4+)TUM%}w~E;i4h#YC
zhJ!nzmATTe58^7?@fZi?%;kKDjX1XAwt#s2N}jwUe9Ze<Uuby|e9R%EJb6{E&Pj0T
z9NxtxO0=S&Pf^4MOVaMUO@<h_O4@b?mv6_&aMTfr8G&jJy0EKjafGc|o@VtpqBsnJ
zva%_+g(04=AqHZ#unq1&&L&fYfgN=Xu2pl88m4~Dp@6xJ=v>7~V}!S`MRTOKXh1p?
zyn}tX6983{0IJpnZb({19ja}(F=3E5GRRiqVoEF=YVe&rati~z$j&3vGIOC+H0X-L
zK-<Pl=8vU#JC7_V!`x}KSFY9ja&DIw^01r5!*Lzeuck?F!3TQ<z2XYgmgL&FugrXd
zF*x&D<Jv^~Y*G)|c7OAhzZRv?5w(R;%Kq6(vF~=M=`5JrakN!@LJPXiI+O1u46x1=
z@YU%i0el&)_d6b9Tk6SqIi7XW7l5W@-&ctuC2zsPEVZI}#0F2JX85C(x!284HhwDN
z2ZdozwI*$)rs@17P$4n1Yym$*mx^~J^4isNK2)M3O+6&-8F7}Tqx#!q_=^L3CFNV#
z`GA~pi30(nr+hp<W1(@3&JvjMP>1AWm{9|M_pkuW2?YiSZ3Rcvcy!R#Olz3&uEUMT
zp+mnXGRGz=2J!^gIWTHkxyfN&-bHV^0&*6LBL<F+{TIl^RC;U#)Up5WC`P6dL%Joo
ztYDvgZ_Vr$LOa^i&5;m|ouDX0?28*tBIR+p5m>BjKG~rI=@I$GMNd030lQd5pOdhk
zU)P5gpvritB1SOOcLEfN#{&W<4WJAX=>j*o?z>3dg_3!|++Ftku!2}g-wT0V0*vf}
zgWkySF9Y6Fpa~8O|Ket9RH)o}cB6zmo0iCaVTKV~`{8Y$KQL0FFH!=9p`eunI*{x3
z;}$r}^3S0U$Q3CKGzK4|8W?qu#X5axtaL7_gG{|a#$-RRc@_mt_Ak0K!t^0*!n$T*
ztR1zY{eP5-J3_UG9vn&4(Q`X&1P}Pe$2bTQV^tF<Ccn;@ds!?Cz(QE6A~lDzVaHJn
z?t5Sq=B(}d2S85~&{IOi)cq<(VHNF<XDWbDMfcUQfoo!+Z1W0BCES7DvX>*4Aksnh
zL+j(I4AZG{FtR<KEdLE(V=MW!F*6}$6ae8g82NNk;J$Z;Tg@)d8O13K34_pyGh-Hj
zR>fc@CS&Zx%jm?*-OF(EdpXd=+3pns{*`p%9QR6hPTjAg6YGXM0Y0H-veP3*IA51;
zP>Bm-YuCrlT^K7<{6%01ytS3<;?OzJ)R=sbHV;ZrHo1!>BbpSTxxxjwqm=Kou5OG)
z$$NXFJOa;im$FM7qJC7X*|Q>3#Ii1<v6s=<>_^5C*^ljlPGP^uZK?^L?d8!9J{$Pe
zsmXU6(ZAB1<M6{yIw1{`3?~mQm}T*qdvSEvxlhmV%4N|Wm~Hn-4`g5#ppM`2fue;e
zQD3zEIx^zZ?Xi$sxZ1jg_+JqZ!Vr`?xgn9b;P@;9ggH?tk$ru+BQ#~t(}OAk*RY1`
zvhDXwja1cFR;HTx(*T*5?q3xGvD3Y=2=GdMhz<!LwSuw80K_DFiU8HZ;la_3+2d5u
z6FSnS0AUAXTpSWN@<pZx@cRjJ0cjJ+c37<I@Yo!Lg4}rt808LwizG2aLiSg(Uz)I5
z6tgUjS@8Xl$B1IJn#-<&y>p^NDw9?$Wi;6El^c7o5bxG856ZkDe!VCAtyl^Zr6gq2
z=P=>1&>&WeebPtf;XRiX(rCwHVQ{>|^$&UWd|m$*9DWVO1iOsUPzKm$S>tme#198;
zt^1QH{p8`&!7QzG^ZGTwQl-Cy(hFx_!knD8#h(FS2U5v4%b5YD?DsEYk~8n!%j^DZ
zqv_ePSQ`5Z-kGnjHd-v?$=$}LyjmB#ey&^R&W&9^k4Bw`b}ARN2C}qf*^!PKAmw^B
z>q44k;ZKGn5!^*v{KZ<5jS70wKPu>nwM#Qv<$--QBD2<jC@b3Zy-zU0rjURb;F}|q
zKo}J5xMcaaY>mGqj<ms-HGd`Y@+$r#-ju~wXFpdFTVEAd`LAXuV&Ex|OM<hm@gtDS
zI-4%kd9ARkDB7<ZKBJ+AvIofAtpHg{jJ<HXxZ}nGWw^;UgKq?70LQ%ELQpkg7;P7-
z$@-prk0xw(7dA8nvLh2nI%;?3^@c><A|&coo#|j_ct`*zf471}ahwD25J_`ej8LTj
zDR&6l>=&>Zk8S{L)&Vw+)^9Gr!+R7uofL*<f@zKbH1ARO?~>0D>wBenTf{RJFNIfc
zA3>RWbfiO*{$6W89Rb&5O-~P9e!o%XodsnOtNbZ~%KZ|T?^42e!h_uR3+F6?J#824
zS#b{pLbZ(~)O)NV81+Fr2+|1hDh7N=gnDn{5bE!RP!C&(*Y`rGM+~xiR1j!DKK=U=
zw@avJKR72{d`t)@IA<M@(+Kqebu#~mogM#R(F0IU13GI8rt;w=D%g({YT~2zx`Ar~
zatcWLm;rbIoTZV0Q~)*waefLIlQ6DHz%~Ik>)@Nv__L5dGT-L}aEio_ThAf!=S7>(
zQ~3P+gSUzKzECvqix$!K3y}B;xea_t3BSbMmlK6sc|vhjhDi+4Spn-bUThn|iceZS
zQ247!2>n_M;HM}I_Sw}4yfP&ClfSP+;cq7SPf7~A^38~UHlnTcgL%@y0`RO8R)r2e
zE&4{a{#`xz?g*~>o&ZpF8T>QMw`bJWXK3rP{8>I<TLK7;Q~sX}-~2~9N!*pkOA5SF
zA?}I+pJWFR{lSPngMEhhE6l&PYNRlj=nt(ru+T4=Mfjtl^dBqIiUFfPQO=*xf>ZLF
z`22Cn|I}Xnl$wA;pBfE^J{zO23={h~WmgmyoA|L;Kj)#z(SJ{XN{Pl2;qzZW5J>V1
z<H4Z!e_6EXR~F6nmsAT-`fHjEB>o@TqJ3VGV1<r8FLV@)6nL|r;IGggY!sOEcQDuQ
zMe5&=AoYI<6cwq3f|_Up5v^FL*LC^V_^fcr|FNk0PZr?yk5v6<SiU4kv;rJj6Dmsf
zKMNQg?lHh<1$5MUJf=__0H&eGJ5pd%!o9LjB#czNb)68?>xGp9lnx0mHOO>}@Y4AW
zymU}_DF-Y-0@=O}h25OeN#Ie?)7MfWSZM{QR60HO2~aeo1UKb(12|Q+Z&0-Wr-qsi
z2`c69wsecXBNdDVP<lt|0YmTPI}{NW9RL-jMi5cZ(Yt_f-;zSsx1^@X=Fu*Hw+b!5
z(IJ7O{JoVz-=2b&0Dppv-ji+=78<B7DAZU1(39Wqq#O4M{!|O^S78MJI^^G-f_nmm
zZV}`;Ux?=eDUj!&AWzDBkgPjJft@2s^pG0--qbymy7!XcVI_FjNMz_1O@1VeX49i-
zoLak5$KRtQe_!g}2dV(8y<flF`+4?Qn&Zjvfz&;gx({&w!PN2h2$hzK#2-?nd=}|L
zsrxV~K4N)4Lam>+RbuG*N7Ecx^D#B(V<h5hN_<b~6RG=X>OPUi&4f?t?I+2&!$x#R
z;y)#de=5Z&X*S-vg@{yo^l1%3oY$X8-DeC|`7GIuw?C(HK1Vs5^0%58dA#WB=Tpbu
z<22w4h9iGbDZj|=6R9Kn7gF~`arB)nfiLOpmng@?2%Y@09(<Vx!Q7{TUn$7>WKqRe
zQ^((vl=*k5<L@gZe+ZCE|BHkFH4*IVsry>$zD~MtsMK$e>~7J^5@95LvncuRi;|y8
z9e;mMj&G%ozi*P|KBE+B{@eEG+dQ&0&UaMicer~rSk8>$)3!ePu3mhXGQO9(@1=2#
z^fbu%bBZbyd1KF`HIf}=bQ`#k@bZAG4S)pc18aP5w(i>)mNseAAj$R?Frfl6B2*|Q
z85bE4|1vkuK9H+g<Qs2MAquyH-9Jg0v#tnVRm0iLO^WM^h>}bOS)V-osYB2?kfIW;
zoXSu9&mPB_<-e0;L~N7H%LAEt#jznR6=UeA3gd4JMLeAW^Xw4G>Zuag#s6LNxJoHW
z%PcQ`VXC4v_DAQcTD54jZ_KqT1lS#BzgA;{b_YtN4^&&Ms&+;0dxs_1gCYc@7V?!6
zM9gfI@v#I8L8I^VhDbib2i|FI*yDInS+O&L%80mOUy1Ih{(8i*mQwC~;z}`KwieHU
zBeL(=Vu;b!NvIf2s0sB`Y?9=g8sqMCpbKDqkfm`IVirtQR^);hHYi3eU~T*lRxlsd
z)7qGCfeLx(fU#b5X$)&kGT1DCuPN?5m&egan?gx^cT_=jfW|mBHTokY)H*{zg%QJ1
zs)HLvyilx<)Wy3{J1R3_8y#fTH_tw#B_|=P)Lebuh2{5-f{za%`2o?#8N02Ia<#2O
zxgq`R{1J=4aZoItm9wJnn9p<e4O`33qh-W6u#ixl{$P4B3YZ@l1<X@EeuuGmot-!$
z`>Hjs%4ja6B#<%yvoMk!)!UQdkE`P3KVrP&SO*WHviRDSt>7qGZGTKS`lpPycE?9t
zYDbe}Jl8hFfJG_;6de&iWIkNMsqGO_((L=UzCdR8ibgkp{dQZeBi#ue7p2DD{K?Q*
zZ0QZ-(R+(yW`&H4g>=NBupJ)l(6L9y4xLn;f%}R}+bN|y37u^FJVT(e;IdnQ^~2|$
zr^LeojSt=O<(uq!je}5Xc6s0)VrYb@LI&N3=)d~BdKx0px?+et2Sc)d5=%u}xx`SE
zu$+IYXH)rp{TGs?ah7SXBzxfZq}&UDFOk}8L#yK@Fjf%>3;xNzWLzAw?k>G(zIg~s
zd?I@&ygl|9As)X-n|;i<x7aDMtqE<FDDqeu_dG)%9Us#jQ_B9$`qNSUNpsU2A{I9h
zilt|m6ELs$CX8rw6hw=&cUnuDf(&*|jl~6rDYRP^8P^WoX%Ba&G438`>P<A2@<-lg
z-GX1+C5>C@SGllk?c<-4QENn~APzg2TOEqVf*4p4yGclQG0vvIH`>VH1CDGAb`HWf
z`;8KZ7Q~pW1Tm&dcHA?HL;C_Ks~#1+jJ;nX{{#URIB0-F_c450-J95I7(ZSP(IwxL
z?=lgyhc9$l;aBG8e|fWddH^2M5>yC*KDuJxpz3ji{V6)F@5`3;eOc@DyNuRzmcJ1H
zXE?u`d%d5xTxT@q6*9`rU&~4S1Z<yp?{$AT{Wu(6i(>(jrxqR&1IsQPQJh8nxn8LL
zPBs$D@{jJ!b=a%ymI(FvkGs_B-4E%*UB{5ZqWv7<j&{cmw;lrOw3oI124l^fC%Hn+
z7-fa||IB)I9M3uWWr3<z`V};J#c-2}7_TPxhfD)AS$)%vm2Rb;@`a}C!&deQl!1R@
zFP8xUAzw8j@8P59*cBk0mA=%}&o05qvAL!k7IVnM4*L%u^5`FR$b*J)%%haus2VNH
zN)W)ZeW1y1*Wk0rym=|lEDrL^;(R-+qxz?E((u<!@_oi|8=hY6Gl6pGY17O*?8>AP
zKwiq8v)dLEBP}#Xgi%yN0pvw|N~8tAc=mHkYC#nl!l<P5NrbW4zuT**`Z>EyA=%%m
z`?1FO-neWTJNFtstvxwPae{%%=$kV0POAX}K&D<P`-N^pHPiA620`)VaW+#4qLn?Z
z_t{r1(+s6+_l|U)bf|x<059dw=4$p4OQVtQjaoYMv$9{>)42c6l;X1q{kZiGe>TAg
zHYXA+L*viQ{!RlqHBxjQ!#l6>nga;g4ts4djgg1b2!(Q20~PtAtl(NjXtYBa*TZ;3
zTsYi;<JceCQ9Yld(oxs#&6vkbDbr3AyuP3;t)KnMI@1g{?0#0njnS#RCO>kI*NyF{
ze!fz6_=Ha>1b9kY_P+hHm$TB5c+qdej{0VQ%f@opR%V@-vBf$rdqJ(9DmLSheQP6_
zX5m(!7QEKx>UdT?d|D8$v#fu0;}xQ4>y@vilaVEaxGrR!s*AoW$k$gglrHlHH!^L)
znv(<W95(9FQ>Rm`Cb2Pt5@!T}JM#{f`kJ-NFGqMg+&Z#b?H-0hjfI*N3Eg&Qu{XmJ
zr307-qYkz7?{=tVgbuW`box?@5$2F6#5mQW*5FAVg&4<iwZozv|3jqe-}ol=Xpe1}
zgV~5F0PfH_ZNvn#lV;)(qD_}!D(!wQ8`fAt8avy6RAf_*YJV|FIt%s@x;4BMi)q4=
zEHX>90=Q_8s)>4$PX)WW)&bu*++X)q8-;3|TooTBlKE+ooli6858IMzk2Tj=m?l`|
zgijUI6sxBRG|*ODW*x=`5|iRhDbCO_8qi*Cvte^Kf6D5tM8Wm{Wth^tKcIo$b0uYA
zN?n(@WC|j}glP;FIW2#aW$MNfCN!xcl*+3~E+^)lqI2dm0$n<)vj_;CKg0wn`MpUX
z(WD$xJ^I&D-PhDI1xd4?Md_<-8(&acz}Vg;P9Mw$DA_As<&!>Ql{|%2h<D;z;%sVV
zHboZ}loyBL`bS08H3m`~*l5TU$Kc{jV$C2Lu@a9nDW2ai<=4JZo!(<5o`F^rQ;3=;
z)56I*H5jIQ9|mDp_LK<$_IaWE)JG9AG`|=5!diAw*|yL6itSxA*!0+u85GCyI24x)
z!ld8NWazYZ?H!H{8@pjUyqt}T-5>ynx`BH~?8ZT~3kxvIzh>Q-6Im7ArI>v47;8jI
z@C!W9%|kuKFL!K5HC3dViXQb!%FHfv;(s{;#T|x6nYjwOBhKJEPzf@FgZ_2D7{Rz$
z)6}}kCa$t9(YJHrw|{~w*=fV4`s>=D9Uhllr*rWj%Wt<`lH+27`Sg#@x*boWj+b?I
z$D1`O{XFgGgl?|WaUA5YVb3jh1K4zpZp33J>IQ7Hnp8w<!oi02&E?Y;^EuLAaVMIu
zN0b3;2nkkhQB27u{+q;q6ABL9x8o6Kp~FX~EOHKhJo<tV7S~R3shAee#i%6gcoY<?
zLdbR)73W3%-{#u0Z=3!Z8yq<7xF+*4+!gE!Z;bL=>)*TITI7x*YDnacMlhPGkvpc*
zYR&olo2DoB2v*z$TNZ8|)I5XaATGud=468A(_s0~-JtnCWu2G_9~90>3O~`SbhFeK
z$HHZnJ0Sax>H64EJQS3mFMkGG75<H*F8?H=n9iZzIbz~4>T`Lj<K@P@O@Vp)c4ywE
zf_WKr&Y0^XEy1k{uO&wPa2|!grM<LcN68(=vA86eT8#8)_2o)SN-P)Wl{x?}x&o3&
zFnbx)TQGAD6?#?(A2;SCY%7MAAs)-Xj^%^Nc#w5K5DP<XrR*jCH13<DqI<3pMBxdp
z&=GwP@jT?yVGduo|Lca%|9`ome@c?+1w}{4_=!}n2&2W)l5_|2k<AWFD}_GF7Fe@)
zTL20NDib<ER33-1mH)|>&bvcG`4iwglJA*)tu4Qa#RXEGTf#O?%D>ZOxX?s|`uIka
zWor(CUCoK{S%Lyx;~P<TaOO6BS<#WghQ4(BcT{gkhn+M>bCd9o-TEIo-V}X}q8S|t
z0{oF(XPTrHwi>14cK<AQ;4o`&2hFn3L*QZH85XyS&_e+JU}0|JpkNIt?M6e0XKn~#
z1xk$SIaF>0+QyTS2=ULWHD)<iP-@e%KWYZaZZOl?Oo>ozNA>W1eMlnKYZjsKWa`d7
zsd7uzFIUEPU>ivKw)0;b|5f-;CstdAxykLrRwz~{>I73Y7b_9##Et^M{>0?Sey#Sx
zQrm&LEesfPm{5%USdpWTsfI(U+aY8{(0wq76~SgavU`*d`Pxy=uFUnxw%$-<@OCXM
z=`Jlmlz%D<Q~U6rxp`rGvBq`#Xa8Xhn{J|Zx{pSH^Zg(~iMQBIb>B<jz3594&7yPw
zp`!E(lGRGLkc`=<t*`>Wj)E4;K!Wz5{6n@yvAaD$fdJRVN4*ZlO0XjX2N5K!Fm^=O
zimU_m1DjwcE7tk$uqe-I6|sQa;lr&`RG<Y>^&OEtV2!c>P+^S}*@z{F%8TI9BEA|J
z^n+^a^h@Y|bpO&SE_#0zVWN)C9T*ax={3(`*2q{yjGhDUW&g`6Eai`xRa)$dv#;UU
z1CN_TxE_udX8RMRrTmiPNn0uVFWnQbVZ?_EM$hC}S7c(Xz-@=6SkiC@;L>WSH>;HR
zB?~;RWwt6JO{+lZHZ+QzWKaw<EY~e?BjFp80AT=PO8(g4X$#(<ENtPpY)Ogct5o|g
zL~EGh+C_@Trn{KpxU&y2pTWGh08<_^T8#CN5@W?WR~D=lJ3Z7~eCGkWz>zHcN(RET
z`=0DiMqA^{Y-f-OXt%QO4F${!1(>?G4Hef*W-GlFRUZ#G$P3v}N{bJN5Sha*wkalR
zAaAkwc)up?i!44KNs`7=q6|yeU^GG-b~y~R;Iqp<Y9>oW=LrbfBZ$O;_Tcg=av~$d
zbE5?9KGqUS3Fs9Hv)u<x9=XIfX{o7+3E3uaH-S6zhVIq?byP1J{;<qw#ke<xNG)0%
zBgLYO77Lk-hK2M_dz3KL7%)2=Tg~`Z@G^E9v~G)w+0m?P_0Txv?%$ef{csGX(Pj0*
z2Rq=3&~{@UiSvMojQp01WN~x$-^MmA3^lv6+FPahv51zA>i<)<{LhQl^2L-7S2hSU
ze;-gqUzD1r7`1iw=e=*8(F+ayBZhE~+Bz$CAT$*?RgizQ%<L{J>@T0#xNirZ0b`+q
zp{SbOn2W9>dynQ)nB=T=l+}emEpEiArIovB9Mpn+fk1l^Whosq(=G3me3t<gdpH>L
zt>Nr*R$2VOi_P3#*8gRlPQhR%|6C%*&u8kFV}z(UR5LoN=l-8C1k%TL;Ip4wh4BkH
zaesba%y2+bE=FnJ)1;$XMS8=_Cb!E->@obxUZZ-dj~vzrw3=swJxOT2gx!n(KQHyA
zUzg74Np&1rx3^&DlzKo{^V0I|i_*Jw%g-G<Zh+MuFD*)M4^Lc@x7PAa_57KKzx#00
z<{r*6Sylp^VeQ@*66xpjf1reFzGKJD6m<JvZrD*;HC8T%>BvWe`9JcQtSmiZs)gzA
zlyms@2{kll1};pW2|x4g3)83L&k^P0*#8G*EiZGG{CT~v@#ClVL+#WguH@ej_u4Gr
z?zxaG-u<D-^ve;6nByVs4pgQU`MdBo78&#YCFG51U9G2*???y6LIUf2epR>*<{kB(
zO09?3c(u`#SXt|#eS_pna5EMG7?yiP(1qz!DjXW%;qZ9;iJ-S3bmdTRO$8EPvJ~LN
zQvS<?&zGdjRLX~yhuD3YE9Db4dR(ikruDSs<tuvls}R}`tIM<`)z@A5@;uqx)0CHo
zdddaGP&tv9NaY^pPHRsK{i9XQ`8L;dWqo0&$50~IqBTzlPRm4tXG~>TiTpzcrNNri
zWl9QNuCz}e7_Hj-X=`umiDMHcex=beS02jwI_wbbg%pFA*Ei5QyDbK_G2Ut-N~E=?
zG~C4UmBcR`(a0u=^i~H5VZbS|U1P)4Q}w(?6I%v*!T_U#dPr)6Q=wU{G|RQ-SJfkZ
zv?I`$KB<=b;Goc)bDjbU5B0PWcu;h5ZFJlQ!?C0mp@=X;ygnf$Fb-jSb&=wLd=jO6
zWrIjLe7_6wQ2_0=3iwH*j7rc<^?uwaHb$1!m<@S4l>Zdlu%z&VJ-n^ZOnQPzh@6+j
zM*F4Rdn)vyDets&&^a~eD5mZ)<>tz(dYYz@lhv%_2ZZ-c7MzE2yM$dkezf<rxe{Rx
z^}AKW)0f(`T{V@S!L+RQ)R*@5@s@!ad!+ZRe$vvA<Hu4#&gW>Fs%097&Z_jZjY`*M
zZ(Un#H#_(eGo|*die279=Fe@{_j<(Pw1C)1HXwm80@o2TSgQ>3j#MIN1`4``K)Nrq
zC|66)@*;1&;-a3);1!wyu;kESj@7M*O-8#yq%x+X=-XsP(`Oq|i84wn<x@zPTrtSM
zGy10uc~#$55Wb)=dV*k=t{L(+VJI|PN#S>suh{8^#_$**L+|##f~cA`Xd(JF^=qNk
zJdHX_G?>w$HC53VdS7nE5yjfIHl_?=)I@8Bb2OKD4r}r}5%IMK3JlXj16O(Ef}UkP
zlnq^sTeQt4EmsPx!RSw;12j4`z})MYMU1U+DFjqU8M`i@QXD_pnN4kC8Gi4^GSm`S
zhVgB0S42^_8!I$eCNM={8H~uD(kPY*3?-&CR;S5<WyVotqisEmm+7F<vzMn#{XFB9
z=HrOT&uyh<yUltLWQHv)AWk9Kknm<#MajnCivgO%OU2CPXEC3llx8@r9NJ|vp83~u
zB9-$sI2Fd-AmUEC!Ngw_|CEBrunbBW8;}l{7fLi8ZIhrceUo&GFPr`T*K~b*ZCKa8
zsHR7kReYKLc?n0+<twFHPb*|o3kQ3e>xc+VHKqp0yM<Y8^!Jxi+v3ZpuYPLx+OF?G
zd58#bC)&V@A6-CRnpZMYgIO^a*=9N{RZq=~@D-bLurDghrJCxD9t>X7O*lRT_i57P
zrEJ%=Y6s{;|5#+ToRP#OX89mewo6-;q$m+;r9lcR)#zMhu#V%UrUS#pEY1HX2$h(7
zxNj<}=vgxYP(9P?tg?D+3)y4s%;bTwI*a3Al|cvH8YL<Mer%%pK-KC<za%H~7qZiY
z{K-;HQ;p^_xp+oI+(FfBqmE9q;i$t&)=bg1;!9MgQ46J8PI?Ed#FVm^WNID^Z>JK6
zR8LjIz}RM7MoZW@5`OKvl-hBvhk^qr6Gn=P<LJP*!6alZM+zl;<kI@1$q|1^kkN8{
zg~Nq<PSza+M;f){i)(=o8pNcTJ%|e<EPXKD^PKpHuF-5>*&Y20|5wIf`<6#>i6gCv
zRHJH78+1hnnQX$*uq>e(*ufN=F{`rBIQprXY|ljkO;79$vqi(J!lE5vx6%rmlfy<W
zMeNYbEUOd$|1n<~kE<YlG+W_{977eREeut9E0XTpP(?fiLlu5D3$=oiuqKb#6lEs?
z3!U5Z)(~a-D#V3q!#K2(gKT4zQ|1E_*(9L!8zXPBa4*#(V+%VOvPL`ErLp=%%x-0e
zv23FyBgT-~ghj>}VvH10q~IP6j%Hqyv1q{^sa)abmDC04&?3{Y=wYr)^guSvSWW$g
z@ImZbG#arGgs>E$XTy|N-wV;+#Bel;K#G-Zx&gzkLWfYt!Pdogv>u0n3FMPLD%1p4
zc_3n#_2|T?`5M`yk*OC!eTVE(l^5A%-m}7z4TiK*glL)L^*li=pwX_Xd)ZrMLMN#2
zyl+*<wd82SypCpbPpKbQb-iszx;9x8RIgddtSj1v*A)m!xTarU<7Iq>cFShgaSL(>
zQ`Cx~F4+}UAvy#b6h2c1aw|iVhQGOCD6r;L+N=k8fGst`2E`~JVNfXDNP~iV?Dk+#
zU_FhRCTW-^*uITvLL=G&|Ec}?qBw|oUPCcb6ZNtpYs+Wui>Hh6gHs!8Kbb#yX?FS-
z3A{z{5AB&97SM|b#i9FFCW*b~Ztqg55IiL2STzch0fvWQMwXPaJeZ-jLNRMi_F5(1
z#%fPfkcJQT80SLV#@a|9zR{RNlg9E07EPBmMppBf*IFDYxIArZa;3)NNJ|VBN0Pqg
z&t(J6+=!zHCA2sa6Uc~}VeC(x(Uvde2T=6hY<Y$W?GIY%%F7`aJ0HWQycR}kURW^T
zp5rP*SS|Fz_)JYi;?P7(5ULj#72Cw6U2P^Erau!5)4vi-e<+B^(5Vwh+opw&nb)kB
zq~-R%0>fFW4cX2NK``!n;K))>n+VTb(qLGBV}<Iy>Q#jlhXJEoMK#b<c!?2FQYfUP
z^jNEyz917xEW1+oQ4h+%L^R33Y}l*HRESWDDMV-?h04Lev~af401cC_MH-M+YsIAz
zr}=cHoAGWenuHx_Sx(E0b4tR68l*3<1S7LF1$%FKr>inUnb*^5F^}XdQ%dJHR7sq_
zu$)MsEd-GO;)Ugk=C;_elO=s1W_ZoI6pp8j7^;*Z3s&@w?A?%Wl5`E`u0-b6AVwh;
z7X0c$Kw8AytOa0hj8eMBz*HK{T}I;5b^$21>MGXy5izlWq`n4o113;d!Q7R+Sg4l6
zLbY5hRB5`w2Ye_fFn4*F<fECJi7FIDs01z3RHuoscZ0dHO2t5I&WfGvV$7|P#sS=z
zyBwGsCKGcr*5zf61!-jNikQ1B=4L>|+@yp4k+~5enAt(9n4}q5p->RjJA%2n6?2Ey
z1m?CJN@~mv7GMjgRH9whzp)<TX?NC>@)ql1Pl^wW_24kWuX<pn2G@be!gU;v$XKQB
z&UMINaUG=_av|c;3~i(aw$f@<UDS#ua2>B>rCl_Uyyd(@5)%1o%e%O4LlQ{h%1VeT
z8cxynM~{(_0g})yCLzxPKU;Z0<Vx=rT)7vKEA<PqRdA)2s{xOK^+j+Kxo}cYm7zm|
z$Y_5%7l{{=MUjhYk0Q4WS0WPj0-s1TDtEUl?N#JLe3P^-ff+R_Fe9woU`BXU%m^XH
zjCL_*Ebmj&;{6RXwltWLPnJYUYx6ir+LnSDP13dmNgJ4AM1|_K>K(z1+=>}PYXUP`
z6P1*XVhTy=+9Xex4a-5w**7zSjlm0&gKo@=3BZzAF(ftt*j5NY%8smPc$-*}WC%dX
zyap?ZFV#k|qP-r;ij3|^RwT`ytOycj)@xu+I$W?Kny0~vFNTZ;mTcgnEN}mJac2VN
z)^*+Y_W;Za4he!Y9Fjwk50EHR3m3<VB+DBgMN*;`k|ibCO&4T1z>t^)7y_6fX-nor
za<(RET4%M>#Im~Bv1Lcj-o{Q8H%-%4X`R+-y11#G)=t_cjpDD1zb|p!-~ZhE9)Ovl
zM8{4dn}fI9<=nHMbC+lO(OK5(WJZnmoS*}@=>h}aGIQ`ok`jeE%|q3oMF2j}f_roe
z^B1cO#?702ry~6P3uEEXHHZ^9`Mfe$4yp5`Au59al>?^Fd7mK@h>&J1Jsw8<PpHf=
z(;{8(13{)W@WZKLN{3qZ(tvUic*YwHD$7#S+6v~d>j4j%2h7_|xcRwt9&k@)oyZ%{
z5)YVi5(XNB841A!80QG8b2T$)-<;9)u$sgJCR9)bYO6w4YBGcwlp|+AjFGa1GMba6
z1M+}#N~z(7MH$i$z2(e5#$sA=2@MNYVJHjf&|1uE5CIT~kf&#)<|e|_6mh|WH~B)h
z#c^BYF5tmytA;V7;dnQGNY`CJ37gI~;BjKtx(hjl2&v*`whhHiUcCSkkQN3^9g`OF
z2TpM(gXM+rQgC=dT7(nBh!{fy*J~I+t?0GO*J{RsfriPPi2yfl$-pgKWW^okF&Qy6
zHNP$(BRK|ATyPd9*yv#$VkkL99&hf_c*@eD)`Q)qJ$NZl*ACUL2Y21m1Dq;yFp&s-
zm=<u1%eA*3ugIi~{A*xRYC9&iX7IBVP;d>Crt-8guV(vrkSd;66*c|ocr{_p2-G-)
z=*k;i%VYL(rOi@(owzc4J+4fKwT>$_`d8%2btC9wB-khwK;xc`!n2H6SX6KxEba%`
z=n2noZBkz$LzWL;0l6HOtB>-KNknr<#e}Al_<_lW5%0a0@ZTsp%Qy!XL2@%K`1)3P
zcN98TA=6a?3h^9#Cg><7iaJLIEl9^2Bpj7`4H$fR4X6_blt2{Fl*5IkPvWzTb73vb
zhZZugVNfcu8=qz5!lESP!cq>M;9XdVWgjDxtEBGfsoUWu->*89M^*MH@=HN65Dqr8
zY@uNaJt*vdTJw0yG;_{#6taBT9^~`$8Lbc$4z$@8otd%0d?X4#^3HPh1HeWzEhXUz
z)~Sm6JsN$-2jbv?sgiSw-I9T_H+fIa9SESWC9`RcUC_TL%eV^!<-a)0ssh(h6uN50
zHeq~3jdz2+7$w4-a_HxjL(M4zi7ctbcxZ8wK*ZGsw<ej*m}>q_pATXRyydUvHFnET
zgXl-c+~D|p8kTH;#f<<%aXQzJtS_@NW?&SSU24S{$Zk@jL|N#4X`+bKjTUnjzFQ;I
z>NlDjQioj??L%WtNYeuuW?8S^#*Mo&kR?^+4iwBoqQl`4z(7vA9dtExqLf_+jM`Ar
z9gUo?z7A!X+-O{@LIkiGtFaeHBQxVFHXy>+l;uDd5tI#O(tQ*Gzrb@af+*c}m}tzK
z3XFk}Rc2~aG}5%T`Wtab=V!h&Y7h^5Gog8Fy3q7?Ip5>U44W^*+f4#?oiCH`X7iAC
zmaF2+*r230aOF%yslCH}>q$UuDieXqP`EBdH*>r&6jXA;%>pIWGzf=eDio8JLszL#
z=5!&x8!J=cCznw)47Il$UlZf47MxLBZVjk{r)U(s;nM4qQ?7Zz(Vxx$&*q<i4zc0z
zTK~ixxYR${Wc~@?5$4;yvIMS<`6t6#I;FspE(~fK(-09c6~h71;B08}Pc(u=_t<M{
zj=HeYbHqrOBbZ9-<p^=<#w^DI0xv*Jfr$fA^PY_WW3*3-ympWSDO~^uBl|H4e8Gr|
zdx<?5u#z#Z=zMa@9RgnpS;Azlcodoi$$x%RICw2WP4bmR!!u-7I@$bhWtBxjo@o}3
zGA2H*oFyQ|6((%X<4$tUJhj}|1|GK>Q^py8JL6QZ8NMX=vK!5GQOx-)R(`N&l|#0d
z6)`<xMW#o@iZ&T%Q_<#MTjp3KB@2}WMZG>tw#UD=OtiiA39TOgnwVUK042`9mb&nH
zDug<fWHZ7jIkcenuLYCk%zeb5{E|*->36%SBDU^zyG29<*o_J8x|Xf4>vmg^x{=#W
z7ls3~)GkZs{OTk~wJe>v-9YJkcKH&~I&&mJf%mf^T(1(B9bM;Vho-Zgj!v-x4XEk0
z^DFt;)<ai|s|UMt9OHd!>%lA11n;7N6HdvQ)T!fIuAQGP=ZRp@rbE^TgS_?338aE`
zW9Hn5v;>3Hc06m%U`V7st;1K~S&IoXQi>_hQpM<n4){~?EGj-_(k?=Dbv-}Zw9VIv
z4KvsCvy)-1V?&MJR7@KE^|IkcBj{r!2$c$=cbFWlL+BrWMxLFe6*5OsiXw1mW%IM8
z>1Y5KG4->vB|DW2K^!h&=23viF7n8a5@C{L0mYs9*~(aiX-Y1PZ{<!YxeXG<Ryj?{
z^(KmQ4!W+FO`_O*0?Ta*Lij25vVm8gEZW0?BmiM}oQ?H)VzJdaC!25M;%hl48*Q&V
zagUP?rM#1kWsyec?N4J@xAbQ|wnvY_33{B5Z50wM<@2u#OpOp<{AIAc+r=@Uu<*n3
z4V98MpVk_-B`Wc?{7;1}z;&ys<HO95++qnOYbSg3+T*Iy^ZjGjq$OtVbdkw^$O0Jg
z{<&CgWsXU<!h5Vcm#9X#UXr5aubgKJ49^vQPz8AQtE94`+<~BgV<t}?{NuoR(0pIo
zcZ<i8ix1B^*zn==459FTH4@~1GfWp6Yn5=&9e~%WN|%AKO>>2JyKZF+SiOc~^JaIG
z|N62|5arx1YrV!C6FmHFs|V_-2R7Moxf+xp2x`NF#9?Nv2r7K91Br2Ma7jaUoLbM@
z<PnX!SKT(YGDIF5HQ@T}rM!Pa7psalyHB28_vD%GlPyBkySn$jm2X}5<lELg`S$LU
zNrOiEj?Dq=(pv)oOY4&-+kD9ao0O7g6KU;&9ogid?|I4+_YKN&!dUWjr)wc}yX*?~
zbqUwXzNGDpdnX7Zx4GO-i*colVeCwmApf3BCUY(GPS5iZ%g*{)29BlUm%~f?S(=M?
zY6bBK(;2a)@^kUi1DD3aqaXkB|9eWy;$tS7#JRmvqBS5#fAPy-dCE$;2BS+;<$<HW
z{Pb$(UBG93tcGyC&DW!`Qn|2s73s>ZBxp*FWuQ5ewM=}i9q+JZoG@p+X(j&3Ue+|<
zmffLv@)>C`_k{WF!n5K?Wj>MPRJJUt!rf95IU7d_6X)_bqd<)9-e=Va6}-Yg3nvQW
zH9_IW^;RVdKVaoHR4nfg!{k7C$z>0cWAM7_B6YfF4c-TBUH$X+H!QqcZ`~}S(9-`N
zm09?(It6wx0u;buYqg}P?o5{6DKp`kr2=B~oK-=MD5d7ejT%`4Vy4Rny&!_(jX|_2
zC9t~YeHTO$Z3&23tr=SjVs=9iGrb_buOEwlfgl$CZi2%;0CNF4__JZUSE)}2Yx&~O
zJF=Sn@4LhPi+Wj6FpQ+73-47I#;ss=*u^g4eXa=~b{q(Xare-~c#!DuuI4sX%_6@a
zQ5)n>Fg6ro{fKtGAt;3p*$W~If2t*RTwbfzCvB{|fhl2@{10Uqp)p$!XXZ9%!VqS{
zKm?g1a(7<PAj=tDeq5JEYy4VWYpw+|m%;wWSk9?(VqXSeh-=t)W|w%I?fB@CDF?~E
zwZkQO18at%xzjiu%7nCX_COHCX9Z56Sxq+{rA+KfyWT@LOPh-ivFwuP89!pGXWjAr
zwkrlOq#vkuw;PN3aDv_-Q*@0pg7;<%45Sna9~3%+{9o(34@kLpA5g(^--NHYN7zG%
zl9RLqUdX)!@x1W7n$rS>T#EE4oBu?H&32JehNY}iJ6Qk+OgHpHxc-Xua@JEyhXGS}
z3Hk>S!Xc`m9F=-hIKt9h`CcQ=WccLIkXso#>ek|_#b{IkqHxS7=M00OmKo~N!WWE}
zSm7zi*!Zc04qdS7+gu7g-4qdCTOQJ&R86~O&6OUI=rpI-raOTq$0GkxDP*`pnGn>4
zvNNelGfu&_&8h8xzA#5jvH%=%U_FTA<1gzFhA69(A)=xDsenPCo!<f{cg$&bo`brN
zuRUVZIlNjqf^(QCsAR2+b?E9PwBAW`VvOgtpVP`0enN=I=U=?86Glgy<bEOD-)~4W
zkR-mcE-#vZgqTaiw<J<-7vSM)b!8g?juWW~;p0SvdtE6#{XX>|3itwOekU`%0#0L`
z#;YL!Rx04{kqRq(R1FAc@tX?&UDslK6J`mBk~fi*39EIpBCRyKM~9a)v6T%#tINcf
zrruO1Tlk-D{JLh8RFB`pDqQpLgOaB3ZM6<)0Ue4mlr190%zsq6$AJb6(k6YK>X~8$
za#F&a)g$WH!&2zG*V^iC_ZDp6gA{&J5J{R%rl<j$0nOTi4fU95h(U<gljo-MUlG~L
zN`T7QDtwn=YJhd!N_%>;0Za(GEa{=;SA+fwiHs~E3Zsrq!C8Cm(h&xgNYAz+-BKlh
zWf@BvbiRz$^aG$pMv!>d*YZCrQaUWXWPrLVq(qF{P_};Eyi4R6fsszQBEY3FhA~+|
z5>3VXcc9c%D^votQYE&q9*{(E(Zt$EVS+Oe)8j=LU~DKvd(UMC(vr*(G_=hsQ+1us
ze=Njf${e9VKaG*_AezH8hE#~>Jdi8xz+q?JFB8PVj`~ud<<8-y6b~7j({yFl2xk(T
z3G^o8#5FMI?-~ucVp#E;WHSlgfy{vQq*Fm!{+Nn7)_ff8?=bzp{@7T8Hb0{)(Z^k}
z<53TQmg))vALay(PW-7K=i%I;Fo=KpM|Guq{&?ZzYC8Yub!-Z;p$;4YlUf`A(pXDm
zk#qclX{6u#5o)KW9VA-oXF>u*2jr)W9fVzx>(j;#VfF~1F(E}Zm<cI8%RtgNd~ht#
ztNj_PWn`sD{H{=06mE0+-D>8I=F$eGNxMxk;@}}OQobEx**G%nM7}p6MiPfqx}H!g
z98vy$#|<d}rZ4}U1DWpZ%GZF#-R?7Sgk19EU9eda$vRRSXFwPa3QVVwCyj7T4H(mK
zV3d~#h8nTUPe?n&7DZWA3&Xcslo;eU2Zf&%@e03TB%JKKg=u_foMW*0O=5+x@C#P*
z-I)wLbp3G5*jnV-oEr$jj7?sL-WiOt-Kfb4p@w4;q3vbFL@XOR`eN`5fm-%ZI9duJ
zZSLsPtKl<e%=iuP1Fa$AvtbKgkd(OHMY^#(celi<mC$Mwm47i%5p`m$087#|7=)6F
zz6w92!5Phv1<E;0JJ9+>S22R=l{5|u^RwO1iTE_^#wEgAcgV6oZz%Byr#RMbTwN97
zTjXnpHis&T(=)7PJqz}<n@s*&vna}L9GvO)6Msy*&ZF?oK<-7Z@Oh0!Qe!)r6Rl&D
zQt_7(LoJ0u0XM6p$T;A0L~iEQLf0_9!|foK*@%a6hO_Y(NvGQuROzWO{(Rs{AfaML
zWzmD!C1?wu5DdFCL~NU>HFL{8C;Log=6mrgtq?rxRyNV1@{J%sp0{rSgNr8!HC?|_
z<vQ_~4b4n5DXRsxbZTOb&ru)^N^o%$jV2C*!Y>L20hfzMsU&X|hijD@w{U5ZtG4~;
zVvcOnQ%=>1EnQ@qtMFT@E+;S3B||Q+miK4wE&M&b&0p%$z8f~fBRQwv<7=oBFxy9i
z2vR9IgR52eJ8ETZ#IO23>gND&+4aQnVd>aXpSsODE)ZV}XtRs(wlGXJcE{ts$hh_P
zqwt0?y#z5+gXHG$F|j$Bik=QL`9BH6frRh6V{~1aiQ(h|ZRoIUFo!RQP?~*G^bkD2
zPAQv+24$K%2QVF|&05)PoGJXW!?4C`o6icL^eBfjSpq>}ngD|gMoKr1nBmV&iARi?
zA-f<h+pbwYuAp|T<bK~NQG`~K!xtv+0II%88=K~Y$(RX2S4D$xshyaT(fY5R(GTXU
z8=>1HI0TEt8dShv{?Q&v;_0qM@lo`K(Nw4Jm<CB{8d|at1c@yQzoK@&X1@Bd9%#}`
z(kt@SFJxGU=+#RfnR^%uszQeyT6y3pKh8H0M0_UeoYQH-VZ{Aa0|8H2iwA5MOxAmH
z*as7fQi!M{cO>muWRpdEcru$7V3qy}Qo67aG@f_r8$ZHsAvu%g3Jq;twFHV)6w&b1
ze0d0JOX4@7M24sEkIdVK3-oxTglj91nLHK3*(MwfvyrRdfkiptM|&P(l$uW9XIww3
zaVL$lLT?=baR`4{VuI4KlZrJmmEW}0(!J>Q%#ldppHp=~;n#$m!f)!Yt?s1kVIP#J
z-d%TR?vm2fr3)0$(40taiWEsmykJBv;Fxtaqq1k5KMWKukP?Hp^1qicmnw25A<P_+
zPw^$Wc}PQgod8Z3diLxK-S>9w-qRj~&m3`}n<GRc*&_~C@C199_lwdqkfN;sw;BX&
ztny{(^O~s}chBlY)fz6_3Ovb8cK-Po{4tfV)sok&1VSHBs-IW1E=$VUfu!<x**g9N
zAA}jN4wb5n3!<cOu54Yi4A438h~#>3HUuRO^5wwwcyE~&o#w|%d7oeNA_1rFWUs|w
z<XPvr=yH`FVGZmc_O+auqIGY2r!eQdZb|O2Z_e6Il|gL08nu+2;$k(^ieS(9|Dw<N
z`CM?KMK9**cm0B1uOPKXE?u0&$%U7`1ws}6f&S8NfJd!%-MKZkE;n#SPY3N=sC&;w
zx>1^6#y}F2Df}n32JL!uYDS@x){mh*vu+VrVjeBR37=2aJIhBh*02~I+orqR{PRya
zk3yu=lj|^C4{7`9MbFY+#&!vdxk)(X_iSr`qAXet;wH;^!A1VLkZFkUoqBi+@j%!W
zXr=`uMj?(*7a+gTjkd&}o{h`dqro|OKZz*7WM+>r(0=w$%9Upfl<8kFj450inScIC
zrQ5O`g0s3hOe@IOn^M+NO5g+eXR;>eU%hC0Ik`dBPvtCgM2g~1q)2QjpYZz1Bmuj!
z*G9)zmN3`0DXBlBwoR#h-P_LpRb;|3&l?eFd<(g>Z*-`Uh-+6ayoI3Z1D>XM`dDti
zjO&JG2QK7eQv)vN;MNU@4X|jL!oQZ7!8UsA-ECcumj~vfP<xwU&d6RT`q(mL_*g0i
z9T9e#!YV3cC}p!pC~aF+vbaj6B!hyxR31WA#iKHmzR~PFMKGu|lr&z(%4#)b76g=J
znrQ<V0LX2j1ePR8fhQ`SQ6xJq)2t=bc)352(HbmdXTiJDIC^&Ihe4T52du|#QjVBW
z10I^!K6vv^oO_tSWbPrgop(%GMtzEcet#zy`eYUF`8SX~0%iz2=6nHV$`}jl2-*8R
z(!~JcX~l#AeBK@mu#L~wlQR8BI6LAnAP7rr9Uh|EK!O2V&&?;U4Fl{X)&UOmj_WG@
zm%)sNQwTV?Y2=5|_kRMZ^O*g62tBa$=0jK(5TAj1mJc+&0?p+zKuLas#7oG(zE5lC
zd;sC*0DucFA;I)Jrng1+)J@Iie>OwaICKsRY^>hmuLKVRCiQTjW3#!!?EZ`;mVBAj
zYfR7nk_<d*i_ZeMz#63XR#RKH4~>WMTc{AW?hAmOyTO(CMcoly`pg;D^930XUalUY
z?2BykFz>%x)mUB=uAjB4NkeJNG3Tgfj;MjvC+C=Wev2PR+P8=s#>JKR<qaE1+wZo>
z34c8URIWfeJHJDiGVe?Fg*Jo2zZZV<-y3E^j$75+Pn0u9?_!bkS2%5eby-`$6^~hk
zr<cb9|KE8VvB+tQCX*L0-~SvE^&gF9v~N5RA2CN*>0$}u(^u>^lYB?!+gGnWW5xt~
z^732W^-In>xy7`XpJ-v~8s=k|v3MdZd|K^4FqQvq*%==xd<tJW+Z%(vq|TlttcvF3
z(<6xS&xh&d3+bg^evq>IeU@K24Zjv<j)-{-;9nz;mE{{P`b&F--&XG?vY^%0{KI6Z
zE8&Na1I0d0%*7R%Y$)sgr<Zq%L1{_}1Dfa!IP94F2q;M%5S~f)RimJUEnX_}<yET@
z4p24(OYTrEjxLoLKxBD+x?vo&l&Bvz{;OvI&>Z3Z6b{pgN&ExGb*_~O26dSuF!O<x
zfB*w@z?oMtRQ#!M&f`4D|L+q6yU56ePYz!TpY&9Dd61aKml%kDg`;8kQWMIJHeL?q
z=9#=2h=2c^bucQM|8*>+E5zr<M!5v%(%Vvbx*Tqm*kfpyBP~wac4bHBI81SXBT(F@
zXh7Un{uLe4vqvKLMX;^N727h~4`B>A#+M_K6S{PN?HJW^QD$*lDU{~b|DvKoYxlSq
z@6#r(kQ0@|QqgpCk;&3{pIr2!iwwu5VaFq$>qAmS66a;gMiW-ZP0^Ib^L8S08Av=_
z<?)Bcm<;qOeChQ_sUv;NEGYaNaU8o4iet*e(R>k>*iLB@ZDDybJ~LOIu%#x-a%JLM
zq^?;%M>&<qat>_B*^(2b?P4n}rJjtxSsB{BD4IBo!VcHp`5QTN2#RW7pJ&`5+(kQD
z94<LgE-ItvYehPG#uw|Utbo;;zMbwJ;($)Dg$_1h#KA`JtZ5qM!a$n<1yL%T?J$uL
zuRf&|xqZFDK0s99<P6uJ5M=&A0RVUEkPEZg0bS%wtH|b1F(~n&n4OAza3Z3e^Cgb6
zMm3Xr-9dU*bq!Jnn@;;FmL~iQBkIohrT6^Q7i3Rh_?_|SLU{s*0CY@vay>jnlLEQq
zP*I)`uH9fqWp07t!LiLs47*Ygj0#4(o^rvaF-_UQLRKBj+qwb<DO470s9QNg$aA9d
z50)FITT9`P?g6Mw;XjK(sNt6UQH{iaD^OThV3W0wO$zK-SHRZ#b6T};AzD{p+*%k&
z3hZB3fV>Pa3?>B*uPd<I3JfI$?p;?PvI69%+$dhuL{BnKjq(=W!&XXvh?OL*xIZ?<
zqQ!S`11zy$z9FnH3`$}4He!!;yp_q)q!1iUNbC+ulqZO+_z)$X>#-BEl&!hX^sqIx
zvKz4VgkmWeMuq#>dQw9w0Of?Os-OeM()Qf5vnoqACdAH@dE}L~TYFawBI=1xsAqyd
z6_4oL3O@7##A<^d_KKAzvPa-K1{Uq^W#-*t<}hN5g+g?v6Jh2_;c1r*l~c+mX+gc9
zxF(`0jdd!9s}XR}jHtK-e=+{nYjg1lal0ucDb|VjzJ<B?Cq4u_&VkJ`iVD50s=_e9
z6yqt=m8_BEZo3Qab6lVgqN2vEWi8v4QR6!z_hj!yV^YweXzQlUvIVe9F<}Nx9dDPQ
z+v0tc<?%5#rtDy3@3>Un&3DC2?lD)W%UAm!!?-plMOVah(?yZ<gkm-(YFtz0n>b2w
zZu7l`;#jdzC=|Q@ZPH~nxi1ul2lRe`2P0uFQ!I`a1HBhvreY3$T@5&Gm#>A~TiT0$
z;HkG&33wn8gR^-!VAd*k)4<?Oj9Dqdtzc)9J5t);OeJrg$G_QxfpF#C5C)reCO7!1
zo7f}MZ2@+2H&$u8m}^-7#08)dBfDK6)V`??UPvH_%X`nrI&LX#Lt92S$<dh59c-~p
zQOUMw?+1jm7eDG5AexY~9PK^2<q0(0-lMti;FAeO*g-vEZ?ZI!TXtBP<12}o(v&90
zXtLX*>#(njp-b^$-|&!YIGZ%w2e$7e!#51J@C5`2ix!`ZiVrbX$5?JuwqiLNI0~ds
zmG{DRNJC01rv<jhs|A-|;mx-=DoWa()}<Q&z2PWKO_lZ-w=-PjNDAM?0QQz5!y2OB
zk1h(0u9l;tqwjp`1L>F~mhTHSu*jn;A4MN97o>{@@#km*!#}z(?`sw@v@+LJ<l|>%
zj(3S(lM*wIk=<@$!P)S;rb@e01Ro8~qCj+rG+!_fc>F6$Wjdrqdy)U5$J!2ihs|QQ
z9sKad;GSNt25URWY`X?V^RY!&sIcS%OVWJeJ?QWiTX0mIDs9W-<nYaBr;Czb_C)=W
zY+8AToSk05K@~p40f9IZi24xr(^{m@No~aea1?^=zSnUSg7MAF0da+?)cl-&OYo{R
z9;I>g#xqAi>NsGQC*0bT0kkjQrTz~F0Itx3Ww6O7L1CPEg=XVY?VPjEcSVF46d;|V
zbjH9Lz&<!ODwPGpyqLRWRk8dmnxb!f-Z<jR55@WNc)XgSCANK*C#KwN=PqjTnPl@o
zEJbfFpsPjUt{+IXch1p*ZKTjNfecr-FurA4mL}!UGS+diPBKbh=^Z1VL@DF4K5R>G
zdRtSWZes{JBi}~*MY9GpZt7M3=3n3#)wlj3)kSxDTbJuOvu@xHRBee76ZI^Z5h{Jc
z34H0jr4l?_20J&$f5T+@mUYtZ-hNN|HvU^C@!zGtPU6c`rCXx%@zTvAJ8esR?Y|%m
zY`^psC0wJjd|epMm_tW=0rt3O4c17AHO8f-ptk<p0BcaIk2R<UYba!aQ`fQv)a(f-
z0K-DNqGKI~kW$)Sl!_XUdmI0pj=_&VZqxhH`{m+P{5|@hNc@sLD^5kFsC0fR&ZdoY
zg+Q*i<B`9x!NRlPHrko;h+_n};`f6&m#`=^fM?64V}bu~S|Tg$vH(7>p*<F4@ml45
z%xw6>NYnchV?*&E3PPH83P(z{Fp|=DU~mS@NcFxFNk1>d&y`A2S<(#wPAWo%#<_J0
zkEr~bx$+(ft^?LX_HsNECAEWK)}pr3hhcs5{ZR9WryMD-rb<!q=ihHtYrp>M@pC-`
zB!>fypz*diR047weYl4}2&nlnOLG(NZem0-$K%?;jw_^O*S3$8qN71+FNLr}-$-xK
zw_(b}^;@2Kvdm|Zj;?<H|M*OHwd^n6`P7>!RP;@d7tP};xS)DM4^gJT3^=qnu3og=
z+=@#xqrsfb0j;v*!X}O=Up0ztnUq)!rLGD3UGdJjc%&g8c|R^Npi0+{0R0H#@Q%Le
zFXvs43vC8#G`UjRO}sbRQL1zbrLZlUFty3|lcYE?kSozPbIkcx_@!t!BsMHdcS<4p
z7Nc!4y4V&`p(vFJQ5+}nnOg_^9(`i^xV&D5zAyfQ*0YR9`<)w1-SPQ2vmEUgyK0bR
z#VuG-@D8e!SEdD(iANS>@&(^EbJF8cgL#KAumj6FE{}g)7#Eg^5^#!Lhx597fSccC
zwr}bZc%tJyzR~#dobg1wO58-#Ba6iUu;gu?Rm1q71(D;@Tjx+hByor>Ou5OLk1Whn
zen?Y~c$f3#{qYdrqVff>;d8N$muA46g)r$s#;kqvw+<~p)=8amfHwlsg-h`SqcD2Q
z%d{7}h1mVZzq@Vp8f&AxFB(_ZYC9^jkIr%UKAZ0L0(9j5Qi%qQ=vv-Rup@5U4Ij#o
zY8oxf9}7(lsBGTX`|Mj12}~}&^cL%7%0K6Jl7_>T@_wR*$!I?(Kim$|Zw8uM)bs&q
z!Q0=<_ito+^7IU@jGY72=k3o%B`F;}l|hDmsO$k1wuYly`8XaH#1Mk&$D&!0XhIy}
z1j!ys35aK50`XfwD^F4nR{F~KA(pZYJOY8l6MqUu0#`)XVi!1-z(JoVz#EEQGoQFq
zNJf$PYvNHz5z?*oE)188fMxP*q?-b6g#sldJ@F=pbFaacAucLfV8=r|ddEYT%HsrK
zbA-6p*ow~eP5R=Ps+IS{V3!p^;s1d-IuI}T&!XU*#iL7<fG-he5_X6q2}OCcPArxT
z65w-!B-4Bo!5_<JpcGa1^N}uwIfpc01SHMDLjr79RFs}~Dp!SeK(XtW%QK`eqZ^0Z
zA<vZG-zwic_0ZTbmp3DXERUKi?K6xd{g-cLKoV6cRV6M*!39qQfhl_{EzfiiWDl04
z$SmI7yId}|aWNUDf5W)P6`mQv6f}UAG-y!ZCTb~mQ0QNbpo!^<L3C#vN~w-Vo4KZe
z_%OU_dDr=PWzM^;VkMl8*F-ld)>BXMf%pzV`rwqF48j-StUD5%GHzu|%IWBac~qV&
zO+f+gId7-qozYakd&Ih$9`|Ux$8%=!#Cp&9Yjlq!v*uU#I!98YfVgV2VtoJ}7QBCM
z07kbGKFf|wM0*~dD(@&wI!D3yVp3vYp8v3^3rm|Dk3#7+VWTL9ZZHmz$;09}{ebeA
zfZF8Vkr_rk8QBVlaS~T?=D{)h7Q?tUT-mYtBjf@oVbfm&@!|V|w?&(bQC$-e=20Yf
zkQ6!Aw+J^S?ZdALH_=g$BEA@n@v7@nBGG7`$!HHrFA?Hq1h`w=?PEcE^u#|DLZ8T%
z?eby+l)I%)-F<?z$f1nqV<2P9`9b{XcAWrW$x;ED5c)96F8JOC`V@R4Gd5Bk%${5m
zOvxEZN4@co$ANc5{~ny`j)VKlrnubjcPj@^k%vS#{7sl&&OS{82L2Bdd!JSKs-(7C
z^~1^*ok)#NOA{N4<)iK7`X)<ch;|@=_)1VPnncI(b@Ws=2gKLWE^))oXu=dA(-i|?
zcDD<1cY`(F3CT&2iwoBm<ZkZ~RU~`8sN(+iMKWJqklW>$J%HLneu3yjwGTiLwFbbc
z9stpRUvB`xYZpC-QTy0^hM434cJ%ZP`+U87@1h5Xql+G+AEqwnsHX+aE=92xIbXLF
zh3^81NN5C$!bo_c__%V#U~v85Rq!{_X<j%Hb{JUVnCLUjfcQ*UuT1u;G@Rg6CZ#RX
zsv`)LjAmC^IcIS)ny?(TD=F8|sz|upRU)_UF3~Do9Ft~Dho_Vfwq5EIUxjh9L4-cj
z!s}I*X*_eJ(?WENR?xqGoj{m?@Ozc$b%LTGuM2j?crLgsP8W3hj9z0lxd%p>Tm`kx
z;A+4x`+UTokv;QWFtC&lz{)LnU%NW-UZNU4AF>CWO~!egRhhJSlSHOS98pHwd5`<>
zk<#rV;bhza$2|{W5kK2@d;0V_{BGw5fuR^@4i`zhHyOXfg(FnzDc!C2wOr9>QPGT&
z-s)_ROq+`X2m+$p-~TL;fQDfH|LyN5EHDFZd50b1SIuE`eY%wYnM}u?$N7~1do#hk
z{NHQR9sm2POvQKk|8z7eH@+NaS_qy`?HA?O@tN9q%bXlJo6YHS?aXzTM~IK$%%q#m
zh)VDANc(I&qfSd7G6xl1yt;Zt^WSjvx@iKUvY&IQiXb?AguDp&5*2Y=tp~hO+;Se`
zpFlNRXwl7tOiQkc@j<QC(X_jI>AqF6!};Lw{;qj~`P~p@KAL=x34J14km)<2H~F)?
znc~^x6tfz{9zBjJIa((tX6-hVlgS=+PGGWLoqxR?(>>_QM1sU#6N$BZ702|?4M4AL
z`+y|8l_YDcoYyq+ThgTGpPN^&uq{QNP?w}B|H6U}d>6S|q$$CVRt1vvasC@?Y+>MQ
zrLC4L9GtsFk`zQ+^FNg{^(dwB%Ku|}^YYy0;?{s!$?U-3(D2A;e$&|IEd|7W%`%E0
z3h&A=`|6IbV_jsBuCU-0>Avf|y75}^&>XoRn$^5a5Yep7_io(dH-y*i@padKbvU?g
zt=IOy`l^6gGyGSr`>F+car?C(@Q!TodXcQ~Il2CXAnU0LCB;Yi9}Bx{FIPD{bilC_
z3WPP6in!U<#_W~kGYa*L#dQ);{W)3AdF^Iv&)L4WM!jWJvMX2A!HTeyaNU#y?e&yn
zb6v)>b$)jcfBwA;!tKy4d`1J>mF&#@$*{ZZZ&kZ~ye#D4qFns{w9Lac2X+t_J%P6f
z(O4MF%GHP2X?8`=aCBY3$-ZU3k?#j2QMM0{0{=^!_>Qrtgs)8a{d{02*)WtYd3XM!
z9Tt)kp8P%okPilX7RRx<T$^(CXO7t+u(n!|MYuT@*74Bo9w@U|PP@;B^bt}SO>Gex
z>RX-5el<1{NnmWTu6><^ReWyCDgCoMpUSrIkL56Hmi9u-{>lSvMbaW?ci0I+s7-S$
zK-8*le)!Tu{)Ldep7=$9)<uMIi?am?Gcn0oP#eaP9A#Z*;<qcQh<B1pv=FZ!pWB?x
z1R>bSDoD=iHFKqLXqyRXk_9|G%NS<6E5)+%m;=3_=M14X8>Q5*Rj}@@bF1LirBDhO
zSaCg-|DrY->HYyN4|e~fjyu&?%aQ3!jy2%1%t8rSfQr~OriBNrd(=@Ms_TZS^Le!m
zLdOzBxwM)-gT_JbJ!M(u_ljko>M5Ij#j<QVW$ii5lPq<hh{*s@fPzUkN!<&Jtf-DS
zc0-E4<>_|~FO&js$4V)N*DmYqCmQ_9m;d<k9BY&*aD~&SAiVW7;CiyGFxExcZ`YFW
z54Nl;+b7JjVk$u#g>k}^8K@wC5|l$O@Eth})3<QU@_QDyv3gXYg<m8j?ty%U>5M<L
zg68-RD`U%lVaN_)5ZNda!Bf<8B%tLm;+6*GjH&@oCBRYRcDSkz>jy<}i%N!a21O_<
ziBG7KkV?m}_Lt1u7l@=!xspuNsw9M3Uov}Nz#?IAW^Eb<wcrc}vgNY$_a~#kXO>Jw
z)ROHC$bd@v)ea^bA6t=>w$CPe(6aH8P6{YU?ShfpFtd=)u7>$=6_s^HDwwxZc{4&7
zrA2p|5Vbd|qh|&Cs*|pE+=FoESQ2gR@Poq8TF{K0kZS=-#@1`~G-gXCv@hE}AHg6v
z!@wpT>Cf2{v7ef9{3JIt!^wQd-7pRX!5csa`!Zdp{IDuA<EB~Eji)F05?0**qHG*7
zFhi3Fy*Zl=LiQ`TF0DN}&CKl&QP?mtH-mGx;R<Bftw7X^wiwV)7;k1T$w0hWB}@GB
zK^M^Gnh_VNCd&o`nnAv~8JN2b-yoyGuu53X@Ds*k*36hn(##f`$r!i*YD8qWa^zP;
z(uyCFx(UL&18e!VVBnsBW}|^a!PpyuF<l)9w%iwNp(`k=uJ}RrvP^7n+II?P8Qv76
z5PL;js;gnH9KcxzJv$_5V8}twcR@IQ+?wPk1YRLwi-&$poP%v~pKFB{_+E^eK4@39
z<doUwN<eX>=9zG0MIg0`@T18!ia1j9%##}8yCVG9aBO3mx^Z|R`;q^SY$j`Gx6@}y
zER47!IC+yBT9q2nGn4d)8Aurre`}ld+)ER4VIqs<y4Rfln-;V=Mr#FgiNq!ij98B`
zJ<LlW$tdII$Q(G`>As7OXz-=F_(_~p2?({Zi8K~lZI&r&ce(wDAXohB44H_e;7)a8
zC3d(ULQK6YxSvf_TS}`fq*oWyt7_5|)XQCpceLDRRwNox_9I8}idGBn6m27692LMf
zIqu2sa6k3|k9)HAeG~0}$Vlu(Ipp1~{Z^VNf0N<aA6pa%Cs4^@<HS_{b8gG7mrBD?
zvRBYFD5Nq)&;Ag9dbT&Kq1zXNfA<G+-sar4R%z)hC>Q*0??jf~wBK0FY*m@FRio0<
zH&_xa{puxY5stMv16%b1KegeBev>F9IqNB8qUGD-yY=N59IchF+YroaDL7duHDtrq
zK$-cZt{W(>2Zgfv3)(&<(MY#)x(~C-$Rr*L4U|-kpO6M!X^(kpvbTT$S515~b>MF1
z^9Siim~{jZNMY|$Std%}*+eJgUkc?%1tq9q^bolNT>+_xjqGR>k+oyi6!>`Mpn-VK
zZ1+BdPU|0nMm=;s2R2m<`VK!z(=mWg8H+(TqJ>Z`j2LR7jL@hNgDCs}LjVY$&%eVB
z9@DG6#%@TIb(q#5K(BceMtEV*MpM>F==uh@d;?BX(ho!xkRpX4!`uLC!1|LPBpseK
zF`xg#Z0gfk$5d;3j8WE{J9(VuIMc96hkF6KffynrC*B#z=tN$a#A&QJ9vD;`gGQb>
zury~-at@b~&TR+caBQ`aGmCP344+cUTJ;iV*42Z6uT+oynOe%GdWmD}>OtdIsy9SE
z#4rnFvL95($py2FUiOYH16WZ`0PJ33K>Sl|7Ie_47y($(!10G+57ePpKxqm#Y_#d4
zU#u<8w4iCD&b{9&_fnh$Y{HOyO!`VXGLk}ML2i^cL_-`~mIMB?F{<80l>W$FmZ<-s
z#jPmWU15kUQ6zL{;^?S>`H1mP<@4Wr+W-rQhx4Dh^}rob6dius;VZ#`!*9Fe@Y}dP
zaQMm{hp$9Ds@7(wmlrw*ms-u)*7W7zM17&wm|m<!jb<lm*E+%3<wm34xDYj$J5h5s
zYImkP^_gg<x!mZ~TJ2zZVWBxQ-Kjk>y|7$6LnG9`KY2NGu{JY*l)Gv>c;t~M9*ul~
z=&jQ&HF{sPuN^$zm~S*MH)yF<o337sF1PBPPOT9%rW?Ubv(c$HmYd7%Xt7zXMfG;H
z+-NT^Ej3%6S~Y4mqRvI0EiFuUW}B_WWHbVtdKZF)>6!URAVokquP61!EUhjsPB*I2
zLcLK7YK@h8tJzqrH9FDCbgMpnexVkeY^+Q#)T@!OqT1Ez&UDmiMvJx9g<7z0p~}C7
z<;uQlaOh%lv36)44Ac)@xOnN%<z{RCpxqv<)>aO+FV|-~2inc$)=cft!otep0h*kv
z&2$a{#1c)n4zxOl7V75@`K}I~Y_vNqhbQ1Qv>u%Kv+hPzr<dvTLF*3wU7ijed-Tk)
zlV^id@u{E%sq1h=eCA|u^3(~VWaWu7m7|ZJIC1=J<=n|{I-WjzG=5v<)Z>p<o`@eg
zS2<&EAM<a|oIQQw<Rix`PaHpc^z^wk5A`OfRKz5eq~O`dDko2!e5`W%)cwbU`;8NN
z8DVC*)q-dXS78JGbh!dfh#sAuxp?68ozd|}PDCR0{ijcyh{Ph(o%X@tSnd4sg$Pz?
z!pO7&`_$p3YO~gMeD63cCBMiHF{0^C<dJbO5}n}JI~`9%$WBrm9&M|1=OS$9Nj~d2
zsJU=3aNLlTcI*igcbd(py@)&=ypF6q-Vkq!4KLI>*1987v=Iy68MSJ&H4rp|oYWf#
z+QI1Y6f7_G(b9D1;*E88vDxmJUJ!OA6Lv#=w7~URSCJcQeF<6nsvEuz9;LV>T+kGB
z9Srt2mKPRo3r<$AL~ndkbgt6^_UP7Iqw>i{wRS~>TAFU7>pCr6t8m1a7VC}aPP0`G
zPAxA!ajxEhGu|8>wtC$sGt-UNbfWXM=<T&uv)oU%<E>V+B?ULS;qxvrO{b*XAU?Sh
zCD%*pcKz+Ouf3&K?Q*?Qy?#4R=hUJ{J+q_iP513<N3+uq{1vLc@?cQ5OV@8AH3hHK
z$n~K<VW3GUWH)LK<nq3D`Kzq5LGctabdOgyuKDSinOeIoV`N4$HAfq?;kf`KYARH!
zs4fM+EPLa$&I)E!+Gnwj^mk@1M)yUvD@&LN1a}|$<vfO00>7`ihLckG_U*e8wW(2`
zt=C%7#pyOI36Cd_s-{}7cvm9<0pKRE*Kchxt=fuUai?G5@@|751Y`Uv&CH|~zWQcP
zH9N5UX>il3WA~d2U0W=j6^X*pN_`sn6773#53U)PYv~*a^cL39&|0~Xnu%DL0vEl#
zf1++BA0j$^p7SQdw3yi$IUu+l{HZpVrMPh^mWBOx)HPkzdRvlSJvjPU6FqKHU+>%*
zO;@YYfrS~og?gjYh6o4L(gEKRU7K;KH|C}4<jep<+-}!eX2`no8tvcL7KHq(-nIra
zDgSgOuhWvl)Tmwdz}n!V2mxqAhL$&M%_ZyQz@n^MbYNxaK;jM_Si+%cBZqH#W8=-j
zP~slqzO_6CQaU7+WLCX5WDI&R5;{%Bf|CmiwF}b=(fO;$cDr_ISq3dQ)?A#1`(Tb{
zD>z$g<G`9IwrWeQT3eC@P6H?(;k9+H24|$t+a_hzTBBa82B!hkiJwTy&7z#nDs^xq
zkU`UYSh{Mxg(NmxSJO9ek)+=oWdya}?z98*(oD)0rg0F>lsSWcbxCj_2-676LTwfw
z8b719QfCyWokzV`12<PS4D%9^_x4pBP;5|Zc}Ys3eHBk*5lLKF=$7IMPB8u|t`F8?
z1vO}cPCVCktu`Or-)gqo2V8gZcIT--T|Hp_l8w+c9UMzWX;`FXa0bmA3|*Q@IK|9e
z1y8kB59*{%aKe{6{e@?j+hFoR90upF<JQkyG~nWDm9|U)Q6@OrYR;plmg*qs<e}4^
zo(Ty&*bo6>P_>i<9)FI$T?(CSL+<YFW3?+CM@Pfx@hi0%?*aOb5>!DlQ^!H<FF+Nr
z%O8`_1?R3Vo^LJ?^az7IYA$u7q_ogulr|OHTw+K<t_{;8m@yf?tF`t_tG)z70c%RL
zqtkdl)POOP*Ba~0#kx~_#&0T=AmW4yTi0T@AnjWK8k!XC^!AW4R~7o}Wpuf@OykWN
zR8epaOXLDRn7xV4HDaA<!KQM!3C<AxYQNZ^l4fcsuHeLS!*PiOslK?hP?KqMqJ*(&
zPa_Yw5)2I{!xTv(P9lXJXRTUr{|coa5pKJts|FN2MfrA`GtC80G2y_s34$cuZN@^7
zdaTw)-8zBxNP=rGEvw7mQ4d8IVQI6MiTrTXfyrdnx0b%vkh)~*dz^+F21xfVVe#4N
zMTw?bNMD)=`4I(4!Tvr@)jEnGB{)?D#mW70qbo%lNoK><>lMW5^A;qFlg=;C5({`O
z)E&?{L!KEVC+r~0KND0l>BFwJOoDW@?^3II!Njurw)ZJa1(O<EU&KD4kN?tglPJ1&
zWu{iEiY#dD`o-oF>R-xCEOY<#()9WILcLQr>1hLYeWnht1pWUlXHEtGHvFC3Kg{h2
z|K`9SWVYwN82nH+%-%cj6XDb0>FlB0p5W61&jrQop6n;X-_8yP&kqD)HS<SdZQ!oV
zcW3@p=0Nt5+<zH(U*=1>-2=k|AIm&EFp;?uoXxy3`$yUT8vNtzQ@NLd4`r_o+>&X8
zi`jDS!?|O@ABP_ep2>c5@cD2k_x-uQp1UJ>Tlo6$-N7dZek8m<b0Yjd13~b@WDx8O
zMuVL@cM4>l=)V#D$?K2)+Zk*f<$ZQ^C<v!UI6WyC8RTU?7*ri!(@Oe3o0A(_9@G0V
zl1Rab9*+e4P2VTw(%((#RS;~`wfz}!kNMmCzNaXcg`V3jT<7#B7;%sJ+xxz!D3`^a
z+pW5`KO^ojfBF9|p6=SEzZ?J8_ui1Vak=#AczP8CP{98gagX`i`@W|rm)m=8Cv<Ip
zM%-im^8drH{f5`w^^JGmbMNcpqxT;>euATx4(2}{CKZSMg)jmwp-d2D`IqCL=s(E6
zA^xetr(Vw={!<-|Yg9Kq|J_8TKcJvC=Xl<+w;RDPgbKBr-rUmwm+D{tsSS5^V}HfE
zLHxnP`D8%cy>K{x_^1EMwjbOx`Qg>1l`pMnH$_BJApY#(e6c__M^0J=3pJxZE5Yl=
zDiwK*6>QAxawBQCm%h&(dh*h#OYi$fVHkvAZhvrla9>X);z;#jlEtqOo7o=Z4s{ln
z3g=P(t*Zx;6wizb{`SF~QV_w;9c$jM+<ox!bbIl2-w^)H{x62X?Uf3K%d};M=k-dv
zF}>8j*z8nrCs*pV%dgb{+7+U(@bC?mRm^TRG6!D0hEL$AgY4^Wym;D2?gmCMTdmA2
zG~2a-L$BTeWoN6(^a`TEZ~SU1;3-yS7p5<?hwi?yYVwrSlPYRJ!#6e-C^KChxdCc0
z76k=<Frzm%n4|{YWJN5LzwL$wBbYYbSW)@sV>dQbOKfbu=hY!lOk0@&h)%7N`oUZ7
zxWS5YTnjf+m;k6k;M3W<wIXpro{3B<ijvyJJIVFeX348pE0=52^Y>W^Ec*GG3QwyQ
za;*lX#f94jE6}CY8P7>mOiX$8fMX%Iq2g1m6+(s9_MK~9p0AOSttiNvoLuv&Mn8J7
zW7nD&u8!#0-nnDVJ5{nbyKY+Z#??xK)v2{ms$%kT_u7}P?n;BXi}p=xhoGA7)t+Ps
zM(T>wR+W0Sy|=g^Y~XncBgd8(mvEuif{u)$YA@7hO1G>5U2hu8i<KpOVzLnxMbYi@
zTC}K?PeR=c^O}lIXi#r6xMU8dqB)p~%TiQtDX%H2W+cEBba2Iaocmy>zUDS4Qz5}Y
z3VC|r*4-Nv79FkC*F+n<P4I3d_M>Rmb(W0WsY*eX)VXaftrBFZ%iBw9+VZs+e5ZBQ
zy1QfVnj%U4q=P%RHE?v>_q)lkGmZoY_984Z%S<@bj*1wk%ol#Fd4@14UZ`Ja94you
z_0Caq*x#y*X6w*a<AL5shqu|I`%To{i<75cThQoFrnOI={svd<7;zfiS(7JEzs|8#
za%+9u)&K0o!gBlKH*U5*?ENuNa<?m!-k&7YyQg?rAX@rEJgRc{PS~^4jsC~49|S+7
z`R|*uW*?|^9%)WjA8l5b7i#Ai_jG5uReQr)p0Wu<NanKniVUv4aV;ZQ{=72VYA$-N
zd(-AN4HEsIoPBG(ara$s-c8pjjAsdPTh#~A)V0<**!3;j*T2>CBU9_2TO%4|yl4GO
zO$#W)r}sxUX!H~i-9Jga6ZKYmEj-7@dmB^}g5mh2A&cbY@=`hg6hZ66?j*6+``EKu
z`++UVTc~gfnSJoqHN4u*L03?~bX{2y=qJY$;yiIiFq}0kJw&FLkc<h@w|8mkGujTD
z58p%+z2iU81h<{qTU*mh<klY9vGMD(k3HIl<Yd^s+NrIo2G&lccs*V0X~||~&OUa=
ziuEA>>|=k);Y~9cU5IT!IZZ~K8(ds&@cY<i4NS=t<LcJqqtdeO`Uw%_{^ge5$}T)o
zYh37DeCy`^r*FGS+7aW`K8`^=-7rpGIG5z&|8jxi2F00Xb3qjSrakK@k)Rd>xlsEn
z6VyohJTu)n0b3t$to+q&OVh1(EpAldh2-^L8@GqaEv2PM|N3^koo=DWn#bG>e`SQ0
z&zw5P1lTukV*;X~S$hVkIaD<<UG7rZ+Z(7zuE9^hBdWFYJH<FKmqzQx`k=-3t@g}X
z>(*_py4I+)u-&9ITNTq3Htq_*&}MDBJrX=`t#>YxT09Hyw;Ho+TkqDqFp^feI8tr;
z*{ga{?u^=D5dPWRn8)qu^O}L59|KLxOD9h|Rk$G1CIhG5;$*$;Z;gUym3#*3YD4Oz
zr=ZB%?W8#&4ohCAeJ^dUw@)=2-^4uOvHFF2=h9HUZ5OS4&mxI-!Fg}K`muVevsGy6
zL-lf@+Ux-gyS_3}smxrtQn|3uJU_jFD7P@G_0Hw3XFa05u<pO@%Kpj&kDNXlKT@f*
zm(N#DR!;V<Ht^*R&MYlmUCXGX2A7$rTB%hQ=W#CIzLEKqyDT5!9R<@PM`8N=7nj~P
zbMeBH?#Y=>>qLz?E|OnQ<*m#k<e*Me-#KPCk2jQ?JW;*o441uw6pqJM!-?KP>GN+1
zx7c&iP)6|MSDp@c*^?7^oTh9+<hdHcq8Z(1!kzX~BZrgwQ2VaXb$o7_sV-!#e&ONz
z!os)urzBCSFKeZz-xdyPvc5U<?cs2B!Cihwn44|YYTp?SRu^>fH^M>r#?6`U3Wu=O
zy87;Lj5(hr0<p@>#pTBQ_k<&}i8TJraKOTv?+phklCZxO=6w*o(yUkicDTiP(`hjJ
z?hwLTz-r-X8%DH{^ZUZ0f8G;Vet(GHXoZu9Y55;WTTX(+9}LHR;l5DtheCWuYoq%Z
zA#wrV4~MWs`V#V3%%#`r=smrC*eKtdLf;>v{YY;oYXh;r6K?Ky0!pht+WSC3)sOXd
zWUAqPA)ciHwqbzuY*^@hExqyn?xQ5wc`igC)QH70jpoedYDGnfTpLaC{ND|62-g%a
zNl8omy>P%H@V_7CRP!H%+j}bLiErrx;eb{AU^rko+#gTix2~6(OXfZ*s#T@-hr-t+
zE;Qkl*R1{}VHn6=HnFNzKO7Qgq#*XjW6!UDGku;(tIoxbgt_+l*8E39G;ms#Uh&p&
zhHdH8PlSX_>ASApPu{-}%J-x2$HL)CWocQLKN)VS=zXQNSiv*?sc;ZR;r^$?VP?l|
zV((`{Qa{r+r&-a2=Ff_-<c((9m1e^N(4Pwjo!R*LFef?thhbktT2(~)kHVZZ=r4o=
z)#b&-s~-<XFu*g+b2i$a2nS|oSU~iP0*iTAW(t2vm$Oyj^+k2x?o{hdmpA$4a6tUF
z^2so#-+!zDxYxS<m2il5#ee^#`$D(B8V(v0uKd$(d-mYhJVgI29Go?__~%^A%Jcpg
zTp&J;=1Xd7R?KA?m0t&5LG~}hp;?<3p8Z!mQ750`w^LhK_>FGg8uY)`U^`9g?%(Ko
z)-JDnnhxyhH^b3Nr8&D$K{3t$TUFMLW0&7j1>HiPe+N|>uFHQP4qcvUb()Lk{{tf<
z=wCelAL+ftl7-Lk3w7K7DI8w$0RDEEch&IWTebfj4#B-rU!M)LL}s5u6*K=*akJWR
zKdK^GOz}J6$V$@Ze^K|&S@^GE4kcdyJj%DWICHV}1$VKy^t<k2>FV#<1>E%C;HgfF
zGXGs_)LpW==YP1zi%Y+67uDsZKd=iB{y)PZiZx~ym;O+LZ+Dg&GfP)r_D`3t{*k+)
z<j}_lwgs>wDsE=JveYyI4EH{8`muvm{N{s+KpsSY+w^Yb{B*mPxox8Yw9)ERQnzr{
zau?bC8y5wAYf9#B-?-$OMlH87a2){pTeYHt*D1QDS>&fZbVIPJ<nWC(y25(VksE4M
zB}ebv2myVW02b#Z8IJtTDlIkAC6f6A&I^32h1{EVtZx!oM=g!*S^owZ(}by!c=LuM
z>;uMRn}}k|<oc%Qq0uZ@8ykRVerOu-DW2JS)A}}^IAi4=Y&PeM+x5D?CHW;I#|MI;
zgRQ1`H5@RO&A`nb%-M=)mF<0?f;r4&TEciFxOHTSMwVw-!4z)dQWMp+>VflDW$C6H
z7Y1K@&%wh7?+%{W;-k!2a+6q&_P`za^UDkK2b^%+F}kR^Knp?d*s{26=se)O+4db<
V(uIr%G*R2WW4KLte}K5~{{w2iz+3<T

diff --git a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext b/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext
deleted file mode 100644
index 20fc3816c225..000000000000
--- a/llvm/test/tools/llvm-cov/Inputs/binary-formats.wasm.proftext
+++ /dev/null
@@ -1,4 +0,0 @@
-__main_argc_argv
-0x0
-1
-100
diff --git a/llvm/test/tools/llvm-cov/binary-formats.c b/llvm/test/tools/llvm-cov/binary-formats.c
index bb61b288cfc6..a5bfc012860e 100644
--- a/llvm/test/tools/llvm-cov/binary-formats.c
+++ b/llvm/test/tools/llvm-cov/binary-formats.c
@@ -10,11 +10,4 @@ int main(int argc, const char *argv[]) {}
 // RUN: llvm-cov show %S/Inputs/binary-formats.v3.macho64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
 // RUN: llvm-cov show %S/Inputs/binary-formats.v6.linux64l -instr-profile %t.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
 
-// RUN: llvm-profdata merge %S/Inputs/binary-formats.wasm.proftext -o %t.wasm.profdata
-// NOTE: The wasm binary is built with the following command:
-//   clang -target wasm32-unknown-wasi %s -o %S/Inputs/binary-formats.v6.wasm32 \
-//     -mllvm -enable-name-compression=false \
-//     -fprofile-instr-generate -fcoverage-mapping -lwasi-emulated-getpid -lwasi-emulated-mman
-// RUN: llvm-cov show %S/Inputs/binary-formats.v6.wasm32 -instr-profile %t.wasm.profdata -path-equivalence=/tmp,%S %s | FileCheck %s
-
 // RUN: llvm-cov export %S/Inputs/binary-formats.macho64l -instr-profile %t.profdata | FileCheck %S/Inputs/binary-formats.canonical.json
-- 
GitLab


From 4e6fa78f4dc8ba01eb63ccaf010f985df6e4a361 Mon Sep 17 00:00:00 2001
From: Michael Buch <michaelbuch12@gmail.com>
Date: Wed, 16 Oct 2024 13:43:36 +0100
Subject: [PATCH 119/329] [lldb][test] Explicitly add target triple to
 no_unique_address Shell tests (#112523)

Follow up to https://github.com/llvm/llvm-project/pull/111902.

Makes sure all the `no_unique_address` tests are in the same place and
we don't rely on the host target triple (which means we don't need to
account for `[[msvc::no_unique_address]]` on Windows).

Now that we don't compile with the host compiler, this patch also adds
`-c` to the compilation command since we don't actually need the linked
binary in the test anyway (and on Darwin linking through Clang requires
the `xcrun` prefix to set up the SDK paths, etc.). We already do this in
`no_unique_address-with-bitfields.cpp` anyway.
---
 .../SymbolFile/DWARF/{ => x86}/no_unique_address-alignment.cpp  | 2 +-
 .../DWARF/{ => x86}/no_unique_address-base-alignment.cpp        | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)
 rename lldb/test/Shell/SymbolFile/DWARF/{ => x86}/no_unique_address-alignment.cpp (89%)
 rename lldb/test/Shell/SymbolFile/DWARF/{ => x86}/no_unique_address-base-alignment.cpp (91%)

diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
similarity index 89%
rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp
rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
index 1488199a3ad2..e198bf0cafea 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-alignment.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-alignment.cpp
@@ -1,6 +1,6 @@
 // XFAIL: *
 
-// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
 // RUN: %lldb %t \
 // RUN:   -o "expr alignof(OverlappingFields)" \
 // RUN:   -o "expr sizeof(OverlappingFields)" \
diff --git a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
similarity index 91%
rename from lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp
rename to lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
index 15d8de0e3ee9..c4bcfc473277 100644
--- a/lldb/test/Shell/SymbolFile/DWARF/no_unique_address-base-alignment.cpp
+++ b/lldb/test/Shell/SymbolFile/DWARF/x86/no_unique_address-base-alignment.cpp
@@ -1,6 +1,6 @@
 // XFAIL: *
 
-// RUN: %clangxx_host -gdwarf -o %t %s
+// RUN: %clang --target=x86_64-apple-macosx -c -gdwarf -o %t %s
 // RUN: %lldb %t \
 // RUN:   -o "expr alignof(OverlappingDerived)" \
 // RUN:   -o "expr sizeof(OverlappingDerived)" \
-- 
GitLab


From 1dfb104eac73863b06751bea225ffa6ef589577f Mon Sep 17 00:00:00 2001
From: Sirui Mu <msrlancern@gmail.com>
Date: Wed, 16 Oct 2024 12:51:50 +0800
Subject: [PATCH 120/329] [mlir][LLVMIR] Add operand bundle support for
 llvm.intr.assume (#112143)

This patch adds operand bundle support for `llvm.intr.assume`.

This patch actually contains two parts:

- `llvm.intr.assume` now accepts operand bundle related attributes and
operands. `llvm.intr.assume` does not take constraint on the operand
bundles, but obviously only a few set of operand bundles are meaningful.
I plan to add some of those (e.g. `aligned` and `separate_storage` are
what interest me but other people may be interested in other operand
bundles as well) in future patches.

- The definitions of `llvm.call`, `llvm.invoke`, and
`llvm.call_intrinsic` actually define `op_bundle_tags` as an operation
property. It turns out this approach would introduce some unnecessary
burden if applied equally to the intrinsic operations because properties
are not available through `Operation *` but we have to operate on
`Operation *` during the import/export of intrinsics, so this PR changes
it from a property to an array attribute.

This patch relands commit d8fadad07c952c4aea967aefb0900e4e43ad0555.
---
 .../Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td   |  1 +
 .../mlir/Dialect/LLVMIR/LLVMDialect.td        |  2 +
 .../mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td   | 44 +++++++--
 .../include/mlir/Dialect/LLVMIR/LLVMOpBase.td | 25 +++--
 mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td   | 18 +---
 mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td  |  2 +-
 .../include/mlir/Target/LLVMIR/ModuleImport.h |  2 +
 mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp    | 96 ++++++++++++-------
 .../LLVMIR/LLVMIRToLLVMTranslation.cpp        |  6 ++
 .../LLVMIR/LLVMToLLVMIRTranslation.cpp        | 16 +++-
 .../Dialect/NVVM/LLVMIRToNVVMTranslation.cpp  |  6 ++
 mlir/lib/Target/LLVMIR/ModuleImport.cpp       | 32 ++++++-
 mlir/lib/Target/LLVMIR/ModuleTranslation.cpp  | 37 ++++++-
 .../expand-then-convert-to-llvm.mlir          |  2 +-
 .../MemRefToLLVM/memref-to-llvm.mlir          |  4 +-
 mlir/test/Dialect/LLVMIR/inlining.mlir        |  4 +-
 mlir/test/Dialect/LLVMIR/roundtrip.mlir       | 27 ++++++
 mlir/test/Target/LLVMIR/Import/intrinsic.ll   | 12 ++-
 .../test/Target/LLVMIR/llvmir-intrinsics.mlir | 15 +++
 mlir/test/Target/LLVMIR/llvmir-invalid.mlir   |  2 +-
 20 files changed, 276 insertions(+), 77 deletions(-)

diff --git a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
index 0e38325f9891..e81db32bcaad 100644
--- a/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/ArmSME/IR/ArmSMEIntrinsicOps.td
@@ -71,6 +71,7 @@ class ArmSME_IntrOp<string mnemonic,
           /*bit requiresAccessGroup=*/0,
           /*bit requiresAliasAnalysis=*/0,
           /*bit requiresFastmath=*/0,
+          /*bit requiresOpBundles=*/0,
           /*list<int> immArgPositions=*/immArgPositions,
           /*list<string> immArgAttrNames=*/immArgAttrNames>;
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
index 27a2b418aadb..ea82f7f7b8e1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMDialect.td
@@ -59,6 +59,8 @@ def LLVM_Dialect : Dialect {
     static StringRef getStructRetAttrName() { return "llvm.sret"; }
     static StringRef getWriteOnlyAttrName() { return "llvm.writeonly"; }
     static StringRef getZExtAttrName() { return "llvm.zeroext"; }
+    static StringRef getOpBundleSizesAttrName() { return "op_bundle_sizes"; }
+    static StringRef getOpBundleTagsAttrName() { return "op_bundle_tags"; }
     // TODO Restrict the usage of this to parameter attributes once there is an
     // alternative way of modeling memory effects on FunctionOpInterface.
     /// Name of the attribute that will cause the creation of a readnone memory
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
index ab40c8ec4b65..845c88b1be77 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMIntrinsicOps.td
@@ -120,7 +120,8 @@ def LLVM_Log2Op : LLVM_UnaryIntrOpF<"log2">;
 def LLVM_LogOp : LLVM_UnaryIntrOpF<"log">;
 def LLVM_Prefetch : LLVM_ZeroResultIntrOp<"prefetch", [0],
   /*traits=*/[], /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*immArgPositions=*/[1, 2, 3], /*immArgAttrNames=*/["rw", "hint", "cache"]
+  /*requiresOpBundles=*/0, /*immArgPositions=*/[1, 2, 3],
+  /*immArgAttrNames=*/["rw", "hint", "cache"]
 > {
   let arguments = (ins LLVM_AnyPointer:$addr, I32Attr:$rw, I32Attr:$hint, I32Attr:$cache);
 }
@@ -176,7 +177,8 @@ class LLVM_MemcpyIntrOpBase<string name> :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+    /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   AnySignlessInteger:$len, I1Attr:$isVolatile);
@@ -206,7 +208,8 @@ def LLVM_MemcpyInlineOp :
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[2, 3], /*immArgAttrNames=*/["len", "isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[2, 3],
+    /*immArgAttrNames=*/["len", "isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   Arg<LLVM_AnyPointer,"",[MemRead]>:$src,
                   APIntAttr:$len, I1Attr:$isVolatile);
@@ -232,7 +235,8 @@ def LLVM_MemsetOp : LLVM_ZeroResultIntrOp<"memset", [0, 2],
      DeclareOpInterfaceMethods<DestructurableAccessorOpInterface>,
      DeclareOpInterfaceMethods<SafeMemorySlotAccessOpInterface>],
     /*requiresAccessGroup=*/1, /*requiresAliasAnalysis=*/1,
-    /*immArgPositions=*/[3], /*immArgAttrNames=*/["isVolatile"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[3],
+    /*immArgAttrNames=*/["isVolatile"]> {
   dag args = (ins Arg<LLVM_AnyPointer,"",[MemWrite]>:$dst,
                   I8:$val, AnySignlessInteger:$len, I1Attr:$isVolatile);
   // Append the alias attributes defined by LLVM_IntrOpBase.
@@ -286,7 +290,8 @@ def LLVM_NoAliasScopeDeclOp
 class LLVM_LifetimeBaseOp<string opName> : LLVM_ZeroResultIntrOp<opName, [1],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*immArgPositions=*/[0], /*immArgAttrNames=*/["size"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+    /*immArgAttrNames=*/["size"]> {
   let arguments = (ins I64Attr:$size, LLVM_AnyPointer:$ptr);
   let assemblyFormat = "$size `,` $ptr attr-dict `:` qualified(type($ptr))";
 }
@@ -306,7 +311,8 @@ def LLVM_InvariantStartOp : LLVM_OneResultIntrOp<"invariant.start", [], [1],
 def LLVM_InvariantEndOp : LLVM_ZeroResultIntrOp<"invariant.end", [2],
     [DeclareOpInterfaceMethods<PromotableOpInterface>],
     /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-    /*immArgPositions=*/[1], /*immArgAttrNames=*/["size"]> {
+    /*requiresOpBundles=*/0, /*immArgPositions=*/[1],
+    /*immArgAttrNames=*/["size"]> {
   let arguments = (ins LLVM_DefaultPointer:$start,
                        I64Attr:$size,
                        LLVM_AnyPointer:$ptr);
@@ -368,7 +374,7 @@ class LLVM_ConstrainedIntr<string mnem, int numArgs,
     SmallVector<Value> mlirOperands;
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
-        llvmOperands.take_front( }] # numArgs # [{),
+        llvmOperands.take_front( }] # numArgs # [{), {}, false,
         {}, {}, mlirOperands, mlirAttrs))) {
       return failure();
     }
@@ -429,7 +435,26 @@ def LLVM_USHLSat : LLVM_BinarySameArgsIntrOpI<"ushl.sat">;
 //
 
 def LLVM_AssumeOp
-  : LLVM_ZeroResultIntrOp<"assume", []>, Arguments<(ins I1:$cond)>;
+    : LLVM_ZeroResultIntrOp<"assume", /*overloadedOperands=*/[], /*traits=*/[],
+                            /*requiresAccessGroup=*/0,
+                            /*requiresAliasAnalysis=*/0,
+                            /*requiresOpBundles=*/1> {
+  dag args = (ins I1:$cond);
+  let arguments = !con(args, opBundleArgs);
+
+  let assemblyFormat = [{
+    $cond
+    ( custom<OpBundles>($op_bundle_operands, type($op_bundle_operands),
+                        $op_bundle_tags)^ )?
+    `:` type($cond) attr-dict
+  }];
+
+  let builders = [
+    OpBuilder<(ins "Value":$cond)>
+  ];
+
+  let hasVerifier = 1;
+}
 
 def LLVM_SSACopyOp : LLVM_OneResultIntrOp<"ssa.copy", [], [0],
                                             [Pure, SameOperandsAndResultType]> {
@@ -992,7 +1017,8 @@ def LLVM_DebugTrap : LLVM_ZeroResultIntrOp<"debugtrap">;
 def LLVM_UBSanTrap : LLVM_ZeroResultIntrOp<"ubsantrap",
   /*overloadedOperands=*/[], /*traits=*/[],
   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-  /*immArgPositions=*/[0], /*immArgAttrNames=*/["failureKind"]> {
+  /*requiresOpBundles=*/0, /*immArgPositions=*/[0],
+  /*immArgAttrNames=*/["failureKind"]> {
   let arguments = (ins I8Attr:$failureKind);
 }
 
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
index c3d352d8d0dd..a38dafa4d9cf 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOpBase.td
@@ -291,7 +291,7 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                       list<int> overloadedResults, list<int> overloadedOperands,
                       list<Trait> traits, int numResults,
                       bit requiresAccessGroup = 0, bit requiresAliasAnalysis = 0,
-                      bit requiresFastmath = 0,
+                      bit requiresFastmath = 0, bit requiresOpBundles = 0,
                       list<int> immArgPositions = [],
                       list<string> immArgAttrNames = []>
     : LLVM_OpBase<dialect, opName, !listconcat(
@@ -313,6 +313,12 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
                  OptionalAttr<LLVM_AliasScopeArrayAttr>:$noalias_scopes,
                  OptionalAttr<LLVM_TBAATagArrayAttr>:$tbaa),
             (ins )));
+  dag opBundleArgs = !if(!gt(requiresOpBundles, 0),
+                         (ins VariadicOfVariadic<LLVM_Type,
+                                "op_bundle_sizes">:$op_bundle_operands,
+                              DenseI32ArrayAttr:$op_bundle_sizes,
+                              OptionalAttr<ArrayAttr>:$op_bundle_tags),
+                         (ins ));
   string llvmEnumName = enumName;
   string overloadedResultsCpp =  "{" # !interleave(overloadedResults, ", ") # "}";
   string overloadedOperandsCpp =  "{" # !interleave(overloadedOperands, ", ") # "}";
@@ -336,6 +342,8 @@ class LLVM_IntrOpBase<Dialect dialect, string opName, string enumName,
     SmallVector<NamedAttribute> mlirAttrs;
     if (failed(moduleImport.convertIntrinsicArguments(
       llvmOperands,
+      llvmOpBundles,
+      }] # !if(!gt(requiresOpBundles, 0), "true", "false") # [{,
       }] # immArgPositionsCpp # [{,
       }] # immArgAttrNamesCpp # [{,
       mlirOperands,
@@ -381,12 +389,14 @@ class LLVM_IntrOp<string mnem, list<int> overloadedResults,
                   list<int> overloadedOperands, list<Trait> traits,
                   int numResults, bit requiresAccessGroup = 0,
                   bit requiresAliasAnalysis = 0, bit requiresFastmath = 0,
+                  bit requiresOpBundles = 0,
                   list<int> immArgPositions = [],
                   list<string> immArgAttrNames = []>
     : LLVM_IntrOpBase<LLVM_Dialect, "intr." # mnem, !subst(".", "_", mnem),
                       overloadedResults, overloadedOperands, traits,
                       numResults, requiresAccessGroup, requiresAliasAnalysis,
-                      requiresFastmath, immArgPositions, immArgAttrNames>;
+                      requiresFastmath, requiresOpBundles, immArgPositions,
+                      immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning no results. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.".
@@ -406,11 +416,13 @@ class LLVM_ZeroResultIntrOp<string mnem, list<int> overloadedOperands = [],
                             list<Trait> traits = [],
                             bit requiresAccessGroup = 0,
                             bit requiresAliasAnalysis = 0,
+                            bit requiresOpBundles = 0,
                             list<int> immArgPositions = [],
                             list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, [], overloadedOperands, traits, /*numResults=*/0,
                   requiresAccessGroup, requiresAliasAnalysis,
-                  /*requiresFastMath=*/0, immArgPositions, immArgAttrNames>;
+                  /*requiresFastMath=*/0, requiresOpBundles, immArgPositions,
+                  immArgAttrNames>;
 
 // Base class for LLVM intrinsic operations returning one result. Places the
 // intrinsic into the LLVM dialect and prefixes its name with "intr.". This is
@@ -422,11 +434,12 @@ class LLVM_OneResultIntrOp<string mnem, list<int> overloadedResults = [],
                            list<int> overloadedOperands = [],
                            list<Trait> traits = [],
                            bit requiresFastmath = 0,
-                          list<int> immArgPositions = [],
-                          list<string> immArgAttrNames = []>
+                           list<int> immArgPositions = [],
+                           list<string> immArgAttrNames = []>
     : LLVM_IntrOp<mnem, overloadedResults, overloadedOperands, traits, 1,
                   /*requiresAccessGroup=*/0, /*requiresAliasAnalysis=*/0,
-                  requiresFastmath, immArgPositions, immArgAttrNames>;
+                  requiresFastmath, /*requiresOpBundles=*/0, immArgPositions,
+                  immArgAttrNames>;
 
 def LLVM_OneResultOpBuilder :
   OpBuilder<(ins "Type":$resultType, "ValueRange":$operands,
diff --git a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
index bbca7bc7286a..d5def510a904 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/LLVMOps.td
@@ -559,11 +559,7 @@ def LLVM_InvokeOp : LLVM_Op<"invoke", [
                    VariadicOfVariadic<LLVM_Type,
                                       "op_bundle_sizes">:$op_bundle_operands,
                    DenseI32ArrayAttr:$op_bundle_sizes,
-                   DefaultValuedProperty<
-                     ArrayProperty<StringProperty, "operand bundle tags">,
-                     "ArrayRef<std::string>{}",
-                     "SmallVector<std::string>{}"
-                   >:$op_bundle_tags);
+                   OptionalAttr<ArrayAttr>:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$result);
   let successors = (successor AnySuccessor:$normalDest,
                               AnySuccessor:$unwindDest);
@@ -678,11 +674,7 @@ def LLVM_CallOp : LLVM_MemAccessOpBase<"call",
                   VariadicOfVariadic<LLVM_Type,
                                      "op_bundle_sizes">:$op_bundle_operands,
                   DenseI32ArrayAttr:$op_bundle_sizes,
-                  DefaultValuedProperty<
-                    ArrayProperty<StringProperty, "operand bundle tags">,
-                    "ArrayRef<std::string>{}",
-                    "SmallVector<std::string>{}"
-                  >:$op_bundle_tags);
+                  OptionalAttr<ArrayAttr>:$op_bundle_tags);
   // Append the aliasing related attributes defined in LLVM_MemAccessOpBase.
   let arguments = !con(args, aliasAttrs);
   let results = (outs Optional<LLVM_Type>:$result);
@@ -1930,11 +1922,7 @@ def LLVM_CallIntrinsicOp
                        VariadicOfVariadic<LLVM_Type,
                                           "op_bundle_sizes">:$op_bundle_operands,
                        DenseI32ArrayAttr:$op_bundle_sizes,
-                       DefaultValuedProperty<
-                         ArrayProperty<StringProperty, "operand bundle tags">,
-                         "ArrayRef<std::string>{}",
-                         "SmallVector<std::string>{}"
-                       >:$op_bundle_tags);
+                       OptionalAttr<ArrayAttr>:$op_bundle_tags);
   let results = (outs Optional<LLVM_Type>:$results);
   let llvmBuilder = [{
     return convertCallLLVMIntrinsicOp(op, builder, moduleTranslation);
diff --git a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
index c40ae4b1016b..3695708439d9 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/ROCDLOps.td
@@ -98,7 +98,7 @@ class ROCDL_IntrOp<string mnemonic, list<int> overloadedResults,
   LLVM_IntrOpBase<ROCDL_Dialect,  mnemonic,
     "amdgcn_" # !subst(".", "_", mnemonic), overloadedResults,
     overloadedOperands, traits, numResults, requiresAccessGroup,
-    requiresAliasAnalysis, 0, immArgPositions, immArgAttrNames>;
+    requiresAliasAnalysis, 0, 0, immArgPositions, immArgAttrNames>;
 
 //===----------------------------------------------------------------------===//
 // ROCDL special register op definitions
diff --git a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
index 9f300bcafea5..bbb7af58d273 100644
--- a/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
+++ b/mlir/include/mlir/Target/LLVMIR/ModuleImport.h
@@ -243,6 +243,8 @@ public:
   /// corresponding MLIR attribute names.
   LogicalResult
   convertIntrinsicArguments(ArrayRef<llvm::Value *> values,
+                            ArrayRef<llvm::OperandBundleUse> opBundles,
+                            bool requiresOpBundles,
                             ArrayRef<unsigned> immArgPositions,
                             ArrayRef<StringLiteral> immArgAttrNames,
                             SmallVectorImpl<Value> &valuesOut,
diff --git a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
index 12ed8cc88ae7..cc73878a64ff 100644
--- a/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
+++ b/mlir/lib/Dialect/LLVMIR/IR/LLVMDialect.cpp
@@ -241,13 +241,18 @@ static void printOneOpBundle(OpAsmPrinter &p, OperandRange operands,
 static void printOpBundles(OpAsmPrinter &p, Operation *op,
                            OperandRangeRange opBundleOperands,
                            TypeRangeRange opBundleOperandTypes,
-                           ArrayRef<std::string> opBundleTags) {
+                           std::optional<ArrayAttr> opBundleTags) {
+  if (opBundleOperands.empty())
+    return;
+  assert(opBundleTags && "expect operand bundle tags");
+
   p << "[";
   llvm::interleaveComma(
-      llvm::zip(opBundleOperands, opBundleOperandTypes, opBundleTags), p,
+      llvm::zip(opBundleOperands, opBundleOperandTypes, *opBundleTags), p,
       [&p](auto bundle) {
+        auto bundleTag = cast<StringAttr>(std::get<2>(bundle)).getValue();
         printOneOpBundle(p, std::get<0>(bundle), std::get<1>(bundle),
-                         std::get<2>(bundle));
+                         bundleTag);
       });
   p << "]";
 }
@@ -256,7 +261,7 @@ static ParseResult parseOneOpBundle(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<std::string> &opBundleTags) {
+    SmallVector<Attribute> &opBundleTags) {
   SMLoc currentParserLoc = p.getCurrentLocation();
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<Type> types;
@@ -276,7 +281,7 @@ static ParseResult parseOneOpBundle(
 
   opBundleOperands.push_back(std::move(operands));
   opBundleOperandTypes.push_back(std::move(types));
-  opBundleTags.push_back(std::move(tag));
+  opBundleTags.push_back(StringAttr::get(p.getContext(), tag));
 
   return success();
 }
@@ -285,16 +290,17 @@ static std::optional<ParseResult> parseOpBundles(
     OpAsmParser &p,
     SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> &opBundleOperands,
     SmallVector<SmallVector<Type>> &opBundleOperandTypes,
-    SmallVector<std::string> &opBundleTags) {
+    ArrayAttr &opBundleTags) {
   if (p.parseOptionalLSquare())
     return std::nullopt;
 
   if (succeeded(p.parseOptionalRSquare()))
     return success();
 
+  SmallVector<Attribute> opBundleTagAttrs;
   auto bundleParser = [&] {
     return parseOneOpBundle(p, opBundleOperands, opBundleOperandTypes,
-                            opBundleTags);
+                            opBundleTagAttrs);
   };
   if (p.parseCommaSeparatedList(bundleParser))
     return failure();
@@ -302,6 +308,8 @@ static std::optional<ParseResult> parseOpBundles(
   if (p.parseRSquare())
     return failure();
 
+  opBundleTags = ArrayAttr::get(p.getContext(), opBundleTagAttrs);
+
   return success();
 }
 
@@ -1039,7 +1047,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, TypeRange results,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr,
         /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1066,7 +1074,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr,
         /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr,
         /*alias_scopes=*/nullptr, /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1079,7 +1087,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1092,7 +1100,7 @@ void CallOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
         /*fastmathFlags=*/nullptr, /*branch_weights=*/nullptr,
         /*CConv=*/nullptr, /*TailCallKind=*/nullptr, /*memory_effects=*/nullptr,
         /*convergent=*/nullptr, /*no_unwind=*/nullptr, /*will_return=*/nullptr,
-        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/std::nullopt,
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{},
         /*access_groups=*/nullptr, /*alias_scopes=*/nullptr,
         /*noalias_scopes=*/nullptr, /*tbaa=*/nullptr);
 }
@@ -1192,12 +1200,20 @@ LogicalResult verifyCallOpVarCalleeType(OpTy callOp) {
 template <typename OpType>
 static LogicalResult verifyOperandBundles(OpType &op) {
   OperandRangeRange opBundleOperands = op.getOpBundleOperands();
-  ArrayRef<std::string> opBundleTags = op.getOpBundleTags();
+  std::optional<ArrayAttr> opBundleTags = op.getOpBundleTags();
 
-  if (opBundleTags.size() != opBundleOperands.size())
+  auto isStringAttr = [](Attribute tagAttr) {
+    return isa<StringAttr>(tagAttr);
+  };
+  if (opBundleTags && !llvm::all_of(*opBundleTags, isStringAttr))
+    return op.emitError("operand bundle tag must be a StringAttr");
+
+  size_t numOpBundles = opBundleOperands.size();
+  size_t numOpBundleTags = opBundleTags ? opBundleTags->size() : 0;
+  if (numOpBundles != numOpBundleTags)
     return op.emitError("expected ")
-           << opBundleOperands.size()
-           << " operand bundle tags, but actually got " << opBundleTags.size();
+           << numOpBundles << " operand bundle tags, but actually got "
+           << numOpBundleTags;
 
   return success();
 }
@@ -1329,7 +1345,8 @@ void CallOp::print(OpAsmPrinter &p) {
                           {getCalleeAttrName(), getTailCallKindAttrName(),
                            getVarCalleeTypeAttrName(), getCConvAttrName(),
                            getOperandSegmentSizesAttrName(),
-                           getOpBundleSizesAttrName()});
+                           getOpBundleSizesAttrName(),
+                           getOpBundleTagsAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1437,7 +1454,7 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
   SmallVector<OpAsmParser::UnresolvedOperand> operands;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  SmallVector<std::string> opBundleTags;
+  ArrayAttr opBundleTags;
 
   // Default to C Calling Convention if no keyword is provided.
   result.addAttribute(
@@ -1483,9 +1500,9 @@ ParseResult CallOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (!opBundleTags.empty())
-    result.getOrAddProperties<CallOp::Properties>().op_bundle_tags =
-        std::move(opBundleTags);
+  if (opBundleTags && !opBundleTags.empty())
+    result.addAttribute(CallOp::getOpBundleTagsAttrName(result.name).getValue(),
+                        opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -1525,8 +1542,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, LLVMFuncOp func,
   auto calleeType = func.getFunctionType();
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), SymbolRefAttr::get(func), ops,
-        normalOps, unwindOps, nullptr, nullptr, {}, std::nullopt, normal,
-        unwind);
+        normalOps, unwindOps, nullptr, nullptr, {}, {}, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
@@ -1535,7 +1551,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state, TypeRange tys,
                      ValueRange unwindOps) {
   build(builder, state, tys,
         /*var_callee_type=*/nullptr, callee, ops, normalOps, unwindOps, nullptr,
-        nullptr, {}, std::nullopt, normal, unwind);
+        nullptr, {}, {}, normal, unwind);
 }
 
 void InvokeOp::build(OpBuilder &builder, OperationState &state,
@@ -1544,7 +1560,7 @@ void InvokeOp::build(OpBuilder &builder, OperationState &state,
                      Block *unwind, ValueRange unwindOps) {
   build(builder, state, getCallOpResultTypes(calleeType),
         getCallOpVarCalleeType(calleeType), callee, ops, normalOps, unwindOps,
-        nullptr, nullptr, {}, std::nullopt, normal, unwind);
+        nullptr, nullptr, {}, {}, normal, unwind);
 }
 
 SuccessorOperands InvokeOp::getSuccessorOperands(unsigned index) {
@@ -1634,7 +1650,8 @@ void InvokeOp::print(OpAsmPrinter &p) {
   p.printOptionalAttrDict((*this)->getAttrs(),
                           {getCalleeAttrName(), getOperandSegmentSizeAttr(),
                            getCConvAttrName(), getVarCalleeTypeAttrName(),
-                           getOpBundleSizesAttrName()});
+                           getOpBundleSizesAttrName(),
+                           getOpBundleTagsAttrName()});
 
   p << " : ";
   if (!isDirect)
@@ -1657,7 +1674,7 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
   TypeAttr varCalleeType;
   SmallVector<SmallVector<OpAsmParser::UnresolvedOperand>> opBundleOperands;
   SmallVector<SmallVector<Type>> opBundleOperandTypes;
-  SmallVector<std::string> opBundleTags;
+  ArrayAttr opBundleTags;
   Block *normalDest, *unwindDest;
   SmallVector<Value, 4> normalOperands, unwindOperands;
   Builder &builder = parser.getBuilder();
@@ -1703,9 +1720,10 @@ ParseResult InvokeOp::parse(OpAsmParser &parser, OperationState &result) {
           parser, opBundleOperands, opBundleOperandTypes, opBundleTags);
       result && failed(*result))
     return failure();
-  if (!opBundleTags.empty())
-    result.getOrAddProperties<InvokeOp::Properties>().op_bundle_tags =
-        std::move(opBundleTags);
+  if (opBundleTags && !opBundleTags.empty())
+    result.addAttribute(
+        InvokeOp::getOpBundleTagsAttrName(result.name).getValue(),
+        opBundleTags);
 
   if (parser.parseOptionalAttrDict(result.attributes))
     return failure();
@@ -3333,7 +3351,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3341,14 +3359,14 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, /*resultTypes=*/TypeRange{}, intrin, args,
         fastMathFlags,
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::Type resultType, mlir::StringAttr intrin,
                             mlir::ValueRange args) {
   build(builder, state, {resultType}, intrin, args, FastmathFlagsAttr{},
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
@@ -3356,7 +3374,7 @@ void CallIntrinsicOp::build(OpBuilder &builder, OperationState &state,
                             mlir::StringAttr intrin, mlir::ValueRange args,
                             mlir::LLVM::FastmathFlagsAttr fastMathFlags) {
   build(builder, state, resultTypes, intrin, args, fastMathFlags,
-        /*op_bundle_operands=*/{});
+        /*op_bundle_operands=*/{}, /*op_bundle_tags=*/{});
 }
 
 //===----------------------------------------------------------------------===//
@@ -3413,6 +3431,18 @@ void InlineAsmOp::getEffects(
   }
 }
 
+//===----------------------------------------------------------------------===//
+// AssumeOp (intrinsic)
+//===----------------------------------------------------------------------===//
+
+void LLVM::AssumeOp::build(OpBuilder &builder, OperationState &state,
+                           mlir::Value cond) {
+  return build(builder, state, cond, /*op_bundle_operands=*/{},
+               /*op_bundle_tags=*/{});
+}
+
+LogicalResult LLVM::AssumeOp::verify() { return verifyOperandBundles(*this); }
+
 //===----------------------------------------------------------------------===//
 // masked_gather (intrinsic)
 //===----------------------------------------------------------------------===//
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
index d034e576dfc5..4fd043c7c93e 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMIRToLLVMTranslation.cpp
@@ -68,6 +68,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
+
+    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+    llvmOpBundles.reserve(inst->getNumOperandBundles());
+    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
 #include "mlir/Dialect/LLVMIR/LLVMIntrinsicFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
index a8595d14ccf2..2084e527773c 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/LLVMIR/LLVMToLLVMIRTranslation.cpp
@@ -114,17 +114,27 @@ convertOperandBundle(OperandRange bundleOperands, StringRef bundleTag,
 }
 
 static SmallVector<llvm::OperandBundleDef>
-convertOperandBundles(OperandRangeRange bundleOperands,
-                      ArrayRef<std::string> bundleTags,
+convertOperandBundles(OperandRangeRange bundleOperands, ArrayAttr bundleTags,
                       LLVM::ModuleTranslation &moduleTranslation) {
   SmallVector<llvm::OperandBundleDef> bundles;
   bundles.reserve(bundleOperands.size());
 
-  for (auto [operands, tag] : llvm::zip_equal(bundleOperands, bundleTags))
+  for (auto [operands, tagAttr] : llvm::zip_equal(bundleOperands, bundleTags)) {
+    StringRef tag = cast<StringAttr>(tagAttr).getValue();
     bundles.push_back(convertOperandBundle(operands, tag, moduleTranslation));
+  }
   return bundles;
 }
 
+static SmallVector<llvm::OperandBundleDef>
+convertOperandBundles(OperandRangeRange bundleOperands,
+                      std::optional<ArrayAttr> bundleTags,
+                      LLVM::ModuleTranslation &moduleTranslation) {
+  if (!bundleTags)
+    return {};
+  return convertOperandBundles(bundleOperands, *bundleTags, moduleTranslation);
+}
+
 /// Builder for LLVM_CallIntrinsicOp
 static LogicalResult
 convertCallLLVMIntrinsicOp(CallIntrinsicOp op, llvm::IRBuilderBase &builder,
diff --git a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
index bc830a77f3c5..2c0b665ad0d8 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/NVVM/LLVMIRToNVVMTranslation.cpp
@@ -50,6 +50,12 @@ static LogicalResult convertIntrinsicImpl(OpBuilder &odsBuilder,
   if (isConvertibleIntrinsic(intrinsicID)) {
     SmallVector<llvm::Value *> args(inst->args());
     ArrayRef<llvm::Value *> llvmOperands(args);
+
+    SmallVector<llvm::OperandBundleUse> llvmOpBundles;
+    llvmOpBundles.reserve(inst->getNumOperandBundles());
+    for (unsigned i = 0; i < inst->getNumOperandBundles(); ++i)
+      llvmOpBundles.push_back(inst->getOperandBundleAt(i));
+
 #include "mlir/Dialect/LLVMIR/NVVMFromLLVMIRConversions.inc"
   }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleImport.cpp b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
index bd861f3a69e5..6e97b2a50af8 100644
--- a/mlir/lib/Target/LLVMIR/ModuleImport.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleImport.cpp
@@ -1311,7 +1311,8 @@ ModuleImport::convertValues(ArrayRef<llvm::Value *> values) {
 }
 
 LogicalResult ModuleImport::convertIntrinsicArguments(
-    ArrayRef<llvm::Value *> values, ArrayRef<unsigned> immArgPositions,
+    ArrayRef<llvm::Value *> values, ArrayRef<llvm::OperandBundleUse> opBundles,
+    bool requiresOpBundles, ArrayRef<unsigned> immArgPositions,
     ArrayRef<StringLiteral> immArgAttrNames, SmallVectorImpl<Value> &valuesOut,
     SmallVectorImpl<NamedAttribute> &attrsOut) {
   assert(immArgPositions.size() == immArgAttrNames.size() &&
@@ -1341,6 +1342,35 @@ LogicalResult ModuleImport::convertIntrinsicArguments(
     valuesOut.push_back(*mlirValue);
   }
 
+  SmallVector<int> opBundleSizes;
+  SmallVector<Attribute> opBundleTagAttrs;
+  if (requiresOpBundles) {
+    opBundleSizes.reserve(opBundles.size());
+    opBundleTagAttrs.reserve(opBundles.size());
+
+    for (const llvm::OperandBundleUse &bundle : opBundles) {
+      opBundleSizes.push_back(bundle.Inputs.size());
+      opBundleTagAttrs.push_back(StringAttr::get(context, bundle.getTagName()));
+
+      for (const llvm::Use &opBundleOperand : bundle.Inputs) {
+        auto operandMlirValue = convertValue(opBundleOperand.get());
+        if (failed(operandMlirValue))
+          return failure();
+        valuesOut.push_back(*operandMlirValue);
+      }
+    }
+
+    auto opBundleSizesAttr = DenseI32ArrayAttr::get(context, opBundleSizes);
+    auto opBundleSizesAttrNameAttr =
+        StringAttr::get(context, LLVMDialect::getOpBundleSizesAttrName());
+    attrsOut.push_back({opBundleSizesAttrNameAttr, opBundleSizesAttr});
+
+    auto opBundleTagsAttr = ArrayAttr::get(context, opBundleTagAttrs);
+    auto opBundleTagsAttrNameAttr =
+        StringAttr::get(context, LLVMDialect::getOpBundleTagsAttrName());
+    attrsOut.push_back({opBundleTagsAttrNameAttr, opBundleTagsAttr});
+  }
+
   return success();
 }
 
diff --git a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
index 6e005f9ec5df..ceb8ba3b3381 100644
--- a/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/ModuleTranslation.cpp
@@ -55,6 +55,7 @@
 #include "llvm/Transforms/Utils/BasicBlockUtils.h"
 #include "llvm/Transforms/Utils/Cloning.h"
 #include "llvm/Transforms/Utils/ModuleUtils.h"
+#include <numeric>
 #include <optional>
 
 #define DEBUG_TYPE "llvm-dialect-to-llvm-ir"
@@ -854,8 +855,40 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
          "LLVM `immArgPositions` and MLIR `immArgAttrNames` should have equal "
          "length");
 
+  SmallVector<llvm::OperandBundleDef> opBundles;
+  size_t numOpBundleOperands = 0;
+  auto opBundleSizesAttr = cast_if_present<DenseI32ArrayAttr>(
+      intrOp->getAttr(LLVMDialect::getOpBundleSizesAttrName()));
+  auto opBundleTagsAttr = cast_if_present<ArrayAttr>(
+      intrOp->getAttr(LLVMDialect::getOpBundleTagsAttrName()));
+
+  if (opBundleSizesAttr && opBundleTagsAttr) {
+    ArrayRef<int> opBundleSizes = opBundleSizesAttr.asArrayRef();
+    assert(opBundleSizes.size() == opBundleTagsAttr.size() &&
+           "operand bundles and tags do not match");
+
+    numOpBundleOperands =
+        std::accumulate(opBundleSizes.begin(), opBundleSizes.end(), size_t(0));
+    assert(numOpBundleOperands <= intrOp->getNumOperands() &&
+           "operand bundle operands is more than the number of operands");
+
+    ValueRange operands = intrOp->getOperands().take_back(numOpBundleOperands);
+    size_t nextOperandIdx = 0;
+    opBundles.reserve(opBundleSizesAttr.size());
+
+    for (auto [opBundleTagAttr, bundleSize] :
+         llvm::zip(opBundleTagsAttr, opBundleSizes)) {
+      auto bundleTag = cast<StringAttr>(opBundleTagAttr).str();
+      auto bundleOperands = moduleTranslation.lookupValues(
+          operands.slice(nextOperandIdx, bundleSize));
+      opBundles.emplace_back(std::move(bundleTag), std::move(bundleOperands));
+      nextOperandIdx += bundleSize;
+    }
+  }
+
   // Map operands and attributes to LLVM values.
-  auto operands = moduleTranslation.lookupValues(intrOp->getOperands());
+  auto opOperands = intrOp->getOperands().drop_back(numOpBundleOperands);
+  auto operands = moduleTranslation.lookupValues(opOperands);
   SmallVector<llvm::Value *> args(immArgPositions.size() + operands.size());
   for (auto [immArgPos, immArgName] :
        llvm::zip(immArgPositions, immArgAttrNames)) {
@@ -890,7 +923,7 @@ llvm::CallInst *mlir::LLVM::detail::createIntrinsicCall(
   llvm::Function *llvmIntr = llvm::Intrinsic::getOrInsertDeclaration(
       module, intrinsic, overloadedTypes);
 
-  return builder.CreateCall(llvmIntr, args);
+  return builder.CreateCall(llvmIntr, args, opBundles);
 }
 
 /// Given a single MLIR operation, create the corresponding LLVM IR operation
diff --git a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
index b86103422b07..55b1bc9c545a 100644
--- a/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/expand-then-convert-to-llvm.mlir
@@ -684,7 +684,7 @@ func.func @collapse_static_shape_with_non_identity_layout(%arg: memref<1x1x8x8xf
 // CHECK: %[[INT_TO_PTR:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
 // CHECK: %[[AND:.*]] = llvm.and %[[INT_TO_PTR]], {{.*}}  : i64
 // CHECK: %[[CMP:.*]] = llvm.icmp "eq" %[[AND]], {{.*}} : i64
-// CHECK: "llvm.intr.assume"(%[[CMP]]) : (i1) -> ()
+// CHECK: llvm.intr.assume %[[CMP]] : i1
 // CHECK: %[[LD_ADDR:.*]] = llvm.getelementptr %[[BUFF_ADDR]][%{{.*}}] : (!llvm.ptr, i64) -> !llvm.ptr, f32
 // CHECK: %[[VAL:.*]] = llvm.load %[[LD_ADDR]] : !llvm.ptr -> f32
 // CHECK: return %[[VAL]] : f32
diff --git a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
index 9dc22abf143b..48dc9079333d 100644
--- a/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
+++ b/mlir/test/Conversion/MemRefToLLVM/memref-to-llvm.mlir
@@ -160,7 +160,7 @@ func.func @assume_alignment(%0 : memref<4x4xf16>) {
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[PTR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
   memref.assume_alignment %0, 16 : memref<4x4xf16>
   return
 }
@@ -177,7 +177,7 @@ func.func @assume_alignment_w_offset(%0 : memref<4x4xf16, strided<[?, ?], offset
   // CHECK-NEXT: %[[INT:.*]] = llvm.ptrtoint %[[BUFF_ADDR]] : !llvm.ptr to i64
   // CHECK-NEXT: %[[MASKED_PTR:.*]] = llvm.and %[[INT]], %[[MASK:.*]] : i64
   // CHECK-NEXT: %[[CONDITION:.*]] = llvm.icmp "eq" %[[MASKED_PTR]], %[[ZERO]] : i64
-  // CHECK-NEXT: "llvm.intr.assume"(%[[CONDITION]]) : (i1) -> ()
+  // CHECK-NEXT: llvm.intr.assume %[[CONDITION]] : i1
   memref.assume_alignment %0, 16 : memref<4x4xf16, strided<[?, ?], offset: ?>>
   return
 }
diff --git a/mlir/test/Dialect/LLVMIR/inlining.mlir b/mlir/test/Dialect/LLVMIR/inlining.mlir
index f9551e311df5..0b7ca3f2bb04 100644
--- a/mlir/test/Dialect/LLVMIR/inlining.mlir
+++ b/mlir/test/Dialect/LLVMIR/inlining.mlir
@@ -18,7 +18,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
   "llvm.intr.memset"(%ptr, %byte, %0) <{isVolatile = true}> : (!llvm.ptr, i8, i32) -> ()
   "llvm.intr.memmove"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
   "llvm.intr.memcpy"(%ptr, %ptr, %0) <{isVolatile = true}> : (!llvm.ptr, !llvm.ptr, i32) -> ()
-  "llvm.intr.assume"(%true) : (i1) -> ()
+  llvm.intr.assume %true : i1
   llvm.fence release
   %2 = llvm.atomicrmw add %ptr, %0 monotonic : !llvm.ptr, i32
   %3 = llvm.cmpxchg %ptr, %0, %1 acq_rel monotonic : !llvm.ptr, i32
@@ -44,7 +44,7 @@ func.func @inner_func_inlinable(%ptr : !llvm.ptr) -> i32 {
 // CHECK: "llvm.intr.memset"(%[[PTR]]
 // CHECK: "llvm.intr.memmove"(%[[PTR]], %[[PTR]]
 // CHECK: "llvm.intr.memcpy"(%[[PTR]], %[[PTR]]
-// CHECK: "llvm.intr.assume"
+// CHECK: llvm.intr.assume
 // CHECK: llvm.fence release
 // CHECK: llvm.atomicrmw add %[[PTR]], %[[CST]] monotonic
 // CHECK: llvm.cmpxchg %[[PTR]], %[[CST]], %[[RES]] acq_rel monotonic
diff --git a/mlir/test/Dialect/LLVMIR/roundtrip.mlir b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
index 3062cdc38c0a..b8ce7db795a1 100644
--- a/mlir/test/Dialect/LLVMIR/roundtrip.mlir
+++ b/mlir/test/Dialect/LLVMIR/roundtrip.mlir
@@ -836,3 +836,30 @@ llvm.func @test_call_intrin_with_opbundle(%arg0 : !llvm.ptr) {
   llvm.call_intrinsic "llvm.assume"(%0) ["align"(%arg0, %1 : !llvm.ptr, i32)] : (i1) -> ()
   llvm.return
 }
+
+// CHECK-LABEL: @test_assume_intr_no_opbundle
+llvm.func @test_assume_intr_no_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  // CHECK: llvm.intr.assume %0 : i1
+  llvm.intr.assume %0 : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_empty_opbundle
+llvm.func @test_assume_intr_empty_opbundle(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  // CHECK: llvm.intr.assume %0 : i1
+  llvm.intr.assume %0 [] : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @test_assume_intr_with_opbundles
+llvm.func @test_assume_intr_with_opbundles(%arg0 : !llvm.ptr) {
+  %0 = llvm.mlir.constant(1 : i1) : i1
+  %1 = llvm.mlir.constant(2 : i32) : i32
+  %2 = llvm.mlir.constant(3 : i32) : i32
+  %3 = llvm.mlir.constant(4 : i32) : i32
+  // CHECK: llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+  llvm.intr.assume %0 ["tag1"(%1, %2 : i32, i32), "tag2"(%3 : i32)] : i1
+  llvm.return
+}
diff --git a/mlir/test/Target/LLVMIR/Import/intrinsic.ll b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
index 28a1bd21c82a..606b11175f57 100644
--- a/mlir/test/Target/LLVMIR/Import/intrinsic.ll
+++ b/mlir/test/Target/LLVMIR/Import/intrinsic.ll
@@ -630,11 +630,21 @@ define void @va_intrinsics_test(ptr %0, ptr %1, ...) {
 ; CHECK-LABEL: @assume
 ; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
 define void @assume(i1 %true) {
-  ; CHECK:  "llvm.intr.assume"(%[[TRUE]]) : (i1) -> ()
+  ; CHECK:  llvm.intr.assume %[[TRUE]] : i1
   call void @llvm.assume(i1 %true)
   ret void
 }
 
+; CHECK-LABEL: @assume_with_opbundles
+; CHECK-SAME:  %[[TRUE:[a-zA-Z0-9]+]]
+; CHECK-SAME:  %[[PTR:[a-zA-Z0-9]+]]
+define void @assume_with_opbundles(i1 %true, ptr %p) {
+  ; CHECK: %[[ALIGN:.+]] = llvm.mlir.constant(8 : i32) : i32
+  ; CHECK:  llvm.intr.assume %[[TRUE]] ["align"(%[[PTR]], %[[ALIGN]] : !llvm.ptr, i32)] : i1
+  call void @llvm.assume(i1 %true) ["align"(ptr %p, i32 8)]
+  ret void
+}
+
 ; CHECK-LABEL: @is_constant
 ; CHECK-SAME:  %[[VAL:[a-zA-Z0-9]+]]
 define void @is_constant(i32 %0) {
diff --git a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
index 0634a7ba907f..cb712eb4e126 100644
--- a/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-intrinsics.mlir
@@ -363,6 +363,21 @@ llvm.func @umin_test(%arg0: i32, %arg1: i32, %arg2: vector<8xi32>, %arg3: vector
   llvm.return
 }
 
+// CHECK-LABEL: @assume_without_opbundles
+llvm.func @assume_without_opbundles(%cond: i1) {
+  // CHECK: call void @llvm.assume(i1 %{{.+}})
+  llvm.intr.assume %cond : i1
+  llvm.return
+}
+
+// CHECK-LABEL: @assume_with_opbundles
+llvm.func @assume_with_opbundles(%cond: i1, %p: !llvm.ptr) {
+  %0 = llvm.mlir.constant(8 : i32) : i32
+  // CHECK: call void @llvm.assume(i1 %{{.+}}) [ "align"(ptr %{{.+}}, i32 8) ]
+  llvm.intr.assume %cond ["align"(%p, %0 : !llvm.ptr, i32)] : i1
+  llvm.return
+}
+
 // CHECK-LABEL: @vector_reductions
 llvm.func @vector_reductions(%arg0: f32, %arg1: vector<8xf32>, %arg2: vector<8xi32>) {
   // CHECK: call i32 @llvm.vector.reduce.add.v8i32
diff --git a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
index af0981440a17..15658ea60681 100644
--- a/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
+++ b/mlir/test/Target/LLVMIR/llvmir-invalid.mlir
@@ -188,7 +188,7 @@ llvm.func @sadd_overflow_intr_wrong_type(%arg0 : i32, %arg1 : f32) -> !llvm.stru
 
 llvm.func @assume_intr_wrong_type(%cond : i16) {
   // expected-error @below{{op operand #0 must be 1-bit signless integer, but got 'i16'}}
-  "llvm.intr.assume"(%cond) : (i16) -> ()
+  llvm.intr.assume %cond : i16
   llvm.return
 }
 
-- 
GitLab


From df0551298868b164197a4e54e9444120dc96ff53 Mon Sep 17 00:00:00 2001
From: VladiKrapp-Arm <vladi.krapp@arm.com>
Date: Wed, 16 Oct 2024 13:50:19 +0100
Subject: [PATCH 121/329] [ARM] Add test for thumb2-reduce-size NFC (#112333)

Check that t2MUL is reduced to tMUL
---
 llvm/test/CodeGen/Thumb2/avoidmuls.mir | 67 ++++++++++++++++++++++++++
 1 file changed, 67 insertions(+)
 create mode 100644 llvm/test/CodeGen/Thumb2/avoidmuls.mir

diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
new file mode 100644
index 000000000000..8d5567482d5c
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
@@ -0,0 +1,67 @@
+# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s
+
+--- |
+  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
+  target triple = "thumbv8m.main-arm-none-eabi"
+
+  ; Function Attrs: norecurse nounwind readnone
+  define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
+  entry:
+    %cmp6 = icmp sgt i32 %y, 0
+    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
+
+  for.body.preheader:                               ; preds = %entry
+    br label %for.body
+
+  for.cond.cleanup:                                 ; preds = %for.body, %entry
+    %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
+    ret i32 %sum.0.lcssa
+
+  for.body:                                         ; preds = %for.body, %for.body.preheader
+    %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
+    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
+    %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
+    %mul = mul nsw i32 %lsr.iv1, %sum.07
+    %lsr.iv.next = add i32 %lsr.iv, -1
+    %lsr.iv.next2 = add i32 %lsr.iv1, 1
+    %exitcond = icmp eq i32 %lsr.iv.next, 0
+    br i1 %exitcond, label %for.cond.cleanup, label %for.body
+  }
+
+  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
+
+...
+---
+name:            test
+tracksRegLiveness: true
+liveins:
+  - { reg: '$r0', virtual-reg: '' }
+  - { reg: '$r1', virtual-reg: '' }
+body:             |
+  bb.0.entry:
+    successors: %bb.1.for.body, %bb.2.for.cond.cleanup
+    liveins: $r0, $r1
+
+    $r2 = tMOVr $r0, 14, _
+    $r0 = t2MOVi 1, 14, _, _
+    t2CMPri $r1, 1, 14, _, implicit-def $cpsr
+    t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr
+
+  bb.1.for.body:
+    successors: %bb.2.for.cond.cleanup, %bb.1.for.body
+    liveins: $r0, $r1, $r2
+
+    $r0 = t2MUL $r2, killed $r0, 14, _
+    $r2 = t2ADDri killed $r2, 1, 14, _, _
+    $r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr
+    t2Bcc %bb.1.for.body, 1, killed $cpsr
+
+  bb.2.for.cond.cleanup:
+    liveins: $r0
+
+    tBX_RET 14, _, implicit $r0
+
+...
+# CHECK-LABEL: test
+# CHECK: tMUL
+# CHECK-NOT: t2MUL
-- 
GitLab


From bac436bd56ebce290279aa15a40e7c99db3c3591 Mon Sep 17 00:00:00 2001
From: Nico Weber <thakis@chromium.org>
Date: Wed, 16 Oct 2024 09:23:35 -0400
Subject: [PATCH 122/329] [gn] port 5f2cf99e146c (lldb darwin loader tblgen)

---
 .../Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn     | 14 ++++++++++++++
 1 file changed, 14 insertions(+)

diff --git a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
index 03e82576d673..f9249c208d99 100644
--- a/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
+++ b/llvm/utils/gn/secondary/lldb/source/Plugins/DynamicLoader/MacOSX-DYLD/BUILD.gn
@@ -1,3 +1,14 @@
+import("//lldb/utils/TableGen/lldb_tablegen.gni")
+
+lldb_tablegen("DynamicLoaderDarwinProperties") {
+  args = [ "-gen-lldb-property-defs" ]
+}
+
+lldb_tablegen("DynamicLoaderDarwinPropertiesEnum") {
+  args = [ "-gen-lldb-property-enum-defs" ]
+  td_file = "DynamicLoaderDarwinProperties.td"
+}
+
 static_library("MacOSX-DYLD") {
   output_name = "lldbPluginDynamicLoaderMacOSXDYLD"
   configs += [
@@ -5,6 +16,8 @@ static_library("MacOSX-DYLD") {
     "//llvm/utils/gn/build:lldb_code",
   ]
   deps = [
+    ":DynamicLoaderDarwinProperties",
+    ":DynamicLoaderDarwinPropertiesEnum",
     "//lldb/source/Breakpoint",
     "//lldb/source/Core",
     "//lldb/source/Expression",
@@ -21,6 +34,7 @@ static_library("MacOSX-DYLD") {
   include_dirs = [ "//lldb/source" ]
   sources = [
     "DynamicLoaderDarwin.cpp",
+    "DynamicLoaderDarwinProperties.cpp",
     "DynamicLoaderMacOS.cpp",
     "DynamicLoaderMacOSXDYLD.cpp",
   ]
-- 
GitLab


From fa5d3f6eeb7ca1c58e387ca5513d0255e4874e96 Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 16 Oct 2024 13:24:03 +0000
Subject: [PATCH 123/329] [gn build] Port 2e8ad49e7cff

---
 llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
index 5146c9a0f61d..85dfd7738c17 100644
--- a/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
+++ b/llvm/utils/gn/secondary/llvm/lib/Transforms/Vectorize/BUILD.gn
@@ -16,7 +16,9 @@ static_library("Vectorize") {
     "SandboxVectorizer/DependencyGraph.cpp",
     "SandboxVectorizer/Interval.cpp",
     "SandboxVectorizer/Passes/BottomUpVec.cpp",
+    "SandboxVectorizer/Passes/RegionsFromMetadata.cpp",
     "SandboxVectorizer/SandboxVectorizer.cpp",
+    "SandboxVectorizer/SandboxVectorizerPassBuilder.cpp",
     "SandboxVectorizer/SeedCollector.cpp",
     "VPlan.cpp",
     "VPlanAnalysis.cpp",
-- 
GitLab


From 8ce0fb86d3acc066a16637ea5c5691da984707a7 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Wed, 16 Oct 2024 21:32:02 +0800
Subject: [PATCH 124/329] [libc++][NFC] Reduce use of
 `__add_lvalue_reference_t` (#112497)

Currently, the occurrences of `__add_lvalue_reference_t` in
`__split_buffer` and `__assoc_state` are probably meaningless.

* In `__split_buffer`, the `__alloc_ref` and `__alloc_const_ref` member
  typedefs are no longer used.
* In `__assoc_state`, we should simply use `_Rp&`, which must be
  well-formed since it's already required that `sizeof(_Rp)` is
  well-formed.

This PR removes the meaningless usages. The remaining occurrences in
`shared_ptr`, `unique_ptr`, and several type traits are meaningful.
---
 libcxx/include/__split_buffer | 3 ---
 libcxx/include/future         | 4 ++--
 2 files changed, 2 insertions(+), 5 deletions(-)

diff --git a/libcxx/include/__split_buffer b/libcxx/include/__split_buffer
index dfe552fbb458..c4817601039f 100644
--- a/libcxx/include/__split_buffer
+++ b/libcxx/include/__split_buffer
@@ -80,9 +80,6 @@ public:
   pointer __end_;
   _LIBCPP_COMPRESSED_PAIR(pointer, __end_cap_, allocator_type, __alloc_);
 
-  using __alloc_ref       = __add_lvalue_reference_t<allocator_type>;
-  using __alloc_const_ref = __add_lvalue_reference_t<allocator_type>;
-
   __split_buffer(const __split_buffer&)            = delete;
   __split_buffer& operator=(const __split_buffer&) = delete;
 
diff --git a/libcxx/include/future b/libcxx/include/future
index dfa373d6593c..f16f4234c489 100644
--- a/libcxx/include/future
+++ b/libcxx/include/future
@@ -594,7 +594,7 @@ public:
   _LIBCPP_HIDE_FROM_ABI void set_value_at_thread_exit(_Arg&& __arg);
 
   _LIBCPP_HIDE_FROM_ABI _Rp move();
-  _LIBCPP_HIDE_FROM_ABI __add_lvalue_reference_t<_Rp> copy();
+  _LIBCPP_HIDE_FROM_ABI _Rp& copy();
 };
 
 template <class _Rp>
@@ -636,7 +636,7 @@ _Rp __assoc_state<_Rp>::move() {
 }
 
 template <class _Rp>
-__add_lvalue_reference_t<_Rp> __assoc_state<_Rp>::copy() {
+_Rp& __assoc_state<_Rp>::copy() {
   unique_lock<mutex> __lk(this->__mut_);
   this->__sub_wait(__lk);
   if (this->__exception_ != nullptr)
-- 
GitLab


From e839d2a60ac3149f09b3cda0816cf5074075733c Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 14:27:38 +0100
Subject: [PATCH 125/329] [X86] andnot-patterns.ll - tweak #112425 test
 patterns to use separate source values for ANDNOT operands

---
 llvm/test/CodeGen/X86/andnot-patterns.ll | 488 +++++++++++------------
 1 file changed, 231 insertions(+), 257 deletions(-)

diff --git a/llvm/test/CodeGen/X86/andnot-patterns.ll b/llvm/test/CodeGen/X86/andnot-patterns.ll
index 0ff4e3b47ae4..46ebe6ba7656 100644
--- a/llvm/test/CodeGen/X86/andnot-patterns.ll
+++ b/llvm/test/CodeGen/X86/andnot-patterns.ll
@@ -10,18 +10,14 @@ declare void @use_i64(i64)
 ; Fold (and X, (rotl (not Y), Z))) -> (and X, (not (rotl Y, Z)))
 ;
 
-define i64 @andnot_rotl_i64(i64 %a0, i64 %a1) nounwind {
+define i64 @andnot_rotl_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    notl %esi
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB0_1
@@ -29,116 +25,112 @@ define i64 @andnot_rotl_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    jmp .LBB0_3
 ; X86-NEXT:  .LBB0_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:  .LBB0_3:
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:    shldl %cl, %eax, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    shldl %cl, %esi, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    notq %rax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
 ; X64-NEXT:    rolq %cl, %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
-  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %not = xor i64 %a1, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2)
   %and = and i64 %rot, %a0
   ret i64 %and
 }
 
-define i32 @andnot_rotl_i32(i32 %a0, i32 %a1) nounwind {
+define i32 @andnot_rotl_i32(i32 %a0, i32 %a1, i32 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    roll %cl, %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
-  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %not = xor i32 %a1, -1
+  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a2)
   %and = and i32 %rot, %a0
   ret i32 %and
 }
 
-define i16 @andnot_rotl_i16(i16 %a0, i16 %a1) nounwind {
+define i16 @andnot_rotl_i16(i16 %a0, i16 %a1, i16 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    rolw %cl, %ax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolw %cl, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
-  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %not = xor i16 %a1, -1
+  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a2)
   %and = and i16 %rot, %a0
   ret i16 %and
 }
 
-define i8 @andnot_rotl_i8(i8 %a0, i8 %a1) nounwind {
+define i8 @andnot_rotl_i8(i8 %a0, i8 %a1, i8 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notb %al
 ; X86-NEXT:    rolb %cl, %al
-; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotl_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
 ; X64-NEXT:    rolb %cl, %al
 ; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-  %not = xor i8 %a0, -1
-  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %not = xor i8 %a1, -1
+  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a2)
   %and = and i8 %rot, %a0
   ret i8 %and
 }
 
-define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
+define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-LABEL: andnot_rotl_i64_multiuse:
 ; X86:       # %bb.0:
 ; X86-NEXT:    pushl %ebx
@@ -146,28 +138,28 @@ define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
-; X86-NEXT:    movl %edi, %eax
-; X86-NEXT:    notl %eax
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %edx
+; X86-NEXT:    notl %esi
 ; X86-NEXT:    testb $32, %cl
 ; X86-NEXT:    jne .LBB4_1
 ; X86-NEXT:  # %bb.2:
-; X86-NEXT:    movl %ebx, %edx
+; X86-NEXT:    movl %esi, %eax
 ; X86-NEXT:    jmp .LBB4_3
 ; X86-NEXT:  .LBB4_1:
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    movl %ebx, %eax
+; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl %esi, %edx
 ; X86-NEXT:  .LBB4_3:
-; X86-NEXT:    movl %eax, %ebx
-; X86-NEXT:    shldl %cl, %edx, %ebx
+; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    shldl %cl, %eax, %ebx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %eax, %edx
-; X86-NEXT:    andl %edx, %esi
+; X86-NEXT:    shldl %cl, %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    andl %eax, %esi
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    andl %ebx, %edi
 ; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edx
+; X86-NEXT:    pushl %eax
 ; X86-NEXT:    calll use_i64@PLT
 ; X86-NEXT:    addl $8, %esp
 ; X86-NEXT:    movl %esi, %eax
@@ -180,18 +172,19 @@ define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
 ; X64-LABEL: andnot_rotl_i64_multiuse:
 ; X64:       # %bb.0:
 ; X64-NEXT:    pushq %rbx
-; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    movq %rdx, %rcx
 ; X64-NEXT:    movq %rdi, %rbx
-; X64-NEXT:    notq %rdi
+; X64-NEXT:    notq %rsi
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    rolq %cl, %rdi
-; X64-NEXT:    andq %rdi, %rbx
+; X64-NEXT:    rolq %cl, %rsi
+; X64-NEXT:    andq %rsi, %rbx
+; X64-NEXT:    movq %rsi, %rdi
 ; X64-NEXT:    callq use_i64@PLT
 ; X64-NEXT:    movq %rbx, %rax
 ; X64-NEXT:    popq %rbx
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
-  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %not = xor i64 %a1, -1
+  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a2)
   %and = and i64 %rot, %a0
   call void @use_i64(i64 %rot)
   ret i64 %and
@@ -201,130 +194,122 @@ define i64 @andnot_rotl_i64_multiuse(i64 %a0, i64 %a1) nounwind {
 ; Fold (and X, (rotr (not Y), Z))) -> (and X, (not (rotr Y, Z)))
 ;
 
-define i64 @andnot_rotr_i64(i64 %a0, i64 %a1) nounwind {
+define i64 @andnot_rotr_i64(i64 %a0, i64 %a1, i64 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %ebx
-; X86-NEXT:    pushl %edi
 ; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
-; X86-NEXT:    movl %esi, %ebx
-; X86-NEXT:    notl %ebx
-; X86-NEXT:    movl %edi, %edx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
+; X86-NEXT:    notl %esi
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    testb $32, %cl
-; X86-NEXT:    jne .LBB5_1
+; X86-NEXT:    je .LBB5_1
 ; X86-NEXT:  # %bb.2:
 ; X86-NEXT:    movl %edx, %eax
 ; X86-NEXT:    jmp .LBB5_3
 ; X86-NEXT:  .LBB5_1:
-; X86-NEXT:    movl %ebx, %eax
-; X86-NEXT:    movl %edx, %ebx
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    movl %edx, %esi
 ; X86-NEXT:  .LBB5_3:
-; X86-NEXT:    movl %ebx, %edx
-; X86-NEXT:    shldl %cl, %eax, %edx
+; X86-NEXT:    movl %esi, %edx
+; X86-NEXT:    shrdl %cl, %eax, %edx
 ; X86-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X86-NEXT:    shldl %cl, %ebx, %eax
-; X86-NEXT:    andl %edi, %eax
-; X86-NEXT:    andl %esi, %edx
+; X86-NEXT:    shrdl %cl, %esi, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    popl %esi
-; X86-NEXT:    popl %edi
-; X86-NEXT:    popl %ebx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rsi, %rcx
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rdx, %rcx
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    notq %rax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $rcx
-; X64-NEXT:    rolq %cl, %rax
+; X64-NEXT:    rorq %cl, %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
-  %rot = tail call i64 @llvm.fshl.i64(i64 %not, i64 %not, i64 %a1)
+  %not = xor i64 %a1, -1
+  %rot = tail call i64 @llvm.fshr.i64(i64 %not, i64 %not, i64 %a2)
   %and = and i64 %rot, %a0
   ret i64 %and
 }
 
-define i32 @andnot_rotr_i32(i32 %a0, i32 %a1) nounwind {
+define i32 @andnot_rotr_i32(i32 %a0, i32 %a1, i32 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i32:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    roll %cl, %eax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    rorl %cl, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    roll %cl, %eax
+; X64-NEXT:    rorl %cl, %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
-  %rot = tail call i32 @llvm.fshl.i32(i32 %not, i32 %not, i32 %a1)
+  %not = xor i32 %a1, -1
+  %rot = tail call i32 @llvm.fshr.i32(i32 %not, i32 %not, i32 %a2)
   %and = and i32 %rot, %a0
   ret i32 %and
 }
 
-define i16 @andnot_rotr_i16(i16 %a0, i16 %a1) nounwind {
+define i16 @andnot_rotr_i16(i16 %a0, i16 %a1, i16 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i16:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
-; X86-NEXT:    rolw %cl, %ax
-; X86-NEXT:    andl %edx, %eax
+; X86-NEXT:    rorw %cl, %ax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rolw %cl, %ax
+; X64-NEXT:    rorw %cl, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
-  %rot = tail call i16 @llvm.fshl.i16(i16 %not, i16 %not, i16 %a1)
+  %not = xor i16 %a1, -1
+  %rot = tail call i16 @llvm.fshr.i16(i16 %not, i16 %not, i16 %a2)
   %and = and i16 %rot, %a0
   ret i16 %and
 }
 
-define i8 @andnot_rotr_i8(i8 %a0, i8 %a1) nounwind {
+define i8 @andnot_rotr_i8(i8 %a0, i8 %a1, i8 %a2) nounwind {
 ; X86-LABEL: andnot_rotr_i8:
 ; X86:       # %bb.0:
 ; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %edx
-; X86-NEXT:    movl %edx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notb %al
-; X86-NEXT:    rolb %cl, %al
-; X86-NEXT:    andb %dl, %al
+; X86-NEXT:    rorb %cl, %al
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_rotr_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %esi, %ecx
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %edx, %ecx
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notb %al
 ; X64-NEXT:    # kill: def $cl killed $cl killed $ecx
-; X64-NEXT:    rolb %cl, %al
+; X64-NEXT:    rorb %cl, %al
 ; X64-NEXT:    andb %dil, %al
+; X64-NEXT:    # kill: def $al killed $al killed $eax
 ; X64-NEXT:    retq
-  %not = xor i8 %a0, -1
-  %rot = tail call i8 @llvm.fshl.i8(i8 %not, i8 %not, i8 %a1)
+  %not = xor i8 %a1, -1
+  %rot = tail call i8 @llvm.fshr.i8(i8 %not, i8 %not, i8 %a2)
   %and = and i8 %rot, %a0
   ret i8 %and
 }
@@ -333,76 +318,73 @@ define i8 @andnot_rotr_i8(i8 %a0, i8 %a1) nounwind {
 ; Fold (and X, (bswap (not Y)))) -> (and X, (not (bswap Y)))
 ;
 
-define i64 @andnot_bswap_i64(i64 %a0) nounwind {
+define i64 @andnot_bswap_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: andnot_bswap_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
-; X86-NEXT:    movl %ecx, %edx
+; X86-NEXT:    notl %eax
 ; X86-NEXT:    notl %edx
 ; X86-NEXT:    bswapl %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    notl %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bswap_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
+; X64-NEXT:    movq %rsi, %rax
 ; X64-NEXT:    notq %rax
 ; X64-NEXT:    bswapq %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
+  %not = xor i64 %a1, -1
   %bswap = tail call i64 @llvm.bswap.i64(i64 %not)
   %and = and i64 %bswap, %a0
   ret i64 %and
 }
 
-define i32 @andnot_bswap_i32(i32 %a0) nounwind {
+define i32 @andnot_bswap_i32(i32 %a0, i32 %a1) nounwind {
 ; X86-LABEL: andnot_bswap_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bswap_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    bswapl %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
+  %not = xor i32 %a1, -1
   %bswap = tail call i32 @llvm.bswap.i32(i32 %not)
   %and = and i32 %bswap, %a0
   ret i32 %and
 }
 
-define i16 @andnot_bswap_i16(i16 %a0) nounwind {
+define i16 @andnot_bswap_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: andnot_bswap_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bswap_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    notl %eax
 ; X64-NEXT:    rolw $8, %ax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
+  %not = xor i16 %a1, -1
   %bswap = tail call i16 @llvm.bswap.i16(i16 %not)
   %and = and i16 %bswap, %a0
   ret i16 %and
@@ -412,72 +394,68 @@ define i16 @andnot_bswap_i16(i16 %a0) nounwind {
 ; Fold (and X, (bitreverse (not Y)))) -> (and X, (not (bitreverse Y)))
 ;
 
-define i64 @andnot_bitreverse_i64(i64 %a0) nounwind {
+define i64 @andnot_bitreverse_i64(i64 %a0, i64 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i64:
 ; X86:       # %bb.0:
-; X86-NEXT:    pushl %esi
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
 ; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    notl %eax
+; X86-NEXT:    notl %ecx
+; X86-NEXT:    bswapl %ecx
 ; X86-NEXT:    movl %ecx, %edx
-; X86-NEXT:    notl %edx
-; X86-NEXT:    bswapl %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %esi
-; X86-NEXT:    shrl $4, %edx
 ; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NEXT:    orl %esi, %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
-; X86-NEXT:    shrl $2, %edx
+; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    shrl $4, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    orl %edx, %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
-; X86-NEXT:    leal (%edx,%esi,4), %edx
-; X86-NEXT:    movl %edx, %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
-; X86-NEXT:    shrl %edx
+; X86-NEXT:    shrl $2, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
+; X86-NEXT:    leal (%ecx,%edx,4), %ecx
+; X86-NEXT:    movl %ecx, %edx
 ; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
-; X86-NEXT:    leal (%edx,%esi,2), %edx
-; X86-NEXT:    andl %eax, %edx
-; X86-NEXT:    notl %eax
+; X86-NEXT:    shrl %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
+; X86-NEXT:    leal (%ecx,%edx,2), %edx
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %esi
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    shrl $4, %eax
 ; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %esi, %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%esi,4), %eax
-; X86-NEXT:    movl %eax, %esi
-; X86-NEXT:    andl $1431655765, %esi # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%esi,2), %eax
-; X86-NEXT:    andl %ecx, %eax
-; X86-NEXT:    popl %esi
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %edx
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i64:
 ; X64:       # %bb.0:
-; X64-NEXT:    movq %rdi, %rax
-; X64-NEXT:    notq %rax
-; X64-NEXT:    bswapq %rax
-; X64-NEXT:    movq %rax, %rcx
-; X64-NEXT:    shrq $4, %rcx
-; X64-NEXT:    movabsq $1085102592571150095, %rdx # imm = 0xF0F0F0F0F0F0F0F
-; X64-NEXT:    andq %rdx, %rcx
-; X64-NEXT:    andq %rdx, %rax
-; X64-NEXT:    shlq $4, %rax
-; X64-NEXT:    orq %rcx, %rax
-; X64-NEXT:    movabsq $3689348814741910323, %rcx # imm = 0x3333333333333333
-; X64-NEXT:    movq %rax, %rdx
-; X64-NEXT:    andq %rcx, %rdx
-; X64-NEXT:    shrq $2, %rax
+; X64-NEXT:    notq %rsi
+; X64-NEXT:    bswapq %rsi
+; X64-NEXT:    movq %rsi, %rax
+; X64-NEXT:    shrq $4, %rax
+; X64-NEXT:    movabsq $1085102592571150095, %rcx # imm = 0xF0F0F0F0F0F0F0F
 ; X64-NEXT:    andq %rcx, %rax
-; X64-NEXT:    leaq (%rax,%rdx,4), %rax
+; X64-NEXT:    andq %rcx, %rsi
+; X64-NEXT:    shlq $4, %rsi
+; X64-NEXT:    orq %rax, %rsi
+; X64-NEXT:    movabsq $3689348814741910323, %rax # imm = 0x3333333333333333
+; X64-NEXT:    movq %rsi, %rcx
+; X64-NEXT:    andq %rax, %rcx
+; X64-NEXT:    shrq $2, %rsi
+; X64-NEXT:    andq %rax, %rsi
+; X64-NEXT:    leaq (%rsi,%rcx,4), %rax
 ; X64-NEXT:    movabsq $6148914691236517205, %rcx # imm = 0x5555555555555555
 ; X64-NEXT:    movq %rax, %rdx
 ; X64-NEXT:    andq %rcx, %rdx
@@ -486,54 +464,53 @@ define i64 @andnot_bitreverse_i64(i64 %a0) nounwind {
 ; X64-NEXT:    leaq (%rax,%rdx,2), %rax
 ; X64-NEXT:    andq %rdi, %rax
 ; X64-NEXT:    retq
-  %not = xor i64 %a0, -1
+  %not = xor i64 %a1, -1
   %bitrev = tail call i64 @llvm.bitreverse.i64(i64 %not)
   %and = and i64 %bitrev, %a0
   ret i64 %and
 }
 
-define i32 @andnot_bitreverse_i32(i32 %a0) nounwind {
+define i32 @andnot_bitreverse_i32(i32 %a0, i32 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i32:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    bswapl %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $252645135, %edx # imm = 0xF0F0F0F
-; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
+; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    shrl $4, %eax
 ; X86-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $858993459, %edx # imm = 0x33333333
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $858993459, %ecx # imm = 0x33333333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X86-NEXT:    leal (%eax,%edx,4), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $1431655765, %edx # imm = 0x55555555
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $1431655765, %eax # imm = 0x55555555
-; X86-NEXT:    leal (%eax,%edx,2), %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    andl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i32:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    notl %eax
-; X64-NEXT:    bswapl %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $252645135, %ecx # imm = 0xF0F0F0F
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    notl %esi
+; X64-NEXT:    bswapl %esi
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $252645135, %eax # imm = 0xF0F0F0F
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $858993459, %ecx # imm = 0x33333333
-; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shrl $4, %esi
+; X64-NEXT:    andl $252645135, %esi # imm = 0xF0F0F0F
+; X64-NEXT:    orl %eax, %esi
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $858993459, %eax # imm = 0x33333333
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    shrl $2, %esi
+; X64-NEXT:    andl $858993459, %esi # imm = 0x33333333
+; X64-NEXT:    leal (%rsi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $1431655765, %ecx # imm = 0x55555555
 ; X64-NEXT:    shrl %eax
@@ -541,55 +518,54 @@ define i32 @andnot_bitreverse_i32(i32 %a0) nounwind {
 ; X64-NEXT:    leal (%rax,%rcx,2), %eax
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    retq
-  %not = xor i32 %a0, -1
+  %not = xor i32 %a1, -1
   %bitrev = tail call i32 @llvm.bitreverse.i32(i32 %not)
   %and = and i32 %bitrev, %a0
   ret i32 %and
 }
 
-define i16 @andnot_bitreverse_i16(i16 %a0) nounwind {
+define i16 @andnot_bitreverse_i16(i16 %a0, i16 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i16:
 ; X86:       # %bb.0:
-; X86-NEXT:    movl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notl %eax
 ; X86-NEXT:    rolw $8, %ax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $3855, %edx # imm = 0xF0F
-; X86-NEXT:    shll $4, %edx
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $3855, %ecx # imm = 0xF0F
+; X86-NEXT:    shll $4, %ecx
 ; X86-NEXT:    shrl $4, %eax
 ; X86-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X86-NEXT:    orl %edx, %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $13107, %edx # imm = 0x3333
+; X86-NEXT:    orl %ecx, %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $13107, %ecx # imm = 0x3333
 ; X86-NEXT:    shrl $2, %eax
 ; X86-NEXT:    andl $13107, %eax # imm = 0x3333
-; X86-NEXT:    leal (%eax,%edx,4), %eax
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andl $21845, %edx # imm = 0x5555
+; X86-NEXT:    leal (%eax,%ecx,4), %eax
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andl $21845, %ecx # imm = 0x5555
 ; X86-NEXT:    shrl %eax
 ; X86-NEXT:    andl $21845, %eax # imm = 0x5555
-; X86-NEXT:    leal (%eax,%edx,2), %eax
-; X86-NEXT:    andl %ecx, %eax
+; X86-NEXT:    leal (%eax,%ecx,2), %eax
+; X86-NEXT:    andw {{[0-9]+}}(%esp), %ax
 ; X86-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i16:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    notl %eax
-; X64-NEXT:    rolw $8, %ax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $3855, %ecx # imm = 0xF0F
-; X64-NEXT:    shll $4, %ecx
-; X64-NEXT:    shrl $4, %eax
+; X64-NEXT:    # kill: def $esi killed $esi def $rsi
+; X64-NEXT:    notl %esi
+; X64-NEXT:    rolw $8, %si
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $3855, %eax # imm = 0xF0F
-; X64-NEXT:    orl %ecx, %eax
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andl $13107, %ecx # imm = 0x3333
-; X64-NEXT:    shrl $2, %eax
+; X64-NEXT:    shll $4, %eax
+; X64-NEXT:    shrl $4, %esi
+; X64-NEXT:    andl $3855, %esi # imm = 0xF0F
+; X64-NEXT:    orl %eax, %esi
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andl $13107, %eax # imm = 0x3333
-; X64-NEXT:    leal (%rax,%rcx,4), %eax
+; X64-NEXT:    shrl $2, %esi
+; X64-NEXT:    andl $13107, %esi # imm = 0x3333
+; X64-NEXT:    leal (%rsi,%rax,4), %eax
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andl $21845, %ecx # imm = 0x5555
 ; X64-NEXT:    shrl %eax
@@ -598,45 +574,43 @@ define i16 @andnot_bitreverse_i16(i16 %a0) nounwind {
 ; X64-NEXT:    andl %edi, %eax
 ; X64-NEXT:    # kill: def $ax killed $ax killed $eax
 ; X64-NEXT:    retq
-  %not = xor i16 %a0, -1
+  %not = xor i16 %a1, -1
   %bitrev = tail call i16 @llvm.bitreverse.i16(i16 %not)
   %and = and i16 %bitrev, %a0
   ret i16 %and
 }
 
-define i8 @andnot_bitreverse_i8(i8 %a0) nounwind {
+define i8 @andnot_bitreverse_i8(i8 %a0, i8 %a1) nounwind {
 ; X86-LABEL: andnot_bitreverse_i8:
 ; X86:       # %bb.0:
-; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %ecx
-; X86-NEXT:    movl %ecx, %eax
+; X86-NEXT:    movzbl {{[0-9]+}}(%esp), %eax
 ; X86-NEXT:    notb %al
 ; X86-NEXT:    rolb $4, %al
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb $51, %dl
-; X86-NEXT:    shlb $2, %dl
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $51, %cl
+; X86-NEXT:    shlb $2, %cl
 ; X86-NEXT:    shrb $2, %al
 ; X86-NEXT:    andb $51, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    movl %eax, %edx
-; X86-NEXT:    andb $85, %dl
-; X86-NEXT:    addb %dl, %dl
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    movl %eax, %ecx
+; X86-NEXT:    andb $85, %cl
+; X86-NEXT:    addb %cl, %cl
 ; X86-NEXT:    shrb %al
 ; X86-NEXT:    andb $85, %al
-; X86-NEXT:    orb %dl, %al
-; X86-NEXT:    andb %cl, %al
+; X86-NEXT:    orb %cl, %al
+; X86-NEXT:    andb {{[0-9]+}}(%esp), %al
 ; X86-NEXT:    retl
 ;
 ; X64-LABEL: andnot_bitreverse_i8:
 ; X64:       # %bb.0:
-; X64-NEXT:    movl %edi, %eax
-; X64-NEXT:    notb %al
-; X64-NEXT:    rolb $4, %al
-; X64-NEXT:    movl %eax, %ecx
-; X64-NEXT:    andb $51, %cl
-; X64-NEXT:    shlb $2, %cl
-; X64-NEXT:    shrb $2, %al
+; X64-NEXT:    notb %sil
+; X64-NEXT:    rolb $4, %sil
+; X64-NEXT:    movl %esi, %eax
 ; X64-NEXT:    andb $51, %al
-; X64-NEXT:    orb %cl, %al
+; X64-NEXT:    shlb $2, %al
+; X64-NEXT:    shrb $2, %sil
+; X64-NEXT:    andb $51, %sil
+; X64-NEXT:    orb %sil, %al
 ; X64-NEXT:    movl %eax, %ecx
 ; X64-NEXT:    andb $85, %cl
 ; X64-NEXT:    addb %cl, %cl
@@ -645,7 +619,7 @@ define i8 @andnot_bitreverse_i8(i8 %a0) nounwind {
 ; X64-NEXT:    orb %cl, %al
 ; X64-NEXT:    andb %dil, %al
 ; X64-NEXT:    retq
-  %not = xor i8 %a0, -1
+  %not = xor i8 %a1, -1
   %bitrev = tail call i8 @llvm.bitreverse.i8(i8 %not)
   %and = and i8 %bitrev, %a0
   ret i8 %and
-- 
GitLab


From cbe03646c620d08da69eac241450f32f4025c635 Mon Sep 17 00:00:00 2001
From: "A. Jiang" <de34@live.cn>
Date: Wed, 16 Oct 2024 21:34:31 +0800
Subject: [PATCH 126/329] [libc++][ranges] LWG3692: `zip_view::iterator`'s
 `operator<=>` is overconstrained and changes of `zip_view` in P2165R4
 (#112077)

The changes are nearly pure simplifications, so I think it's OK to do
them together in the same PR.

Actual test coverages were already added in commit ad41d1e26b12
(https://reviews.llvm.org/D141216). Thanks to Casey Carter!

Fixes #104975
Towards #105200
---
 libcxx/docs/Status/Cxx23Issues.csv            |  2 +-
 libcxx/docs/Status/Cxx23Papers.csv            |  2 +-
 libcxx/include/__ranges/zip_view.h            | 55 +++----------------
 .../range.adaptors/range.zip/cpo.pass.cpp     |  4 --
 .../range.zip/ctor.default.pass.cpp           |  6 +-
 .../range.zip/iterator/compare.pass.cpp       | 16 +-----
 .../range.zip/iterator/deref.pass.cpp         |  8 ---
 .../iterator/member_types.compile.pass.cpp    | 14 +----
 .../range.zip/iterator/subscript.pass.cpp     |  8 ---
 9 files changed, 13 insertions(+), 102 deletions(-)

diff --git a/libcxx/docs/Status/Cxx23Issues.csv b/libcxx/docs/Status/Cxx23Issues.csv
index 63e4176ecba1..cfa721230e5f 100644
--- a/libcxx/docs/Status/Cxx23Issues.csv
+++ b/libcxx/docs/Status/Cxx23Issues.csv
@@ -168,7 +168,7 @@
 "`LWG3672 <https://wg21.link/LWG3672>`__","``common_iterator::operator->()`` should return by value","2022-07 (Virtual)","|Complete|","19.0",""
 "`LWG3683 <https://wg21.link/LWG3683>`__","``operator==`` for ``polymorphic_allocator`` cannot deduce template argument in common cases","2022-07 (Virtual)","|Complete|","20.0",""
 "`LWG3687 <https://wg21.link/LWG3687>`__","``expected<cv void, E>`` move constructor should move","2022-07 (Virtual)","|Complete|","16.0",""
-"`LWG3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","","",""
+"`LWG3692 <https://wg21.link/LWG3692>`__","``zip_view::iterator``'s ``operator<=>`` is overconstrained","2022-07 (Virtual)","|Complete|","20.0",""
 "`LWG3701 <https://wg21.link/LWG3701>`__","Make ``formatter<remove_cvref_t<const charT[N]>, charT>`` requirement explicit","2022-07 (Virtual)","|Complete|","15.0",""
 "`LWG3702 <https://wg21.link/LWG3702>`__","Should ``zip_transform_view::iterator`` remove ``operator<``","2022-07 (Virtual)","","",""
 "`LWG3703 <https://wg21.link/LWG3703>`__","Missing requirements for ``expected<T, E>`` requires ``is_void<T>``","2022-07 (Virtual)","|Complete|","16.0",""
diff --git a/libcxx/docs/Status/Cxx23Papers.csv b/libcxx/docs/Status/Cxx23Papers.csv
index da7b58818771..c64f1c4171fc 100644
--- a/libcxx/docs/Status/Cxx23Papers.csv
+++ b/libcxx/docs/Status/Cxx23Papers.csv
@@ -60,7 +60,7 @@
 "`P1642R11 <https://wg21.link/P1642R11>`__","Freestanding ``[utilities]``, ``[ranges]``, and ``[iterators]``","2022-07 (Virtual)","","",""
 "`P1899R3 <https://wg21.link/P1899R3>`__","``stride_view``","2022-07 (Virtual)","","",""
 "`P2093R14 <https://wg21.link/P2093R14>`__","Formatted output","2022-07 (Virtual)","|Complete|","18.0",""
-"`P2165R4 <https://wg21.link/P2165R4>`__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","","",""
+"`P2165R4 <https://wg21.link/P2165R4>`__","Compatibility between ``tuple``, ``pair`` and ``tuple-like`` objects","2022-07 (Virtual)","|Partial|","","Only the part for ``zip_view`` is implemented."
 "`P2278R4 <https://wg21.link/P2278R4>`__","``cbegin`` should always return a constant iterator","2022-07 (Virtual)","","",""
 "`P2286R8 <https://wg21.link/P2286R8>`__","Formatting Ranges","2022-07 (Virtual)","|Complete|","16.0",""
 "`P2291R3 <https://wg21.link/P2291R3>`__","Add Constexpr Modifiers to Functions ``to_chars`` and ``from_chars`` for Integral Types in ``<charconv>`` Header","2022-07 (Virtual)","|Complete|","16.0",""
diff --git a/libcxx/include/__ranges/zip_view.h b/libcxx/include/__ranges/zip_view.h
index fe3c87a9306f..835e23cb23af 100644
--- a/libcxx/include/__ranges/zip_view.h
+++ b/libcxx/include/__ranges/zip_view.h
@@ -36,7 +36,6 @@
 #include <__utility/forward.h>
 #include <__utility/integer_sequence.h>
 #include <__utility/move.h>
-#include <__utility/pair.h>
 #include <tuple>
 
 #if !defined(_LIBCPP_HAS_NO_PRAGMA_SYSTEM_HEADER)
@@ -58,22 +57,11 @@ concept __zip_is_common =
     (!(bidirectional_range<_Ranges> && ...) && (common_range<_Ranges> && ...)) ||
     ((random_access_range<_Ranges> && ...) && (sized_range<_Ranges> && ...));
 
-template <typename _Tp, typename _Up>
-auto __tuple_or_pair_test() -> pair<_Tp, _Up>;
-
-template <typename... _Types>
-  requires(sizeof...(_Types) != 2)
-auto __tuple_or_pair_test() -> tuple<_Types...>;
-
-template <class... _Types>
-using __tuple_or_pair = decltype(__tuple_or_pair_test<_Types...>());
-
 template <class _Fun, class _Tuple>
 _LIBCPP_HIDE_FROM_ABI constexpr auto __tuple_transform(_Fun&& __f, _Tuple&& __tuple) {
   return std::apply(
       [&]<class... _Types>(_Types&&... __elements) {
-        return __tuple_or_pair<invoke_result_t<_Fun&, _Types>...>(
-            std::invoke(__f, std::forward<_Types>(__elements))...);
+        return tuple<invoke_result_t<_Fun&, _Types>...>(std::invoke(__f, std::forward<_Types>(__elements))...);
       },
       std::forward<_Tuple>(__tuple));
 }
@@ -88,7 +76,7 @@ _LIBCPP_HIDE_FROM_ABI constexpr void __tuple_for_each(_Fun&& __f, _Tuple&& __tup
 }
 
 template <class _Fun, class _Tuple1, class _Tuple2, size_t... _Indices>
-_LIBCPP_HIDE_FROM_ABI constexpr __tuple_or_pair<
+_LIBCPP_HIDE_FROM_ABI constexpr tuple<
     invoke_result_t<_Fun&,
                     typename tuple_element<_Indices, remove_cvref_t<_Tuple1>>::type,
                     typename tuple_element<_Indices, remove_cvref_t<_Tuple2>>::type>...>
@@ -250,10 +238,9 @@ template <input_range... _Views>
   requires(view<_Views> && ...) && (sizeof...(_Views) > 0)
 template <bool _Const>
 class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base<_Const, _Views...> {
-  __tuple_or_pair<iterator_t<__maybe_const<_Const, _Views>>...> __current_;
+  tuple<iterator_t<__maybe_const<_Const, _Views>>...> __current_;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(
-      __tuple_or_pair<iterator_t<__maybe_const<_Const, _Views>>...> __current)
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __iterator(tuple<iterator_t<__maybe_const<_Const, _Views>>...> __current)
       : __current_(std::move(__current)) {}
 
   template <bool>
@@ -266,7 +253,7 @@ class zip_view<_Views...>::__iterator : public __zip_view_iterator_category_base
 
 public:
   using iterator_concept = decltype(__get_zip_view_iterator_tag<_Const, _Views...>());
-  using value_type       = __tuple_or_pair<range_value_t<__maybe_const<_Const, _Views>>...>;
+  using value_type       = tuple<range_value_t<__maybe_const<_Const, _Views>>...>;
   using difference_type  = common_type_t<range_difference_t<__maybe_const<_Const, _Views>>...>;
 
   _LIBCPP_HIDE_FROM_ABI __iterator() = default;
@@ -340,33 +327,8 @@ public:
     }
   }
 
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return __x.__current_ < __y.__current_;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return __y < __x;
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator<=(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return !(__y < __x);
-  }
-
-  _LIBCPP_HIDE_FROM_ABI friend constexpr bool operator>=(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...>
-  {
-    return !(__x < __y);
-  }
-
   _LIBCPP_HIDE_FROM_ABI friend constexpr auto operator<=>(const __iterator& __x, const __iterator& __y)
-    requires __zip_all_random_access<_Const, _Views...> &&
-             (three_way_comparable<iterator_t<__maybe_const<_Const, _Views>>> && ...)
+    requires __zip_all_random_access<_Const, _Views...>
   {
     return __x.__current_ <=> __y.__current_;
   }
@@ -427,10 +389,9 @@ template <input_range... _Views>
   requires(view<_Views> && ...) && (sizeof...(_Views) > 0)
 template <bool _Const>
 class zip_view<_Views...>::__sentinel {
-  __tuple_or_pair<sentinel_t<__maybe_const<_Const, _Views>>...> __end_;
+  tuple<sentinel_t<__maybe_const<_Const, _Views>>...> __end_;
 
-  _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(
-      __tuple_or_pair<sentinel_t<__maybe_const<_Const, _Views>>...> __end)
+  _LIBCPP_HIDE_FROM_ABI constexpr explicit __sentinel(tuple<sentinel_t<__maybe_const<_Const, _Views>>...> __end)
       : __end_(__end) {}
 
   friend class zip_view<_Views...>;
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
index ea5953cefa0f..bdfd58ff8bbe 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/cpo.pass.cpp
@@ -63,11 +63,7 @@ constexpr bool test() {
         std::ranges::zip_view<std::ranges::zip_view<SizedRandomAccessView, SizedRandomAccessView>>> decltype(auto) v2 =
         std::views::zip(v);
 
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v2)>, std::tuple<std::pair<int&, int&>>>);
-#else
     static_assert(std::is_same_v<std::ranges::range_reference_t<decltype(v2)>, std::tuple<std::tuple<int&, int&>>>);
-#endif
   }
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
index f53289621eab..fdfcc02a8fb1 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/ctor.default.pass.cpp
@@ -49,12 +49,8 @@ constexpr bool test() {
     using View = std::ranges::zip_view<DefaultConstructibleView, DefaultConstructibleView>;
     View v = View(); // the default constructor is not explicit
     assert(v.size() == 3);
-    auto it = v.begin();
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    using Value = std::pair<const int&, const int&>;
-#else
+    auto it     = v.begin();
     using Value = std::tuple<const int&, const int&>;
-#endif
     assert(*it++ == Value(buff[0], buff[0]));
     assert(*it++ == Value(buff[1], buff[1]));
     assert(*it == Value(buff[2], buff[2]));
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
index ed1cb0ccebd2..8ab734680009 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/compare.pass.cpp
@@ -10,17 +10,8 @@
 
 // friend constexpr bool operator==(const iterator& x, const iterator& y)
 //   requires (equality_comparable<iterator_t<maybe-const<Const, Views>>> && ...);
-// friend constexpr bool operator<(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
-// friend constexpr bool operator>(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
-// friend constexpr bool operator<=(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
-// friend constexpr bool operator>=(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...>;
 // friend constexpr auto operator<=>(const iterator& x, const iterator& y)
-//   requires all-random-access<Const, Views...> &&
-//            (three_way_comparable<iterator_t<maybe-const<Const, Views>>> && ...);
+//   requires all-random-access<Const, Views...>;
 
 #include <ranges>
 #include <compare>
@@ -165,12 +156,7 @@ constexpr bool test() {
     using Subrange = std::ranges::subrange<It>;
     static_assert(!std::three_way_comparable<It>);
     using R = std::ranges::zip_view<Subrange, Subrange>;
-#ifdef _LIBCPP_VERSION
-    // libc++ hasn't implemented LWG-3692 "zip_view::iterator's operator<=> is overconstrained"
-    static_assert(!std::three_way_comparable<std::ranges::iterator_t<R>>);
-#else
     static_assert(std::three_way_comparable<std::ranges::iterator_t<R>>);
-#endif
 
     int a[] = {1, 2, 3, 4};
     int b[] = {5, 6, 7, 8, 9};
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
index 569d04097219..fb58aa28fbdf 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/deref.pass.cpp
@@ -42,11 +42,7 @@ constexpr bool test() {
     auto [x, y] = *it;
     assert(&x == &(a[0]));
     assert(&y == &(b[0]));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(*it), std::pair<int&, double&>>);
-#else
     static_assert(std::is_same_v<decltype(*it), std::tuple<int&, double&>>);
-#endif
 
     x = 5;
     y = 0.1;
@@ -70,11 +66,7 @@ constexpr bool test() {
     auto it = v.begin();
     assert(&(std::get<0>(*it)) == &(a[0]));
     assert(&(std::get<1>(*it)) == &(a[0]));
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(*it), std::pair<int&, int const&>>);
-#else
     static_assert(std::is_same_v<decltype(*it), std::tuple<int&, int const&>>);
-#endif
   }
   return true;
 }
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
index c19f6c2b1652..2f2f0fc4f4e3 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/member_types.compile.pass.cpp
@@ -65,7 +65,7 @@ struct ConstVeryDifferentRange {
 void test() {
   int buffer[] = {1, 2, 3, 4};
   {
-    // 2 views should have pair value_type
+    // 2 views should have 2-tuple value_type
     // random_access_iterator_tag
     std::ranges::zip_view v(buffer, buffer);
     using Iter = decltype(v.begin());
@@ -73,11 +73,7 @@ void test() {
     static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
     static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
     static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<Iter::value_type, std::pair<int, int>>);
-#else
     static_assert(std::is_same_v<Iter::value_type, std::tuple<int, int>>);
-#endif
     static_assert(HasIterCategory<Iter>);
   }
 
@@ -124,11 +120,7 @@ void test() {
     static_assert(std::is_same_v<Iter::iterator_concept, std::random_access_iterator_tag>);
     static_assert(std::is_same_v<Iter::iterator_category, std::input_iterator_tag>);
     static_assert(std::is_same_v<Iter::difference_type, std::ptrdiff_t>);
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<Iter::value_type, std::pair<int, std::pair<int, int>>>);
-#else
     static_assert(std::is_same_v<Iter::value_type, std::tuple<int, std::tuple<int, int>>>);
-#endif
     static_assert(HasIterCategory<Iter>);
   }
 
@@ -169,11 +161,7 @@ void test() {
     // value_type of multiple views with different value_type
     std::ranges::zip_view v{foos, bars};
     using Iter = decltype(v.begin());
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<Iter::value_type, std::pair<Foo, Bar>>);
-#else
     static_assert(std::is_same_v<Iter::value_type, std::tuple<Foo, Bar>>);
-#endif
   }
 
   {
diff --git a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
index 1538d763205d..ba3abfa2a436 100644
--- a/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
+++ b/libcxx/test/std/ranges/range.adaptors/range.zip/iterator/subscript.pass.cpp
@@ -27,11 +27,7 @@ constexpr bool test() {
     assert(it[2] == *(it + 2));
     assert(it[4] == *(it + 4));
 
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(it[2]), std::pair<int&, int>>);
-#else
     static_assert(std::is_same_v<decltype(it[2]), std::tuple<int&, int>>);
-#endif
   }
 
   {
@@ -42,11 +38,7 @@ constexpr bool test() {
     assert(it[2] == *(it + 2));
     assert(it[4] == *(it + 4));
 
-#ifdef _LIBCPP_VERSION // libc++ doesn't implement P2165R4 yet
-    static_assert(std::is_same_v<decltype(it[2]), std::pair<int&, int&>>);
-#else
     static_assert(std::is_same_v<decltype(it[2]), std::tuple<int&, int&>>);
-#endif
   }
 
   {
-- 
GitLab


From 658ff0b84c9dd5f33f4d769ba7378cc2c64315a1 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 16 Oct 2024 09:37:27 -0400
Subject: [PATCH 127/329] [RISCV][VLOPT] Add support for integer widening
 multiply instructions (#112204)

This adds support for these instructions and also tests getOperandInfo
for these instructions as well.
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp   |   7 +-
 llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll | 126 +++++++++++++++++++
 2 files changed, 132 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 53373b7a0f15..6053899987db 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -563,7 +563,12 @@ static bool isSupportedInstr(const MachineInstr &MI) {
   case RISCV::VREM_VV:
   case RISCV::VREM_VX:
   // Vector Widening Integer Multiply Instructions
-  // FIXME: Add support
+  case RISCV::VWMUL_VV:
+  case RISCV::VWMUL_VX:
+  case RISCV::VWMULSU_VV:
+  case RISCV::VWMULSU_VX:
+  case RISCV::VWMULU_VV:
+  case RISCV::VWMULU_VX:
   // Vector Single-Width Integer Multiply-Add Instructions
   // FIXME: Add support
   // Vector Widening Integer Multiply-Add Instructions
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
index a360ae1998f7..11f603b56b6e 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt-instrs.ll
@@ -1122,6 +1122,132 @@ define <vscale x 4 x i32> @vrem_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
   ret <vscale x 4 x i32> %2
 }
 
+define <vscale x 4 x i64> @vwmul_vv(<vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmul_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmul.vv v12, v8, v9
+; NOVLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmul.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmul_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmul.vv v12, v8, v9
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmul.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, <vscale x 4 x i16> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, <vscale x 4 x i32> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmul_vx(<vscale x 4 x i16> %a, i16 %b, i32 %c, iXLen %vl) {
+; NOVLOPT-LABEL: vwmul_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a3, zero, e16, m1, ta, ma
+; NOVLOPT-NEXT:    vwmul.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a2, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmul.vx v8, v12, a1
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmul_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a2, e16, m1, ta, ma
+; VLOPT-NEXT:    vwmul.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmul.vx v8, v12, a1
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i32> @llvm.riscv.vwmul.nxv4i32.nxv4i16.i16(<vscale x 4 x i32> poison, <vscale x 4 x i16> %a, i16 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vwmul.nxv4i64.nxv4i64.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %1, i32 %c, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulsu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulsu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulsu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulsu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulsu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulsu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulsu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulsu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulsu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulsu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulsu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulu_vv(<vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulu_vv:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a1, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulu.vv v12, v8, v10
+; NOVLOPT-NEXT:    vsetvli zero, a0, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulu_vv:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a0, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulu.vv v12, v8, v10
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.nxv4i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
+define <vscale x 4 x i64> @vwmulu_vx(<vscale x 4 x i32> %a, i32 %b, iXLen %vl) {
+; NOVLOPT-LABEL: vwmulu_vx:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a2, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vwmulu.vx v12, v8, a0
+; NOVLOPT-NEXT:    vsetvli zero, a1, e64, m4, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v12, v12
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vwmulu_vx:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetvli zero, a1, e32, m2, ta, ma
+; VLOPT-NEXT:    vwmulu.vx v12, v8, a0
+; VLOPT-NEXT:    vsetvli zero, zero, e64, m4, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v12, v12
+; VLOPT-NEXT:    ret
+  %1 = call <vscale x 4 x i64> @llvm.riscv.vwmulu.nxv4i64.nxv4i32.i32(<vscale x 4 x i64> poison, <vscale x 4 x i32> %a, i32 %b, iXLen -1)
+  %2 = call <vscale x 4 x i64> @llvm.riscv.vadd.nxv4i64.nxv4i64(<vscale x 4 x i64> poison, <vscale x 4 x i64> %1, <vscale x 4 x i64> %1, iXLen %vl)
+  ret <vscale x 4 x i64> %2
+}
+
 define <vscale x 4 x i32> @vwmacc_vx(<vscale x 4 x i16> %a, i16 %b, iXLen %vl) {
 ; NOVLOPT-LABEL: vwmacc_vx:
 ; NOVLOPT:       # %bb.0:
-- 
GitLab


From 3a56b03ef33a7462f8e21ed295e59b7d851f85fa Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 16 Oct 2024 06:40:10 -0700
Subject: [PATCH 128/329] [IR] Avoid repeated hash lookups (NFC) (#112469)

---
 llvm/lib/IR/LegacyPassManager.cpp | 8 +++-----
 1 file changed, 3 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/IR/LegacyPassManager.cpp b/llvm/lib/IR/LegacyPassManager.cpp
index 96e2f1d7908b..ce6f6c733f4b 100644
--- a/llvm/lib/IR/LegacyPassManager.cpp
+++ b/llvm/lib/IR/LegacyPassManager.cpp
@@ -104,15 +104,13 @@ void PMDataManager::emitInstrCountChangedRemark(
       [&FunctionToInstrCount](Function &MaybeChangedFn) {
         // Update the total module count.
         unsigned FnSize = MaybeChangedFn.getInstructionCount();
-        auto It = FunctionToInstrCount.find(MaybeChangedFn.getName());
 
         // If we created a new function, then we need to add it to the map and
         // say that it changed from 0 instructions to FnSize.
-        if (It == FunctionToInstrCount.end()) {
-          FunctionToInstrCount[MaybeChangedFn.getName()] =
-              std::pair<unsigned, unsigned>(0, FnSize);
+        auto [It, Inserted] = FunctionToInstrCount.try_emplace(
+            MaybeChangedFn.getName(), 0, FnSize);
+        if (Inserted)
           return;
-        }
         // Insert the new function size into the second member of the pair. This
         // tells us whether or not this function changed in size.
         It->second.second = FnSize;
-- 
GitLab


From 0a20ab908ca7cc82a4c206d39d0eaf86a46e1ff0 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 16 Oct 2024 06:40:48 -0700
Subject: [PATCH 129/329] [mlir] Avoid repeated hash lookups (NFC) (#112472)

---
 mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
index 40c83487fd47..27e89d69e214 100644
--- a/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
+++ b/mlir/lib/Dialect/MLProgram/Transforms/PipelineGlobalOps.cpp
@@ -148,8 +148,9 @@ void MLProgramPipelineGlobals::processBlock(
     if (auto store = mlir::dyn_cast<GlobalStoreOp>(op)) {
       auto ref = store.getGlobal();
       symbolStore.insert(ref);
-      if (previousStores.contains(ref)) {
-        toDelete.push_back(previousStores.find(ref)->getSecond());
+      auto it = previousStores.find(ref);
+      if (it != previousStores.end()) {
+        toDelete.push_back(it->getSecond());
       }
 
       previousLoads[ref] = store.getValue();
-- 
GitLab


From 9128077c88f0112b4a5b1f64922247793250001b Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 16 Oct 2024 06:41:19 -0700
Subject: [PATCH 130/329] [Scalar] Avoid repeated hash lookups (NFC) (#112486)

---
 llvm/lib/Transforms/Scalar/Float2Int.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Transforms/Scalar/Float2Int.cpp b/llvm/lib/Transforms/Scalar/Float2Int.cpp
index 98ecbe400543..9d23c8994300 100644
--- a/llvm/lib/Transforms/Scalar/Float2Int.cpp
+++ b/llvm/lib/Transforms/Scalar/Float2Int.cpp
@@ -398,9 +398,9 @@ bool Float2IntPass::validateAndTransform(const DataLayout &DL) {
 }
 
 Value *Float2IntPass::convert(Instruction *I, Type *ToTy) {
-  if (ConvertedInsts.contains(I))
+  if (auto It = ConvertedInsts.find(I); It != ConvertedInsts.end())
     // Already converted this instruction.
-    return ConvertedInsts[I];
+    return It->second;
 
   SmallVector<Value*,4> NewOperands;
   for (Value *V : I->operands()) {
-- 
GitLab


From f5d3c87ede965d3cb4dd58aeed0a0b94e674b997 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Wed, 16 Oct 2024 06:41:40 -0700
Subject: [PATCH 131/329] [IPO] Simplify code with StringMap::operator[] (NFC)
 (#112490)

---
 llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
index 076d91adfd1d..4e757b299618 100644
--- a/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
+++ b/llvm/include/llvm/Transforms/IPO/SampleProfileMatcher.h
@@ -201,9 +201,7 @@ private:
   void UpdateWithSalvagedProfiles();
 
   LocToLocMap &getIRToProfileLocationMap(const Function &F) {
-    auto Ret = FuncMappings.try_emplace(
-        FunctionSamples::getCanonicalFnName(F.getName()), LocToLocMap());
-    return Ret.first->second;
+    return FuncMappings[FunctionSamples::getCanonicalFnName(F.getName())];
   }
   void distributeIRToProfileLocationMap();
   void distributeIRToProfileLocationMap(FunctionSamples &FS);
-- 
GitLab


From a3010c77910c706be4c51ce4a95d51211e335a1f Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 16 Oct 2024 14:43:28 +0100
Subject: [PATCH 132/329] [GlobalISel] Add boolean predicated legalization
 action methods. (#111287)

Under AArch64 it is common and will become more common to have operation
legalization rules dependant on a feature of the architecture. For
example HasFP16 or the newer CSSC integer min/max instructions, among
many others. With the current legalization rules this either means
adding a custom predicate based on the feature as in
`legalIf([=](const LegalityQuery &Query) { return HasFP16 && ...; }` or
splitting the legalization rules into pieces that place rules optionally
into them base on the features available.

This patch proposes an alternative where the existing routines like
legalFor(..) are provided a boolean predicate, which if false skips
adding the rule. It makes the rules cleaner and will hopefully allow
them to scale better as we add more features.

The SVE predicates for loads/stores I have changed to just be always
available. Scalable vectors without SVE have never been supported, but
it could also add a condition.
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |  31 ++-
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 181 ++++++------------
 .../GlobalISel/legalizer-info-validation.mir  |  72 +++----
 3 files changed, 119 insertions(+), 165 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 82e713f30ea3..4e5a6cf92b76 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -599,11 +599,22 @@ public:
   LegalizeRuleSet &legalFor(std::initializer_list<LLT> Types) {
     return actionFor(LegalizeAction::Legal, Types);
   }
+  LegalizeRuleSet &legalFor(bool Pred, std::initializer_list<LLT> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Legal, Types);
+  }
   /// The instruction is legal when type indexes 0 and 1 is any type pair in the
   /// given list.
   LegalizeRuleSet &legalFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
     return actionFor(LegalizeAction::Legal, Types);
   }
+  LegalizeRuleSet &legalFor(bool Pred,
+                            std::initializer_list<std::pair<LLT, LLT>> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Legal, Types);
+  }
   /// The instruction is legal when type index 0 is any type in the given list
   /// and imm index 0 is anything.
   LegalizeRuleSet &legalForTypeWithAnyImm(std::initializer_list<LLT> Types) {
@@ -846,12 +857,23 @@ public:
   LegalizeRuleSet &customFor(std::initializer_list<LLT> Types) {
     return actionFor(LegalizeAction::Custom, Types);
   }
+  LegalizeRuleSet &customFor(bool Pred, std::initializer_list<LLT> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Custom, Types);
+  }
 
-  /// The instruction is custom when type indexes 0 and 1 is any type pair in the
-  /// given list.
+  /// The instruction is custom when type indexes 0 and 1 is any type pair in
+  /// the given list.
   LegalizeRuleSet &customFor(std::initializer_list<std::pair<LLT, LLT>> Types) {
     return actionFor(LegalizeAction::Custom, Types);
   }
+  LegalizeRuleSet &customFor(bool Pred,
+                             std::initializer_list<std::pair<LLT, LLT>> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Custom, Types);
+  }
 
   LegalizeRuleSet &customForCartesianProduct(std::initializer_list<LLT> Types) {
     return actionForCartesianProduct(LegalizeAction::Custom, Types);
@@ -990,6 +1012,11 @@ public:
                     scalarNarrowerThan(TypeIdx, Ty.getSizeInBits()),
                     changeTo(typeIdx(TypeIdx), Ty));
   }
+  LegalizeRuleSet &minScalar(bool Pred, unsigned TypeIdx, const LLT Ty) {
+    if (!Pred)
+      return *this;
+    return minScalar(TypeIdx, Ty);
+  }
 
   /// Ensure the scalar is at least as wide as Ty if condition is met.
   LegalizeRuleSet &minScalarIf(LegalityPredicate Predicate, unsigned TypeIdx,
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index a69894839361..773f5c0923e9 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -215,19 +215,10 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .legalFor({s64, v8s16, v16s8, v4s32})
       .lower();
 
-  auto &MinMaxActions = getActionDefinitionsBuilder(
-      {G_SMIN, G_SMAX, G_UMIN, G_UMAX});
-  if (HasCSSC)
-    MinMaxActions
-        .legalFor({s32, s64, v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
-        // Making clamping conditional on CSSC extension as without legal types we
-        // lower to CMP which can fold one of the two sxtb's we'd otherwise need
-        // if we detect a type smaller than 32-bit.
-        .minScalar(0, s32);
-  else
-    MinMaxActions
-        .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32});
-  MinMaxActions
+  getActionDefinitionsBuilder({G_SMIN, G_SMAX, G_UMIN, G_UMAX})
+      .legalFor({v8s8, v16s8, v4s16, v8s16, v2s32, v4s32})
+      .legalFor(HasCSSC, {s32, s64})
+      .minScalar(HasCSSC, 0, s32)
       .clampNumElements(0, v8s8, v16s8)
       .clampNumElements(0, v4s16, v8s16)
       .clampNumElements(0, v2s32, v4s32)
@@ -247,11 +238,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       {G_FADD, G_FSUB, G_FMUL, G_FDIV, G_FMA, G_FSQRT, G_FMAXNUM, G_FMINNUM,
        G_FMAXIMUM, G_FMINIMUM, G_FCEIL, G_FFLOOR, G_FRINT, G_FNEARBYINT,
        G_INTRINSIC_TRUNC, G_INTRINSIC_ROUND, G_INTRINSIC_ROUNDEVEN})
-      .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[0];
-        return (Ty == v8s16 || Ty == v4s16) && HasFP16;
-      })
+      .legalFor({s32, s64, v2s32, v4s32, v2s64})
+      .legalFor(HasFP16, {s16, v4s16, v8s16})
       .libcallFor({s128})
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .minScalarOrElt(0, MinFPScalar)
@@ -261,11 +249,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .moreElementsToNextPow2(0);
 
   getActionDefinitionsBuilder({G_FABS, G_FNEG})
-      .legalFor({MinFPScalar, s32, s64, v2s32, v4s32, v2s64})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[0];
-        return (Ty == v8s16 || Ty == v4s16) && HasFP16;
-      })
+      .legalFor({s32, s64, v2s32, v4s32, v2s64})
+      .legalFor(HasFP16, {s16, v4s16, v8s16})
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .lowerIf(scalarOrEltWiderThan(0, 64))
       .clampNumElements(0, v4s16, v8s16)
@@ -350,31 +335,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
     return ValTy.isPointerVector() && ValTy.getAddressSpace() == 0;
   };
 
-  auto &LoadActions = getActionDefinitionsBuilder(G_LOAD);
-  auto &StoreActions = getActionDefinitionsBuilder(G_STORE);
-
-  if (ST.hasSVE()) {
-    LoadActions.legalForTypesWithMemDesc({
-        // 128 bit base sizes
-        {nxv16s8, p0, nxv16s8, 8},
-        {nxv8s16, p0, nxv8s16, 8},
-        {nxv4s32, p0, nxv4s32, 8},
-        {nxv2s64, p0, nxv2s64, 8},
-    });
-
-    // TODO: Add nxv2p0. Consider bitcastIf.
-    //       See #92130
-    //       https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
-    StoreActions.legalForTypesWithMemDesc({
-        // 128 bit base sizes
-        {nxv16s8, p0, nxv16s8, 8},
-        {nxv8s16, p0, nxv8s16, 8},
-        {nxv4s32, p0, nxv4s32, 8},
-        {nxv2s64, p0, nxv2s64, 8},
-    });
-  }
-
-  LoadActions
+  getActionDefinitionsBuilder(G_LOAD)
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering == AtomicOrdering::Acquire;
@@ -399,6 +360,13 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       // These extends are also legal
       .legalForTypesWithMemDesc(
           {{s32, p0, s8, 8}, {s32, p0, s16, 8}, {s64, p0, s32, 8}})
+      .legalForTypesWithMemDesc({
+          // SVE vscale x 128 bit base sizes
+          {nxv16s8, p0, nxv16s8, 8},
+          {nxv8s16, p0, nxv8s16, 8},
+          {nxv4s32, p0, nxv4s32, 8},
+          {nxv2s64, p0, nxv2s64, 8},
+      })
       .widenScalarToNextPow2(0, /* MinSize = */ 8)
       .clampMaxNumElements(0, s8, 16)
       .clampMaxNumElements(0, s16, 8)
@@ -425,7 +393,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .scalarizeIf(typeInSet(0, {v2s16, v2s8}), 0)
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0);
 
-  StoreActions
+  getActionDefinitionsBuilder(G_STORE)
       .customIf([=](const LegalityQuery &Query) {
         return HasRCPC3 && Query.Types[0] == s128 &&
                Query.MMODescrs[0].Ordering == AtomicOrdering::Release;
@@ -445,6 +413,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
            {p0, p0, s64, 8},    {s128, p0, s128, 8},  {v16s8, p0, s128, 8},
            {v8s8, p0, s64, 8},  {v4s16, p0, s64, 8},  {v8s16, p0, s128, 8},
            {v2s32, p0, s64, 8}, {v4s32, p0, s128, 8}, {v2s64, p0, s128, 8}})
+      .legalForTypesWithMemDesc({
+          // SVE vscale x 128 bit base sizes
+          // TODO: Add nxv2p0. Consider bitcastIf.
+          //       See #92130
+          // https://github.com/llvm/llvm-project/pull/92130#discussion_r1616888461
+          {nxv16s8, p0, nxv16s8, 8},
+          {nxv8s16, p0, nxv8s16, 8},
+          {nxv4s32, p0, nxv4s32, 8},
+          {nxv2s64, p0, nxv2s64, 8},
+      })
       .clampScalar(0, s8, s64)
       .lowerIf([=](const LegalityQuery &Query) {
         return Query.Types[0].isScalar() &&
@@ -532,12 +510,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(0)
       .clampScalar(0, s8, s64);
   getActionDefinitionsBuilder(G_FCONSTANT)
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[0];
-        if (HasFP16 && Ty == s16)
-          return true;
-        return Ty == s32 || Ty == s64 || Ty == s128;
-      })
+      .legalFor({s32, s64, s128})
+      .legalFor(HasFP16, {s16})
       .clampScalar(0, MinFPScalar, s128);
 
   // FIXME: fix moreElementsToNextPow2
@@ -569,16 +543,12 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .customIf(isVector(0));
 
   getActionDefinitionsBuilder(G_FCMP)
-      .legalFor({{s32, MinFPScalar},
-                 {s32, s32},
+      .legalFor({{s32, s32},
                  {s32, s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32},
                  {v2s64, v2s64}})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[1];
-        return (Ty == v8s16 || Ty == v4s16) && Ty == Query.Types[0] && HasFP16;
-      })
+      .legalFor(HasFP16, {{s32, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
       .widenScalarOrEltToNextPow2(1)
       .clampScalar(0, s32, s32)
       .minScalarOrElt(1, MinFPScalar)
@@ -693,13 +663,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s64, v2s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32}})
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasFP16 &&
-               (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
-                Query.Types[1] == v8s16) &&
-               (Query.Types[0] == s32 || Query.Types[0] == s64 ||
-                Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
-      })
+      .legalFor(HasFP16,
+                {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       // The range of a fp16 value fits into an i17, so we can lower the width
@@ -741,13 +706,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s64, v2s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32}})
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasFP16 &&
-               (Query.Types[1] == s16 || Query.Types[1] == v4s16 ||
-                Query.Types[1] == v8s16) &&
-               (Query.Types[0] == s32 || Query.Types[0] == s64 ||
-                Query.Types[0] == v4s16 || Query.Types[0] == v8s16);
-      })
+      .legalFor(HasFP16,
+                {{s32, s16}, {s64, s16}, {v4s16, v4s16}, {v8s16, v8s16}})
       // Handle types larger than i64 by scalarizing/lowering.
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
@@ -790,13 +750,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
                  {v2s64, v2s64},
                  {v4s32, v4s32},
                  {v2s32, v2s32}})
-      .legalIf([=](const LegalityQuery &Query) {
-        return HasFP16 &&
-               (Query.Types[0] == s16 || Query.Types[0] == v4s16 ||
-                Query.Types[0] == v8s16) &&
-               (Query.Types[1] == s32 || Query.Types[1] == s64 ||
-                Query.Types[1] == v4s16 || Query.Types[1] == v8s16);
-      })
+      .legalFor(HasFP16,
+                {{s16, s32}, {s16, s64}, {v4s16, v4s16}, {v8s16, v8s16}})
       .scalarizeIf(scalarOrEltWiderThan(1, 64), 1)
       .scalarizeIf(scalarOrEltWiderThan(0, 64), 0)
       .moreElementsToNextPow2(1)
@@ -1050,12 +1005,8 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .widenScalarToNextPow2(1, /*Min=*/32)
       .clampScalar(1, s32, s64)
       .scalarSameSizeAs(0, 1)
-      .legalIf([=](const LegalityQuery &Query) {
-        return (HasCSSC && typeInSet(0, {s32, s64})(Query));
-      })
-      .customIf([=](const LegalityQuery &Query) {
-        return (!HasCSSC && typeInSet(0, {s32, s64})(Query));
-      });
+      .legalFor(HasCSSC, {s32, s64})
+      .customFor(!HasCSSC, {s32, s64});
 
   getActionDefinitionsBuilder(G_SHUFFLE_VECTOR)
       .legalIf([=](const LegalityQuery &Query) {
@@ -1143,11 +1094,9 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   }
 
   // FIXME: Legal vector types are only legal with NEON.
-  auto &ABSActions = getActionDefinitionsBuilder(G_ABS);
-  if (HasCSSC)
-    ABSActions
-        .legalFor({s32, s64});
-  ABSActions.legalFor(PackedVectorAllTypeList)
+  getActionDefinitionsBuilder(G_ABS)
+      .legalFor(HasCSSC, {s32, s64})
+      .legalFor(PackedVectorAllTypeList)
       .customIf([=](const LegalityQuery &Q) {
         // TODO: Fix suboptimal codegen for 128+ bit types.
         LLT SrcTy = Q.Types[0];
@@ -1171,10 +1120,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   // later.
   getActionDefinitionsBuilder(G_VECREDUCE_FADD)
       .legalFor({{s32, v2s32}, {s32, v4s32}, {s64, v2s64}})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[1];
-        return (Ty == v4s16 || Ty == v8s16) && HasFP16;
-      })
+      .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
       .minScalarOrElt(0, MinFPScalar)
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
@@ -1215,10 +1161,7 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
   getActionDefinitionsBuilder({G_VECREDUCE_FMIN, G_VECREDUCE_FMAX,
                                G_VECREDUCE_FMINIMUM, G_VECREDUCE_FMAXIMUM})
       .legalFor({{s32, v4s32}, {s32, v2s32}, {s64, v2s64}})
-      .legalIf([=](const LegalityQuery &Query) {
-        const auto &Ty = Query.Types[1];
-        return Query.Types[0] == s16 && (Ty == v8s16 || Ty == v4s16) && HasFP16;
-      })
+      .legalFor(HasFP16, {{s16, v4s16}, {s16, v8s16}})
       .minScalarOrElt(0, MinFPScalar)
       .clampMaxNumElements(1, s64, 2)
       .clampMaxNumElements(1, s32, 4)
@@ -1295,32 +1238,16 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .customFor({{s32, s32}, {s64, s64}});
 
   auto always = [=](const LegalityQuery &Q) { return true; };
-  auto &CTPOPActions = getActionDefinitionsBuilder(G_CTPOP);
-  if (HasCSSC)
-    CTPOPActions
-        .legalFor({{s32, s32},
-                   {s64, s64},
-                   {v8s8, v8s8},
-                   {v16s8, v16s8}})
-        .customFor({{s128, s128},
-                    {v2s64, v2s64},
-                    {v2s32, v2s32},
-                    {v4s32, v4s32},
-                    {v4s16, v4s16},
-                    {v8s16, v8s16}});
-  else
-    CTPOPActions
-        .legalFor({{v8s8, v8s8},
-                   {v16s8, v16s8}})
-        .customFor({{s32, s32},
-                    {s64, s64},
-                    {s128, s128},
-                    {v2s64, v2s64},
-                    {v2s32, v2s32},
-                    {v4s32, v4s32},
-                    {v4s16, v4s16},
-                    {v8s16, v8s16}});
-  CTPOPActions
+  getActionDefinitionsBuilder(G_CTPOP)
+      .legalFor(HasCSSC, {{s32, s32}, {s64, s64}})
+      .legalFor({{v8s8, v8s8}, {v16s8, v16s8}})
+      .customFor(!HasCSSC, {{s32, s32}, {s64, s64}})
+      .customFor({{s128, s128},
+                  {v2s64, v2s64},
+                  {v2s32, v2s32},
+                  {v4s32, v4s32},
+                  {v4s16, v4s16},
+                  {v8s16, v8s16}})
       .clampScalar(0, s32, s128)
       .widenScalarToNextPow2(0)
       .minScalarEltSameAsIf(always, 1, 0)
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index a21b786a2bae..073c3cafa062 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -152,12 +152,12 @@
 #
 # DEBUG-NEXT: G_INTRINSIC_TRUNC (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INTRINSIC_ROUND (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INTRINSIC_LRINT (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 2, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -167,8 +167,8 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_INTRINSIC_ROUNDEVEN (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_READCYCLECOUNTER (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -310,8 +310,8 @@
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FCONSTANT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_VASTART (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -459,27 +459,27 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FADD (opcode {{[0-9]+}}): 1 type index, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FSUB (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMUL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMA (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAD (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_FDIV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FREM (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -565,12 +565,12 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FMINNUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAXNUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMINNUM_IEEE (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
@@ -579,12 +579,12 @@
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_FMINIMUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FMAXIMUM (opcode {{[0-9]+}}): 1 type index
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_GET_FPENV (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
@@ -692,8 +692,8 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FCEIL (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FCOS (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. the first uncovered type index: 1, OK
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
@@ -734,20 +734,20 @@
 # DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FSQRT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FFLOOR (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FRINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_FNEARBYINT (opcode {{[0-9]+}}): 1 type index, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 1, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ADDRSPACE_CAST (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
-- 
GitLab


From 621fcf892bcd3c2d81e25c6ee39ca32db3c6b05a Mon Sep 17 00:00:00 2001
From: Tom Eccles <tom.eccles@arm.com>
Date: Wed, 16 Oct 2024 14:43:57 +0100
Subject: [PATCH 133/329] [mlir][OpenMP] rewrite conversion of privatisation
 for omp.parallel (#111844)

The existing conversion inlined private alloc regions and firstprivate
copy regions in mlir, then undoing the modification of the mlir module
before completing the conversion. To make this work, LLVM IR had to be
generated using the wrong mapping for privatised values and then later
fixed inside of OpenMPIRBuilder.

This approach violated an assumption in OpenMPIRBuilder that private
variables would be values not constants. Flang sometimes generates code
where private variables are promoted to globals, the address of which is
treated as a constant in LLVM IR. This caused the incorrect values for
the private variable from being replaced by OpenMPIRBuilder: ultimately
resulting in programs producing incorrect results.

This patch rewrites delayed privatisation for omp.parallel to work more
similarly to reductions: translating directly into LLVMIR with correct
mappings for private variables.

RFC:
https://discourse.llvm.org/t/rfc-openmp-fix-issue-in-mlir-to-llvmir-translation-for-delayed-privatisation/81225

Tested against the gfortran testsuite and our internal test suite.
Linaro's post-commit bots will check against the fujitsu test suite.

I decided to add the new tests as flang integration tests rather than in
mlir/test/Target/LLVMIR:
- The regression test is for an issue filed against flang. i wanted to
keep the reproducer similar to the code in the ticket.
- I found the "worst case" CFG test difficult to reason about in
abstract it helped me to think about what was going on in terms of a
Fortran program.

Fixes #106297
---
 .../parallel-private-reduction-worstcase.f90  | 262 +++++++++++
 .../Integration/OpenMP/private-global.f90     |  46 ++
 .../OpenMP/OpenMPToLLVMIRTranslation.cpp      | 414 +++++++-----------
 .../Target/LLVMIR/openmp-firstprivate.mlir    |  25 +-
 mlir/test/Target/LLVMIR/openmp-private.mlir   |   6 +-
 5 files changed, 490 insertions(+), 263 deletions(-)
 create mode 100644 flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
 create mode 100644 flang/test/Integration/OpenMP/private-global.f90

diff --git a/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90 b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
new file mode 100644
index 000000000000..3aa5d0424639
--- /dev/null
+++ b/flang/test/Integration/OpenMP/parallel-private-reduction-worstcase.f90
@@ -0,0 +1,262 @@
+! RUN: %flang_fc1 -fopenmp -emit-llvm %s -o - | FileCheck %s
+
+! Combinational testing of control flow graph and builder insertion points
+! in mlir-to-llvm conversion:
+!   - mixing multiple delayed privatizations and multiple reductions
+!   - multiple blocks in the private alloc region
+!   - private alloc region has to read from the mold variable
+!   - firstprivate
+!   - multiple blocks in the private copy region
+!   - multiple blocks in the reduction init region
+!   - reduction init region has to read from the mold variable
+!   - re-used omp.private ops
+!   - re-used omp.reduction.declare ops
+!   - unstructured code inside of the parallel region
+!   - needs private dealloc region, and this has multiple blocks
+!   - needs reduction cleanup region, and this has multiple blocks
+
+! This maybe belongs in the mlir tests, but what we are doing here is complex
+! enough that I find the kind of minimised mlir code preferred by mlir reviewers
+! hard to read without some fortran here for reference. Nothing like this would
+! be generated by other upstream users of the MLIR OpenMP dialect.
+
+subroutine worst_case(a, b, c, d)
+  real, allocatable :: a(:), b(:), c(:), d(:)
+  integer i
+
+  !$omp parallel firstprivate(a,b) reduction(+:c,d)
+  if (sum(a) == 1) stop 1
+  !$omp end parallel
+end subroutine
+
+! CHECK-LABEL: define internal void @worst_case_..omp_par
+! CHECK-NEXT:  omp.par.entry:
+!                [reduction alloc regions inlined here]
+! CHECK:         br label %omp.private.latealloc
+
+! CHECK:       omp.private.latealloc:                            ; preds = %omp.par.entry
+! CHECK-NEXT:  br label %omp.private.alloc5
+
+! CHECK:       omp.private.alloc5:                               ; preds = %omp.private.latealloc
+!                [begin private alloc for first var]
+!                [read the length from the mold argument]
+!                [if it is non-zero...]
+! CHECK:         br i1 {{.*}}, label %omp.private.alloc6, label %omp.private.alloc7
+
+! CHECK:       omp.private.alloc7:                               ; preds = %omp.private.alloc5
+!                [finish private alloc for first var with zero extent]
+! CHECK:         br label %omp.private.alloc8
+
+! CHECK:       omp.private.alloc8:                               ; preds = %omp.private.alloc6, %omp.private.alloc7
+! CHECK-NEXT:    br label %omp.region.cont4
+
+! CHECK:       omp.region.cont4:                                 ; preds = %omp.private.alloc8
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.private.alloc
+
+! CHECK:       omp.private.alloc:                                ; preds = %omp.region.cont4
+!                [begin private alloc for first var]
+!                [read the length from the mold argument]
+!                [if it is non-zero...]
+! CHECK:         br i1 %{{.*}}, label %omp.private.alloc1, label %omp.private.alloc2
+
+! CHECK:       omp.private.alloc2:                               ; preds = %omp.private.alloc
+!                [finish private alloc for second var with zero extent]
+! CHECK:         br label %omp.private.alloc3
+
+! CHECK:       omp.private.alloc3:                               ; preds = %omp.private.alloc1, %omp.private.alloc2
+! CHECK-NEXT:    br label %omp.region.cont
+
+! CHECK:       omp.region.cont:                                  ; preds = %omp.private.alloc3
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.private.copy
+
+! CHECK:       omp.private.copy:                                 ; preds = %omp.region.cont
+! CHECK-NEXT:    br label %omp.private.copy10
+
+! CHECK:       omp.private.copy10:                               ; preds = %omp.private.copy
+!                [begin firstprivate copy for first var]
+!                [read the length, is it non-zero?]
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy11, label %omp.private.copy12
+
+! CHECK:       omp.private.copy12:                               ; preds = %omp.private.copy11, %omp.private.copy10
+! CHECK-NEXT:    br label %omp.region.cont9
+
+! CHECK:       omp.region.cont9:                                 ; preds = %omp.private.copy12
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.private.copy14
+
+! CHECK:       omp.private.copy14:                               ; preds = %omp.region.cont9
+!                [begin firstprivate copy for second var]
+!                [read the length, is it non-zero?]
+! CHECK:         br i1 %{{.*}}, label %omp.private.copy15, label %omp.private.copy16
+
+! CHECK:       omp.private.copy16:                               ; preds = %omp.private.copy15, %omp.private.copy14
+! CHECK-NEXT:    br label %omp.region.cont13
+
+! CHECK:       omp.region.cont13:                                ; preds = %omp.private.copy16
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.reduction.init
+
+! CHECK:       omp.reduction.init:                               ; preds = %omp.region.cont13
+!                [deffered stores for results of reduction alloc regions]
+! CHECK:         br label %[[VAL_96:.*]]
+
+! CHECK:       omp.reduction.neutral:                            ; preds = %omp.reduction.init
+!                [start of reduction initialization region]
+!                [null check:]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral18, label %omp.reduction.neutral19
+
+! CHECK:       omp.reduction.neutral19:                          ; preds = %omp.reduction.neutral
+!                [malloc and assign the default value to the reduction variable]
+! CHECK:         br label %omp.reduction.neutral20
+
+! CHECK:       omp.reduction.neutral20:                          ; preds = %omp.reduction.neutral18, %omp.reduction.neutral19
+! CHECK-NEXT:    br label %omp.region.cont17
+
+! CHECK:       omp.region.cont17:                                ; preds = %omp.reduction.neutral20
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.reduction.neutral22
+
+! CHECK:       omp.reduction.neutral22:                          ; preds = %omp.region.cont17
+!                [start of reduction initialization region]
+!                [null check:]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.neutral23, label %omp.reduction.neutral24
+
+! CHECK:       omp.reduction.neutral24:                          ; preds = %omp.reduction.neutral22
+!                [malloc and assign the default value to the reduction variable]
+! CHECK:         br label %omp.reduction.neutral25
+
+! CHECK:       omp.reduction.neutral25:                          ; preds = %omp.reduction.neutral23, %omp.reduction.neutral24
+! CHECK-NEXT:    br label %omp.region.cont21
+
+! CHECK:       omp.region.cont21:                                ; preds = %omp.reduction.neutral25
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    br label %omp.par.region
+
+! CHECK:       omp.par.region:                                   ; preds = %omp.region.cont21
+! CHECK-NEXT:    br label %omp.par.region27
+
+! CHECK:       omp.par.region27:                                 ; preds = %omp.par.region
+!                [call SUM runtime function]
+!                [if (sum(a) == 1)]
+! CHECK:         br i1 %{{.*}}, label %omp.par.region28, label %omp.par.region29
+
+! CHECK:       omp.par.region29:                                 ; preds = %omp.par.region27
+! CHECK-NEXT:    br label %omp.region.cont26
+
+! CHECK:       omp.region.cont26:                                ; preds = %omp.par.region28, %omp.par.region29
+!                [omp parallel region done, call into the runtime to complete reduction]
+! CHECK:         %[[VAL_233:.*]] = call i32 @__kmpc_reduce(
+! CHECK:         switch i32 %[[VAL_233]], label %reduce.finalize [
+! CHECK-NEXT:      i32 1, label %reduce.switch.nonatomic
+! CHECK-NEXT:      i32 2, label %reduce.switch.atomic
+! CHECK-NEXT:    ]
+
+! CHECK:       reduce.switch.atomic:                             ; preds = %omp.region.cont26
+! CHECK-NEXT:    unreachable
+
+! CHECK:       reduce.switch.nonatomic:                          ; preds = %omp.region.cont26
+! CHECK-NEXT:    %[[red_private_value_0:.*]] = load ptr, ptr %{{.*}}, align 8
+! CHECK-NEXT:    br label %omp.reduction.nonatomic.body
+
+!              [various blocks implementing the reduction]
+
+! CHECK:       omp.region.cont35:                                ; preds =
+! CHECK-NEXT:    %{{.*}} = phi ptr
+! CHECK-NEXT:    call void @__kmpc_end_reduce(
+! CHECK-NEXT:    br label %reduce.finalize
+
+! CHECK:       reduce.finalize:                                  ; preds =
+! CHECK-NEXT:    br label %omp.par.pre_finalize
+
+! CHECK:       omp.par.pre_finalize:                             ; preds = %reduce.finalize
+! CHECK-NEXT:    %{{.*}} = load ptr, ptr
+! CHECK-NEXT:    br label %omp.reduction.cleanup
+
+! CHECK:       omp.reduction.cleanup:                            ; preds = %omp.par.pre_finalize
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup41, label %omp.reduction.cleanup42
+
+! CHECK:       omp.reduction.cleanup42:                          ; preds = %omp.reduction.cleanup41, %omp.reduction.cleanup
+! CHECK-NEXT:    br label %omp.region.cont40
+
+! CHECK:       omp.region.cont40:                                ; preds = %omp.reduction.cleanup42
+! CHECK-NEXT:    %{{.*}} = load ptr, ptr
+! CHECK-NEXT:    br label %omp.reduction.cleanup44
+
+! CHECK:       omp.reduction.cleanup44:                          ; preds = %omp.region.cont40
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.reduction.cleanup45, label %omp.reduction.cleanup46
+
+! CHECK:       omp.reduction.cleanup46:                          ; preds = %omp.reduction.cleanup45, %omp.reduction.cleanup44
+! CHECK-NEXT:    br label %omp.region.cont43
+
+! CHECK:       omp.region.cont43:                                ; preds = %omp.reduction.cleanup46
+! CHECK-NEXT:    br label %omp.private.dealloc
+
+! CHECK:       omp.private.dealloc:                              ; preds = %omp.region.cont43
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc48, label %omp.private.dealloc49
+
+! CHECK:       omp.private.dealloc49:                            ; preds = %omp.private.dealloc48, %omp.private.dealloc
+! CHECK-NEXT:    br label %omp.region.cont47
+
+! CHECK:       omp.region.cont47:                                ; preds = %omp.private.dealloc49
+! CHECK-NEXT:    br label %omp.private.dealloc51
+
+! CHECK:       omp.private.dealloc51:                            ; preds = %omp.region.cont47
+!                [null check]
+! CHECK:         br i1 %{{.*}}, label %omp.private.dealloc52, label %omp.private.dealloc53
+
+! CHECK:       omp.private.dealloc53:                            ; preds = %omp.private.dealloc52, %omp.private.dealloc51
+! CHECK-NEXT:    br label %omp.region.cont50
+
+! CHECK:       omp.region.cont50:                                ; preds = %omp.private.dealloc53
+! CHECK-NEXT:    br label %omp.par.outlined.exit.exitStub
+
+! CHECK:       omp.private.dealloc52:                            ; preds = %omp.private.dealloc51
+!                [dealloc memory]
+! CHECK:         br label %omp.private.dealloc53
+
+! CHECK:       omp.private.dealloc48:                            ; preds = %omp.private.dealloc
+!                [dealloc memory]
+! CHECK:         br label %omp.private.dealloc49
+
+! CHECK:       omp.reduction.cleanup45:                          ; preds = %omp.reduction.cleanup44
+! CHECK-NEXT:    call void @free(
+! CHECK-NEXT:    br label %omp.reduction.cleanup46
+
+! CHECK:       omp.reduction.cleanup41:                          ; preds = %omp.reduction.cleanup
+! CHECK-NEXT:    call void @free(
+! CHECK-NEXT:    br label %omp.reduction.cleanup42
+
+! CHECK:       omp.par.region28:                                 ; preds = %omp.par.region27
+! CHECK-NEXT:    call {} @_FortranAStopStatement
+
+! CHECK:       omp.reduction.neutral23:                          ; preds = %omp.reduction.neutral22
+!                [source length was zero: finish initializing array]
+! CHECK:         br label %omp.reduction.neutral25
+
+! CHECK:       omp.reduction.neutral18:                          ; preds = %omp.reduction.neutral
+!                [source length was zero: finish initializing array]
+! CHECK:         br label %omp.reduction.neutral20
+
+! CHECK:       omp.private.copy15:                               ; preds = %omp.private.copy14
+!                [source length was non-zero: call assign runtime]
+! CHECK:         br label %omp.private.copy16
+
+! CHECK:       omp.private.copy11:                               ; preds = %omp.private.copy10
+!                [source length was non-zero: call assign runtime]
+! CHECK:         br label %omp.private.copy12
+
+! CHECK:       omp.private.alloc1:                               ; preds = %omp.private.alloc
+!                [var extent was non-zero: malloc a private array]
+! CHECK:         br label %omp.private.alloc3
+
+! CHECK:       omp.private.alloc6:                               ; preds = %omp.private.alloc5
+!                [var extent was non-zero: malloc a private array]
+! CHECK:         br label %omp.private.alloc8
+
+! CHECK:       omp.par.outlined.exit.exitStub:                   ; preds = %omp.region.cont50
+! CHECK-NEXT:    ret void
diff --git a/flang/test/Integration/OpenMP/private-global.f90 b/flang/test/Integration/OpenMP/private-global.f90
new file mode 100644
index 000000000000..62d0a3faf0c5
--- /dev/null
+++ b/flang/test/Integration/OpenMP/private-global.f90
@@ -0,0 +1,46 @@
+!RUN: %flang_fc1 -emit-llvm -fopenmp %s -o - | FileCheck %s
+
+! Regression test for https://github.com/llvm/llvm-project/issues/106297
+
+program bug
+  implicit none
+  integer :: table(10)
+  !$OMP PARALLEL PRIVATE(table)
+    table = 50
+    if (any(table/=50)) then
+      stop 'fail 3'
+    end if
+  !$OMP END PARALLEL
+  print *,'ok'
+End Program
+
+
+! CHECK-LABEL: define internal void {{.*}}..omp_par(
+! CHECK:       omp.par.entry:
+! CHECK:         %[[VAL_9:.*]] = alloca i32, align 4
+! CHECK:         %[[VAL_10:.*]] = load i32, ptr %[[VAL_11:.*]], align 4
+! CHECK:         store i32 %[[VAL_10]], ptr %[[VAL_9]], align 4
+! CHECK:         %[[VAL_12:.*]] = load i32, ptr %[[VAL_9]], align 4
+! CHECK:         %[[PRIV_TABLE:.*]] = alloca [10 x i32], i64 1, align 4
+! ...
+! check that we use the private copy of table for the assignment
+! CHECK:       omp.par.region1:
+! CHECK:         %[[ELEMENTAL_TMP:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[TABLE_BOX_ADDR:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, align 8
+! CHECK:         %[[BOXED_FIFTY:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8 }, align 8
+! CHECK:         %[[TABLE_BOX_ADDR2:.*]] = alloca { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, i64 1, align 8
+! CHECK:         %[[TABLE_BOX_VAL:.*]] = insertvalue { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } { ptr undef, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64), i32 20240719, i8 1, i8 9, i8 0, i8 0, [1 x [3 x i64]] {{\[\[}}3 x i64] [i64 1, i64 10, i64 ptrtoint (ptr getelementptr (i32, ptr null, i32 1) to i64)]] }, ptr %[[PRIV_TABLE]], 0
+! CHECK:         store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL]], ptr %[[TABLE_BOX_ADDR]], align 8
+! CHECK:         %[[TABLE_BOX_VAL2:.*]] = load { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] }, ptr %[[TABLE_BOX_ADDR]], align 8
+! CHECK:         store { ptr, i64, i32, i8, i8, i8, i8, [1 x [3 x i64]] } %[[TABLE_BOX_VAL2]], ptr %[[TABLE_BOX_ADDR2]], align 8
+! CHECK:         %[[VAL_26:.*]] = call {} @_FortranAAssign(ptr %[[TABLE_BOX_ADDR2]], ptr %[[BOXED_FIFTY]], ptr @{{.*}}, i32 9)
+! ...
+! check that we use the private copy of table for table/=50
+! CHECK:       omp.par.region3:
+! CHECK:         %[[VAL_44:.*]] = sub nsw i64 %{{.*}}, 1
+! CHECK:         %[[VAL_45:.*]] = mul nsw i64 %[[VAL_44]], 1
+! CHECK:         %[[VAL_46:.*]] = mul nsw i64 %[[VAL_45]], 1
+! CHECK:         %[[VAL_47:.*]] = add nsw i64 %[[VAL_46]], 0
+! CHECK:         %[[VAL_48:.*]] = getelementptr i32, ptr %[[PRIV_TABLE]], i64 %[[VAL_47]]
+! CHECK:         %[[VAL_49:.*]] = load i32, ptr %[[VAL_48]], align 4
+! CHECK:         %[[VAL_50:.*]] = icmp ne i32 %[[VAL_49]], 50
diff --git a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
index cb7dd3cd874d..7c45e89cd8ac 100644
--- a/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
+++ b/mlir/lib/Target/LLVMIR/Dialect/OpenMP/OpenMPToLLVMIRTranslation.cpp
@@ -371,20 +371,46 @@ convertOmpCritical(Operation &opInst, llvm::IRBuilderBase &builder,
   return success();
 }
 
-/// Populates `reductions` with reduction declarations used in the given loop.
+/// Looks up from the operation from and returns the PrivateClauseOp with
+/// name symbolName
+static omp::PrivateClauseOp findPrivatizer(Operation *from,
+                                           SymbolRefAttr symbolName) {
+  omp::PrivateClauseOp privatizer =
+      SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(from,
+                                                                 symbolName);
+  assert(privatizer && "privatizer not found in the symbol table");
+  return privatizer;
+}
+
+/// Populates `privatizations` with privatization declarations used for the
+/// given op.
+/// TODO: generalise beyond ParallelOp
+static void collectPrivatizationDecls(
+    omp::ParallelOp op, SmallVectorImpl<omp::PrivateClauseOp> &privatizations) {
+  std::optional<ArrayAttr> attr = op.getPrivateSyms();
+  if (!attr)
+    return;
+
+  privatizations.reserve(privatizations.size() + attr->size());
+  for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {
+    privatizations.push_back(findPrivatizer(op, symbolRef));
+  }
+}
+
+/// Populates `reductions` with reduction declarations used in the given op.
 template <typename T>
 static void
-collectReductionDecls(T loop,
+collectReductionDecls(T op,
                       SmallVectorImpl<omp::DeclareReductionOp> &reductions) {
-  std::optional<ArrayAttr> attr = loop.getReductionSyms();
+  std::optional<ArrayAttr> attr = op.getReductionSyms();
   if (!attr)
     return;
 
-  reductions.reserve(reductions.size() + loop.getNumReductionVars());
+  reductions.reserve(reductions.size() + op.getNumReductionVars());
   for (auto symbolRef : attr->getAsRange<SymbolRefAttr>()) {
     reductions.push_back(
         SymbolTable::lookupNearestSymbolFrom<omp::DeclareReductionOp>(
-            loop, symbolRef));
+            op, symbolRef));
   }
 }
 
@@ -609,7 +635,7 @@ static LogicalResult
 allocReductionVars(T loop, ArrayRef<BlockArgument> reductionArgs,
                    llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation,
-                   llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
+                   const llvm::OpenMPIRBuilder::InsertPointTy &allocaIP,
                    SmallVectorImpl<omp::DeclareReductionOp> &reductionDecls,
                    SmallVectorImpl<llvm::Value *> &privateReductionVariables,
                    DenseMap<Value, llvm::Value *> &reductionVariableMap,
@@ -1317,76 +1343,11 @@ convertOmpWsloop(Operation &opInst, llvm::IRBuilderBase &builder,
                                     privateReductionVariables, isByRef);
 }
 
-/// A RAII class that on construction replaces the region arguments of the
-/// parallel op (which correspond to private variables) with the actual private
-/// variables they correspond to. This prepares the parallel op so that it
-/// matches what is expected by the OMPIRBuilder.
-///
-/// On destruction, it restores the original state of the operation so that on
-/// the MLIR side, the op is not affected by conversion to LLVM IR.
-class OmpParallelOpConversionManager {
-public:
-  OmpParallelOpConversionManager(omp::ParallelOp opInst)
-      : region(opInst.getRegion()),
-        privateBlockArgs(cast<omp::BlockArgOpenMPOpInterface>(*opInst)
-                             .getPrivateBlockArgs()),
-        privateVars(opInst.getPrivateVars()) {
-    for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
-      mlir::replaceAllUsesInRegionWith(blockArg, var, region);
-  }
-
-  ~OmpParallelOpConversionManager() {
-    for (auto [blockArg, var] : llvm::zip_equal(privateBlockArgs, privateVars))
-      mlir::replaceAllUsesInRegionWith(var, blockArg, region);
-  }
-
-private:
-  Region &region;
-  llvm::MutableArrayRef<BlockArgument> privateBlockArgs;
-  OperandRange privateVars;
-};
-
-// Looks up from the operation from and returns the PrivateClauseOp with
-// name symbolName
-static omp::PrivateClauseOp findPrivatizer(Operation *from,
-                                           SymbolRefAttr symbolName) {
-  omp::PrivateClauseOp privatizer =
-      SymbolTable::lookupNearestSymbolFrom<omp::PrivateClauseOp>(from,
-                                                                 symbolName);
-  assert(privatizer && "privatizer not found in the symbol table");
-  return privatizer;
-}
-// clones the given privatizer. The original privatizer is used as
-// the insert point for the clone.
-static omp::PrivateClauseOp
-clonePrivatizer(LLVM::ModuleTranslation &moduleTranslation,
-                omp::PrivateClauseOp privatizer, Operation *fromOperation) {
-  MLIRContext &context = moduleTranslation.getContext();
-  mlir::IRRewriter opCloner(&context);
-  opCloner.setInsertionPoint(privatizer);
-  auto clone =
-      llvm::cast<mlir::omp::PrivateClauseOp>(opCloner.clone(*privatizer));
-
-  // Unique the clone name to avoid clashes in the symbol table.
-  unsigned counter = 0;
-  SmallString<256> cloneName = SymbolTable::generateSymbolName<256>(
-      privatizer.getSymName(),
-      [&](llvm::StringRef candidate) {
-        return SymbolTable::lookupNearestSymbolFrom(
-                   fromOperation, StringAttr::get(&context, candidate)) !=
-               nullptr;
-      },
-      counter);
-
-  clone.setSymName(cloneName);
-  return clone;
-}
 /// Converts the OpenMP parallel operation to LLVM IR.
 static LogicalResult
 convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
                    LLVM::ModuleTranslation &moduleTranslation) {
   using InsertPointTy = llvm::OpenMPIRBuilder::InsertPointTy;
-  OmpParallelOpConversionManager raii(opInst);
   ArrayRef<bool> isByRef = getIsByRef(opInst.getReductionByref());
   assert(isByRef.size() == opInst.getNumReductionVars());
 
@@ -1395,6 +1356,15 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   LogicalResult bodyGenStatus = success();
   llvm::OpenMPIRBuilder *ompBuilder = moduleTranslation.getOpenMPBuilder();
 
+  // Collect delayed privatization declarations
+  MutableArrayRef<BlockArgument> privateBlockArgs =
+      cast<omp::BlockArgOpenMPOpInterface>(*opInst).getPrivateBlockArgs();
+  SmallVector<llvm::Value *> llvmPrivateVars;
+  SmallVector<omp::PrivateClauseOp> privateDecls;
+  llvmPrivateVars.reserve(privateBlockArgs.size());
+  privateDecls.reserve(privateBlockArgs.size());
+  collectPrivatizationDecls(opInst, privateDecls);
+
   // Collect reduction declarations
   SmallVector<omp::DeclareReductionOp> reductionDecls;
   collectReductionDecls(opInst, reductionDecls);
@@ -1403,6 +1373,66 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
   SmallVector<DeferredStore> deferredStores;
 
   auto bodyGenCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP) {
+    // Allocate private vars
+    llvm::BranchInst *allocaTerminator =
+        llvm::cast<llvm::BranchInst>(allocaIP.getBlock()->getTerminator());
+    builder.SetInsertPoint(allocaTerminator);
+    assert(allocaTerminator->getNumSuccessors() == 1 &&
+           "This is an unconditional branch created by OpenMPIRBuilder");
+    llvm::BasicBlock *afterAllocas = allocaTerminator->getSuccessor(0);
+
+    // FIXME: Some of the allocation regions do more than just allocating.
+    // They read from their block argument (amongst other non-alloca things).
+    // When OpenMPIRBuilder outlines the parallel region into a different
+    // function it places the loads for live in-values (such as these block
+    // arguments) at the end of the entry block (because the entry block is
+    // assumed to contain only allocas). Therefore, if we put these complicated
+    // alloc blocks in the entry block, these will not dominate the availability
+    // of the live-in values they are using. Fix this by adding a latealloc
+    // block after the entry block to put these in (this also helps to avoid
+    // mixing non-alloca code with allocas).
+    // Alloc regions which do not use the block argument can still be placed in
+    // the entry block (therefore keeping the allocas together).
+    llvm::BasicBlock *privAllocBlock = nullptr;
+    if (!privateBlockArgs.empty())
+      privAllocBlock = splitBB(builder, true, "omp.private.latealloc");
+    for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
+      Region &allocRegion = privateDecls[i].getAllocRegion();
+
+      // map allocation region block argument
+      llvm::Value *nonPrivateVar =
+          moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
+      assert(nonPrivateVar);
+      moduleTranslation.mapValue(privateDecls[i].getAllocMoldArg(),
+                                 nonPrivateVar);
+
+      // in-place convert the private allocation region
+      SmallVector<llvm::Value *, 1> phis;
+      if (privateDecls[i].getAllocMoldArg().getUses().empty()) {
+        // TODO this should use
+        // allocaIP.getBlock()->getFirstNonPHIOrDbgOrAlloca() so it goes before
+        // the code for fetching the thread id. Not doing this for now to avoid
+        // test churn.
+        builder.SetInsertPoint(allocaIP.getBlock()->getTerminator());
+      } else {
+        builder.SetInsertPoint(privAllocBlock->getTerminator());
+      }
+      if (failed(inlineConvertOmpRegions(allocRegion, "omp.private.alloc",
+                                         builder, moduleTranslation, &phis))) {
+        bodyGenStatus = failure();
+        return;
+      }
+      assert(phis.size() == 1 && "expected one allocation to be yielded");
+
+      moduleTranslation.mapValue(privateBlockArgs[i], phis[0]);
+      llvmPrivateVars.push_back(phis[0]);
+
+      // clear alloc region block argument mapping in case it needs to be
+      // re-created with a different source for another use of the same
+      // reduction decl
+      moduleTranslation.forgetMapping(allocRegion);
+    }
+
     // Allocate reduction vars
     DenseMap<Value, llvm::Value *> reductionVariableMap;
 
@@ -1419,12 +1449,64 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
             deferredStores, isByRef)))
       bodyGenStatus = failure();
 
+    // Apply copy region for firstprivate.
+    bool needsFirstprivate =
+        llvm::any_of(privateDecls, [](omp::PrivateClauseOp &privOp) {
+          return privOp.getDataSharingType() ==
+                 omp::DataSharingClauseType::FirstPrivate;
+        });
+    if (needsFirstprivate) {
+      // Find the end of the allocation blocks
+      assert(afterAllocas->getSinglePredecessor());
+      builder.SetInsertPoint(
+          afterAllocas->getSinglePredecessor()->getTerminator());
+      llvm::BasicBlock *copyBlock =
+          splitBB(builder, /*CreateBranch=*/true, "omp.private.copy");
+      builder.SetInsertPoint(copyBlock->getFirstNonPHIOrDbgOrAlloca());
+    }
+    for (unsigned i = 0; i < privateBlockArgs.size(); ++i) {
+      if (privateDecls[i].getDataSharingType() !=
+          omp::DataSharingClauseType::FirstPrivate)
+        continue;
+
+      // copyRegion implements `lhs = rhs`
+      Region &copyRegion = privateDecls[i].getCopyRegion();
+
+      // map copyRegion rhs arg
+      llvm::Value *nonPrivateVar =
+          moduleTranslation.lookupValue(opInst.getPrivateVars()[i]);
+      assert(nonPrivateVar);
+      moduleTranslation.mapValue(privateDecls[i].getCopyMoldArg(),
+                                 nonPrivateVar);
+
+      // map copyRegion lhs arg
+      moduleTranslation.mapValue(privateDecls[i].getCopyPrivateArg(),
+                                 llvmPrivateVars[i]);
+
+      // in-place convert copy region
+      builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
+      if (failed(inlineConvertOmpRegions(copyRegion, "omp.private.copy",
+                                         builder, moduleTranslation))) {
+        bodyGenStatus = failure();
+        return;
+      }
+
+      // ignore unused value yielded from copy region
+
+      // clear copy region block argument mapping in case it needs to be
+      // re-created with different sources for reuse of the same reduction
+      // decl
+      moduleTranslation.forgetMapping(copyRegion);
+    }
+
     // Initialize reduction vars
-    builder.restoreIP(allocaIP);
+    builder.SetInsertPoint(builder.GetInsertBlock()->getTerminator());
     llvm::BasicBlock *initBlock = splitBB(builder, true, "omp.reduction.init");
     allocaIP =
         InsertPointTy(allocaIP.getBlock(),
                       allocaIP.getBlock()->getTerminator()->getIterator());
+
+    builder.restoreIP(allocaIP);
     SmallVector<llvm::Value *> byRefVars(opInst.getNumReductionVars());
     for (unsigned i = 0; i < opInst.getNumReductionVars(); ++i) {
       if (isByRef[i]) {
@@ -1534,183 +1616,11 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
     }
   };
 
-  SmallVector<omp::PrivateClauseOp> mlirPrivatizerClones;
-  SmallVector<llvm::Value *> llvmPrivateVars;
-
-  // TODO: Perform appropriate actions according to the data-sharing
-  // attribute (shared, private, firstprivate, ...) of variables.
-  // Currently shared and private are supported.
-  auto privCB = [&](InsertPointTy allocaIP, InsertPointTy codeGenIP,
-                    llvm::Value &, llvm::Value &llvmOmpRegionInput,
-                    llvm::Value *&llvmReplacementValue) -> InsertPointTy {
-    llvmReplacementValue = &llvmOmpRegionInput;
-
-    // If this is a private value, this lambda will return the corresponding
-    // mlir value and its `PrivateClauseOp`. Otherwise, empty values are
-    // returned.
-    auto [mlirPrivVar, mlirPrivatizerClone] =
-        [&]() -> std::pair<mlir::Value, omp::PrivateClauseOp> {
-      if (!opInst.getPrivateVars().empty()) {
-        auto mlirPrivVars = opInst.getPrivateVars();
-        auto mlirPrivSyms = opInst.getPrivateSyms();
-
-        // Try to find a privatizer that corresponds to the LLVM value being
-        // privatized.
-        for (auto [mlirPrivVar, mlirPrivatizerAttr] :
-             llvm::zip_equal(mlirPrivVars, *mlirPrivSyms)) {
-          // Find the MLIR private variable corresponding to the LLVM value
-          // being privatized.
-          llvm::Value *mlirToLLVMPrivVar =
-              moduleTranslation.lookupValue(mlirPrivVar);
-
-          // Check if the LLVM value being privatized matches the LLVM value
-          // mapped to privVar. In some cases, this is not trivial ...
-          auto isMatch = [&]() {
-            if (mlirToLLVMPrivVar == nullptr)
-              return false;
-
-            // If both values are trivially equal, we found a match.
-            if (mlirToLLVMPrivVar == &llvmOmpRegionInput)
-              return true;
-
-            // Otherwise, we check if both llvmOmpRegionInputPtr and
-            // mlirToLLVMPrivVar refer to the same memory (through a load/store
-            // pair). This happens if a struct (i.e. multi-field value) is being
-            // privatized.
-            //
-            // For example, if the privatized value is defined by:
-            // ```
-            //   %priv_val = alloca { ptr, i64 }, align 8
-            // ```
-            //
-            // The initialization of this value (outside the omp region) will be
-            // something like this:
-            //
-            // clang-format off
-            // ```
-            //   %partially_init_priv_val = insertvalue { ptr, i64 } undef,
-            //                              ptr %some_ptr, 0
-            //   %fully_init_priv_val = insertvalue { ptr, i64 } %partially_init_priv_val,
-            //                          i64 %some_i64, 1
-            //   ...
-            //   store { ptr, i64 } %fully_init_priv_val, ptr %priv_val, align 8
-            // ```
-            // clang-format on
-            //
-            // Now, we enter the OMP region, in order to access this privatized
-            // value, we need to load from the allocated memory:
-            // ```
-            // omp.par.entry:
-            //   %priv_val_load = load { ptr, i64 }, ptr %priv_val, align 8
-            // ```
-            //
-            // The 2 LLVM values tracked here map as follows:
-            // - `mlirToLLVMPrivVar`     -> `%fully_init_priv_val`
-            // - `llvmOmpRegionInputPtr` -> `%priv_val_load`
-            //
-            // Even though they eventually refer to the same memory reference
-            // (the memory being privatized), they are 2 distinct LLVM values.
-            // Therefore, we need to discover their correspondence by finding
-            // out if they store into and load from the same mem ref.
-            auto *llvmOmpRegionInputPtrLoad =
-                llvm::dyn_cast_if_present<llvm::LoadInst>(&llvmOmpRegionInput);
-
-            if (llvmOmpRegionInputPtrLoad == nullptr)
-              return false;
-
-            for (auto &use : mlirToLLVMPrivVar->uses()) {
-              auto *mlirToLLVMPrivVarStore =
-                  llvm::dyn_cast_if_present<llvm::StoreInst>(use.getUser());
-              if (mlirToLLVMPrivVarStore &&
-                  (llvmOmpRegionInputPtrLoad->getPointerOperand() ==
-                   mlirToLLVMPrivVarStore->getPointerOperand()))
-                return true;
-            }
-
-            return false;
-          };
-
-          if (!isMatch())
-            continue;
-
-          SymbolRefAttr privSym = llvm::cast<SymbolRefAttr>(mlirPrivatizerAttr);
-          omp::PrivateClauseOp privatizer = findPrivatizer(opInst, privSym);
-
-          // Clone the privatizer in case it is used by more than one parallel
-          // region. The privatizer is processed in-place (see below) before it
-          // gets inlined in the parallel region and therefore processing the
-          // original op is dangerous.
-          return {mlirPrivVar,
-                  clonePrivatizer(moduleTranslation, privatizer, opInst)};
-        }
-      }
-
-      return {mlir::Value(), omp::PrivateClauseOp()};
-    }();
-
-    if (mlirPrivVar) {
-      Region &allocRegion = mlirPrivatizerClone.getAllocRegion();
-
-      // If this is a `firstprivate` clause, prepare the `omp.private` op by:
-      if (mlirPrivatizerClone.getDataSharingType() ==
-          omp::DataSharingClauseType::FirstPrivate) {
-        auto oldAllocBackBlock = std::prev(allocRegion.end());
-        omp::YieldOp oldAllocYieldOp =
-            llvm::cast<omp::YieldOp>(oldAllocBackBlock->getTerminator());
-
-        Region &copyRegion = mlirPrivatizerClone.getCopyRegion();
-
-        mlir::IRRewriter copyCloneBuilder(&moduleTranslation.getContext());
-        // 1. Cloning the `copy` region to the end of the `alloc` region.
-        copyCloneBuilder.cloneRegionBefore(copyRegion, allocRegion,
-                                           allocRegion.end());
-
-        auto newCopyRegionFrontBlock = std::next(oldAllocBackBlock);
-        // 2. Merging the last `alloc` block with the first block in the `copy`
-        // region clone.
-        // 3. Re-mapping the first argument of the `copy` region to be the
-        // argument of the `alloc` region and the second argument of the `copy`
-        // region to be the yielded value of the `alloc` region (this is the
-        // private clone of the privatized value).
-        copyCloneBuilder.mergeBlocks(&*newCopyRegionFrontBlock,
-                                     &*oldAllocBackBlock,
-                                     {mlirPrivatizerClone.getAllocMoldArg(),
-                                      oldAllocYieldOp.getOperand(0)});
-
-        // 4. The old terminator of the `alloc` region is not needed anymore, so
-        // delete it.
-        oldAllocYieldOp.erase();
-      }
-
-      // Replace the privatizer block argument with mlir value being privatized.
-      // This way, the body of the privatizer will be changed from using the
-      // region/block argument to the value being privatized.
-      replaceAllUsesInRegionWith(mlirPrivatizerClone.getAllocMoldArg(),
-                                 mlirPrivVar, allocRegion);
-
-      auto oldIP = builder.saveIP();
-      builder.restoreIP(allocaIP);
-
-      SmallVector<llvm::Value *, 1> yieldedValues;
-      if (failed(inlineConvertOmpRegions(allocRegion, "omp.privatizer", builder,
-                                         moduleTranslation, &yieldedValues))) {
-        opInst.emitError("failed to inline `alloc` region of an `omp.private` "
-                         "op in the parallel region");
-        bodyGenStatus = failure();
-        mlirPrivatizerClone.erase();
-      } else {
-        assert(yieldedValues.size() == 1);
-        llvmReplacementValue = yieldedValues.front();
-
-        // Keep the LLVM replacement value and the op clone in case we need to
-        // emit cleanup (i.e. deallocation) logic.
-        llvmPrivateVars.push_back(llvmReplacementValue);
-        mlirPrivatizerClones.push_back(mlirPrivatizerClone);
-      }
-
-      builder.restoreIP(oldIP);
-    }
-
+  auto privCB = [](InsertPointTy allocaIP, InsertPointTy codeGenIP,
+                   llvm::Value &, llvm::Value &val, llvm::Value *&replVal) {
+    // tell OpenMPIRBuilder not to do anything. We handled Privatisation in
+    // bodyGenCB.
+    replVal = &val;
     return codeGenIP;
   };
 
@@ -1733,8 +1643,7 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       bodyGenStatus = failure();
 
     SmallVector<Region *> privateCleanupRegions;
-    llvm::transform(mlirPrivatizerClones,
-                    std::back_inserter(privateCleanupRegions),
+    llvm::transform(privateDecls, std::back_inserter(privateCleanupRegions),
                     [](omp::PrivateClauseOp privatizer) {
                       return &privatizer.getDeallocRegion();
                     });
@@ -1767,9 +1676,6 @@ convertOmpParallel(omp::ParallelOp opInst, llvm::IRBuilderBase &builder,
       ompBuilder->createParallel(ompLoc, allocaIP, bodyGenCB, privCB, finiCB,
                                  ifCond, numThreads, pbKind, isCancellable));
 
-  for (mlir::omp::PrivateClauseOp privatizerClone : mlirPrivatizerClones)
-    privatizerClone.erase();
-
   return bodyGenStatus;
 }
 
diff --git a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
index 02ce6b5b19ce..79412fb69f75 100644
--- a/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-firstprivate.mlir
@@ -74,27 +74,38 @@ llvm.func @parallel_op_firstprivate_multi_block(%arg0: !llvm.ptr) {
 // CHECK: [[PRIV_BB2]]:
 // CHECK-NEXT: %[[C1:.*]] = phi i32 [ 1, %[[PRIV_BB1]] ]
 // CHECK-NEXT: %[[PRIV_ALLOC:.*]] = alloca float, i32 %[[C1]], align 4
-// The entry block of the `copy` region is merged into the exit block of the
-// `alloc` region. So check for that.
+// CHECK-NEXT: br label %omp.region.cont
+
+// CHECK: omp.region.cont:
+// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ]
+// CHECK-NEXT: br label %omp.private.latealloc
+
+// CHECK: omp.private.latealloc:
+// CHECK-NEXT: br label %omp.private.copy
+
+// CHECK: omp.private.copy:
+// CHECK-NEXT: br label %omp.private.copy3
+
+// CHECK: omp.private.copy3:
 // CHECK-NEXT: %[[ORIG_VAL:.*]] = load float, ptr %[[ORIG_PTR]], align 4
 // CHECK-NEXT: br label %[[PRIV_BB3:.*]]
 
 // Check contents of the 2nd block in the `copy` region.
 // CHECK: [[PRIV_BB3]]:
-// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %[[PRIV_BB2]] ]
-// CHECK-NEXT: %[[PRIV_ALLOC2:.*]] = phi ptr [ %[[PRIV_ALLOC]], %[[PRIV_BB2]] ]
-// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC2]], align 4
+// CHECK-NEXT: %[[ORIG_VAL2:.*]] = phi float [ %[[ORIG_VAL]], %omp.private.copy3 ]
+// CHECK-NEXT: %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %omp.private.copy3 ]
+// CHECK-NEXT: store float %[[ORIG_VAL2]], ptr %[[PRIV_ALLOC3]], align 4
 // CHECK-NEXT: br label %[[PRIV_CONT:.*]]
 
 // Check that the privatizer's continuation block yileds the private clone's
 // address.
 // CHECK: [[PRIV_CONT]]:
-// CHECK-NEXT:   %[[PRIV_ALLOC3:.*]] = phi ptr [ %[[PRIV_ALLOC2]], %[[PRIV_BB3]] ]
+// CHECK-NEXT:   %[[PRIV_ALLOC4:.*]] = phi ptr [ %[[PRIV_ALLOC3]], %[[PRIV_BB3]] ]
 // CHECK-NEXT:   br label %[[PAR_REG:.*]]
 
 // Check that the body of the parallel region loads from the private clone.
 // CHECK: [[PAR_REG]]:
-// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC3]], align 4
+// CHECK:        %{{.*}} = load float, ptr %[[PRIV_ALLOC2]], align 4
 
 omp.private {type = firstprivate} @multi_block.privatizer : !llvm.ptr alloc {
 ^bb0(%arg0: !llvm.ptr):
diff --git a/mlir/test/Target/LLVMIR/openmp-private.mlir b/mlir/test/Target/LLVMIR/openmp-private.mlir
index 6153e5685c29..5407f97286eb 100644
--- a/mlir/test/Target/LLVMIR/openmp-private.mlir
+++ b/mlir/test/Target/LLVMIR/openmp-private.mlir
@@ -104,6 +104,9 @@ llvm.func @parallel_op_private_multi_block(%arg0: !llvm.ptr) {
 // CHECK: omp.par.entry:
 // CHECK:  %[[ORIG_PTR_PTR:.*]] = getelementptr { ptr }, ptr %{{.*}}, i32 0, i32 0
 // CHECK:  %[[ORIG_PTR:.*]] = load ptr, ptr %[[ORIG_PTR_PTR]], align 8
+// CHECK:  br label %omp.private.latealloc
+
+// CHECK: omp.private.latealloc:
 // CHECK:   br label %[[PRIV_BB1:.*]]
 
 // Check contents of the first block in the `alloc` region.
@@ -151,8 +154,7 @@ omp.private {type = private} @multi_block.privatizer : !llvm.ptr alloc {
 // CHECK:         omp.par.region:
 // CHECK:           br label %[[PAR_REG_BEG:.*]]
 // CHECK:         [[PAR_REG_BEG]]:
-// CHECK:           %[[PRIVATIZER_GEP:.*]] = getelementptr double, ptr @_QQfoo, i64 111
-// CHECK:           call void @bar(ptr %[[PRIVATIZER_GEP]])
+// CHECK:           call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 111))
 // CHECK:           call void @bar(ptr getelementptr (double, ptr @_QQfoo, i64 222))
 llvm.func @lower_region_with_addressof() {
   %0 = llvm.mlir.constant(1 : i64) : i64
-- 
GitLab


From ab2b17512cda90305d5bea77b8e8fa119ab78f25 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Wed, 16 Oct 2024 14:48:59 +0100
Subject: [PATCH 134/329] [flang] Link to libatomic with openmp and
 rtlib=libgcc (#112202)

Currently when using OpenMP atomics we depend on some symbols from
libatomic. These symbols are provided in a separate library for the
libgcc runtime, so we should link to that when rtlib=libgcc.

For the compiler-rt case, the presence and location of the symbols is
dependent on how compiler-rt itself was built so we cannot make that
decision for the user. As such no extra flags are added in that case.
---
 clang/lib/Driver/ToolChains/CommonArgs.cpp | 10 ++++++++++
 flang/test/Driver/atomic.f90               |  5 +++++
 2 files changed, 15 insertions(+)
 create mode 100644 flang/test/Driver/atomic.f90

diff --git a/clang/lib/Driver/ToolChains/CommonArgs.cpp b/clang/lib/Driver/ToolChains/CommonArgs.cpp
index 3dd86ab7b99c..e662c3f0d2fa 100644
--- a/clang/lib/Driver/ToolChains/CommonArgs.cpp
+++ b/clang/lib/Driver/ToolChains/CommonArgs.cpp
@@ -1294,6 +1294,16 @@ void tools::addFortranRuntimeLibs(const ToolChain &TC, const ArgList &Args,
     CmdArgs.push_back("-lFortranRuntime");
     CmdArgs.push_back("-lFortranDecimal");
   }
+
+  // libomp needs libatomic for atomic operations if using libgcc
+  if (Args.hasFlag(options::OPT_fopenmp, options::OPT_fopenmp_EQ,
+                   options::OPT_fno_openmp, false)) {
+    Driver::OpenMPRuntimeKind OMPRuntime =
+        TC.getDriver().getOpenMPRuntime(Args);
+    ToolChain::RuntimeLibType RuntimeLib = TC.GetRuntimeLibType(Args);
+    if (OMPRuntime == Driver::OMPRT_OMP && RuntimeLib == ToolChain::RLT_Libgcc)
+      CmdArgs.push_back("-latomic");
+  }
 }
 
 void tools::addFortranRuntimeLibraryPath(const ToolChain &TC,
diff --git a/flang/test/Driver/atomic.f90 b/flang/test/Driver/atomic.f90
new file mode 100644
index 000000000000..0fb3b428f694
--- /dev/null
+++ b/flang/test/Driver/atomic.f90
@@ -0,0 +1,5 @@
+!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=libgcc -### %s 2>&1 | FileCheck --check-prefixes=GCC %s
+!RUN: %flang --target=aarch64-unknown-linux-gnu -fuse-ld=ld -fopenmp -rtlib=compiler-rt -### %s 2>&1 | FileCheck --check-prefixes=CRT %s
+
+!GCC: -latomic
+!CRT-NOT: -latomic
-- 
GitLab


From 91b5bef358e6763c5e18e34b1bc37e64114b3e04 Mon Sep 17 00:00:00 2001
From: David Truby <david.truby@arm.com>
Date: Wed, 16 Oct 2024 14:49:30 +0100
Subject: [PATCH 135/329] [flang] Tighten requirements on some glibc float128
 functions (#110651)

j0l, j1l, jnl, y0l, y1l and ynl are glibc extensions rather than
standard POSIX functions, and so are not available in every Linux libc.
This patch checks if `__GLIBC__` and `_GNU_SOURCE` are defined before
using
these functions.

This patch allows the float128 runtime to build with musl libc on Linux.
---
 flang/lib/Evaluate/intrinsics-library.cpp | 2 +-
 flang/runtime/Float128Math/math-entries.h | 9 ++++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/flang/lib/Evaluate/intrinsics-library.cpp b/flang/lib/Evaluate/intrinsics-library.cpp
index ee4df2dbd113..bb439a6bb3a7 100644
--- a/flang/lib/Evaluate/intrinsics-library.cpp
+++ b/flang/lib/Evaluate/intrinsics-library.cpp
@@ -417,7 +417,7 @@ template <> struct HostRuntimeLibrary<double, LibraryVersion::LibmExtensions> {
   static_assert(map.Verify(), "map must be sorted");
 };
 
-#if HAS_FLOAT80 || HAS_LDBL128
+#if defined(__GLIBC__) && (HAS_FLOAT80 || HAS_LDBL128)
 template <>
 struct HostRuntimeLibrary<long double, LibraryVersion::LibmExtensions> {
   using F = FuncPointer<long double, long double>;
diff --git a/flang/runtime/Float128Math/math-entries.h b/flang/runtime/Float128Math/math-entries.h
index 90a983b787f5..4600c726d728 100644
--- a/flang/runtime/Float128Math/math-entries.h
+++ b/flang/runtime/Float128Math/math-entries.h
@@ -187,9 +187,6 @@ DEFINE_SIMPLE_ALIAS(Hypot, std::hypot)
 DEFINE_SIMPLE_ALIAS(Ilogb, std::ilogb)
 DEFINE_SIMPLE_ALIAS(Isinf, std::isinf)
 DEFINE_SIMPLE_ALIAS(Isnan, std::isnan)
-DEFINE_SIMPLE_ALIAS(J0, j0l)
-DEFINE_SIMPLE_ALIAS(J1, j1l)
-DEFINE_SIMPLE_ALIAS(Jn, jnl)
 DEFINE_SIMPLE_ALIAS(Ldexp, std::ldexp)
 DEFINE_SIMPLE_ALIAS(Lgamma, std::lgamma)
 DEFINE_SIMPLE_ALIAS(Llround, std::llround)
@@ -207,9 +204,15 @@ DEFINE_SIMPLE_ALIAS(Tan, std::tan)
 DEFINE_SIMPLE_ALIAS(Tanh, std::tanh)
 DEFINE_SIMPLE_ALIAS(Tgamma, std::tgamma)
 DEFINE_SIMPLE_ALIAS(Trunc, std::trunc)
+
+#if defined(__GLIBC__) && defined(_GNU_SOURCE)
+DEFINE_SIMPLE_ALIAS(J0, j0l)
+DEFINE_SIMPLE_ALIAS(J1, j1l)
+DEFINE_SIMPLE_ALIAS(Jn, jnl)
 DEFINE_SIMPLE_ALIAS(Y0, y0l)
 DEFINE_SIMPLE_ALIAS(Y1, y1l)
 DEFINE_SIMPLE_ALIAS(Yn, ynl)
+#endif
 
 // Use numeric_limits to produce infinity of the right type.
 #define F128_RT_INFINITY \
-- 
GitLab


From 87f126243beb69b8b02e5cd4df762bc8a6f1f8cc Mon Sep 17 00:00:00 2001
From: Dmitry Vasilyev <dvassiliev@accesssoftek.com>
Date: Wed, 16 Oct 2024 17:52:16 +0400
Subject: [PATCH 136/329] [lldb][test] Skip Test*FromStdModule tests on Linux
 for now (#112530)

This is the alternative to #98701.
See for more details:
https://reviews.llvm.org/D139361
https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
---
 .../expression/import-std-module/array/TestArrayFromStdModule.py | 1 +
 .../TestDbgInfoContentVectorFromStdModule.py                     | 1 +
 .../vector-of-vectors/TestVectorOfVectorsFromStdModule.py        | 1 +
 3 files changed, 3 insertions(+)

diff --git a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
index 13ab6b0c9ac1..bafc76282962 100644
--- a/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/array/TestArrayFromStdModule.py
@@ -10,6 +10,7 @@ from lldbsuite.test import lldbutil
 class TestCase(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
+    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
index 1c32222e64f1..71eaeef20e79 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-dbg-info-content/TestDbgInfoContentVectorFromStdModule.py
@@ -14,6 +14,7 @@ class TestDbgInfoContentVector(TestBase):
     @skipIf(compiler="clang", compiler_version=["<", "12.0"])
     @skipIf(macos_version=["<", "14.0"])
     @skipIfDarwin  # https://github.com/llvm/llvm-project/issues/106475
+    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
diff --git a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
index a1f33271f39d..e9415fd53651 100644
--- a/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
+++ b/lldb/test/API/commands/expression/import-std-module/vector-of-vectors/TestVectorOfVectorsFromStdModule.py
@@ -10,6 +10,7 @@ from lldbsuite.test import lldbutil
 class TestVectorOfVectors(TestBase):
     @add_test_categories(["libc++"])
     @skipIf(compiler=no_match("clang"))
+    @skipIfLinux  # https://discourse.llvm.org/t/lldb-test-failures-on-linux/80095
     def test(self):
         self.build()
 
-- 
GitLab


From b333edd0d6da744c099ad3ff3b5fbd2d4e4dd45a Mon Sep 17 00:00:00 2001
From: Boaz Brickner <brickner@google.com>
Date: Wed, 16 Oct 2024 16:02:25 +0200
Subject: [PATCH 137/329] [clang] When checking for covariant return types,
 make sure the pointers or references are to *classes* (#111856)

https://eel.is/c++draft/class.virtual#8.1

This prevents overriding methods with non class return types that have
less cv-qualification.

Fixes: #111742
---
 clang/docs/ReleaseNotes.rst             | 13 +++++++++++++
 clang/lib/Sema/SemaDeclCXX.cpp          |  2 +-
 clang/test/SemaCXX/virtual-override.cpp |  6 ++++++
 3 files changed, 20 insertions(+), 1 deletion(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 33eb9a2b5804..dc5564b6db11 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -99,6 +99,19 @@ C++ Specific Potentially Breaking Changes
     // Was error, now evaluates to false.
     constexpr bool b = f() == g();
 
+- Clang will now correctly not consider pointers to non classes for covariance.
+
+  .. code-block:: c++
+
+    struct A {
+      virtual const int *f() const;
+    };
+    struct B : A {
+      // Return type has less cv-qualification but doesn't point to a class.
+      // Error will be generated.
+      int *f() const override;
+    };
+
 - The warning ``-Wdeprecated-literal-operator`` is now on by default, as this is
   something that WG21 has shown interest in removing from the language. The
   result is that anyone who is compiling with ``-Werror`` should see this
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 75d82c12e0c1..38f808a470aa 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -18273,7 +18273,7 @@ bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New,
   }
 
   // The return types aren't either both pointers or references to a class type.
-  if (NewClassTy.isNull()) {
+  if (NewClassTy.isNull() || !NewClassTy->isStructureOrClassType()) {
     Diag(New->getLocation(),
          diag::err_different_return_type_for_overriding_virtual_function)
         << New->getDeclName() << NewTy << OldTy
diff --git a/clang/test/SemaCXX/virtual-override.cpp b/clang/test/SemaCXX/virtual-override.cpp
index 72abfc3cf51e..d37c275d46ba 100644
--- a/clang/test/SemaCXX/virtual-override.cpp
+++ b/clang/test/SemaCXX/virtual-override.cpp
@@ -19,10 +19,12 @@ struct b { };
   
 class A {
   virtual a* f(); // expected-note{{overridden virtual function is here}}
+  virtual int *g(); // expected-note{{overridden virtual function is here}}
 };
 
 class B : A {
   virtual b* f(); // expected-error{{return type of virtual function 'f' is not covariant with the return type of the function it overrides ('b *' is not derived from 'a *')}}
+  virtual char *g(); // expected-error{{virtual function 'g' has a different return type ('char *') than the function it overrides (which has return type 'int *')}}
 };
 
 }
@@ -83,11 +85,15 @@ struct a { };
 class A {
   virtual const a* f(); 
   virtual a* g(); // expected-note{{overridden virtual function is here}}
+  virtual const int* h(); // expected-note{{overridden virtual function is here}}
+  virtual int* i(); // expected-note{{overridden virtual function is here}}
 };
 
 class B : A {
   virtual a* f(); 
   virtual const a* g(); // expected-error{{return type of virtual function 'g' is not covariant with the return type of the function it overrides (class type 'const a *' is more qualified than class type 'a *'}}
+  virtual int* h();  // expected-error{{virtual function 'h' has a different return type ('int *') than the function it overrides (which has return type 'const int *')}}
+  virtual const int* i(); // expected-error{{virtual function 'i' has a different return type ('const int *') than the function it overrides (which has return type 'int *')}}
 };
 
 }
-- 
GitLab


From cba7b369b2a511082897fc5dc5a9c95a36c2743d Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 16 Oct 2024 07:20:18 -0700
Subject: [PATCH 138/329] [Clang][TableGen] Use const pointers for various Init
 objects in MveEmitter (#112320)

Use const pointers for various Init objects in MveEmitter. This is a
part of effort to have better const correctness in TableGen backends:


https://discourse.llvm.org/t/psa-planned-changes-to-tablegen-getallderiveddefinitions-api-potential-downstream-breakages/81089
---
 clang/utils/TableGen/MveEmitter.cpp | 51 +++++++++++++++--------------
 1 file changed, 26 insertions(+), 25 deletions(-)

diff --git a/clang/utils/TableGen/MveEmitter.cpp b/clang/utils/TableGen/MveEmitter.cpp
index 915e914d6b92..51e570944b49 100644
--- a/clang/utils/TableGen/MveEmitter.cpp
+++ b/clang/utils/TableGen/MveEmitter.cpp
@@ -1033,15 +1033,15 @@ public:
   // to expand Tablegen classes like 'Vector' which mean something different in
   // each member of a parametric family.
   const Type *getType(const Record *R, const Type *Param);
-  const Type *getType(DagInit *D, const Type *Param);
-  const Type *getType(Init *I, const Type *Param);
+  const Type *getType(const DagInit *D, const Type *Param);
+  const Type *getType(const Init *I, const Type *Param);
 
   // Functions that translate the Tablegen representation of an intrinsic's
   // code generation into a collection of Value objects (which will then be
   // reprocessed to read out the actual C++ code included by CGBuiltin.cpp).
-  Result::Ptr getCodeForDag(DagInit *D, const Result::Scope &Scope,
+  Result::Ptr getCodeForDag(const DagInit *D, const Result::Scope &Scope,
                             const Type *Param);
-  Result::Ptr getCodeForDagArg(DagInit *D, unsigned ArgNum,
+  Result::Ptr getCodeForDagArg(const DagInit *D, unsigned ArgNum,
                                const Result::Scope &Scope, const Type *Param);
   Result::Ptr getCodeForArg(unsigned ArgNum, const Type *ArgType, bool Promote,
                             bool Immediate);
@@ -1060,10 +1060,10 @@ public:
   void EmitBuiltinAliases(raw_ostream &OS);
 };
 
-const Type *EmitterBase::getType(Init *I, const Type *Param) {
-  if (auto Dag = dyn_cast<DagInit>(I))
+const Type *EmitterBase::getType(const Init *I, const Type *Param) {
+  if (const auto *Dag = dyn_cast<DagInit>(I))
     return getType(Dag, Param);
-  if (auto Def = dyn_cast<DefInit>(I))
+  if (const auto *Def = dyn_cast<DefInit>(I))
     return getType(Def->getDef(), Param);
 
   PrintFatalError("Could not convert this value into a type");
@@ -1088,7 +1088,7 @@ const Type *EmitterBase::getType(const Record *R, const Type *Param) {
   PrintFatalError(R->getLoc(), "Could not convert this record into a type");
 }
 
-const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
+const Type *EmitterBase::getType(const DagInit *D, const Type *Param) {
   // The meat of the getType system: types in the Tablegen are represented by a
   // dag whose operators select sub-cases of this function.
 
@@ -1156,7 +1156,8 @@ const Type *EmitterBase::getType(DagInit *D, const Type *Param) {
   PrintFatalError("Bad operator in type dag expression");
 }
 
-Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
+Result::Ptr EmitterBase::getCodeForDag(const DagInit *D,
+                                       const Result::Scope &Scope,
                                        const Type *Param) {
   const Record *Op = cast<DefInit>(D->getOperator())->getDef();
 
@@ -1199,14 +1200,14 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
     Result::Ptr Arg = getCodeForDagArg(D, 0, Scope, Param);
 
     const Type *Ty = nullptr;
-    if (auto *DI = dyn_cast<DagInit>(D->getArg(0)))
+    if (const auto *DI = dyn_cast<DagInit>(D->getArg(0)))
       if (auto *PTy = dyn_cast<PointerType>(getType(DI->getOperator(), Param)))
         Ty = PTy->getPointeeType();
     if (!Ty)
       PrintFatalError("'address' pointer argument should be a pointer");
 
     unsigned Alignment;
-    if (auto *II = dyn_cast<IntInit>(D->getArg(1))) {
+    if (const auto *II = dyn_cast<IntInit>(D->getArg(1))) {
       Alignment = II->getValue();
     } else {
       PrintFatalError("'address' alignment argument should be an integer");
@@ -1267,10 +1268,10 @@ Result::Ptr EmitterBase::getCodeForDag(DagInit *D, const Result::Scope &Scope,
   }
 }
 
-Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
+Result::Ptr EmitterBase::getCodeForDagArg(const DagInit *D, unsigned ArgNum,
                                           const Result::Scope &Scope,
                                           const Type *Param) {
-  Init *Arg = D->getArg(ArgNum);
+  const Init *Arg = D->getArg(ArgNum);
   StringRef Name = D->getArgNameStr(ArgNum);
 
   if (!Name.empty()) {
@@ -1286,18 +1287,18 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
   // Sometimes the Arg is a bit. Prior to multiclass template argument
   // checking, integers would sneak through the bit declaration,
   // but now they really are bits.
-  if (auto *BI = dyn_cast<BitInit>(Arg))
+  if (const auto *BI = dyn_cast<BitInit>(Arg))
     return std::make_shared<IntLiteralResult>(getScalarType("u32"),
                                               BI->getValue());
 
-  if (auto *II = dyn_cast<IntInit>(Arg))
+  if (const auto *II = dyn_cast<IntInit>(Arg))
     return std::make_shared<IntLiteralResult>(getScalarType("u32"),
                                               II->getValue());
 
-  if (auto *DI = dyn_cast<DagInit>(Arg))
+  if (const auto *DI = dyn_cast<DagInit>(Arg))
     return getCodeForDag(DI, Scope, Param);
 
-  if (auto *DI = dyn_cast<DefInit>(Arg)) {
+  if (const auto *DI = dyn_cast<DefInit>(Arg)) {
     const Record *Rec = DI->getDef();
     if (Rec->isSubClassOf("Type")) {
       const Type *T = getType(Rec, Param);
@@ -1307,7 +1308,7 @@ Result::Ptr EmitterBase::getCodeForDagArg(DagInit *D, unsigned ArgNum,
 
   PrintError("bad DAG argument type for code generation");
   PrintNote("DAG: " + D->getAsString());
-  if (TypedInit *Typed = dyn_cast<TypedInit>(Arg))
+  if (const auto *Typed = dyn_cast<TypedInit>(Arg))
     PrintNote("argument type: " + Typed->getType()->getAsString());
   PrintFatalNote("argument number " + Twine(ArgNum) + ": " + Arg->getAsString());
 }
@@ -1379,13 +1380,13 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
   HeaderOnly = R->getValueAsBit("headerOnly");
 
   // Process the intrinsic's argument list.
-  DagInit *ArgsDag = R->getValueAsDag("args");
+  const DagInit *ArgsDag = R->getValueAsDag("args");
   Result::Scope Scope;
   for (unsigned i = 0, e = ArgsDag->getNumArgs(); i < e; ++i) {
-    Init *TypeInit = ArgsDag->getArg(i);
+    const Init *TypeInit = ArgsDag->getArg(i);
 
     bool Promote = true;
-    if (auto TypeDI = dyn_cast<DefInit>(TypeInit))
+    if (const auto *TypeDI = dyn_cast<DefInit>(TypeInit))
       if (TypeDI->getDef()->isSubClassOf("unpromoted"))
         Promote = false;
 
@@ -1397,7 +1398,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
     // If the argument is a subclass of Immediate, record the details about
     // what values it can take, for Sema checking.
     bool Immediate = false;
-    if (auto TypeDI = dyn_cast<DefInit>(TypeInit)) {
+    if (const auto *TypeDI = dyn_cast<DefInit>(TypeInit)) {
       const Record *TypeRec = TypeDI->getDef();
       if (TypeRec->isSubClassOf("Immediate")) {
         Immediate = true;
@@ -1444,7 +1445,7 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
 
   // Finally, go through the codegen dag and translate it into a Result object
   // (with an arbitrary DAG of depended-on Results hanging off it).
-  DagInit *CodeDag = R->getValueAsDag("codegen");
+  const DagInit *CodeDag = R->getValueAsDag("codegen");
   const Record *MainOp = cast<DefInit>(CodeDag->getOperator())->getDef();
   if (MainOp->isSubClassOf("CustomCodegen")) {
     // Or, if it's the special case of CustomCodegen, just accumulate
@@ -1456,9 +1457,9 @@ ACLEIntrinsic::ACLEIntrinsic(EmitterBase &ME, const Record *R,
       StringRef Name = CodeDag->getArgNameStr(i);
       if (Name.empty()) {
         PrintFatalError("Operands to CustomCodegen should have names");
-      } else if (auto *II = dyn_cast<IntInit>(CodeDag->getArg(i))) {
+      } else if (const auto *II = dyn_cast<IntInit>(CodeDag->getArg(i))) {
         CustomCodeGenArgs[std::string(Name)] = itostr(II->getValue());
-      } else if (auto *SI = dyn_cast<StringInit>(CodeDag->getArg(i))) {
+      } else if (const auto *SI = dyn_cast<StringInit>(CodeDag->getArg(i))) {
         CustomCodeGenArgs[std::string(Name)] = std::string(SI->getValue());
       } else {
         PrintFatalError("Operands to CustomCodegen should be integers");
-- 
GitLab


From 6924fc03260370876f7091ba06cdc350989ac3c5 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 16 Oct 2024 07:21:10 -0700
Subject: [PATCH 139/329] [LLVM] Add `Intrinsic::getDeclarationIfExists`
 (#112428)

Add `Intrinsic::getDeclarationIfExists` to lookup an existing
declaration of an intrinsic in a `Module`.
---
 llvm/include/llvm/IR/Intrinsics.h                  | 10 ++++++++++
 llvm/lib/Analysis/LazyValueInfo.cpp                |  2 +-
 llvm/lib/Analysis/ScalarEvolution.cpp              | 12 ++++++------
 llvm/lib/IR/Intrinsics.cpp                         | 10 ++++++++++
 llvm/lib/LTO/LTO.cpp                               | 14 +++++++-------
 .../Target/AMDGPU/AMDGPULowerKernelArguments.cpp   |  4 ++--
 .../Target/AMDGPU/AMDGPULowerKernelAttributes.cpp  |  3 +--
 llvm/lib/Target/AMDGPU/SIISelLowering.cpp          |  2 +-
 llvm/lib/Transforms/IPO/ExpandVariadics.cpp        |  5 +++--
 llvm/lib/Transforms/IPO/GlobalDCE.cpp              |  6 +++---
 llvm/lib/Transforms/IPO/GlobalSplit.cpp            |  8 ++++----
 llvm/lib/Transforms/IPO/LowerTypeTests.cpp         |  6 +++---
 llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp   | 10 +++++-----
 llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp     | 13 +++++++------
 .../Instrumentation/IndirectCallPromotion.cpp      |  2 +-
 .../Transforms/Instrumentation/InstrProfiling.cpp  | 12 ++++++------
 llvm/lib/Transforms/Scalar/GuardWidening.cpp       |  8 ++++----
 llvm/lib/Transforms/Scalar/IndVarSimplify.cpp      |  4 ++--
 llvm/lib/Transforms/Scalar/JumpThreading.cpp       |  4 ++--
 llvm/lib/Transforms/Scalar/LoopPredication.cpp     |  6 +++---
 llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp |  4 ++--
 .../Transforms/Scalar/LowerWidenableCondition.cpp  |  4 ++--
 llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp  |  4 ++--
 llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp  |  4 ++--
 24 files changed, 89 insertions(+), 68 deletions(-)

diff --git a/llvm/include/llvm/IR/Intrinsics.h b/llvm/include/llvm/IR/Intrinsics.h
index 49f4fe4c5c3d..e893295e3272 100644
--- a/llvm/include/llvm/IR/Intrinsics.h
+++ b/llvm/include/llvm/IR/Intrinsics.h
@@ -102,6 +102,16 @@ namespace Intrinsic {
   inline Function *getDeclaration(Module *M, ID id, ArrayRef<Type *> Tys = {}) {
     return getOrInsertDeclaration(M, id, Tys);
   }
+
+  /// Look up the Function declaration of the intrinsic \p id in the Module
+  /// \p M and return it if it exists. Otherwise, return nullptr. This version
+  /// supports non-overloaded intrinsics.
+  Function *getDeclarationIfExists(const Module *M, ID id);
+
+  /// This version supports overloaded intrinsics.
+  Function *getDeclarationIfExists(Module *M, ID id, ArrayRef<Type *> Tys,
+                                   FunctionType *FT = nullptr);
+
   /// Looks up Name in NameTable via binary search. NameTable must be sorted
   /// and all entries must start with "llvm.".  If NameTable contains an exact
   /// match for Name or a prefix of Name followed by a dot, its index in
diff --git a/llvm/lib/Analysis/LazyValueInfo.cpp b/llvm/lib/Analysis/LazyValueInfo.cpp
index 30dc4ae30dbf..10ad4708596c 100644
--- a/llvm/lib/Analysis/LazyValueInfo.cpp
+++ b/llvm/lib/Analysis/LazyValueInfo.cpp
@@ -1613,7 +1613,7 @@ LazyValueInfoImpl &LazyValueInfo::getOrCreateImpl(const Module *M) {
     assert(M && "getCache() called with a null Module");
     const DataLayout &DL = M->getDataLayout();
     Function *GuardDecl =
-        M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+        Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard);
     PImpl = new LazyValueInfoImpl(AC, DL, GuardDecl);
   }
   return *static_cast<LazyValueInfoImpl *>(PImpl);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 97ea405a5267..a3ba8e037819 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -11665,8 +11665,8 @@ bool ScalarEvolution::isBasicBlockEntryGuardedByCond(const BasicBlock *BB,
   }
 
   // Check conditions due to any @llvm.experimental.guard intrinsics.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   if (GuardDecl)
     for (const auto *GU : GuardDecl->users())
       if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
@@ -13615,8 +13615,8 @@ ScalarEvolution::ScalarEvolution(Function &F, TargetLibraryInfo &TLI,
   // ScalarEvolution to optimize based on those guards.  For now we prefer to be
   // efficient in lieu of being smart in that rather obscure case.
 
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   HasGuards = GuardDecl && !GuardDecl->use_empty();
 }
 
@@ -15593,8 +15593,8 @@ ScalarEvolution::LoopGuards::collect(const Loop *L, ScalarEvolution &SE) {
   }
 
   // Second, collect information from llvm.experimental.guards dominating the loop.
-  auto *GuardDecl = SE.F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      SE.F.getParent(), Intrinsic::experimental_guard);
   if (GuardDecl)
     for (const auto *GU : GuardDecl->users())
       if (const auto *Guard = dyn_cast<IntrinsicInst>(GU))
diff --git a/llvm/lib/IR/Intrinsics.cpp b/llvm/lib/IR/Intrinsics.cpp
index ff8b4b7a020c..1b92daf15b46 100644
--- a/llvm/lib/IR/Intrinsics.cpp
+++ b/llvm/lib/IR/Intrinsics.cpp
@@ -724,6 +724,16 @@ Function *Intrinsic::getOrInsertDeclaration(Module *M, ID id,
           .getCallee());
 }
 
+Function *Intrinsic::getDeclarationIfExists(const Module *M, ID id) {
+  return M->getFunction(getName(id));
+}
+
+Function *Intrinsic::getDeclarationIfExists(Module *M, ID id,
+                                            ArrayRef<Type *> Tys,
+                                            FunctionType *FT) {
+  return M->getFunction(getName(id, Tys, M, FT));
+}
+
 // This defines the "Intrinsic::getIntrinsicForClangBuiltin()" method.
 #define GET_LLVM_INTRINSIC_FOR_CLANG_BUILTIN
 #include "llvm/IR/IntrinsicImpl.inc"
diff --git a/llvm/lib/LTO/LTO.cpp b/llvm/lib/LTO/LTO.cpp
index 90c4e2c3cd13..0f53c6085121 100644
--- a/llvm/lib/LTO/LTO.cpp
+++ b/llvm/lib/LTO/LTO.cpp
@@ -1120,13 +1120,13 @@ Error LTO::checkPartiallySplit() {
   if (!ThinLTO.CombinedIndex.partiallySplitLTOUnits())
     return Error::success();
 
-  Function *TypeTestFunc = RegularLTO.CombinedModule->getFunction(
-      Intrinsic::getName(Intrinsic::type_test));
-  Function *TypeCheckedLoadFunc = RegularLTO.CombinedModule->getFunction(
-      Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      RegularLTO.CombinedModule->getFunction(
-          Intrinsic::getName(Intrinsic::type_checked_load_relative));
+  const Module *Combined = RegularLTO.CombinedModule.get();
+  Function *TypeTestFunc =
+      Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_test);
+  Function *TypeCheckedLoadFunc =
+      Intrinsic::getDeclarationIfExists(Combined, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      Combined, Intrinsic::type_checked_load_relative);
 
   // First check if there are type tests / type checked loads in the
   // merged regular LTO module IR.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
index d16c96f88e7b..6573176492b7 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelArguments.cpp
@@ -171,8 +171,8 @@ public:
   // Try to allocate SGPRs to preload implicit kernel arguments.
   void tryAllocImplicitArgPreloadSGPRs(uint64_t ImplicitArgsBaseOffset,
                                        IRBuilder<> &Builder) {
-    StringRef Name = Intrinsic::getName(Intrinsic::amdgcn_implicitarg_ptr);
-    Function *ImplicitArgPtr = F.getParent()->getFunction(Name);
+    Function *ImplicitArgPtr = Intrinsic::getDeclarationIfExists(
+        F.getParent(), Intrinsic::amdgcn_implicitarg_ptr);
     if (!ImplicitArgPtr)
       return;
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
index 7d66d07c9d0f..1bb5e794da7d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerKernelAttributes.cpp
@@ -78,8 +78,7 @@ public:
 Function *getBasePtrIntrinsic(Module &M, bool IsV5OrAbove) {
   auto IntrinsicId = IsV5OrAbove ? Intrinsic::amdgcn_implicitarg_ptr
                                  : Intrinsic::amdgcn_dispatch_ptr;
-  StringRef Name = Intrinsic::getName(IntrinsicId);
-  return M.getFunction(Name);
+  return Intrinsic::getDeclarationIfExists(&M, IntrinsicId);
 }
 
 } // end anonymous namespace
diff --git a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
index 8c197f231496..de9173e923ab 100644
--- a/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIISelLowering.cpp
@@ -8786,7 +8786,7 @@ SDValue SITargetLowering::LowerINTRINSIC_WO_CHAIN(SDValue Op,
 
     const Module *M = MF.getFunction().getParent();
     const GlobalValue *GV =
-        M->getNamedValue(Intrinsic::getName(Intrinsic::amdgcn_groupstaticsize));
+        Intrinsic::getDeclarationIfExists(M, Intrinsic::amdgcn_groupstaticsize);
     SDValue GA = DAG.getTargetGlobalAddress(GV, DL, MVT::i32, 0,
                                             SIInstrInfo::MO_ABS32_LO);
     return {DAG.getMachineNode(AMDGPU::S_MOV_B32, DL, MVT::i32, GA), 0};
diff --git a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
index a7a01ca1055d..3121659edadd 100644
--- a/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
+++ b/llvm/lib/Transforms/IPO/ExpandVariadics.cpp
@@ -145,9 +145,10 @@ public:
 // function here in the meantime to decouple from that discussion.
 Function *getPreexistingDeclaration(Module *M, Intrinsic::ID Id,
                                     ArrayRef<Type *> Tys = {}) {
+  if (Tys.empty())
+    return Intrinsic::getDeclarationIfExists(M, Id);
   auto *FT = Intrinsic::getType(M->getContext(), Id, Tys);
-  return M->getFunction(Tys.empty() ? Intrinsic::getName(Id)
-                                    : Intrinsic::getName(Id, Tys, M, FT));
+  return Intrinsic::getDeclarationIfExists(M, Id, Tys, FT);
 }
 
 class ExpandVariadics : public ModulePass {
diff --git a/llvm/lib/Transforms/IPO/GlobalDCE.cpp b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
index e36d524d7667..eca36fb31cea 100644
--- a/llvm/lib/Transforms/IPO/GlobalDCE.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalDCE.cpp
@@ -186,9 +186,9 @@ void GlobalDCEPass::ScanVTableLoad(Function *Caller, Metadata *TypeId,
 void GlobalDCEPass::ScanTypeCheckedLoadIntrinsics(Module &M) {
   LLVM_DEBUG(dbgs() << "Scanning type.checked.load intrinsics\n");
   Function *TypeCheckedLoadFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      &M, Intrinsic::type_checked_load_relative);
 
   auto scan = [&](Function *CheckedLoadFunc) {
     if (!CheckedLoadFunc)
diff --git a/llvm/lib/Transforms/IPO/GlobalSplit.cpp b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
index fd49b745fd75..320fd893935f 100644
--- a/llvm/lib/Transforms/IPO/GlobalSplit.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalSplit.cpp
@@ -174,11 +174,11 @@ static bool splitGlobals(Module &M) {
   // llvm.type.checked.load intrinsics, which indicates that splitting globals
   // may be beneficial.
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   Function *TypeCheckedLoadFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      &M, Intrinsic::type_checked_load_relative);
   if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
       (!TypeCheckedLoadFunc || TypeCheckedLoadFunc->use_empty()) &&
       (!TypeCheckedLoadRelativeFunc ||
diff --git a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
index 519a4e9314a2..3fcfc6a87677 100644
--- a/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
+++ b/llvm/lib/Transforms/IPO/LowerTypeTests.cpp
@@ -1970,7 +1970,7 @@ static void dropTypeTests(Module &M, Function &TypeTestFunc) {
 
 bool LowerTypeTestsModule::lower() {
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
 
   if (DropTypeTests) {
     if (TypeTestFunc)
@@ -1979,7 +1979,7 @@ bool LowerTypeTestsModule::lower() {
     // except for in the case where we originally were performing ThinLTO but
     // decided not to in the backend.
     Function *PublicTypeTestFunc =
-        M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
+        Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
     if (PublicTypeTestFunc)
       dropTypeTests(M, *PublicTypeTestFunc);
     if (TypeTestFunc || PublicTypeTestFunc) {
@@ -2002,7 +2002,7 @@ bool LowerTypeTestsModule::lower() {
     return false;
 
   Function *ICallBranchFunnelFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::icall_branch_funnel));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::icall_branch_funnel);
   if ((!TypeTestFunc || TypeTestFunc->use_empty()) &&
       (!ICallBranchFunnelFunc || ICallBranchFunnelFunc->use_empty()) &&
       !ExportSummary && !ImportSummary)
diff --git a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
index 9bf29c46938e..cd0e412bdf35 100644
--- a/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
+++ b/llvm/lib/Transforms/IPO/ThinLTOBitcodeWriter.cpp
@@ -123,7 +123,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
   };
 
   if (Function *TypeTestFunc =
-          M.getFunction(Intrinsic::getName(Intrinsic::type_test))) {
+          Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test)) {
     for (const Use &U : TypeTestFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 1);
@@ -131,7 +131,7 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
   }
 
   if (Function *PublicTypeTestFunc =
-          M.getFunction(Intrinsic::getName(Intrinsic::public_type_test))) {
+          Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test)) {
     for (const Use &U : PublicTypeTestFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 1);
@@ -139,15 +139,15 @@ void promoteTypeIds(Module &M, StringRef ModuleId) {
   }
 
   if (Function *TypeCheckedLoadFunc =
-          M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load))) {
+          Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load)) {
     for (const Use &U : TypeCheckedLoadFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 2);
     }
   }
 
-  if (Function *TypeCheckedLoadRelativeFunc = M.getFunction(
-          Intrinsic::getName(Intrinsic::type_checked_load_relative))) {
+  if (Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+          &M, Intrinsic::type_checked_load_relative)) {
     for (const Use &U : TypeCheckedLoadRelativeFunc->uses()) {
       auto CI = cast<CallInst>(U.getUser());
       ExternalizeTypeId(CI, 2);
diff --git a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
index 59f986b4ca26..45d32218f362 100644
--- a/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
+++ b/llvm/lib/Transforms/IPO/WholeProgramDevirt.cpp
@@ -851,7 +851,7 @@ void llvm::updateVCallVisibilityInModule(
 void llvm::updatePublicTypeTestCalls(Module &M,
                                      bool WholeProgramVisibilityEnabledInLTO) {
   Function *PublicTypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::public_type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::public_type_test);
   if (!PublicTypeTestFunc)
     return;
   if (hasWholeProgramVisibility(WholeProgramVisibilityEnabledInLTO)) {
@@ -2247,12 +2247,13 @@ bool DevirtModule::run() {
     return false;
 
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   Function *TypeCheckedLoadFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load));
-  Function *TypeCheckedLoadRelativeFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_checked_load_relative));
-  Function *AssumeFunc = M.getFunction(Intrinsic::getName(Intrinsic::assume));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_checked_load);
+  Function *TypeCheckedLoadRelativeFunc = Intrinsic::getDeclarationIfExists(
+      &M, Intrinsic::type_checked_load_relative);
+  Function *AssumeFunc =
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::assume);
 
   // Normally if there are no users of the devirtualization intrinsics in the
   // module, this pass has nothing to do. But if we are exporting, we also need
diff --git a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
index 86637109d940..43b8d5e6a8ce 100644
--- a/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
+++ b/llvm/lib/Transforms/Instrumentation/IndirectCallPromotion.cpp
@@ -944,7 +944,7 @@ computeVirtualCallSiteTypeInfoMap(Module &M, ModuleAnalysisManager &MAM,
   // Find out virtual calls by looking at users of llvm.type.checked.load in
   // that case.
   Function *TypeTestFunc =
-      M.getFunction(Intrinsic::getName(Intrinsic::type_test));
+      Intrinsic::getDeclarationIfExists(&M, Intrinsic::type_test);
   if (!TypeTestFunc || TypeTestFunc->use_empty())
     return;
 
diff --git a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
index 929c78744205..d7d809dfdd5f 100644
--- a/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/InstrProfiling.cpp
@@ -902,15 +902,15 @@ static bool needsRuntimeHookUnconditionally(const Triple &TT) {
 /// Check if the module contains uses of any profiling intrinsics.
 static bool containsProfilingIntrinsics(Module &M) {
   auto containsIntrinsic = [&](int ID) {
-    if (auto *F = M.getFunction(Intrinsic::getName(ID)))
+    if (auto *F = Intrinsic::getDeclarationIfExists(&M, ID))
       return !F->use_empty();
     return false;
   };
-  return containsIntrinsic(llvm::Intrinsic::instrprof_cover) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_increment) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_increment_step) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_timestamp) ||
-         containsIntrinsic(llvm::Intrinsic::instrprof_value_profile);
+  return containsIntrinsic(Intrinsic::instrprof_cover) ||
+         containsIntrinsic(Intrinsic::instrprof_increment) ||
+         containsIntrinsic(Intrinsic::instrprof_increment_step) ||
+         containsIntrinsic(Intrinsic::instrprof_timestamp) ||
+         containsIntrinsic(Intrinsic::instrprof_value_profile);
 }
 
 bool InstrLowerer::lower() {
diff --git a/llvm/lib/Transforms/Scalar/GuardWidening.cpp b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
index e7ff2a14469c..7fa9f4280909 100644
--- a/llvm/lib/Transforms/Scalar/GuardWidening.cpp
+++ b/llvm/lib/Transforms/Scalar/GuardWidening.cpp
@@ -980,11 +980,11 @@ StringRef GuardWideningImpl::scoreTypeToString(WideningScore WS) {
 PreservedAnalyses GuardWideningPass::run(Function &F,
                                          FunctionAnalysisManager &AM) {
   // Avoid requesting analyses if there are no guards or widenable conditions.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
-  auto *WCDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  auto *WCDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_widenable_condition);
   bool HasWidenableConditions = WCDecl && !WCDecl->use_empty();
   if (!HasIntrinsicGuards && !HasWidenableConditions)
     return PreservedAnalyses::all();
diff --git a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
index 2668305e9c84..ad68fc1f21e2 100644
--- a/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
+++ b/llvm/lib/Transforms/Scalar/IndVarSimplify.cpp
@@ -598,8 +598,8 @@ bool IndVarSimplify::simplifyAndExtend(Loop *L,
                                        LoopInfo *LI) {
   SmallVector<WideIVInfo, 8> WideIVs;
 
-  auto *GuardDecl = L->getBlocks()[0]->getModule()->getFunction(
-          Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      L->getBlocks()[0]->getModule(), Intrinsic::experimental_guard);
   bool HasGuards = GuardDecl && !GuardDecl->use_empty();
 
   SmallVector<PHINode *, 8> LoopPhis;
diff --git a/llvm/lib/Transforms/Scalar/JumpThreading.cpp b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
index 7a0b661a0779..11fdc39464df 100644
--- a/llvm/lib/Transforms/Scalar/JumpThreading.cpp
+++ b/llvm/lib/Transforms/Scalar/JumpThreading.cpp
@@ -296,8 +296,8 @@ bool JumpThreadingPass::runImpl(Function &F_, FunctionAnalysisManager *FAM_,
   DTU = std::move(DTU_);
   BFI = BFI_;
   BPI = BPI_;
-  auto *GuardDecl = F->getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F->getParent(), Intrinsic::experimental_guard);
   HasGuards = GuardDecl && !GuardDecl->use_empty();
 
   // Reduce the number of instructions duplicated when optimizing strictly for
diff --git a/llvm/lib/Transforms/Scalar/LoopPredication.cpp b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
index 209b083a4e91..31694ad1fa50 100644
--- a/llvm/lib/Transforms/Scalar/LoopPredication.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopPredication.cpp
@@ -1193,10 +1193,10 @@ bool LoopPredication::runOnLoop(Loop *Loop) {
 
   // There is nothing to do if the module doesn't use guards
   auto *GuardDecl =
-      M->getFunction(Intrinsic::getName(Intrinsic::experimental_guard));
+      Intrinsic::getDeclarationIfExists(M, Intrinsic::experimental_guard);
   bool HasIntrinsicGuards = GuardDecl && !GuardDecl->use_empty();
-  auto *WCDecl = M->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  auto *WCDecl = Intrinsic::getDeclarationIfExists(
+      M, Intrinsic::experimental_widenable_condition);
   bool HasWidenableConditions =
       PredicateWidenableBranchGuards && WCDecl && !WCDecl->use_empty();
   if (!HasIntrinsicGuards && !HasWidenableConditions)
diff --git a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
index ce35349376c4..5f3e612e73b6 100644
--- a/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerGuardIntrinsic.cpp
@@ -27,8 +27,8 @@ using namespace llvm;
 static bool lowerGuardIntrinsic(Function &F) {
   // Check if we can cheaply rule out the possibility of not having any work to
   // do.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   if (!GuardDecl || GuardDecl->use_empty())
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
index 3c977b816a05..ea2b419b17a5 100644
--- a/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerWidenableCondition.cpp
@@ -26,8 +26,8 @@ using namespace llvm;
 static bool lowerWidenableCondition(Function &F) {
   // Check if we can cheaply rule out the possibility of not having any work to
   // do.
-  auto *WCDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_widenable_condition));
+  auto *WCDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_widenable_condition);
   if (!WCDecl || WCDecl->use_empty())
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
index b9f88ba4e078..948466c675e9 100644
--- a/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
+++ b/llvm/lib/Transforms/Scalar/MakeGuardsExplicit.cpp
@@ -56,8 +56,8 @@ static void turnToExplicitForm(CallInst *Guard, Function *DeoptIntrinsic) {
 static bool explicifyGuards(Function &F) {
   // Check if we can cheaply rule out the possibility of not having any work to
   // do.
-  auto *GuardDecl = F.getParent()->getFunction(
-      Intrinsic::getName(Intrinsic::experimental_guard));
+  auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+      F.getParent(), Intrinsic::experimental_guard);
   if (!GuardDecl || GuardDecl->use_empty())
     return false;
 
diff --git a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
index f3f5ffb6b61b..aa3cbc5e4bdd 100644
--- a/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
+++ b/llvm/lib/Transforms/Scalar/SimpleLoopUnswitch.cpp
@@ -2920,8 +2920,8 @@ static bool collectUnswitchCandidates(
   // Whether or not we should also collect guards in the loop.
   bool CollectGuards = false;
   if (UnswitchGuards) {
-    auto *GuardDecl = L.getHeader()->getParent()->getParent()->getFunction(
-        Intrinsic::getName(Intrinsic::experimental_guard));
+    auto *GuardDecl = Intrinsic::getDeclarationIfExists(
+        L.getHeader()->getParent()->getParent(), Intrinsic::experimental_guard);
     if (GuardDecl && !GuardDecl->use_empty())
       CollectGuards = true;
   }
-- 
GitLab


From dcc5ba4a4d94e9550ff02239c252f446ab3fdf19 Mon Sep 17 00:00:00 2001
From: Stefan Pintilie <stefanp@ca.ibm.com>
Date: Wed, 16 Oct 2024 10:25:09 -0400
Subject: [PATCH 140/329] [PowerPC] Add missing patterns for lround when i32 is
 returned. (#111863)

The patch adds support for lround when the output type of the rounding
is i32.
The support for a rounding result of type i64 existed before this patch.
---
 llvm/lib/Target/PowerPC/PPCInstrVSX.td        |  4 +
 .../CodeGen/PowerPC/scalar-rounding-ops.ll    | 84 +++++++++++++++++++
 2 files changed, 88 insertions(+)

diff --git a/llvm/lib/Target/PowerPC/PPCInstrVSX.td b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
index dd07892794d5..fe9ab22c5763 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrVSX.td
+++ b/llvm/lib/Target/PowerPC/PPCInstrVSX.td
@@ -3606,6 +3606,10 @@ def : Pat<(i64 (lround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
 def : Pat<(i64 (lround f32:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
+def : Pat<(i32 (lround f64:$S)),
+          (i32 (MFVSRWZ (FCTIW (XSRDPI $S))))>;
+def : Pat<(i32 (lround f32:$S)),
+          (i32 (MFVSRWZ (FCTIW (XSRDPI (COPY_TO_REGCLASS $S, VSFRC)))))>;
 def : Pat<(i64 (llround f64:$S)),
           (i64 (MFVSRD (FCTID (XSRDPI $S))))>;
 def : Pat<(i64 (llround f32:$S)),
diff --git a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
index e950c0a2efac..2be370f638d5 100644
--- a/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
+++ b/llvm/test/CodeGen/PowerPC/scalar-rounding-ops.ll
@@ -214,6 +214,48 @@ entry:
 
 declare i64 @llvm.lround.i64.f64(double)
 
+define dso_local i32 @test_lroundi32f64(double %d) local_unnamed_addr {
+; BE-LABEL: test_lroundi32f64:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lround
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_lroundi32f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl lround
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; FAST-LABEL: test_lroundi32f64:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xsrdpi f0, f1
+; FAST-NEXT:    fctiw f0, f0
+; FAST-NEXT:    mffprwz r3, f0
+; FAST-NEXT:    blr
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f64(double %d)
+  ret i32 %0
+}
+
+declare i32 @llvm.lround.i32.f64(double)
+
 define dso_local i64 @test_lroundf(float %f) local_unnamed_addr {
 ; BE-LABEL: test_lroundf:
 ; BE:       # %bb.0: # %entry
@@ -256,6 +298,48 @@ entry:
 
 declare i64 @llvm.lround.i64.f32(float)
 
+define dso_local i32 @test_lroundi32f32(float %d) local_unnamed_addr {
+; BE-LABEL: test_lroundi32f32:
+; BE:       # %bb.0: # %entry
+; BE-NEXT:    mflr r0
+; BE-NEXT:    stdu r1, -112(r1)
+; BE-NEXT:    std r0, 128(r1)
+; BE-NEXT:    .cfi_def_cfa_offset 112
+; BE-NEXT:    .cfi_offset lr, 16
+; BE-NEXT:    bl lroundf
+; BE-NEXT:    nop
+; BE-NEXT:    addi r1, r1, 112
+; BE-NEXT:    ld r0, 16(r1)
+; BE-NEXT:    mtlr r0
+; BE-NEXT:    blr
+;
+; CHECK-LABEL: test_lroundi32f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    mflr r0
+; CHECK-NEXT:    stdu r1, -32(r1)
+; CHECK-NEXT:    std r0, 48(r1)
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    .cfi_offset lr, 16
+; CHECK-NEXT:    bl lroundf
+; CHECK-NEXT:    nop
+; CHECK-NEXT:    addi r1, r1, 32
+; CHECK-NEXT:    ld r0, 16(r1)
+; CHECK-NEXT:    mtlr r0
+; CHECK-NEXT:    blr
+;
+; FAST-LABEL: test_lroundi32f32:
+; FAST:       # %bb.0: # %entry
+; FAST-NEXT:    xsrdpi f0, f1
+; FAST-NEXT:    fctiw f0, f0
+; FAST-NEXT:    mffprwz r3, f0
+; FAST-NEXT:    blr
+entry:
+  %0 = tail call i32 @llvm.lround.i32.f32(float %d)
+  ret i32 %0
+}
+
+declare i32 @llvm.lround.i32.f32(float)
+
 define dso_local i64 @test_llround(double %d) local_unnamed_addr {
 ; BE-LABEL: test_llround:
 ; BE:       # %bb.0: # %entry
-- 
GitLab


From 7b4c8b35d43c0a17f222722487d7a2b4ceee0a26 Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Wed, 16 Oct 2024 10:27:44 -0400
Subject: [PATCH 141/329] [AMDGPU][True16][MC] VOP3 profile in True16 format
 (#109031)

Modify VOP3 profile and pesudo, and add encoding info for VOP3 True16
including DPP and DPP8 in true16 and fake16 format.

This patch applies true16/fake16 changes and asm/dasm changes to
V_ADD_NC_U16
V_ADD_NC_I16
V_SUB_NC_U16
V_SUB_NC_I16
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.td         |   8 +-
 llvm/lib/Target/AMDGPU/VOP2Instructions.td    |   4 +-
 llvm/lib/Target/AMDGPU/VOP3Instructions.td    |  84 ++--
 llvm/lib/Target/AMDGPU/VOPInstructions.td     | 312 ++++++++++--
 .../AMDGPU/GlobalISel/inst-select-add.s16.mir |  40 +-
 .../test/CodeGen/AMDGPU/dpp_combine_gfx11.mir |  32 +-
 .../isel-amdgpu-cs-chain-preserve-cc.ll       |   6 +-
 llvm/test/MC/AMDGPU/gfx11_asm_vop3.s          | 208 +++++---
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s    | 192 +++----
 llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s     | 124 +++--
 llvm/test/MC/AMDGPU/gfx12_asm_vop3.s          | 156 +++---
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s    | 328 ++++++------
 llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s     | 176 ++++---
 .../Disassembler/AMDGPU/gfx11_dasm_vop3.txt   | 444 ++++++++++++----
 .../AMDGPU/gfx11_dasm_vop3_dpp16.txt          | 376 +++++++++++---
 .../AMDGPU/gfx11_dasm_vop3_dpp8.txt           | 196 +++++++-
 .../Disassembler/AMDGPU/gfx12_dasm_vop3.txt   | 372 +++++++++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp16.txt          | 472 +++++++++++++++---
 .../AMDGPU/gfx12_dasm_vop3_dpp8.txt           | 292 ++++++++++-
 19 files changed, 2896 insertions(+), 926 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.td b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
index 087ca1f95446..42a1ffb8a26d 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.td
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.td
@@ -2149,6 +2149,8 @@ class getAsmVOP3P <int NumSrcArgs, bit HasModifiers,
   string ret = dst#", "#src0#src1#src2#opsel#mods#clamp;
 }
 
+// FIXME-TRUE16 AsmVOP3OpSel will be deprecated after all
+// VOP3 16 bit instructions are replaced to true16 format
 class getAsmVOP3OpSel <int NumSrcArgs,
                        bit HasClamp,
                        bit HasOMod,
@@ -2237,8 +2239,9 @@ class getAsmVOP3Base <int NumSrcArgs, bit HasDst, bit HasClamp,
   string clamp = !if(HasClamp, "$clamp", "");
   string omod = !if(HasOMod, "$omod", "");
 
-  string ret = dst#!if(!gt(NumSrcArgs,0),", "#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod, "");
-
+  string ret = dst#!if(!eq(NumSrcArgs,0),
+                       "",
+                       !if(HasDst,", ", "")#src0#src1#src2#opsel#bytesel#3PMods#clamp#omod);
 }
 
 class getAsmVOP3DPP<string base> {
@@ -2733,6 +2736,7 @@ def VOP_F32_F32_F16_F16 : VOPProfile <[f32, f32, f16, f16]>;
 def VOP_F32_F32_F32_F32 : VOPProfile <[f32, f32, f32, f32]>;
 def VOP_F64_F64_F64_F64 : VOPProfile <[f64, f64, f64, f64]>;
 def VOP_I32_I32_I32_I32 : VOPProfile <[i32, i32, i32, i32]>;
+def VOP_I32_I32_I32_I16 : VOPProfile <[i32, i32, i32, i16]>;
 def VOP_I64_I32_I32_I64 : VOPProfile <[i64, i32, i32, i64]>;
 def VOP_I32_F32_I32_I32 : VOPProfile <[i32, f32, i32, i32]>;
 def VOP_I64_I64_I32_I64 : VOPProfile <[i64, i64, i32, i64]>;
diff --git a/llvm/lib/Target/AMDGPU/VOP2Instructions.td b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
index 639f9189cbe7..e83ea57c61df 100644
--- a/llvm/lib/Target/AMDGPU/VOP2Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP2Instructions.td
@@ -1664,8 +1664,8 @@ multiclass VOP3Only_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3Only_Realtriple<GFX11Gen, op>, VOP3Only_Realtriple<GFX12Gen, op>;
 
 multiclass VOP3Only_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> :
-  VOP3Only_Realtriple_t16<GFX11Gen, op, asmName, OpName>,
-  VOP3Only_Realtriple_t16<GFX12Gen, op, asmName, OpName>;
+  VOP3_Realtriple_t16_gfx11<op, asmName, OpName, "", /*IsSingle*/1>,
+  VOP3_Realtriple_t16_gfx12<op, asmName, OpName, "", /*IsSingle*/1>;
 
 multiclass VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string OpName = NAME> {
   defm OpName#"_t16": VOP3Only_Realtriple_t16_gfx11_gfx12<op, asmName, OpName#"_t16">;
diff --git a/llvm/lib/Target/AMDGPU/VOP3Instructions.td b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
index 78ca7a2f258c..34ecdb56e868 100644
--- a/llvm/lib/Target/AMDGPU/VOP3Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP3Instructions.td
@@ -569,16 +569,10 @@ def VOP3_CVT_SR_F8_F32_Profile : VOP3_Profile<VOPProfile<[i32, f32, i32, f32]>,
                             getAsmVOP3OpSel<3, HasClamp, HasOMod,
                                             HasSrc0FloatMods, HasSrc1FloatMods,
                                             HasSrc2FloatMods>.ret);
-  let AsmVOP3DPP16 = !subst(", $src2_modifiers", "",
-                            getAsmVOP3DPP16<getAsmVOP3Base<3, 1, HasClamp, 1,
-                                            HasOMod, 0, 1, HasSrc0FloatMods,
-                                            HasSrc1FloatMods,
-                                            HasSrc2FloatMods>.ret>.ret);
-  let AsmVOP3DPP8 = !subst(", $src2_modifiers", "",
-                           getAsmVOP3DPP8<getAsmVOP3Base<3, 1, HasClamp, 1,
-                                          HasOMod, 0, 1, HasSrc0FloatMods,
-                                          HasSrc1FloatMods,
-                                          HasSrc2FloatMods>.ret>.ret);
+  let AsmVOP3Base = !subst(", $src2_modifiers", "",
+                    getAsmVOP3Base<NumSrcArgs, HasDst, HasClamp,
+                    HasOpSel, HasOMod, IsVOP3P, HasModifiers, HasModifiers, 0/*Src1Mods*/,
+                    HasModifiers, DstVT>.ret);
 }
 
 class VOP3_CVT_SR_F8_ByteSel_Profile<ValueType SrcVT> :
@@ -636,8 +630,8 @@ let SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0 in {
   defm V_MAXIMUM3_F16 : VOP3Inst <"v_maximum3_f16", VOP3_Profile<VOP_F16_F16_F16_F16, VOP3_OPSEL>, AMDGPUfmaximum3>;
 } // End SubtargetPredicate = isGFX12Plus, ReadsModeReg = 0
 
-defm V_ADD_I16 : VOP3Inst <"v_add_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
-defm V_SUB_I16 : VOP3Inst <"v_sub_i16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>>;
+defm V_ADD_I16 : VOP3Inst_t16 <"v_add_i16", VOP_I16_I16_I16>;
+defm V_SUB_I16 : VOP3Inst_t16 <"v_sub_i16", VOP_I16_I16_I16>;
 
 defm V_MAD_U32_U16 : VOP3Inst <"v_mad_u32_u16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
 defm V_MAD_I32_I16 : VOP3Inst <"v_mad_i32_i16", VOP3_Profile<VOP_I32_I16_I16_I32, VOP3_OPSEL>>;
@@ -752,6 +746,8 @@ def : GCNPat<(DivergentBinFrag<or> (or_oneuse i64:$src0, i64:$src1), i64:$src2),
                               (i32 (EXTRACT_SUBREG $src1, sub1)),
                               (i32 (EXTRACT_SUBREG $src2, sub1))), sub1)>;
 
+} // End SubtargetPredicate = isGFX9Plus
+
 // FIXME: Probably should hardcode clamp bit in pseudo and avoid this.
 class OpSelBinOpClampPat<SDPatternOperator node,
                          Instruction inst> : GCNPat<
@@ -760,9 +756,14 @@ class OpSelBinOpClampPat<SDPatternOperator node,
   (inst $src0_modifiers, $src0, $src1_modifiers, $src1, DSTCLAMP.ENABLE, 0)
 >;
 
-def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
-def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
-} // End SubtargetPredicate = isGFX9Plus
+let SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts in {
+  def : OpSelBinOpClampPat<saddsat, V_ADD_I16_e64>;
+  def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_e64>;
+} // End SubtargetPredicate = isGFX9Plus, True16Predicate = NotHasTrue16BitInsts
+let True16Predicate = UseFakeTrue16Insts in {
+  def : OpSelBinOpClampPat<saddsat, V_ADD_I16_fake16_e64>;
+  def : OpSelBinOpClampPat<ssubsat, V_SUB_I16_fake16_e64>;
+} // End True16Predicate = UseFakeTrue16Insts
 
 multiclass IMAD32_Pats <VOP3_Pseudo inst> {
   def : GCNPat <
@@ -871,21 +872,31 @@ let SubtargetPredicate = isGFX10Plus in {
     def : PermlanePat<int_amdgcn_permlanex16, V_PERMLANEX16_B32_e64, vt>;
   }
 
-  defm V_ADD_NC_U16 : VOP3Inst <"v_add_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, add>;
-  defm V_SUB_NC_U16 : VOP3Inst <"v_sub_nc_u16", VOP3_Profile<VOP_I16_I16_I16, VOP3_OPSEL>, sub>;
-
-  def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
-  def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
-
-  // Undo sub x, c -> add x, -c canonicalization since c is more likely
-  // an inline immediate than -c.
-  def : GCNPat<
-    (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
-    (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
-  >;
+  defm V_ADD_NC_U16 : VOP3Inst_t16 <"v_add_nc_u16", VOP_I16_I16_I16, add>;
+  defm V_SUB_NC_U16 : VOP3Inst_t16 <"v_sub_nc_u16", VOP_I16_I16_I16, sub>;
 
 } // End SubtargetPredicate = isGFX10Plus
 
+let True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus in {
+   def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_e64>;
+   def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_e64>;
+   // Undo sub x, c -> add x, -c canonicalization since c is more likely
+   // an inline immediate than -c.
+   def : GCNPat<
+     (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+     (V_SUB_NC_U16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+   >;
+} // End True16Predicate = NotHasTrue16BitInsts, SubtargetPredicate = isGFX10Plus
+
+let True16Predicate = UseFakeTrue16Insts in {
+   def : OpSelBinOpClampPat<uaddsat, V_ADD_NC_U16_fake16_e64>;
+   def : OpSelBinOpClampPat<usubsat, V_SUB_NC_U16_fake16_e64>;
+   def : GCNPat<
+     (add i16:$src0, (i16 NegSubInlineIntConst16:$src1)),
+     (V_SUB_NC_U16_fake16_e64 0, VSrc_b16:$src0, 0, NegSubInlineIntConst16:$src1, 0, 0)
+   >;
+} // End True16Predicate = UseFakeTrue16Insts
+
 let SubtargetPredicate = isGFX12Plus in {
   let Constraints = "$vdst = $vdst_in", DisableEncoding="$vdst_in" in {
     defm V_PERMLANE16_VAR_B32  : VOP3Inst<"v_permlane16_var_b32",  VOP3_PERMLANE_VAR_Profile>;
@@ -1104,6 +1115,17 @@ multiclass VOP3_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
 multiclass VOP3Dot_Realtriple_gfx11_gfx12<bits<10> op> :
   VOP3Dot_Realtriple<GFX11Gen, op>, VOP3Dot_Realtriple<GFX12Gen, op>;
 
+multiclass VOP3_Realtriple_t16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                           string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
+  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                                      string pseudo_mnemonic = "", bit isSingle = 0> {
+  defm opName#"_t16": VOP3_Realtriple_t16_gfx11_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+  defm opName#"_fake16": VOP3_Realtriple_t16_gfx11_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+}
+
 multiclass VOP3be_Real_gfx11_gfx12<bits<10> op, string opName, string asmName> :
   VOP3be_Real<GFX11Gen, op, opName, asmName>,
   VOP3be_Real<GFX12Gen, op, opName, asmName>;
@@ -1189,8 +1211,8 @@ defm V_DIV_SCALE_F32       : VOP3be_Real_gfx11_gfx12<0x2fc, "V_DIV_SCALE_F32", "
 defm V_DIV_SCALE_F64       : VOP3be_Real_gfx11_gfx12<0x2fd, "V_DIV_SCALE_F64", "v_div_scale_f64">;
 defm V_MAD_U64_U32_gfx11   : VOP3be_Real_gfx11<0x2fe, "V_MAD_U64_U32_gfx11", "v_mad_u64_u32">;
 defm V_MAD_I64_I32_gfx11   : VOP3be_Real_gfx11<0x2ff, "V_MAD_I64_I32_gfx11", "v_mad_i64_i32">;
-defm V_ADD_NC_U16          : VOP3Only_Realtriple_gfx11_gfx12<0x303>;
-defm V_SUB_NC_U16          : VOP3Only_Realtriple_gfx11_gfx12<0x304>;
+defm V_ADD_NC_U16          : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x303, "v_add_nc_u16">;
+defm V_SUB_NC_U16          : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x304, "v_sub_nc_u16">;
 defm V_MUL_LO_U16          : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x305, "v_mul_lo_u16">;
 defm V_CVT_PK_I16_F32      : VOP3_Realtriple_gfx11_gfx12<0x306>;
 defm V_CVT_PK_U16_F32      : VOP3_Realtriple_gfx11_gfx12<0x307>;
@@ -1198,8 +1220,8 @@ defm V_MAX_U16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30
 defm V_MAX_I16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30a, "v_max_i16">;
 defm V_MIN_U16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30b, "v_min_u16">;
 defm V_MIN_I16             : VOP3Only_Realtriple_t16_and_fake16_gfx11_gfx12<0x30c, "v_min_i16">;
-defm V_ADD_NC_I16          : VOP3_Realtriple_with_name_gfx11_gfx12<0x30d, "V_ADD_I16", "v_add_nc_i16">;
-defm V_SUB_NC_I16          : VOP3_Realtriple_with_name_gfx11_gfx12<0x30e, "V_SUB_I16", "v_sub_nc_i16">;
+defm V_ADD_NC_I16          : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30d, "v_add_nc_i16", "V_ADD_I16">;
+defm V_SUB_NC_I16          : VOP3_Realtriple_t16_and_fake16_gfx11_gfx12<0x30e, "v_sub_nc_i16", "V_SUB_I16">;
 defm V_PACK_B32_F16        : VOP3_Realtriple_gfx11_gfx12<0x311>;
 defm V_CVT_PK_NORM_I16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x312, "V_CVT_PKNORM_I16_F16" , "v_cvt_pk_norm_i16_f16" >;
 defm V_CVT_PK_NORM_U16_F16 : VOP3_Realtriple_with_name_gfx11_gfx12<0x313, "V_CVT_PKNORM_U16_F16" , "v_cvt_pk_norm_u16_f16" >;
diff --git a/llvm/lib/Target/AMDGPU/VOPInstructions.td b/llvm/lib/Target/AMDGPU/VOPInstructions.td
index 05a7d907d237..aab5dc7465d9 100644
--- a/llvm/lib/Target/AMDGPU/VOPInstructions.td
+++ b/llvm/lib/Target/AMDGPU/VOPInstructions.td
@@ -111,7 +111,7 @@ class VOP3_Pseudo <string opName, VOPProfile P, list<dag> pattern = [],
 
   bit HasFP8DstByteSel = P.HasFP8DstByteSel;
 
-  let AsmOperands = !if(isVop3OpSel,
+  let AsmOperands = !if(!and(!not(P.IsTrue16), isVop3OpSel),
                         P.AsmVOP3OpSel,
                         !if(!and(isVOP3P, P.IsPacked), P.AsmVOP3P, P.Asm64));
 
@@ -178,6 +178,7 @@ class VOP3_Real <VOP_Pseudo ps, int EncodingFamily, string asm_name = ps.Mnemoni
   let SubtargetPredicate = ps.SubtargetPredicate;
   let WaveSizePredicate  = ps.WaveSizePredicate;
   let OtherPredicates    = ps.OtherPredicates;
+  let True16Predicate    = ps.True16Predicate;
   let AsmMatchConverter  = ps.AsmMatchConverter;
   let AsmVariantName     = ps.AsmVariantName;
   let Constraints        = ps.Constraints;
@@ -242,6 +243,41 @@ class VOP3a<VOPProfile P> : Enc64 {
   let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
 }
 
+// To avoid having different version of every type of operand depending on if
+// they are part of a True16 instruction or not, the operand encoding should be
+// the same for SGPR, imm, and VGPR_32 whether the instruction is True16 or not.
+class VOP3a_t16<VOPProfile P> : Enc64 {
+  bits<11> vdst;
+  bits<4> src0_modifiers;
+  bits<11> src0;
+  bits<3> src1_modifiers;
+  bits<11> src1;
+  bits<3> src2_modifiers;
+  bits<11> src2;
+  bits<1> clamp;
+  bits<2> omod;
+
+  let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
+  let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
+  let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
+  // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
+  let Inst{11} = !if(P.HasSrc0Mods, src0_modifiers{2}, 0);
+  let Inst{12} = !if(P.HasSrc1Mods, src1_modifiers{2}, 0);
+  let Inst{13} = !if(P.HasSrc2Mods, src2_modifiers{2}, 0);
+  let Inst{14} = !if(!and(P.HasDst, P.HasSrc0Mods), src0_modifiers{3}, 0);
+  let Inst{15} = !if(P.HasClamp, clamp{0}, 0);
+
+  let Inst{31-26} = 0x35;
+  let Inst{40-32} = !if(P.HasSrc0, src0{8-0}, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+  let Inst{60-59} = !if(P.HasOMod, omod, 0);
+  let Inst{61}    = !if(P.HasSrc0Mods, src0_modifiers{0}, 0);
+  let Inst{62}    = !if(P.HasSrc1Mods, src1_modifiers{0}, 0);
+  let Inst{63}    = !if(P.HasSrc2Mods, src2_modifiers{0}, 0);
+}
+
 class VOP3a_gfx6_gfx7<bits<9> op, VOPProfile p> : VOP3a<p> {
   let Inst{11}    = !if(p.HasClamp, clamp{0}, 0);
   let Inst{25-17} = op;
@@ -272,6 +308,10 @@ class VOP3e_gfx10<bits<10> op, VOPProfile p> : VOP3a_gfx10<op, p> {
 
 class VOP3e_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3e_gfx10<op, p>;
 
+class VOP3e_t16_gfx11_gfx12<bits<10> op, VOPProfile p> : VOP3a_t16<p> {
+  let Inst{25-16} = op;
+}
+
 class VOP3e_vi <bits<10> op, VOPProfile P> : VOP3a_vi <op, P> {
   bits<8> vdst;
   let Inst{7-0} = !if(P.EmitDst, vdst{7-0}, 0);
@@ -736,7 +776,12 @@ class VOP3_DPPe_Fields : VOP3_DPPe_Fields_Base {
   bits<8> src0;
 }
 
+class VOP3_DPPe_Fields_t16 : VOP3_DPPe_Fields_Base {
+  bits<11> src0;
+}
+
 // Common refers to common between DPP and DPP8
+// Base refers to a shared base between T16 and regular instructions
 class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   bits<4> src0_modifiers;
   bits<3> src1_modifiers;
@@ -748,7 +793,7 @@ class VOP3_DPPe_Common_Base<bits<10> op, VOPProfile P> : Enc96 {
   let Inst{8}     = !if(P.HasSrc0Mods, src0_modifiers{1}, 0);
   let Inst{9}     = !if(P.HasSrc1Mods, src1_modifiers{1}, 0);
   let Inst{10}    = !if(P.HasSrc2Mods, src2_modifiers{1}, 0);
-  // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
+  // 16-bit select fields which can be interpreted as OpSel or hi/lo suffix
   let Inst{11} = !if(P.HasOpSel, !if(P.HasSrc0Mods, src0_modifiers{2}, 0),
                                  !if(P.IsFP8SrcByteSel, byte_sel{1}, ?));
   let Inst{12} = !if(P.HasOpSel, !if(P.HasSrc1Mods, src1_modifiers{2}, 0),
@@ -777,6 +822,16 @@ class VOP3_DPPe_Common<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P>
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
 }
 
+class VOP3_DPPe_Common_t16<bits<10> op, VOPProfile P> : VOP3_DPPe_Common_Base<op, P> {
+  bits<11> vdst;
+  bits<11> src1;
+  bits<11> src2;
+
+  let Inst{7-0}   = !if(P.EmitDst, vdst{7-0}, 0);
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+}
+
 class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
   bits<4> src0_modifiers;
   bits<4> src1_modifiers;
@@ -786,6 +841,7 @@ class VOP3P_DPPe_Common_Base<bits<7> op, VOPProfile P> : Enc96 {
   let Inst{8} = !if(P.HasSrc0Mods, src0_modifiers{1}, 0); // neg_hi src0
   let Inst{9} = !if(P.HasSrc1Mods, src1_modifiers{1}, 0); // neg_hi src1
   let Inst{10} = !if(P.HasSrc2Mods, src2_modifiers{1}, 0); // neg_hi src2
+  // OPSEL must be set such that the low result only uses low inputs, and the high result only uses high inputs.
   let Inst{11} = !if(!and(P.HasSrc0, P.HasOpSel), src0_modifiers{2}, 0); // op_sel(0)
   let Inst{12} = !if(!and(P.HasSrc1, P.HasOpSel), src1_modifiers{2}, 0); // op_sel(1)
   let Inst{13} = !if(!and(P.HasSrc2, P.HasOpSel), src2_modifiers{2}, 0); // op_sel(2)
@@ -810,6 +866,16 @@ class VOP3P_DPPe_Common<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P
   let Inst{58-50} = !if(P.HasSrc2, src2, 0);
 }
 
+class VOP3P_DPPe_Common_t16<bits<7> op, VOPProfile P> : VOP3P_DPPe_Common_Base<op, P> {
+  bits<11> vdst;
+  bits<11> src1;
+  bits<11> src2;
+
+  let Inst{7-0} = vdst{7-0};
+  let Inst{49-41} = !if(P.HasSrc1, src1{8-0}, 0);
+  let Inst{58-50} = !if(P.HasSrc2, src2{8-0}, 0);
+}
+
 class VOP_DPP_Pseudo <string OpName, VOPProfile P, list<dag> pattern=[],
   dag Ins = P.InsDPP, string asmOps = P.AsmDPP> :
   VOP_Pseudo<OpName, "_dpp", P, P.OutsDPP, Ins, asmOps, pattern> {
@@ -870,6 +936,7 @@ class VOP_DPP_Real <VOP_DPP_Pseudo ps, int EncodingFamily> :
   // Copy relevant pseudo op flags
   let isConvergent         = ps.isConvergent;
   let SubtargetPredicate   = ps.SubtargetPredicate;
+  let True16Predicate      = ps.True16Predicate;
   let AssemblerPredicate   = ps.AssemblerPredicate;
   let OtherPredicates      = ps.OtherPredicates;
   let AsmMatchConverter    = ps.AsmMatchConverter;
@@ -928,11 +995,29 @@ class VOP3_DPP_Base <string OpName, VOPProfile P, bit IsDPP16,
   let Size = 12;
 }
 
+class VOP3_DPP_Enc <bits<10> op, VOPProfile P, bit IsDPP16> :
+  VOP3_DPPe_Common<op, P>,
+  VOP3_DPPe_Fields {
+
+  let Inst{40-32} = 0xfa;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{80-72} = dpp_ctrl;
+  let Inst{82}    = !if(IsDPP16, fi, ?);
+  let Inst{83}    = bound_ctrl;
+
+  // Inst{87-84} ignored by hw
+  let Inst{91-88} = bank_mask;
+  let Inst{95-92} = row_mask;
+}
+
 class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
                dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
                string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
-  VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPPe_Common<op, P>,
-  VOP3_DPPe_Fields {
+  VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>, VOP3_DPP_Enc<op, P, IsDPP16>;
+
+class VOP3_DPP_Enc_t16<bits<10> op, VOPProfile P, bit IsDPP16 >
+    : VOP3_DPPe_Common_t16<op, P>,
+      VOP3_DPPe_Fields_t16 {
 
   let Inst{40-32} = 0xfa;
   let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
@@ -945,6 +1030,13 @@ class VOP3_DPP <bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
   let Inst{95-92} = row_mask;
 }
 
+class VOP3_DPP_t16<bits<10> op, string OpName, VOPProfile P, bit IsDPP16,
+                   dag InsDPP = !if (IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
+                   string AsmDPP = !if (IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)>
+    : VOP3_DPP_Base<OpName, P, IsDPP16, InsDPP, AsmDPP>,
+      VOP3_DPP_Enc_t16<op, P, IsDPP16> {
+}
+
 class VOP3P_DPP <bits<7> op, string OpName, VOPProfile P, bit IsDPP16,
                dag InsDPP = !if(IsDPP16, P.InsVOP3DPP16, P.InsVOP3DPP),
                string AsmDPP = !if(IsDPP16, P.AsmVOP3DPP16, P.AsmVOP3DPP)> :
@@ -979,6 +1071,12 @@ class VOP3_DPP8e_Fields {
   bits<9> fi;
 }
 
+class VOP3_DPP8e_Fields_t16 {
+  bits<11> src0;
+  bits<24> dpp8;
+  bits<9> fi;
+}
+
 class VOP_DPP8_Base<string OpName, VOPProfile P, dag InsDPP8 = P.InsDPP8, string AsmDPP8 = P.AsmDPP8> :
   InstSI<P.OutsDPP8, InsDPP8, OpName#AsmDPP8, []> {
 
@@ -1011,16 +1109,28 @@ class VOP3_DPP8_Base<string OpName, VOPProfile P> :
   let Size = 12;
 }
 
+class VOP3_DPP8_Enc <bits<10> op, VOPProfile P> :
+  VOP3_DPPe_Common<op, P>,
+  VOP3_DPP8e_Fields {
+  let Inst{40-32} = fi;
+  let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
+  let Inst{95-72} = dpp8{23-0};
+}
 
 class VOP3_DPP8<bits<10> op, string OpName, VOPProfile P> :
-  VOP3_DPP8_Base<OpName, P>, VOP3_DPPe_Common<op, P>,
-  VOP3_DPP8e_Fields {
+  VOP3_DPP8_Base<OpName, P>, VOP3_DPP8_Enc<op, P>;
 
+class VOP3_DPP8_Enc_t16 <bits<10> op, VOPProfile P> :
+  VOP3_DPPe_Common_t16<op, P>,
+  VOP3_DPP8e_Fields_t16 {
   let Inst{40-32} = fi;
   let Inst{71-64} = !if(P.HasSrc0, src0{7-0}, 0);
   let Inst{95-72} = dpp8{23-0};
 }
 
+class VOP3_DPP8_t16<bits<10> op, string OpName, VOPProfile P> :
+  VOP3_DPP8_Base<OpName, P>, VOP3_DPP8_Enc_t16<op, P>;
+
 class VOP3P_DPP8<bits<7> op, string OpName, VOPProfile P> :
   VOP3_DPP8_Base<OpName, P>, VOP3P_DPPe_Common<op, P>,
   VOP3_DPP8e_Fields {
@@ -1273,6 +1383,30 @@ class VOP3_Profile<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOP3_Pr
 
 }
 
+class VOP3_Profile_True16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile_True16<P> {
+  let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+  let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+  let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
+  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+  let HasModifiers =
+      !if (Features.IsMAI, 0,
+           !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+  let IsSingle = 1;
+}
+
+class VOP3_Profile_Fake16<VOPProfile P, VOP3Features Features = VOP3_REGULAR> : VOPProfile_Fake16<P> {
+  let HasClamp = !if(Features.HasClamp, 1, P.HasClamp);
+  let HasOpSel = !if(Features.HasOpSel, 1, P.HasOpSel);
+  let IsMAI    = !if(Features.IsMAI,    1, P.IsMAI);
+  let IsPacked = !if(Features.IsPacked, 1, P.IsPacked);
+
+  let HasModifiers =
+      !if (Features.IsMAI, 0,
+           !or(Features.IsPacked, Features.HasOpSel, P.HasModifiers));
+  let IsSingle = 1;
+}
+
 // consistently gives instructions a _e64 suffix
 multiclass VOP3Inst_Pseudo_Wrapper<string opName, VOPProfile P, list<dag> pattern = [], bit VOP3Only = 0> {
     def _e64 : VOP3_Pseudo<opName, P, pattern, VOP3Only>;
@@ -1325,11 +1459,33 @@ multiclass VOP3PseudoScalarInst<string OpName, VOPProfile P,
                                i32:$omod))))]>;
 }
 
+multiclass VOP3Inst_t16_with_profiles<string OpName, VOPProfile P, VOPProfile P_t16,
+                        VOPProfile P_fake16,
+                        SDPatternOperator node = null_frag,
+                        SDPatternOperator node_t16 = node> {
+  let True16Predicate = NotHasTrue16BitInsts  in {
+    defm NAME : VOP3Inst<OpName, P, node>;
+  }
+  let True16Predicate = UseRealTrue16Insts in {
+    defm _t16 : VOP3Inst<OpName # "_t16", P_t16, node_t16>;
+  }
+  let True16Predicate = UseFakeTrue16Insts in {
+    defm _fake16 : VOP3Inst<OpName # "_fake16", P_fake16, node>;
+  }
+}
+
+multiclass VOP3Inst_t16<string OpName, VOPProfile P,
+                        SDPatternOperator node = null_frag,
+                        SDPatternOperator node_t16 = node>
+ : VOP3Inst_t16_with_profiles<OpName, VOP3_Profile<P, VOP3_OPSEL>,
+                VOP3_Profile_True16<P, VOP3_OPSEL>, VOP3_Profile_Fake16<P, VOP3_OPSEL>,
+                node, node_t16>;
+
 //===----------------------------------------------------------------------===//
 // VOP3 DPP
 //===----------------------------------------------------------------------===//
 
-class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
+class VOP3_DPP16_Helper<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
     : VOP3_DPP<op, opName, ps.Pfl, 1> {
   let VOP3_OPSEL = ps.Pfl.HasOpSel;
   let IsDOT = ps.IsDOT;
@@ -1342,17 +1498,43 @@ class Base_VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, string opName = ps.OpName>
   let OtherPredicates = ps.OtherPredicates;
 }
 
+class VOP3_DPP16_t16_Helper<bits<10> op, VOP_DPP_Pseudo ps,
+                          string opName = ps.OpName>
+    : VOP3_DPP_t16<op, opName, ps.Pfl, 1> {
+  let VOP3_OPSEL = ps.Pfl.HasOpSel;
+  let IsDOT = ps.IsDOT;
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+  let AssemblerPredicate = HasDPP16;
+  let SubtargetPredicate = HasDPP16;
+  let OtherPredicates = ps.OtherPredicates;
+}
+
 class VOP3_DPP16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
                  string opName = ps.OpName>
-    : Base_VOP3_DPP16<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+    : VOP3_DPP16_Helper<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
+
+class VOP3_DPP16_t16<bits<10> op, VOP_DPP_Pseudo ps, int subtarget,
+                     string opName = ps.OpName>
+    : VOP3_DPP16_t16_Helper<op, ps, opName>, SIMCInstr<ps.PseudoInstr, subtarget>;
 
 class VOP3_DPP16_Gen<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
-                     string opName = ps.OpName> :
-  VOP3_DPP16 <op, ps, Gen.Subtarget, opName> {
+                     string opName = ps.OpName>
+    : VOP3_DPP16<op, ps, Gen.Subtarget, opName> {
   let AssemblerPredicate = Gen.AssemblerPredicate;
-  let True16Predicate = !if(ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
-  let DecoderNamespace = Gen.DecoderNamespace#
-                         !if(ps.Pfl.IsRealTrue16, "", "_FAKE16");
+  let DecoderNamespace = Gen.DecoderNamespace;
+}
+
+class VOP3_DPP16_Gen_t16<bits<10> op, VOP_DPP_Pseudo ps, GFXGen Gen,
+                         string opName = ps.OpName>
+    : VOP3_DPP16_t16<op, ps, Gen.Subtarget, opName> {
+  let True16Predicate =
+      !if (ps.Pfl.IsRealTrue16, UseRealTrue16Insts, NoTrue16Predicate);
+  let AssemblerPredicate = Gen.AssemblerPredicate;
+  let DecoderNamespace =
+      Gen.DecoderNamespace #!if (ps.Pfl.IsRealTrue16, "", "_FAKE16");
 }
 
 class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
@@ -1366,11 +1548,25 @@ class Base_VOP3_DPP8<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
 
   let SubtargetPredicate = ps.SubtargetPredicate;
   let OtherPredicates = ps.OtherPredicates;
+  let True16Predicate = ps.True16Predicate;
+}
+
+class Base_VOP3_DPP8_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : VOP3_DPP8_t16<op, opName, ps.Pfl> {
+  let VOP3_OPSEL = ps.Pfl.HasOpSel;
+  let IsDOT = ps.IsDOT;
+  let hasSideEffects = ps.hasSideEffects;
+  let Defs = ps.Defs;
+  let SchedRW = ps.SchedRW;
+  let Uses = ps.Uses;
+
+  let OtherPredicates = ps.OtherPredicates;
+  let True16Predicate = ps.True16Predicate;
 }
 
 class Base_VOP3b_DPP16<bits<10> op, VOP_DPP_Pseudo ps,
                        string opName = ps.OpName>
-    : Base_VOP3_DPP16<op, ps, opName> {
+    : VOP3_DPP16_Helper<op, ps, opName> {
   bits<7> sdst;
   let Inst{14 - 8} = sdst;
 }
@@ -1381,6 +1577,12 @@ class VOP3b_DPP8_Base<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
   let Inst{14 - 8} = sdst;
 }
 
+class VOP3b_DPP8_Base_t16<bits<10> op, VOP_Pseudo ps, string opName = ps.OpName>
+    : Base_VOP3_DPP8<op, ps, opName> {
+  bits<8> sdst;
+  let Inst{14 - 8} = sdst{7-1};
+}
+
 //===----------------------------------------------------------------------===//
 // VOP3 GFX11, GFX12
 //===----------------------------------------------------------------------===//
@@ -1420,10 +1622,11 @@ multiclass VOP3Dot_Real_Base<GFXGen Gen, bits<10> op, string opName = NAME,
 }
 
 multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
-                               string asmName, bit isSingle = 0> {
+                               string asmName, string pseudo_mnemonic = "", bit isSingle = 0> {
   defvar ps = !cast<VOP_Pseudo>(opName#"_e64");
   let AsmString = asmName # ps.AsmOperands,
       IsSingle = !or(isSingle, ps.Pfl.IsSingle) in {
+    // FIXME-TRUE16 support FP8 instructions properly
     if ps.Pfl.IsFP8SrcByteSel then {
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
@@ -1432,17 +1635,27 @@ multiclass VOP3_Real_with_name<GFXGen Gen, bits<10> op, string opName,
       def _e64#Gen.Suffix :
         VOP3_Real_Gen<ps, Gen>,
         VOP3FP8OpSel_dst_bytesel_gfx11_gfx12<op, ps.Pfl>;
-    } else if ps.Pfl.HasOpSel then {
-      def _e64#Gen.Suffix :
-        VOP3_Real_Gen<ps, Gen>,
-        VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
     } else {
-      def _e64#Gen.Suffix :
-        VOP3_Real_Gen<ps, Gen>,
-        VOP3e_gfx11_gfx12<op, ps.Pfl>;
+      if ps.Pfl.IsRealTrue16 then {
+        def _e64#Gen.Suffix :
+          VOP3_Real_Gen<ps, Gen>,
+          VOP3e_t16_gfx11_gfx12<op, ps.Pfl>;
+      } else {
+        if ps.Pfl.HasOpSel then {
+          def _e64#Gen.Suffix :
+            VOP3_Real_Gen<ps, Gen>,
+            VOP3OpSel_gfx11_gfx12<op, ps.Pfl>;
+        } else {
+          def _e64#Gen.Suffix :
+            VOP3_Real_Gen<ps, Gen>,
+            VOP3e_gfx11_gfx12<op, ps.Pfl>;
+        }
+      }
     }
   }
-  def Gen.Suffix#"_VOP3_alias" : LetDummies, AMDGPUMnemonicAlias<ps.Mnemonic, asmName> {
+  def Gen.Suffix#"_VOP3_alias" : LetDummies,
+                                 AMDGPUMnemonicAlias<!if(!empty(pseudo_mnemonic),
+                                                     ps.Mnemonic, pseudo_mnemonic), asmName, ""> {
     let AssemblerPredicate = Gen.AssemblerPredicate;
   }
 }
@@ -1456,8 +1669,13 @@ multiclass VOP3_Real_No_Suffix<GFXGen Gen, bits<10> op, string opName = NAME> {
 }
 
 multiclass VOP3_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
-  def _e64_dpp#Gen.Suffix :
-    VOP3_DPP16_Gen<op, !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp"), Gen>;
+  defvar ps = !cast<VOP_DPP_Pseudo>(opName#"_e64"#"_dpp");
+  if ps.Pfl.IsTrue16 then
+    def _e64_dpp#Gen.Suffix :
+      VOP3_DPP16_Gen_t16<op, ps, Gen>;
+  else
+    def _e64_dpp#Gen.Suffix :
+      VOP3_DPP16_Gen<op, ps, Gen>;
 }
 
 multiclass VOP3Dot_Real_dpp_Base<GFXGen Gen, bits<10> op, string opName = NAME> {
@@ -1552,18 +1770,14 @@ multiclass VOP3Only_Realtriple<GFXGen Gen, bits<10> op> :
   VOP3_Realtriple<Gen, op, 1>;
 
 multiclass VOP3_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
-                                     string asmName, bit isSingle = 0> :
-  VOP3_Real_with_name<Gen, op, opName, asmName, isSingle>,
+                                     string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Real_with_name<Gen, op, opName, asmName, pseudo_mnemonic, isSingle>,
   VOP3_Real_dpp_with_name<Gen, op, opName, asmName>,
   VOP3_Real_dpp8_with_name<Gen, op, opName, asmName>;
 
 multiclass VOP3Only_Realtriple_with_name<GFXGen Gen, bits<10> op, string opName,
                                          string asmName> :
-  VOP3_Realtriple_with_name<Gen, op, opName, asmName, 1>;
-
-multiclass VOP3Only_Realtriple_t16<GFXGen Gen, bits<10> op, string asmName,
-                                   string opName = NAME>
-    : VOP3Only_Realtriple_with_name<Gen, op, opName, asmName>;
+  VOP3_Realtriple_with_name<Gen, op, opName, asmName, "", 1>;
 
 multiclass VOP3be_Realtriple<
     GFXGen Gen, bits<10> op, bit isSingle = 0, string opName = NAME,
@@ -1579,6 +1793,16 @@ multiclass VOP3beOnly_Realtriple<GFXGen Gen, bits<10> op> :
 // VOP3 GFX11
 //===----------------------------------------------------------------------===//
 
+// VOP1 and VOP2 depend on these triple defs
+
+multiclass VOP3_Realtriple_t16_gfx11<bits<10> op, string asmName, string opName = NAME,
+                                     string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX11Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
+                                     string opName = NAME, string pseudo_mnemonic = "">
+  : VOP3_Realtriple_t16_gfx11<op, asmName, opName, pseudo_mnemonic, 1>;
+
 multiclass VOP3be_Real_gfx11<bits<10> op, string opName, string asmName,
                              bit isSingle = 0> :
   VOP3be_Real<GFX11Gen, op, opName, asmName, isSingle>;
@@ -1591,10 +1815,6 @@ multiclass VOP3_Realtriple_gfx11<bits<10> op, bit isSingle = 0,
                                  string opName = NAME> :
   VOP3_Realtriple<GFX11Gen, op, isSingle, opName>;
 
-multiclass VOP3Only_Realtriple_t16_gfx11<bits<10> op, string asmName,
-                                     string opName = NAME>
-    : VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>;
-
 //===----------------------------------------------------------------------===//
 // VOP3 GFX12
 //===----------------------------------------------------------------------===//
@@ -1610,6 +1830,16 @@ multiclass VOP3Only_Real_Base_gfx12<bits<10> op> :
 multiclass VOP3Only_Realtriple_t16_gfx12<bits<10> op> :
   VOP3Only_Realtriple<GFX12Gen, op>;
 
+multiclass VOP3_Realtriple_t16_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                     string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
+
+multiclass VOP3_Realtriple_t16_and_fake16_gfx12<bits<10> op, string asmName, string opName = NAME,
+                                                string pseudo_mnemonic = "", bit isSingle = 0> {
+  defm opName#"_t16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_t16", pseudo_mnemonic, isSingle>;
+  defm opName#"_fake16":VOP3_Realtriple_t16_gfx12<op, asmName, opName#"_fake16", pseudo_mnemonic, isSingle>;
+}
+
 multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
                                        string asmName, bit isSingle = 0> {
   defvar ps = !cast<VOP3_Pseudo>(opName#"_e64");
@@ -1624,18 +1854,14 @@ multiclass VOP3be_Real_with_name_gfx12<bits<10> op, string opName,
 }
 
 multiclass VOP3_Realtriple_with_name_gfx12<bits<10> op, string opName,
-                                           string asmName, bit isSingle = 0> :
-  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, isSingle>;
+                                           string asmName, string pseudo_mnemonic = "", bit isSingle = 0> :
+  VOP3_Realtriple_with_name<GFX12Gen, op, opName, asmName, pseudo_mnemonic, isSingle>;
 
 multiclass VOP3Only_Realtriple_with_name_gfx11_gfx12<bits<10> op, string opName,
                                                      string asmName> :
   VOP3Only_Realtriple_with_name<GFX11Gen, op, opName, asmName>,
   VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
 
-multiclass VOP3Only_Realtriple_with_name_t16_gfx12<bits<10> op, string asmName,
-                                     string opName = NAME>
-    : VOP3Only_Realtriple_with_name<GFX12Gen, op, opName, asmName>;
-
 //===----------------------------------------------------------------------===//
 
 include "VOPCInstructions.td"
@@ -1705,4 +1931,4 @@ def VOPTrue16Table : GenericTable {
 
   let PrimaryKey = ["Opcode"];
   let PrimaryKeyName = "getTrue16OpcodeHelper";
-}
\ No newline at end of file
+}
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
index 11411c691c39..1971cd80d568 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-add.s16.mir
@@ -2,7 +2,7 @@
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx900 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX6 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX10 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck -check-prefix=GFX11 %s
 
 # Note: 16-bit instructions generally produce a 0 result in the high 16-bits on GFX8 and GFX9 and preserve high 16 bits on GFX10+
 
@@ -23,6 +23,7 @@ body: |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -30,6 +31,14 @@ body: |
     ; GFX10-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX10-NEXT: [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_ADD_NC_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -56,6 +65,7 @@ body: |
     ; GFX6-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
     ; GFX6-NEXT: [[V_ADD_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_U16_e64 [[COPY]], [[COPY1]], 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_ADD_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16_zext_to_s32
     ; GFX10: liveins: $vgpr0, $vgpr1
     ; GFX10-NEXT: {{  $}}
@@ -65,6 +75,16 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
     ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_e64_]], implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16_zext_to_s32
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_ADD_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY]], 0, [[COPY1]], 0, 0, implicit $exec
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_ADD_NC_U16_fake16_e64_]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_TRUNC %0
@@ -91,12 +111,20 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16_neg_inline_const_64
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
     ; GFX10-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX10-NEXT: [[V_SUB_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16_neg_inline_const_64
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_SUB_NC_U16_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_CONSTANT i16 -64
@@ -121,6 +149,7 @@ body: |
     ; GFX6-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
     ; GFX6-NEXT: [[V_SUB_U16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_U16_e64 [[COPY]], 64, 0, implicit $exec
     ; GFX6-NEXT: S_ENDPGM 0, implicit [[V_SUB_U16_e64_]]
+    ;
     ; GFX10-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
     ; GFX10: liveins: $vgpr0
     ; GFX10-NEXT: {{  $}}
@@ -129,6 +158,15 @@ body: |
     ; GFX10-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
     ; GFX10-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_e64_]], implicit $exec
     ; GFX10-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
+    ;
+    ; GFX11-LABEL: name: add_s16_neg_inline_const_64_zext_to_s32
+    ; GFX11: liveins: $vgpr0
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[V_SUB_NC_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_SUB_NC_U16_fake16_e64 0, [[COPY]], 0, 64, 0, 0, implicit $exec
+    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 65535
+    ; GFX11-NEXT: [[V_AND_B32_e64_:%[0-9]+]]:vgpr_32 = V_AND_B32_e64 [[S_MOV_B32_]], [[V_SUB_NC_U16_fake16_e64_]], implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_AND_B32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_CONSTANT i16 -64
diff --git a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
index 1151bde02ef6..41b61f2e09a3 100644
--- a/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
+++ b/llvm/test/CodeGen/AMDGPU/dpp_combine_gfx11.mir
@@ -82,9 +82,9 @@ body:             |
 
 # Regression test for src_modifiers on base u16 opcode
 # GCN-LABEL: name: vop3_u16
-# GCN: %5:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %7:vgpr_32 = V_ADD_NC_U16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
+# GCN: %5:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 0, %1, 0, %3, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %7:vgpr_32 = V_ADD_NC_U16_fake16_e64_dpp %3, 1, %5, 2, %5, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 name: vop3_u16
 tracksRegLiveness: true
 body:             |
@@ -96,11 +96,11 @@ body:             |
     %2:vgpr_32 = COPY $vgpr2
     %3:vgpr_32 = IMPLICIT_DEF
     %4:vgpr_32 = V_MOV_B32_dpp %3, %1, 1, 15, 15, 1, implicit $exec
-    %5:vgpr_32 = V_ADD_NC_U16_e64 0, %4, 0, %3, 0, 0, implicit $exec
+    %5:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, %4, 0, %3, 0, 0, implicit $exec
     %6:vgpr_32 = V_MOV_B32_dpp %3, %5, 1, 15, 15, 1, implicit $exec
-    %7:vgpr_32 = V_ADD_NC_U16_e64 1, %6, 2, %5, 0, 0, implicit $exec
+    %7:vgpr_32 = V_ADD_NC_U16_fake16_e64 1, %6, 2, %5, 0, 0, implicit $exec
     %8:vgpr_32 = V_MOV_B32_dpp %3, %7, 1, 15, 15, 1, implicit $exec
-    %9:vgpr_32 = V_ADD_NC_U16_e64 4, %8, 8, %7, 0, 0, implicit $exec
+    %9:vgpr_32 = V_ADD_NC_U16_fake16_e64 4, %8, 8, %7, 0, 0, implicit $exec
 ...
 
 name: vop3p
@@ -880,11 +880,11 @@ body: |
 
 # Check op_sel is all 0s when combining
 # GCN-LABEL: name: opsel_vop3
-# GCN: %4:vgpr_32 = V_ADD_I16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
-# GCN: %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
-# GCN: %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
-# GCN: %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
-# GCN: %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+# GCN: %4:vgpr_32 = V_ADD_I16_fake16_e64_dpp %2, 0, %0, 0, %1, 0, 0, 1, 15, 15, 1, implicit $exec
+# GCN: %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+# GCN: %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+# GCN: %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+# GCN: %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec
 name:            opsel_vop3
 tracksRegLiveness: true
 body:             |
@@ -897,23 +897,23 @@ body:             |
 
     ; Combine for op_sel:[0,0,0]
     %3:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %4:vgpr_32 = V_ADD_I16_e64 0, %3, 0, %1, 0, 0, implicit $exec
+    %4:vgpr_32 = V_ADD_I16_fake16_e64 0, %3, 0, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[1,0,0]
     %5:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %6:vgpr_32 = V_ADD_I16_e64 4, %5, 0, %1, 0, 0, implicit $exec
+    %6:vgpr_32 = V_ADD_I16_fake16_e64 4, %5, 0, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[0,1,0]
     %7:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %8:vgpr_32 = V_ADD_I16_e64 0, %7, 4, %1, 0, 0, implicit $exec
+    %8:vgpr_32 = V_ADD_I16_fake16_e64 0, %7, 4, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[1,1,0]
     %9:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %10:vgpr_32 = V_ADD_I16_e64 4, %9, 4, %1, 0, 0, implicit $exec
+    %10:vgpr_32 = V_ADD_I16_fake16_e64 4, %9, 4, %1, 0, 0, implicit $exec
 
     ; Do not combine for op_sel:[0,0,1] (dst_op_sel only)
     %11:vgpr_32 = V_MOV_B32_dpp %2, %0, 1, 15, 15, 1, implicit $exec
-    %12:vgpr_32 = V_ADD_I16_e64 8, %11, 0, %1, 0, 0, implicit $exec
+    %12:vgpr_32 = V_ADD_I16_fake16_e64 8, %11, 0, %1, 0, 0, implicit $exec
 ...
 
 # Check op_sel is all 0s and op_sel_hi is all 1s when combining
diff --git a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
index c62b4e565078..2e2a1094ba99 100644
--- a/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
+++ b/llvm/test/CodeGen/AMDGPU/isel-amdgpu-cs-chain-preserve-cc.ll
@@ -996,7 +996,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; GISEL-GFX11-NEXT:   [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; GISEL-GFX11-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; GISEL-GFX11-NEXT:   [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
-  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
+  ; GISEL-GFX11-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY2]], 0, [[COPY1]], 0, 0, implicit $exec
   ; GISEL-GFX11-NEXT:   [[COPY3:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; GISEL-GFX11-NEXT:   FLAT_STORE_SHORT [[COPY3]], [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
   ; GISEL-GFX11-NEXT:   S_ENDPGM 0
@@ -1020,7 +1020,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF32-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF32-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF32-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF32-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF32-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
@@ -1032,7 +1032,7 @@ define amdgpu_cs_chain_preserve void @amdgpu_cs_chain_preserve_cc_i16(i16 inreg
   ; DAGISEL-GFX11-WF64-NEXT: {{  $}}
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr8
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY1:%[0-9]+]]:sgpr_32 = COPY $sgpr0
-  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
+  ; DAGISEL-GFX11-WF64-NEXT:   [[V_ADD_NC_U16_e64_:%[0-9]+]]:vgpr_32 = V_ADD_NC_U16_fake16_e64 0, [[COPY1]], 0, [[COPY]], 0, 0, implicit $exec
   ; DAGISEL-GFX11-WF64-NEXT:   [[DEF:%[0-9]+]]:sreg_64 = IMPLICIT_DEF
   ; DAGISEL-GFX11-WF64-NEXT:   [[COPY2:%[0-9]+]]:vreg_64 = COPY [[DEF]]
   ; DAGISEL-GFX11-WF64-NEXT:   FLAT_STORE_SHORT killed [[COPY2]], killed [[V_ADD_NC_U16_e64_]], 0, 0, implicit $exec, implicit $flat_scr :: (store (s16) into `ptr poison`)
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
index 67038f4c8eec..210d55898367 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3.s
@@ -244,49 +244,67 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1
 v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null
 // GFX11: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_add_nc_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_i16 v5, s1, s2
+v_add_nc_i16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_i16 v5, s105, s105
+v_add_nc_i16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_lo, ttmp15
+v_add_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_hi, 0xfe0b
+v_add_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, ttmp15, src_scc
+v_add_nc_i16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_i16 v5, m0, 0.5
+v_add_nc_i16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_i16 v5, exec_lo, -1
+v_add_nc_i16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_i16 v5, exec_hi, null
+v_add_nc_i16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_i16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_add_nc_i16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_i16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_add_nc_i32 v5, v1, v2
@@ -334,49 +352,67 @@ v_add_nc_i32 v5, src_scc, vcc_lo
 v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX11: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_add_nc_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_u16 v5, s1, s2
+v_add_nc_u16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_u16 v5, s105, s105
+v_add_nc_u16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_lo, ttmp15
+v_add_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_hi, 0xfe0b
+v_add_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, ttmp15, src_scc
+v_add_nc_u16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_u16 v5, m0, 0.5
+v_add_nc_u16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_u16 v5, exec_lo, -1
+v_add_nc_u16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_u16 v5, exec_hi, null
+v_add_nc_u16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_u16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_add_nc_u16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_u16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_alignbit_b32 v5, v1, v2, s3
@@ -5801,49 +5837,67 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo
 v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp
 // GFX11: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_i16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_i16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_i16 v5, s1, s2
+v_sub_nc_i16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_i16 v5, s105, s105
+v_sub_nc_i16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_lo, ttmp15
+v_sub_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_hi, 0xfe0b
+v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, ttmp15, src_scc
+v_sub_nc_i16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_i16 v5, m0, 0.5
+v_sub_nc_i16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_lo, -1
+v_sub_nc_i16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_hi, null
+v_sub_nc_i16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_i16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_sub_nc_i16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_sub_nc_i32 v5, v1, v2
@@ -5891,49 +5945,67 @@ v_sub_nc_i32 v5, src_scc, vcc_lo
 v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX11: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_u16 v5, v1, v2
-// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_u16 v5, v255, v255
-// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_u16 v5, s1, s2
+v_sub_nc_u16 v5.l, s1, s2
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_u16 v5, s105, s105
+v_sub_nc_u16 v5.l, s105, s105
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_lo, ttmp15
+v_sub_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_hi, 0xfe0b
+v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, ttmp15, src_scc
+v_sub_nc_u16 v5.l, ttmp15, src_scc
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_u16 v5, m0, 0.5
+v_sub_nc_u16 v5.l, m0, 0.5
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_lo, -1
+v_sub_nc_u16 v5.l, exec_lo, -1
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_hi, null
+v_sub_nc_u16 v5.l, exec_hi, null
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_u16 v5.l, null, exec_lo
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+
+v_sub_nc_u16 v5.l, -1, exec_hi
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX11: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX11: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX11: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX11: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v5.l, src_scc, vcc_lo
+// GFX11: encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX11: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX11: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX11: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_subrev_co_u32 v5, s6, v1, v2
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
index 3c693c556194..c82b61e21edf 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp16.s
@@ -194,47 +194,47 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -278,47 +278,47 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4116,47 +4116,47 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4200,47 +4200,47 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
-// GFX11: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX11: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4475,30 +4475,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX11: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
 // GFX11: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
@@ -4724,30 +4700,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX11: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX11: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX11: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX11: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX11: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX11: encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
index 79709278bc0c..73369685a0e6 100644
--- a/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx11_asm_vop3_dpp8.s
@@ -114,14 +114,23 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -132,14 +141,23 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -2601,14 +2619,23 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2619,14 +2646,23 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
-// GFX11: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX11: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX11: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX11: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2748,30 +2784,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX11: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 // GFX11: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
@@ -2997,30 +3009,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX11: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX11: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX11: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX11: encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
index f28933ec3a89..1ae1eaf1ceea 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3.s
@@ -208,49 +208,58 @@ v_add_lshl_u32 v5, src_scc, vcc_lo, -1
 v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null
 // GFX12: encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 
-v_add_nc_i16 v5, v1, v2
+v_add_nc_i16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_i16 v5, v255, v255
+v_add_nc_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_i16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_i16 v5, s1, s2
+v_add_nc_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_i16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_i16 v5, s105, s105
+v_add_nc_i16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_lo, ttmp15
+v_add_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_i16 v5, vcc_hi, 0xfe0b
+v_add_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, ttmp15, src_scc
+v_add_nc_i16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_i16 v5, m0, 0.5
+v_add_nc_i16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_i16 v5, exec_lo, -1
+v_add_nc_i16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_i16 v5, exec_hi, null
+v_add_nc_i16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_add_nc_i32 v5, v1, v2
@@ -298,49 +307,58 @@ v_add_nc_i32 v5, src_scc, vcc_lo
 v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX12: encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_add_nc_u16 v5, v1, v2
+v_add_nc_u16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
 
-v_add_nc_u16 v5, v255, v255
+v_add_nc_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+
+v_add_nc_u16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
 
-v_add_nc_u16 v5, s1, s2
+v_add_nc_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+
+v_add_nc_u16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 
-v_add_nc_u16 v5, s105, s105
+v_add_nc_u16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_lo, ttmp15
+v_add_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_add_nc_u16 v5, vcc_hi, 0xfe0b
+v_add_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, ttmp15, src_scc
+v_add_nc_u16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_add_nc_u16 v5, m0, 0.5
+v_add_nc_u16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_add_nc_u16 v5, exec_lo, -1
+v_add_nc_u16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 
-v_add_nc_u16 v5, exec_hi, null
+v_add_nc_u16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_add_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_add_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_add_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_add_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_add_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_add_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_add_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_alignbit_b32 v5, v1, v2, s3
@@ -5696,49 +5714,58 @@ v_sub_co_u32 v5, ttmp[14:15], src_scc, vcc_lo
 v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp
 // GFX12: encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_i16 v5, v1, v2
+v_sub_nc_i16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_i16 v5, v255, v255
+v_sub_nc_i16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_i16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_i16 v5, s1, s2
+v_sub_nc_i16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_i16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_i16 v5, s105, s105
+v_sub_nc_i16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_lo, ttmp15
+v_sub_nc_i16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_i16 v5, vcc_hi, 0xfe0b
+v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, ttmp15, src_scc
+v_sub_nc_i16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_i16 v5, m0, 0.5
+v_sub_nc_i16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_lo, -1
+v_sub_nc_i16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_i16 v5, exec_hi, null
+v_sub_nc_i16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_i16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_i16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_i16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_i16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_i16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_i16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_sub_nc_i32 v5, v1, v2
@@ -5786,49 +5813,58 @@ v_sub_nc_i32 v5, src_scc, vcc_lo
 v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp
 // GFX12: encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 
-v_sub_nc_u16 v5, v1, v2
+v_sub_nc_u16 v5.l, v1.l, v2.l
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
 
-v_sub_nc_u16 v5, v255, v255
+v_sub_nc_u16 v5.l, v1.h, v2.l
+// GFX12: encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+
+v_sub_nc_u16 v5.l, v255.l, v255.l
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
 
-v_sub_nc_u16 v5, s1, s2
+v_sub_nc_u16 v5.l, v255.l, v255.h
+// GFX12: encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+
+v_sub_nc_u16 v5.l, s1, s2
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 
-v_sub_nc_u16 v5, s105, s105
+v_sub_nc_u16 v5.l, s105, s105
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_lo, ttmp15
+v_sub_nc_u16 v5.l, vcc_lo, ttmp15
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 
-v_sub_nc_u16 v5, vcc_hi, 0xfe0b
+v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, ttmp15, src_scc
+v_sub_nc_u16 v5.l, ttmp15, src_scc
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 
-v_sub_nc_u16 v5, m0, 0.5
+v_sub_nc_u16 v5.l, m0, 0.5
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_lo, -1
+v_sub_nc_u16 v5.l, exec_lo, -1
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 
-v_sub_nc_u16 v5, exec_hi, null
+v_sub_nc_u16 v5.l, exec_hi, null
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 
-v_sub_nc_u16 v5, null, exec_lo op_sel:[1,1,1]
+v_sub_nc_u16 v5.h, null, exec_lo op_sel:[1,1,1]
 // GFX12: encoding: [0x05,0x58,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 
-v_sub_nc_u16 v5, -1, exec_hi op_sel:[0,0,0]
+v_sub_nc_u16 v5.l, -1, exec_hi op_sel:[0,0,0]
 // GFX12: encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
 
-v_sub_nc_u16 v5, 0.5, m0 op_sel:[1,0,0]
+v_sub_nc_u16 v5.l, 0.5, m0 op_sel:[1,0,0]
 // GFX12: encoding: [0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00]
 
-v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0]
+v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0]
 // GFX12: encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
 
-v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp
+// GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+
+v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi clamp
 // GFX12: encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 
 v_subrev_co_u32 v5, s6, v1, v2
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
index adf37901fc85..56bd0ee4b474 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp16.s
@@ -214,47 +214,71 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 row_xmask:0 row_mask:0x1 bank_mask:0x3 bo
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x47,0xd6,0xfa,0xfe,0xf7,0x03,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -298,47 +322,71 @@ v_add_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x80,0x26,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xfa,0x04,0x0e,0x04,0x01,0x1b,0x00,0xff]
@@ -4622,47 +4670,71 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 row_xmask:0 row_mask:0x1 bank_mask:
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0xfc,0x01,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4706,47 +4778,71 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctr
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x80,0x25,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0]
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3]
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
+// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3]
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0]
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15
-// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:0 row_mask:0xf bank_mask:0xf
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:0 row_mask:0xf bank_mask:0xf
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_share:15 row_mask:0x0 bank_mask:0x1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15
+// GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
 // GFX12: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
-// GFX12: [0xff,0x80,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h row_share:0 row_mask:0xf bank_mask:0xf
+// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l row_share:15 row_mask:0x0 bank_mask:0x1
+// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1 fi:0
+// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x09,0x13]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 quad_perm:[3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -5001,30 +5097,6 @@ v_xor_b16_e64_dpp v5, v1, v2 row_xmask:0 row_mask:0x1 bank_mask:0x3 bound_ctrl:1
 v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:0 fi:1
 // GFX12: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x05,0x30]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
 // GFX12: [0x05,0x0a,0x12,0xd7,0xfa,0x04,0x02,0x20,0x01,0x60,0x01,0x13]
 
@@ -5250,30 +5322,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] row_xmask:0 row_mask:0x1 ban
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
 // GFX12: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf
-// GFX12: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1
-// GFX12: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3
-// GFX12: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1
-// GFX12: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1
 // GFX12: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
 
diff --git a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
index 1be122faccbc..6331d22c6976 100644
--- a/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
+++ b/llvm/test/MC/AMDGPU/gfx12_asm_vop3_dpp8.s
@@ -134,14 +134,38 @@ v_add_lshl_u32_e64_dpp v5, v1, v2, 0.5 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_lshl_u32_e64_dpp v255, v255, v255, src_scc dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x47,0xd6,0xe9,0xfe,0xf7,0x03,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0d,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x0d,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x26,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -152,14 +176,38 @@ v_add_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_add_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x80,0x26,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_add_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_add_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x03,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x03,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_alignbit_b32_e64_dpp v5, v1, v2, v3 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x16,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x77,0x39,0x05]
@@ -3043,14 +3091,38 @@ v_sub_co_u32_e64_dpp v5, ttmp[14:15], v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_co_u32_e64_dpp v255, null, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0xfc,0x01,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_i16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_i16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x0e,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x0e,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x25,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3061,14 +3133,38 @@ v_sub_nc_i32_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_sub_nc_i32_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x80,0x25,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
+v_sub_nc_u16_e64_dpp v255.l, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
+// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] fi:1
 // GFX12: [0x05,0x00,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 
-v_sub_nc_u16_e64_dpp v255, v255, v255 clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
-// GFX12: [0xff,0x80,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l dpp8:[7,6,5,4,3,2,1,0]
+// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h dpp8:[7,6,5,4,3,2,1,0] fi:1
+// GFX12: [0x05,0x10,0x04,0xd7,0xea,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+
+v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l clamp dpp8:[0,0,0,0,0,0,0,0] fi:0
+// GFX12: [0xff,0xc0,0x04,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
 v_subrev_co_u32_e64_dpp v5, s6, v1, v2 dpp8:[7,6,5,4,3,2,1,0]
 // W32: [0x05,0x06,0x02,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3210,30 +3306,6 @@ v_xor_b16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] fi:1
 v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:0
 // GFX12: [0xff,0x00,0x64,0xd7,0xe9,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_cvt_pk_norm_i16_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 // GFX12: [0x05,0x0a,0x12,0xd7,0xe9,0x04,0x02,0x20,0x01,0x77,0x39,0x05]
 
@@ -3459,30 +3531,6 @@ v_pack_b32_f16_e64_dpp v5, -v1, |v2| op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
 v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1
 // GFX12: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0]
-// GFX12: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
-
-v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1
-// GFX12: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
-
 v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4]
 // GFX12: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
 
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
index 07058a645159..365caa5f9b6d 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3.txt
@@ -189,49 +189,112 @@
 # GFX11: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX11: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_add_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_add_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x0d,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX11: v_add_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00]
@@ -279,49 +342,112 @@
 # GFX11: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_add_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_add_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x03,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX11: v_alignbit_b32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00]
@@ -5871,49 +5997,112 @@
 # GFX11: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_sub_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x0e,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX11: v_sub_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00]
@@ -5961,49 +6150,112 @@
 # GFX11: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX11: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
-0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00
-
-# GFX11: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
-0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00
-
-# GFX11: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX11: v_sub_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX11: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX11: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
-0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00
-
-# GFX11: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
-0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00
-
-# GFX11: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
-0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00
+# W32-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, -1, exec_hi          ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi            ; encoding: [0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xc1,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0           ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xf0,0xfa,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo      ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo        ; encoding: [0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+0x05,0x00,0x04,0xd7,0xfd,0xd4,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX11: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # W32: v_subrev_co_u32 v5, s12, v1, v2           ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
index 4ae8b053f0e0..d0bd6398ad10 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp16.txt
@@ -3824,88 +3824,220 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -4664,88 +4796,220 @@
 # GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
index b44dba748666..cbf5a3d11e50 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx11_dasm_vop3_dpp8.txt
@@ -2168,34 +2168,112 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX11: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -2840,34 +2918,112 @@
 # GFX11: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX11: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX11: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
index af04a31423b6..c87c8855f5cd 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3.txt
@@ -153,49 +153,112 @@
 # GFX12: v_add_lshl_u32 v255, 0xaf123456, vcc_hi, null ; encoding: [0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf]
 0xff,0x00,0x47,0xd6,0xff,0xd6,0xf0,0x01,0x56,0x34,0x12,0xaf
 
-# GFX12: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0d,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0d,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_add_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0d,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0d,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0d,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x0d,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x0d,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x0d,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0d,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0d,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_add_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x26,0xd7,0x01,0x05,0x02,0x00]
@@ -243,49 +306,112 @@
 # GFX12: v_add_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x26,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x03,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x03,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_add_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x03,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x03,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x03,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x03,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x03,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x03,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x03,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_add_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_add_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x03,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_add_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_alignbit_b32 v5, v1, v2, s3           ; encoding: [0x05,0x00,0x16,0xd6,0x01,0x05,0x0e,0x00]
@@ -5797,49 +5923,112 @@
 # GFX12: v_sub_co_u32 v255, null, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0xfc,0x01,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x0e,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255             ; encoding: [0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x0e,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s1, s2                 ; encoding: [0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, s105, s105             ; encoding: [0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_sub_nc_i16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x0e,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x0e,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x0e,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x0e,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x0e,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x0e,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x0e,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_i16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_i16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x0e,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # GFX12: v_sub_nc_i32 v5, v1, v2                 ; encoding: [0x05,0x00,0x25,0xd7,0x01,0x05,0x02,0x00]
@@ -5887,49 +6076,112 @@
 # GFX12: v_sub_nc_i32 v255, 0xaf123456, vcc_hi clamp ; encoding: [0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf]
 0xff,0x80,0x25,0xd7,0xff,0xd6,0x00,0x00,0x56,0x34,0x12,0xaf
 
-# GFX12: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.l, v2.l           ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00]
 0x05,0x00,0x04,0xd7,0x01,0x05,0x02,0x00
 
-# GFX12: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.l       ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255             ; encoding: [0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00]
 0x05,0x00,0x04,0xd7,0xff,0xff,0x03,0x00
 
-# GFX12: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s1, s2               ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s1, s2                 ; encoding: [0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x01,0x04,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, s105, s105           ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, s105, s105             ; encoding: [0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x69,0xd2,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_lo, ttmp15       ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_lo, ttmp15         ; encoding: [0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6a,0xf6,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, vcc_hi, 0xfe0b       ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, vcc_hi, 0xfe0b         ; encoding: [0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x6b,0xfe,0x01,0x00,0x0b,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, ttmp15, src_scc      ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, ttmp15, src_scc        ; encoding: [0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7b,0xfa,0x01,0x00
 
-# GFX12: v_sub_nc_u16 v5, m0, 0x3800
+# W32-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, m0, 0x3800           ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, m0, 0x3800             ; encoding: [0x05,0x00,0x04,0xd7,0x7d,0xfe,0x01,0x00,0x00,0x38,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7d,0xe0,0x01,0x00
 
-# GFX12: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_lo, -1          ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_lo, -1            ; encoding: [0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00]
 0x05,0x00,0x04,0xd7,0x7e,0x82,0x01,0x00
 
-# GFX12: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, exec_hi, null        ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, exec_hi, null          ; encoding: [0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7f,0xf8,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, null, exec_lo        ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, null, exec_lo          ; encoding: [0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00]
 0x05,0x00,0x04,0xd7,0x7c,0xfc,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.h, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, -1, exec_hi op_sel:[1,1,1] ; encoding: [0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00]
 0x05,0x58,0x04,0xd7,0xc1,0xfe,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0]
+# W32-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, 0x3800, m0 op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0xff,0xfa,0x00,0x00,0x00,0x38,0x00,0x00]
 0x05,0x08,0x04,0xd7,0xf0,0xfa,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, src_scc, vcc_lo op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00]
 0x05,0x10,0x04,0xd7,0xfd,0xd4,0x00,0x00
 
-# GFX12: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v1.h, v2.l op_sel:[1,0,0] ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v1, v2 op_sel:[1,0,0]  ; encoding: [0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00]
+0x05,0x08,0x04,0xd7,0x01,0x05,0x02,0x00
+
+# W32-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W32-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-REAL16: v_sub_nc_u16 v5.l, v255.l, v255.h op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+# W64-FAKE16: v_sub_nc_u16 v5, v255, v255 op_sel:[0,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00]
+0x05,0x10,0x04,0xd7,0xff,0xff,0x03,0x00
+
+# W32-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16 v255.h, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16 v255, 0xfe0b, vcc_hi op_sel:[0,0,1] clamp ; encoding: [0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xff,0xd6,0x00,0x00,0x0b,0xfe,0x00,0x00
 
 # W32: v_subrev_co_u32 v5, s12, v1, v2           ; encoding: [0x05,0x0c,0x02,0xd7,0x01,0x05,0x02,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
index 65cfdd5ef7de..5081b9811e43 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp16.txt
@@ -4115,88 +4115,268 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0x00,0x64,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0d,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0d,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x03,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x03,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x12,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
@@ -5000,88 +5180,268 @@
 # GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30]
 0xff,0x13,0x11,0xd7,0xfa,0xfe,0x03,0x60,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x0e,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x0e,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[3,2,1,0] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1b,0x00,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 quad_perm:[0,1,2,3] row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0xe4,0x00,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x40,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_half_mirror row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x41,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x01,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shl:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x0f,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x11,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_shr:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x1f,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:1 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x21,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 row_ror:15 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff]
 0x05,0x00,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x2f,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] row_share:0 row_mask:0xf bank_mask:0xf ; encoding: [0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff]
 0x05,0x58,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x50,0x01,0xff
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] row_share:15 row_mask:0x0 bank_mask:0x1 ; encoding: [0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01]
 0x05,0x08,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x5f,0x01,0x01
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] row_xmask:0 row_mask:0x1 bank_mask:0x3 ; encoding: [0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13]
 0x05,0x10,0x04,0xd7,0xfa,0x04,0x02,0x00,0x01,0x60,0x01,0x13
 
-# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp row_xmask:15 row_mask:0x3 bank_mask:0x0 bound_ctrl:1 fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30]
 0xff,0xc0,0x04,0xd7,0xfa,0xfe,0x03,0x00,0xff,0x6f,0x0d,0x30
 
 # GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 quad_perm:[0,1,2,3] row_mask:0x0 bank_mask:0x0 fi:1 ; encoding: [0x00,0x00,0x66,0xd6,0xfa,0x04,0x0e,0x04,0x01,0xe4,0x04,0x00]
diff --git a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
index 4640b967cbc0..77f05027d1cf 100644
--- a/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
+++ b/llvm/test/MC/Disassembler/AMDGPU/gfx12_dasm_vop3_dpp8.txt
@@ -2393,34 +2393,160 @@
 # W64-FAKE16: v_xor_b16_e64_dpp v255, v255, v255 dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0x00,0x64,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0d,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0d,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_add_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x03,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_add_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_add_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x03,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_cvt_pk_norm_i16_f16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x12,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
@@ -3113,34 +3239,160 @@
 # GFX12: v_pack_b32_f16_e64_dpp v255, -|v255|, -|v255| op_sel:[0,1,0] dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00]
 0xff,0x13,0x11,0xd7,0xea,0xfe,0x03,0x60,0xff,0x00,0x00,0x00
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x0e,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_i16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_i16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x0e,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.l dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x00,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
+
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.h, v1.h, v2.h op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,1,1] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x58,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.h, v2.l op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[1,0,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x08,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v5.l, v1.l, v2.h op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v5, v1, v2 op_sel:[0,1,0] dpp8:[7,6,5,4,3,2,1,0] ; encoding: [0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05]
 0x05,0x10,0x04,0xd7,0xe9,0x04,0x02,0x00,0x01,0x77,0x39,0x05
 
-# GFX12: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W32-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-REAL16: v_sub_nc_u16_e64_dpp v255.h, v255.l, v255.l op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
+# W64-FAKE16: v_sub_nc_u16_e64_dpp v255, v255, v255 op_sel:[0,0,1] clamp dpp8:[0,0,0,0,0,0,0,0] fi:1 ; encoding: [0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00]
 0xff,0xc0,0x04,0xd7,0xea,0xfe,0x03,0x00,0xff,0x00,0x00,0x00
 
 # GFX12: v_dot2_f16_f16_e64_dpp v0, v1, v2, v3 dpp8:[0,1,2,3,4,4,4,4] ; encoding: [0x00,0x00,0x66,0xd6,0xe9,0x04,0x0e,0x04,0x01,0x88,0x46,0x92]
-- 
GitLab


From 95c24cb9de54f81b07ee4abd594fc32905063c68 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Wed, 16 Oct 2024 16:33:13 +0200
Subject: [PATCH 142/329] [libc][math][c23] Add exp10m1f16 C23 math function
 (#105706)

Part of #95250.
---
 libc/config/gpu/entrypoints.txt              |   1 +
 libc/config/linux/x86_64/entrypoints.txt     |   1 +
 libc/docs/math/index.rst                     |   2 +-
 libc/spec/stdc.td                            |   2 +
 libc/src/math/CMakeLists.txt                 |   2 +
 libc/src/math/exp10m1f16.h                   |  21 +++
 libc/src/math/generic/CMakeLists.txt         |  23 +++
 libc/src/math/generic/exp10f16.cpp           |  47 +-----
 libc/src/math/generic/exp10m1f16.cpp         | 163 +++++++++++++++++++
 libc/src/math/generic/expxf16.h              |  47 ++++++
 libc/test/src/math/CMakeLists.txt            |  11 ++
 libc/test/src/math/exp10m1f16_test.cpp       |  40 +++++
 libc/test/src/math/smoke/CMakeLists.txt      |  13 ++
 libc/test/src/math/smoke/exp10m1f16_test.cpp | 113 +++++++++++++
 libc/utils/MPFRWrapper/MPFRUtils.cpp         |  25 +++
 libc/utils/MPFRWrapper/MPFRUtils.h           |   1 +
 16 files changed, 467 insertions(+), 45 deletions(-)
 create mode 100644 libc/src/math/exp10m1f16.h
 create mode 100644 libc/src/math/generic/exp10m1f16.cpp
 create mode 100644 libc/test/src/math/exp10m1f16_test.cpp
 create mode 100644 libc/test/src/math/smoke/exp10m1f16_test.cpp

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index b4cfe47f4505..251ad43ece8d 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -522,6 +522,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.ceilf16
     libc.src.math.copysignf16
     libc.src.math.exp10f16
+    libc.src.math.exp10m1f16
     libc.src.math.exp2f16
     libc.src.math.expf16
     libc.src.math.f16add
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 2589da3756e1..3ca14ec03de3 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -611,6 +611,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.ceilf16
     libc.src.math.copysignf16
     libc.src.math.exp10f16
+    libc.src.math.exp10m1f16
     libc.src.math.exp2f16
     libc.src.math.exp2m1f16
     libc.src.math.expf16
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 72e8f6689a36..95ac7f4f12f9 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -292,7 +292,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp10     | |check|          | |check|         |                        | |check|              |                        | 7.12.6.2               | F.10.3.2                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| exp10m1   |                  |                 |                        |                      |                        | 7.12.6.3               | F.10.3.3                   |
+| exp10m1   |                  |                 |                        | |check|              |                        | 7.12.6.3               | F.10.3.3                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | exp2      | |check|          | |check|         |                        | |check|              |                        | 7.12.6.4               | F.10.3.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index 1b255690c2e4..ea032ba5f66e 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -692,6 +692,8 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"exp10f", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
           GuardedFunctionSpec<"exp10f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
 
+          GuardedFunctionSpec<"exp10m1f16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+
           FunctionSpec<"remainder", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
           FunctionSpec<"remainderf", RetValSpec<FloatType>, [ArgSpec<FloatType>, ArgSpec<FloatType>]>,
           FunctionSpec<"remainderl", RetValSpec<LongDoubleType>, [ArgSpec<LongDoubleType>, ArgSpec<LongDoubleType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index 7803369583de..ecf639684814 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -127,6 +127,8 @@ add_math_entrypoint_object(exp10)
 add_math_entrypoint_object(exp10f)
 add_math_entrypoint_object(exp10f16)
 
+add_math_entrypoint_object(exp10m1f16)
+
 add_math_entrypoint_object(expm1)
 add_math_entrypoint_object(expm1f)
 add_math_entrypoint_object(expm1f16)
diff --git a/libc/src/math/exp10m1f16.h b/libc/src/math/exp10m1f16.h
new file mode 100644
index 000000000000..e195bc431f2e
--- /dev/null
+++ b/libc/src/math/exp10m1f16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for exp10m1f16 --------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_EXP10M1F16_H
+#define LLVM_LIBC_SRC_MATH_EXP10M1F16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 exp10m1f16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_EXP10M1F16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index 1ad611fa168c..ffa74970a2ab 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -1656,6 +1656,29 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  exp10m1f16
+  SRCS
+    exp10m1f16.cpp
+  HDRS
+    ../exp10m1f16.h
+  DEPENDS
+    .expxf16
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.cast
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.multiply_add
+    libc.src.__support.FPUtil.polyeval
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+    libc.src.__support.macros.properties.cpu_features
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   expm1
   SRCS
diff --git a/libc/src/math/generic/exp10f16.cpp b/libc/src/math/generic/exp10f16.cpp
index 1c5966c1f1c1..f7a8ee3245ed 100644
--- a/libc/src/math/generic/exp10f16.cpp
+++ b/libc/src/math/generic/exp10f16.cpp
@@ -54,16 +54,6 @@ static constexpr fputil::ExceptValues<float16, N_EXP10F16_EXCEPTS>
 #endif
     }};
 
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > round(log2(10), SG, RN);
-static constexpr float LOG2F_10 = 0x1.a934fp+1f;
-
-// Generated by Sollya with the following commands:
-//   > display = hexadecimal;
-//   > round(log10(2), SG, RN);
-static constexpr float LOG10F_2 = 0x1.344136p-2f;
-
 LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
   using FPBits = fputil::FPBits<float16>;
   FPBits x_bits(x);
@@ -132,40 +122,9 @@ LLVM_LIBC_FUNCTION(float16, exp10f16, (float16 x)) {
   if (auto r = EXP10F16_EXCEPTS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
     return r.value();
 
-  // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
-  // find hi, mid, lo, such that:
-  //   x = (hi + mid) * log2(10) + lo, in which
-  //     hi is an integer,
-  //     mid * 2^3 is an integer,
-  //     -2^(-4) <= lo < 2^(-4).
-  // In particular,
-  //   hi + mid = round(x * 2^3) * 2^(-3).
-  // Then,
-  //   10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
-  // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
-  // by adding hi to the exponent field of 2^mid.  10^lo is computed using a
-  // degree-4 minimax polynomial generated by Sollya.
-
-  float xf = x;
-  float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
-  int x_hi_mid = static_cast<int>(kf);
-  int x_hi = x_hi_mid >> 3;
-  int x_mid = x_hi_mid & 0x7;
-  // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
-  float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
-
-  uint32_t exp2_hi_mid_bits =
-      EXP2_MID_BITS[x_mid] +
-      static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
-  float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
-  // Degree-4 minimax polynomial generated by Sollya with the following
-  // commands:
-  //   > display = hexadecimal;
-  //   > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
-  //   > 1 + x * P;
-  float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
-                                    0x1.04b434p+1f, 0x1.2bcf9ep+0f);
-  return fputil::cast<float16>(exp2_hi_mid * exp10_lo);
+  // 10^x = 2^((hi + mid) * log2(10)) * 10^lo
+  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+  return static_cast<float16>(exp2_hi_mid * exp10_lo);
 }
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/exp10m1f16.cpp b/libc/src/math/generic/exp10m1f16.cpp
new file mode 100644
index 000000000000..9f2c1959fa5e
--- /dev/null
+++ b/libc/src/math/generic/exp10m1f16.cpp
@@ -0,0 +1,163 @@
+//===-- Half-precision 10^x - 1 function ----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10m1f16.h"
+#include "expxf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/multiply_add.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+#include "src/__support/macros/properties/cpu_features.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 3> EXP10M1F16_EXCEPTS_LO = {{
+    // (input, RZ output, RU offset, RD offset, RN offset)
+    // x = 0x1.5c4p-4, exp10m1f16(x) = 0x1.bacp-3 (RZ)
+    {0x2d71U, 0x32ebU, 1U, 0U, 0U},
+    // x = -0x1.5ep-13, exp10m1f16(x) = -0x1.92cp-12 (RZ)
+    {0x8978U, 0x8e4bU, 0U, 1U, 0U},
+    // x = -0x1.e2p-10, exp10m1f16(x) = -0x1.14cp-8 (RZ)
+    {0x9788U, 0x9c53U, 0U, 1U, 0U},
+}};
+
+#ifdef LIBC_TARGET_CPU_HAS_FMA
+static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 3;
+#else
+static constexpr size_t N_EXP10M1F16_EXCEPTS_HI = 6;
+#endif
+
+static constexpr fputil::ExceptValues<float16, N_EXP10M1F16_EXCEPTS_HI>
+    EXP10M1F16_EXCEPTS_HI = {{
+        // (input, RZ output, RU offset, RD offset, RN offset)
+        // x = 0x1.8f4p-2, exp10m1f16(x) = 0x1.744p+0 (RZ)
+        {0x363dU, 0x3dd1U, 1U, 0U, 0U},
+        // x = 0x1.95cp-2, exp10m1f16(x) = 0x1.7d8p+0 (RZ)
+        {0x3657U, 0x3df6U, 1U, 0U, 0U},
+        // x = 0x1.d04p-2, exp10m1f16(x) = 0x1.d7p+0 (RZ)
+        {0x3741U, 0x3f5cU, 1U, 0U, 1U},
+#ifndef LIBC_TARGET_CPU_HAS_FMA
+        // x = 0x1.0cp+1, exp10m1f16(x) = 0x1.ec4p+6 (RZ)
+        {0x4030U, 0x57b1U, 1U, 0U, 1U},
+        // x = 0x1.1b8p+1, exp10m1f16(x) = 0x1.45cp+7 (RZ)
+        {0x406eU, 0x5917U, 1U, 0U, 1U},
+        // x = 0x1.2f4p+2, exp10m1f16(x) = 0x1.ab8p+15 (RZ)
+        {0x44bdU, 0x7aaeU, 1U, 0U, 1U},
+#endif
+    }};
+
+LLVM_LIBC_FUNCTION(float16, exp10m1f16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| <= 2^(-3), or |x| >= 11 * log10(2), or x is NaN.
+  if (LIBC_UNLIKELY(x_abs <= 0x3000U || x_abs >= 0x429fU)) {
+    // exp10m1(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When x >= 16 * log10(2).
+    if (x_u >= 0x44d1U && x_bits.is_pos()) {
+      // exp10m1(+inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+
+    // When x < -11 * log10(2).
+    if (x_u > 0xc29fU) {
+      // exp10m1(-inf) = -1
+      if (x_bits.is_inf())
+        return FPBits::one(Sign::NEG).get_val();
+
+      // When x >= -0x1.ce4p+1, round(10^x - 1, HP, RN) = -0x1.ffcp-1.
+      if (x_u <= 0xc339U) {
+        return fputil::round_result_slightly_down(
+            fputil::cast<float16>(-0x1.ffcp-1));
+      }
+
+      // When x < -0x1.ce4p+1, round(10^x - 1, HP, RN) = -1.
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_DOWNWARD:
+        return FPBits::one(Sign::NEG).get_val();
+      default:
+        return fputil::cast<float16>(-0x1.ffcp-1);
+      }
+    }
+
+    // When |x| <= 2^(-3).
+    if (x_abs <= 0x3000U) {
+      if (auto r = EXP10M1F16_EXCEPTS_LO.lookup(x_u);
+          LIBC_UNLIKELY(r.has_value()))
+        return r.value();
+
+      float xf = x;
+      // Degree-5 minimax polynomial generated by Sollya with the following
+      // commands:
+      //   > display = hexadecimal;
+      //   > P = fpminimax((10^x - 1)/x, 4, [|SG...|], [-2^-3, 2^-3]);
+      //   > x * P;
+      return fputil::cast<float16>(
+          xf * fputil::polyeval(xf, 0x1.26bb1cp+1f, 0x1.5351c8p+1f,
+                                0x1.04704p+1f, 0x1.2ce084p+0f, 0x1.14a6bep-1f));
+    }
+  }
+
+  // When x is 1, 2, or 3. These are hard-to-round cases with exact results.
+  // 10^4 - 1 = 9'999 is not exactly representable as a float16, but luckily the
+  // polynomial approximation gives the correct result for x = 4 in all
+  // rounding modes.
+  if (LIBC_UNLIKELY((x_u & ~(0x3c00U | 0x4000U | 0x4200U | 0x4400U)) == 0)) {
+    switch (x_u) {
+    case 0x3c00U: // x = 1.0f16
+      return fputil::cast<float16>(9.0);
+    case 0x4000U: // x = 2.0f16
+      return fputil::cast<float16>(99.0);
+    case 0x4200U: // x = 3.0f16
+      return fputil::cast<float16>(999.0);
+    }
+  }
+
+  if (auto r = EXP10M1F16_EXCEPTS_HI.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+    return r.value();
+
+  // exp10(x) = exp2((hi + mid) * log2(10)) * exp10(lo)
+  auto [exp2_hi_mid, exp10_lo] = exp10_range_reduction(x);
+  // exp10m1(x) = exp2((hi + mid) * log2(lo)) * exp10(lo) - 1
+  return fputil::cast<float16>(
+      fputil::multiply_add(exp2_hi_mid, exp10_lo, -1.0f));
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index 35294130a150..8de329bd2ab0 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -127,6 +127,53 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
   return {exp2_hi_mid, exp2_lo};
 }
 
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log2(10), SG, RN);
+static constexpr float LOG2F_10 = 0x1.a934fp+1f;
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log10(2), SG, RN);
+static constexpr float LOG10F_2 = 0x1.344136p-2f;
+
+LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) {
+  // For -8 < x < 5, to compute 10^x, we perform the following range reduction:
+  // find hi, mid, lo, such that:
+  //   x = (hi + mid) * log2(10) + lo, in which
+  //     hi is an integer,
+  //     mid * 2^3 is an integer,
+  //     -2^(-4) <= lo < 2^(-4).
+  // In particular,
+  //   hi + mid = round(x * 2^3) * 2^(-3).
+  // Then,
+  //   10^x = 10^(hi + mid + lo) = 2^((hi + mid) * log2(10)) + 10^lo
+  // We store 2^mid in the lookup table EXP2_MID_BITS, and compute 2^hi * 2^mid
+  // by adding hi to the exponent field of 2^mid.  10^lo is computed using a
+  // degree-4 minimax polynomial generated by Sollya.
+
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
+  int x_hi_mid = static_cast<int>(kf);
+  int x_hi = x_hi_mid >> 3;
+  int x_mid = x_hi_mid & 0x7;
+  // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
+  float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
+
+  uint32_t exp2_hi_mid_bits =
+      EXP2_MID_BITS[x_mid] +
+      static_cast<uint32_t>(x_hi << fputil::FPBits<float>::FRACTION_LEN);
+  float exp2_hi_mid = fputil::FPBits<float>(exp2_hi_mid_bits).get_val();
+  // Degree-4 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax((10^x - 1)/x, 3, [|SG...|], [-2^-4, 2^-4]);
+  //   > 1 + x * P;
+  float exp10_lo = fputil::polyeval(lo, 0x1p+0f, 0x1.26bb14p+1f, 0x1.53526p+1f,
+                                    0x1.04b434p+1f, 0x1.2bcf9ep+0f);
+  return {exp2_hi_mid, exp10_lo};
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 12e1d078b29b..5dff0b49125b 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1062,6 +1062,17 @@ add_fp_unittest(
     libc.src.math.exp10f16
 )
 
+add_fp_unittest(
+  exp10m1f16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    exp10m1f16_test.cpp
+  DEPENDS
+    libc.src.math.exp10m1f16
+)
+
 add_fp_unittest(
   copysign_test
   SUITE
diff --git a/libc/test/src/math/exp10m1f16_test.cpp b/libc/test/src/math/exp10m1f16_test.cpp
new file mode 100644
index 000000000000..41bb12f7d097
--- /dev/null
+++ b/libc/test/src/math/exp10m1f16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for exp10m1f16 ------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/exp10m1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcExp10m1f16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
+                                   LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Exp10m1, x,
+                                   LIBC_NAMESPACE::exp10m1f16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 447ea6952713..6b3623dc0d0d 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -1235,6 +1235,19 @@ add_fp_unittest(
     libc.src.__support.FPUtil.cast
 )
 
+add_fp_unittest(
+  exp10m1f16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    exp10m1f16_test.cpp
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.errno.errno
+    libc.src.math.exp10m1f16
+    libc.src.__support.FPUtil.cast
+)
+
 add_fp_unittest(
   copysign_test
   SUITE
diff --git a/libc/test/src/math/smoke/exp10m1f16_test.cpp b/libc/test/src/math/smoke/exp10m1f16_test.cpp
new file mode 100644
index 000000000000..dfa7fa477d3d
--- /dev/null
+++ b/libc/test/src/math/smoke/exp10m1f16_test.cpp
@@ -0,0 +1,113 @@
+//===-- Unittests for exp10m1f16 ------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/exp10m1f16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcExp10m1f16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcExp10m1f16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::exp10m1f16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::exp10m1f16(sNaN),
+                              FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::exp10m1f16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+                            LIBC_NAMESPACE::exp10m1f16(neg_inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::exp10m1f16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::exp10m1f16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::exp10m1f16(max_normal),
+                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  // round(16 * log10(2), HP, RN);
+  float16 x = LIBC_NAMESPACE::fputil::cast<float16>(0x1.344p+2);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+      inf, LIBC_NAMESPACE::exp10m1f16(x), FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      max_normal, LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcExp10m1f16Test, ResultNearNegOne) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+                              LIBC_NAMESPACE::exp10m1f16(neg_max_normal),
+                              FE_INEXACT);
+
+  // round(-11 * log10(2), HP, RD);
+  float16 x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.a8p+1);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  // Next float16 value below -0x1.ce4p+1.
+  x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ce8p+1);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      LIBC_NAMESPACE::fputil::cast<float16>(-1.0),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      LIBC_NAMESPACE::fputil::cast<float16>(-0x1.ffcp-1),
+      LIBC_NAMESPACE::exp10m1f16(x), FE_INEXACT);
+}
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.cpp b/libc/utils/MPFRWrapper/MPFRUtils.cpp
index eecffc782c1a..bd4fbe294a62 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.cpp
+++ b/libc/utils/MPFRWrapper/MPFRUtils.cpp
@@ -334,6 +334,29 @@ public:
     return result;
   }
 
+  MPFRNumber exp10m1() const {
+    // TODO: Only use mpfr_exp10m1 once CI and buildbots get MPFR >= 4.2.0.
+#if MPFR_VERSION_MAJOR > 4 ||                                                  \
+    (MPFR_VERSION_MAJOR == 4 && MPFR_VERSION_MINOR >= 2)
+    MPFRNumber result(*this);
+    mpfr_exp10m1(result.value, value, mpfr_rounding);
+    return result;
+#else
+    unsigned int prec = mpfr_precision * 3;
+    MPFRNumber result(*this, prec);
+
+    MPFRNumber ln10(10.0f, prec);
+    // log(10)
+    mpfr_log(ln10.value, ln10.value, mpfr_rounding);
+    // x * log(10)
+    mpfr_mul(result.value, value, ln10.value, mpfr_rounding);
+    // e^(x * log(10)) - 1
+    int ex = mpfr_expm1(result.value, result.value, mpfr_rounding);
+    mpfr_subnormalize(result.value, ex, mpfr_rounding);
+    return result;
+#endif
+  }
+
   MPFRNumber expm1() const {
     MPFRNumber result(*this);
     mpfr_expm1(result.value, value, mpfr_rounding);
@@ -744,6 +767,8 @@ unary_operation(Operation op, InputType input, unsigned int precision,
     return mpfrInput.exp2m1();
   case Operation::Exp10:
     return mpfrInput.exp10();
+  case Operation::Exp10m1:
+    return mpfrInput.exp10m1();
   case Operation::Expm1:
     return mpfrInput.expm1();
   case Operation::Floor:
diff --git a/libc/utils/MPFRWrapper/MPFRUtils.h b/libc/utils/MPFRWrapper/MPFRUtils.h
index 8d51fa4e4772..9fc12a6adefb 100644
--- a/libc/utils/MPFRWrapper/MPFRUtils.h
+++ b/libc/utils/MPFRWrapper/MPFRUtils.h
@@ -42,6 +42,7 @@ enum class Operation : int {
   Exp2,
   Exp2m1,
   Exp10,
+  Exp10m1,
   Expm1,
   Floor,
   Log,
-- 
GitLab


From 9381c6fd04cc16a7606633f57c96c11e58181ddb Mon Sep 17 00:00:00 2001
From: Krystian Stasiowski <sdkrystian@gmail.com>
Date: Wed, 16 Oct 2024 08:40:03 -0600
Subject: [PATCH 143/329] [Clang][Sema] Use the correct injected template
 arguments for partial specializations when collecting multi-level template
 argument lists (#112381)

After #111852 refactored multi-level template argument list collection,
the following results in a crash:
```
template<typename T, bool B>
struct A;

template<bool B>
struct A<int, B>
{
    void f() requires B;
};

template<bool B>
void A<int, B>::f() requires B { } // crash here
```

This happens because when collecting template arguments for constraint
normalization from a partial specialization, we incorrectly use the
template argument list of the partial specialization. We should be using
the template argument list of the _template-head_ (as defined in
[temp.arg.general] p2). Fixes #112222.
---
 clang/include/clang/AST/DeclTemplate.h        |  18 +-
 clang/lib/AST/DeclTemplate.cpp                |  28 ++
 clang/lib/Sema/SemaTemplateInstantiate.cpp    |   4 +-
 .../temp/temp.constr/temp.constr.decl/p4.cpp  | 284 ++++++++++--------
 4 files changed, 211 insertions(+), 123 deletions(-)

diff --git a/clang/include/clang/AST/DeclTemplate.h b/clang/include/clang/AST/DeclTemplate.h
index 141f58c4600a..0f0c0bf6e4ef 100644
--- a/clang/include/clang/AST/DeclTemplate.h
+++ b/clang/include/clang/AST/DeclTemplate.h
@@ -2085,7 +2085,11 @@ public:
 class ClassTemplatePartialSpecializationDecl
   : public ClassTemplateSpecializationDecl {
   /// The list of template parameters
-  TemplateParameterList* TemplateParams = nullptr;
+  TemplateParameterList *TemplateParams = nullptr;
+
+  /// The set of "injected" template arguments used within this
+  /// partial specialization.
+  TemplateArgument *InjectedArgs = nullptr;
 
   /// The class template partial specialization from which this
   /// class template partial specialization was instantiated.
@@ -2132,6 +2136,10 @@ public:
     return TemplateParams;
   }
 
+  /// Retrieve the template arguments list of the template parameter list
+  /// of this template.
+  ArrayRef<TemplateArgument> getInjectedTemplateArgs();
+
   /// \brief All associated constraints of this partial specialization,
   /// including the requires clause and any constraints derived from
   /// constrained-parameters.
@@ -2856,6 +2864,10 @@ class VarTemplatePartialSpecializationDecl
   /// The list of template parameters
   TemplateParameterList *TemplateParams = nullptr;
 
+  /// The set of "injected" template arguments used within this
+  /// partial specialization.
+  TemplateArgument *InjectedArgs = nullptr;
+
   /// The variable template partial specialization from which this
   /// variable template partial specialization was instantiated.
   ///
@@ -2902,6 +2914,10 @@ public:
     return TemplateParams;
   }
 
+  /// Retrieve the template arguments list of the template parameter list
+  /// of this template.
+  ArrayRef<TemplateArgument> getInjectedTemplateArgs();
+
   /// \brief All associated constraints of this partial specialization,
   /// including the requires clause and any constraints derived from
   /// constrained-parameters.
diff --git a/clang/lib/AST/DeclTemplate.cpp b/clang/lib/AST/DeclTemplate.cpp
index d9b67b7bedf5..d2d8907b884e 100644
--- a/clang/lib/AST/DeclTemplate.cpp
+++ b/clang/lib/AST/DeclTemplate.cpp
@@ -1185,6 +1185,20 @@ SourceRange ClassTemplatePartialSpecializationDecl::getSourceRange() const {
   return Range;
 }
 
+ArrayRef<TemplateArgument>
+ClassTemplatePartialSpecializationDecl::getInjectedTemplateArgs() {
+  TemplateParameterList *Params = getTemplateParameters();
+  auto *First = cast<ClassTemplatePartialSpecializationDecl>(getFirstDecl());
+  if (!First->InjectedArgs) {
+    auto &Context = getASTContext();
+    SmallVector<TemplateArgument, 16> TemplateArgs;
+    Context.getInjectedTemplateArgs(Params, TemplateArgs);
+    First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()];
+    std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs);
+  }
+  return llvm::ArrayRef(First->InjectedArgs, Params->size());
+}
+
 //===----------------------------------------------------------------------===//
 // FriendTemplateDecl Implementation
 //===----------------------------------------------------------------------===//
@@ -1535,6 +1549,20 @@ SourceRange VarTemplatePartialSpecializationDecl::getSourceRange() const {
   return Range;
 }
 
+ArrayRef<TemplateArgument>
+VarTemplatePartialSpecializationDecl::getInjectedTemplateArgs() {
+  TemplateParameterList *Params = getTemplateParameters();
+  auto *First = cast<VarTemplatePartialSpecializationDecl>(getFirstDecl());
+  if (!First->InjectedArgs) {
+    auto &Context = getASTContext();
+    SmallVector<TemplateArgument, 16> TemplateArgs;
+    Context.getInjectedTemplateArgs(Params, TemplateArgs);
+    First->InjectedArgs = new (Context) TemplateArgument[TemplateArgs.size()];
+    std::copy(TemplateArgs.begin(), TemplateArgs.end(), First->InjectedArgs);
+  }
+  return llvm::ArrayRef(First->InjectedArgs, Params->size());
+}
+
 static TemplateParameterList *
 createMakeIntegerSeqParameterList(const ASTContext &C, DeclContext *DC) {
   // typename T
diff --git a/clang/lib/Sema/SemaTemplateInstantiate.cpp b/clang/lib/Sema/SemaTemplateInstantiate.cpp
index 8c7f694c0904..8665c099903d 100644
--- a/clang/lib/Sema/SemaTemplateInstantiate.cpp
+++ b/clang/lib/Sema/SemaTemplateInstantiate.cpp
@@ -237,7 +237,7 @@ struct TemplateInstantiationArgumentCollecter
     if (Innermost)
       AddInnermostTemplateArguments(VTPSD);
     else if (ForConstraintInstantiation)
-      AddOuterTemplateArguments(VTPSD, VTPSD->getTemplateArgs().asArray(),
+      AddOuterTemplateArguments(VTPSD, VTPSD->getInjectedTemplateArgs(),
                                 /*Final=*/false);
 
     if (VTPSD->isMemberSpecialization())
@@ -274,7 +274,7 @@ struct TemplateInstantiationArgumentCollecter
     if (Innermost)
       AddInnermostTemplateArguments(CTPSD);
     else if (ForConstraintInstantiation)
-      AddOuterTemplateArguments(CTPSD, CTPSD->getTemplateArgs().asArray(),
+      AddOuterTemplateArguments(CTPSD, CTPSD->getInjectedTemplateArgs(),
                                 /*Final=*/false);
 
     if (CTPSD->isMemberSpecialization())
diff --git a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
index 70064f867e18..f144e14cd122 100644
--- a/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
+++ b/clang/test/CXX/temp/temp.constr/temp.constr.decl/p4.cpp
@@ -1,175 +1,219 @@
 // RUN: %clang_cc1 -std=c++20 -verify %s
 // expected-no-diagnostics
 
-template<typename T>
-concept D = true;
+namespace Primary {
+  template<typename T>
+  concept D = true;
 
-template<typename T>
-struct A {
-  template<typename U, bool V>
-  void f() requires V;
+  template<typename T>
+  struct A {
+    template<typename U, bool V>
+    void f() requires V;
 
-  template<>
-  void f<short, true>();
+    template<>
+    void f<short, true>();
+
+    template<D U>
+    void g();
+
+    template<typename U, bool V> requires V
+    struct B;
+
+    template<typename U, bool V> requires V
+    struct B<U*, V>;
+
+    template<>
+    struct B<short, true>;
+
+    template<D U>
+    struct C;
+
+    template<D U>
+    struct C<U*>;
 
+    template<typename U, bool V> requires V
+    static int x;
+
+    template<typename U, bool V> requires V
+    static int x<U*, V>;
+
+    template<>
+    int x<short, true>;
+
+    template<D U>
+    static int y;
+
+    template<D U>
+    static int y<U*>;
+  };
+
+  template<typename T>
+  template<typename U, bool V>
+  void A<T>::f() requires V { }
+
+  template<typename T>
   template<D U>
-  void g();
+  void A<T>::g() { }
 
+  template<typename T>
   template<typename U, bool V> requires V
-  struct B;
+  struct A<T>::B { };
 
+  template<typename T>
   template<typename U, bool V> requires V
-  struct B<U*, V>;
+  struct A<T>::B<U*, V> { };
 
-  template<>
-  struct B<short, true>;
+  template<typename T>
+  template<typename U, bool V> requires V
+  struct A<T>::B<U&, V> { };
 
+  template<typename T>
   template<D U>
-  struct C;
+  struct A<T>::C { };
 
+  template<typename T>
   template<D U>
-  struct C<U*>;
+  struct A<T>::C<U*> { };
 
+  template<typename T>
   template<typename U, bool V> requires V
-  static int x;
+  int A<T>::x = 0;
 
+  template<typename T>
   template<typename U, bool V> requires V
-  static int x<U*, V>;
+  int A<T>::x<U*, V> = 0;
 
-  template<>
-  int x<short, true>;
+  template<typename T>
+  template<typename U, bool V> requires V
+  int A<T>::x<U&, V> = 0;
 
+  template<typename T>
   template<D U>
-  static int y;
+  int A<T>::y = 0;
 
+  template<typename T>
   template<D U>
-  static int y<U*>;
-};
-
-template<typename T>
-template<typename U, bool V>
-void A<T>::f() requires V { }
+  int A<T>::y<U*> = 0;
 
-template<typename T>
-template<D U>
-void A<T>::g() { }
-
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B { };
+  template<>
+  template<typename U, bool V>
+  void A<short>::f() requires V;
 
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B<U*, V> { };
+  template<>
+  template<>
+  void A<short>::f<int, true>();
 
-template<typename T>
-template<typename U, bool V> requires V
-struct A<T>::B<U&, V> { };
+  template<>
+  template<>
+  void A<void>::f<int, true>();
 
-template<typename T>
-template<D U>
-struct A<T>::C { };
+  template<>
+  template<D U>
+  void A<short>::g();
 
-template<typename T>
-template<D U>
-struct A<T>::C<U*> { };
+  template<>
+  template<typename U, bool V> requires V
+  struct A<int>::B;
 
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x = 0;
+  template<>
+  template<>
+  struct A<int>::B<int, true>;
 
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x<U*, V> = 0;
+  template<>
+  template<>
+  struct A<void>::B<int, true>;
 
-template<typename T>
-template<typename U, bool V> requires V
-int A<T>::x<U&, V> = 0;
+  template<>
+  template<typename U, bool V> requires V
+  struct A<int>::B<U*, V>;
 
-template<typename T>
-template<D U>
-int A<T>::y = 0;
+  template<>
+  template<typename U, bool V> requires V
+  struct A<int>::B<U&, V>;
 
-template<typename T>
-template<D U>
-int A<T>::y<U*> = 0;
+  template<>
+  template<D U>
+  struct A<int>::C;
 
-template<>
-template<typename U, bool V>
-void A<short>::f() requires V;
+  template<>
+  template<D U>
+  struct A<int>::C<U*>;
 
-template<>
-template<>
-void A<short>::f<int, true>();
+  template<>
+  template<D U>
+  struct A<int>::C<U&>;
 
-template<>
-template<>
-void A<void>::f<int, true>();
+  template<>
+  template<typename U, bool V> requires V
+  int A<long>::x;
 
-template<>
-template<D U>
-void A<short>::g();
+  template<>
+  template<>
+  int A<long>::x<int, true>;
 
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B;
+  template<>
+  template<>
+  int A<void>::x<int, true>;
 
-template<>
-template<>
-struct A<int>::B<int, true>;
+  template<>
+  template<typename U, bool V> requires V
+  int A<long>::x<U*, V>;
 
-template<>
-template<>
-struct A<void>::B<int, true>;
+  template<>
+  template<typename U, bool V> requires V
+  int A<long>::x<U&, V>;
 
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B<U*, V>;
+  template<>
+  template<D U>
+  int A<long>::y;
 
-template<>
-template<typename U, bool V> requires V
-struct A<int>::B<U&, V>;
+  template<>
+  template<D U>
+  int A<long>::y<U*>;
 
-template<>
-template<D U>
-struct A<int>::C;
+  template<>
+  template<D U>
+  int A<long>::y<U&>;
+} // namespace Primary
 
-template<>
-template<D U>
-struct A<int>::C<U*>;
+namespace Partial {
+  template<typename T, bool B>
+  struct A;
 
-template<>
-template<D U>
-struct A<int>::C<U&>;
+  template<bool U>
+  struct A<int, U>
+  {
+      template<typename V> requires U
+      void f();
 
-template<>
-template<typename U, bool V> requires V
-int A<long>::x;
+      template<typename V> requires U
+      static const int x;
 
-template<>
-template<>
-int A<long>::x<int, true>;
+      template<typename V> requires U
+      struct B;
+  };
 
-template<>
-template<>
-int A<void>::x<int, true>;
+  template<bool U>
+  template<typename V> requires U
+  void A<int, U>::f() { }
 
-template<>
-template<typename U, bool V> requires V
-int A<long>::x<U*, V>;
+  template<bool U>
+  template<typename V> requires U
+  constexpr int A<int, U>::x = 0;
 
-template<>
-template<typename U, bool V> requires V
-int A<long>::x<U&, V>;
+  template<bool U>
+  template<typename V> requires U
+  struct A<int, U>::B { };
 
-template<>
-template<D U>
-int A<long>::y;
+  template<>
+  template<typename V> requires true
+  void A<int, true>::f() { }
 
-template<>
-template<D U>
-int A<long>::y<U*>;
+  template<>
+  template<typename V> requires true
+  constexpr int A<int, true>::x = 1;
 
-template<>
-template<D U>
-int A<long>::y<U&>;
+  template<>
+  template<typename V> requires true
+  struct A<int, true>::B { };
+} // namespace Partial
-- 
GitLab


From 383df16317eec3b29b93025e2a86ea024b3f59c7 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Wed, 16 Oct 2024 15:40:54 +0100
Subject: [PATCH 144/329] [llvm][llvm-lit] Add total time for each testsuite in
 JUnit XML output (#112230)

Currently we write out a time taken to run all test suites:
```
<testsuites time="8.28">
```
And one for each test:
```
<testcase classname="lldb-shell.Breakpoint" name="breakpoint-command.test" time="2.38"/>
```
However, the schema says there should be one for each suite and test,
but none for testsuites:

https://github.com/windyroad/JUnit-Schema/blob/cfa434d4b8e102a8f55b8727b552a0063ee9044e/JUnit.xsd#L216

I'm leaving the `testsuites` time in though because no one has
complained so far, and someone out there probably has a script relying
on it by now. Most XML tools handle unknown attributes quite well
anyway.

I'm adding a per testsuite time to comply with the schema and maybe be
more compatible with other JUnit tools.
```
<testsuite name="lldb-shell" ... time="12.34">
```

The test suite time is the sum of the time taken for all tests in the
suite. This will ignore some overhead in setting up the suite, and means
that the sum of the times for all individual suites may not equal the
`testsuites` time.

As we're usually focusing on the execution time of particular tests, not
lit's book keeping, I think this is a reasonable choice.
---
 llvm/utils/lit/lit/reports.py         | 14 +++++++++++---
 llvm/utils/lit/tests/shtest-format.py |  2 +-
 llvm/utils/lit/tests/xunit-output.py  |  2 +-
 3 files changed, 13 insertions(+), 5 deletions(-)

diff --git a/llvm/utils/lit/lit/reports.py b/llvm/utils/lit/lit/reports.py
index 2ac44b0c0ce8..d2d719b076bc 100755
--- a/llvm/utils/lit/lit/reports.py
+++ b/llvm/utils/lit/lit/reports.py
@@ -105,12 +105,20 @@ class XunitReport(object):
             file.write("</testsuites>\n")
 
     def _write_testsuite(self, file, suite, tests):
-        skipped = sum(1 for t in tests if t.result.code in self.skipped_codes)
-        failures = sum(1 for t in tests if t.isFailure())
+        skipped = 0
+        failures = 0
+        time = 0.0
+
+        for t in tests:
+            if t.result.code in self.skipped_codes:
+                skipped += 1
+            if t.isFailure():
+                failures += 1
+            time += t.result.elapsed
 
         name = suite.config.name.replace(".", "-")
         file.write(
-            f'<testsuite name={quo(name)} tests="{len(tests)}" failures="{failures}" skipped="{skipped}">\n'
+            f'<testsuite name={quo(name)} tests="{len(tests)}" failures="{failures}" skipped="{skipped}" time="{time:.2f}">\n'
         )
         for test in tests:
             self._write_test(file, test, name)
diff --git a/llvm/utils/lit/tests/shtest-format.py b/llvm/utils/lit/tests/shtest-format.py
index 4a3d65b7bce4..3a1959549e5d 100644
--- a/llvm/utils/lit/tests/shtest-format.py
+++ b/llvm/utils/lit/tests/shtest-format.py
@@ -107,7 +107,7 @@
 
 # XUNIT: <?xml version="1.0" encoding="UTF-8"?>
 # XUNIT-NEXT: <testsuites time="{{[0-9.]+}}">
-# XUNIT-NEXT: <testsuite name="shtest-format" tests="22" failures="8" skipped="3">
+# XUNIT-NEXT: <testsuite name="shtest-format" tests="22" failures="8" skipped="3" time="{{[0-9.]+}}">
 
 # XUNIT: <testcase classname="shtest-format.external_shell" name="fail.txt" time="{{[0-9]+\.[0-9]+}}">
 # XUNIT-NEXT: <failure{{[ ]*}}>
diff --git a/llvm/utils/lit/tests/xunit-output.py b/llvm/utils/lit/tests/xunit-output.py
index 67d99849fe36..392cded4653f 100644
--- a/llvm/utils/lit/tests/xunit-output.py
+++ b/llvm/utils/lit/tests/xunit-output.py
@@ -9,7 +9,7 @@
 
 # CHECK:      <?xml version="1.0" encoding="UTF-8"?>
 # CHECK-NEXT: <testsuites time="{{[0-9.]+}}">
-# CHECK-NEXT: <testsuite name="test-data" tests="5" failures="1" skipped="3">
+# CHECK-NEXT: <testsuite name="test-data" tests="5" failures="1" skipped="3" time="{{[0-9.]+}}">
 # CHECK-NEXT: <testcase classname="test-data.test-data" name="bad&amp;name.ini" time="{{[0-1]\.[0-9]+}}">
 # CHECK-NEXT:   <failure><![CDATA[& < > ]]]]><![CDATA[> &"]]></failure>
 # CHECK-NEXT: </testcase>
-- 
GitLab


From d9c95efb6c102fc9e9c52a558d611bb7aa433dbb Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Oct 2024 15:43:30 +0100
Subject: [PATCH 145/329] [LLVM] Make more use of IRBuilder::CreateIntrinsic.
 NFC. (#112546)

Convert almost every instance of:
  CreateCall(Intrinsic::getOrInsertDeclaration(...), ...)
to the equivalent CreateIntrinsic call.
---
 llvm/lib/CodeGen/SafeStack.cpp                |   3 +-
 llvm/lib/CodeGen/StackProtector.cpp           |   6 +-
 llvm/lib/IR/AutoUpgrade.cpp                   | 150 +++++++-----------
 .../Target/AArch64/AArch64ISelLowering.cpp    |   3 +-
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |   5 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   3 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   7 +-
 llvm/lib/Target/X86/X86WinEHState.cpp         |  21 +--
 .../Instrumentation/AddressSanitizer.cpp      |   7 +-
 .../Instrumentation/HWAddressSanitizer.cpp    |  19 ++-
 llvm/lib/Transforms/Instrumentation/KCFI.cpp  |   3 +-
 .../Instrumentation/PGOInstrumentation.cpp    |  32 ++--
 .../Instrumentation/ThreadSanitizer.cpp       |   4 +-
 13 files changed, 101 insertions(+), 162 deletions(-)

diff --git a/llvm/lib/CodeGen/SafeStack.cpp b/llvm/lib/CodeGen/SafeStack.cpp
index a50909af8bfc..ad2037a2c20b 100644
--- a/llvm/lib/CodeGen/SafeStack.cpp
+++ b/llvm/lib/CodeGen/SafeStack.cpp
@@ -368,8 +368,7 @@ Value *SafeStack::getStackGuard(IRBuilder<> &IRB, Function &F) {
 
   if (!StackGuardVar) {
     TL.insertSSPDeclarations(*M);
-    return IRB.CreateCall(
-        Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
+    return IRB.CreateIntrinsic(Intrinsic::stackguard, {}, {});
   }
 
   return IRB.CreateLoad(StackPtrTy, StackGuardVar, "StackGuard");
diff --git a/llvm/lib/CodeGen/StackProtector.cpp b/llvm/lib/CodeGen/StackProtector.cpp
index a192161bbd94..0ce305c4410d 100644
--- a/llvm/lib/CodeGen/StackProtector.cpp
+++ b/llvm/lib/CodeGen/StackProtector.cpp
@@ -519,8 +519,7 @@ static Value *getStackGuard(const TargetLoweringBase *TLI, Module *M,
   if (SupportsSelectionDAGSP)
     *SupportsSelectionDAGSP = true;
   TLI->insertSSPDeclarations(*M);
-  return B.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackguard));
+  return B.CreateIntrinsic(Intrinsic::stackguard, {}, {});
 }
 
 /// Insert code into the entry block that stores the stack guard
@@ -541,8 +540,7 @@ static bool CreatePrologue(Function *F, Module *M, Instruction *CheckLoc,
   AI = B.CreateAlloca(PtrTy, nullptr, "StackGuardSlot");
 
   Value *GuardSlot = getStackGuard(TLI, M, B, &SupportsSelectionDAGSP);
-  B.CreateCall(Intrinsic::getOrInsertDeclaration(M, Intrinsic::stackprotector),
-               {GuardSlot, AI});
+  B.CreateIntrinsic(Intrinsic::stackprotector, {}, {GuardSlot, AI});
   return SupportsSelectionDAGSP;
 }
 
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 32f66f77f19f..519ff8d74c5a 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1745,8 +1745,7 @@ static Value *upgradeX86VPERMT2Intrinsics(IRBuilder<> &Builder, CallBase &CI,
   if (!IndexForm)
     std::swap(Args[0], Args[1]);
 
-  Value *V = Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
+  Value *V = Builder.CreateIntrinsic(IID, {}, Args);
   Value *PassThru = ZeroMask ? ConstantAggregateZero::get(Ty)
                              : Builder.CreateBitCast(CI.getArgOperand(1),
                                                      Ty);
@@ -2269,8 +2268,7 @@ static bool upgradeAVX512MaskToSelect(StringRef Name, IRBuilder<> &Builder,
   SmallVector<Value *, 4> Args(CI.args());
   Args.pop_back();
   Args.pop_back();
-  Rep = Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(CI.getModule(), IID), Args);
+  Rep = Builder.CreateIntrinsic(IID, {}, Args);
   unsigned NumArgs = CI.arg_size();
   Rep = emitX86Select(Builder, CI.getArgOperand(NumArgs - 1), Rep,
                       CI.getArgOperand(NumArgs - 2));
@@ -2325,25 +2323,21 @@ static Value *upgradeNVVMIntrinsicCall(StringRef Name, CallBase *CI,
   } else if (Name == "clz.ll") {
     // llvm.nvvm.clz.ll returns an i32, but llvm.ctlz.i64 returns an i64.
     Value *Arg = CI->getArgOperand(0);
-    Value *Ctlz = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctlz,
-                                          {Arg->getType()}),
-        {Arg, Builder.getFalse()}, "ctlz");
+    Value *Ctlz = Builder.CreateIntrinsic(Intrinsic::ctlz, {Arg->getType()},
+                                          {Arg, Builder.getFalse()},
+                                          /*FMFSource=*/nullptr, "ctlz");
     Rep = Builder.CreateTrunc(Ctlz, Builder.getInt32Ty(), "ctlz.trunc");
   } else if (Name == "popc.ll") {
     // llvm.nvvm.popc.ll returns an i32, but llvm.ctpop.i64 returns an
     // i64.
     Value *Arg = CI->getArgOperand(0);
-    Value *Popc = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::ctpop,
-                                          {Arg->getType()}),
-        Arg, "ctpop");
+    Value *Popc = Builder.CreateIntrinsic(Intrinsic::ctpop, {Arg->getType()},
+                                          Arg, /*FMFSource=*/nullptr, "ctpop");
     Rep = Builder.CreateTrunc(Popc, Builder.getInt32Ty(), "ctpop.trunc");
   } else if (Name == "h2f") {
-    Rep = Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                                 F->getParent(), Intrinsic::convert_from_fp16,
-                                 {Builder.getFloatTy()}),
-                             CI->getArgOperand(0), "h2f");
+    Rep = Builder.CreateIntrinsic(Intrinsic::convert_from_fp16,
+                                  {Builder.getFloatTy()}, CI->getArgOperand(0),
+                                  /*FMFSource=*/nullptr, "h2f");
   } else if (Name.consume_front("bitcast.") &&
              (Name == "f2i" || Name == "i2f" || Name == "ll2d" ||
               Name == "d2ll")) {
@@ -2493,10 +2487,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   } else if (Name.starts_with("avx.sqrt.p") ||
              Name.starts_with("sse2.sqrt.p") ||
              Name.starts_with("sse.sqrt.p")) {
-    Rep =
-        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                               F->getParent(), Intrinsic::sqrt, CI->getType()),
-                           {CI->getArgOperand(0)});
+    Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(),
+                                  {CI->getArgOperand(0)});
   } else if (Name.starts_with("avx512.mask.sqrt.p")) {
     if (CI->arg_size() == 4 &&
         (!isa<ConstantInt>(CI->getArgOperand(3)) ||
@@ -2505,13 +2497,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
                                           : Intrinsic::x86_avx512_sqrt_pd_512;
 
       Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(3)};
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+      Rep = Builder.CreateIntrinsic(IID, {}, Args);
     } else {
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), Intrinsic::sqrt,
-                                            CI->getType()),
-          {CI->getArgOperand(0)});
+      Rep = Builder.CreateIntrinsic(Intrinsic::sqrt, CI->getType(),
+                                    {CI->getArgOperand(0)});
     }
     Rep =
         emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
@@ -2635,9 +2624,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       break;
     }
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
-        {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = Builder.CreateIntrinsic(IID, {},
+                                  {CI->getOperand(0), CI->getArgOperand(1)});
     Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.mask.fpclass.p")) {
     Type *OpTy = CI->getArgOperand(0)->getType();
@@ -2659,9 +2647,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     else
       llvm_unreachable("Unexpected intrinsic");
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
-        {CI->getOperand(0), CI->getArgOperand(1)});
+    Rep = Builder.CreateIntrinsic(IID, {},
+                                  {CI->getOperand(0), CI->getArgOperand(1)});
     Rep = applyX86MaskOn1BitsVec(Builder, Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.cmp.p")) {
     SmallVector<Value *, 4> Args(CI->args());
@@ -2689,8 +2676,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       std::swap(Mask, Args.back());
     Args.push_back(Mask);
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
   } else if (Name.starts_with("avx512.mask.cmp.")) {
     // Integer compare intrinsics.
     unsigned Imm = cast<ConstantInt>(CI->getArgOperand(2))->getZExtValue();
@@ -3413,8 +3399,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_add_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFAdd(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3429,8 +3415,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_div_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFDiv(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3445,8 +3431,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_mul_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFMul(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3461,8 +3447,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_sub_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+      Rep = Builder.CreateIntrinsic(
+          IID, {},
           {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     } else {
       Rep = Builder.CreateFSub(CI->getArgOperand(0), CI->getArgOperand(1));
@@ -3479,16 +3465,15 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         {Intrinsic::x86_avx512_min_ps_512, Intrinsic::x86_avx512_min_pd_512}};
     Intrinsic::ID IID = MinMaxTbl[IsMin][IsDouble];
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
+    Rep = Builder.CreateIntrinsic(
+        IID, {},
         {CI->getArgOperand(0), CI->getArgOperand(1), CI->getArgOperand(4)});
     Rep =
         emitX86Select(Builder, CI->getArgOperand(3), Rep, CI->getArgOperand(2));
   } else if (Name.starts_with("avx512.mask.lzcnt.")) {
     Rep =
-        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                               F->getParent(), Intrinsic::ctlz, CI->getType()),
-                           {CI->getArgOperand(0), Builder.getInt1(false)});
+        Builder.CreateIntrinsic(Intrinsic::ctlz, CI->getType(),
+                                {CI->getArgOperand(0), Builder.getInt1(false)});
     Rep =
         emitX86Select(Builder, CI->getArgOperand(2), Rep, CI->getArgOperand(1));
   } else if (Name.starts_with("avx512.mask.psll")) {
@@ -3732,10 +3717,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     if (NegAcc)
       Ops[2] = Builder.CreateFNeg(Ops[2]);
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
-                                          Ops[0]->getType()),
-        Ops);
+    Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops);
 
     if (IsScalar)
       Rep = Builder.CreateInsertElement(CI->getArgOperand(0), Rep, (uint64_t)0);
@@ -3747,10 +3729,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Ops[1] = Builder.CreateExtractElement(Ops[1], (uint64_t)0);
     Ops[2] = Builder.CreateExtractElement(Ops[2], (uint64_t)0);
 
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), Intrinsic::fma,
-                                          Ops[0]->getType()),
-        Ops);
+    Rep = Builder.CreateIntrinsic(Intrinsic::fma, Ops[0]->getType(), Ops);
 
     Rep = Builder.CreateInsertElement(Constant::getNullValue(CI->getType()),
                                       Rep, (uint64_t)0);
@@ -3846,9 +3825,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       else
         IID = Intrinsic::x86_avx512_vfmadd_pd_512;
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID),
-          {A, B, C, CI->getArgOperand(4)});
+      Rep = Builder.CreateIntrinsic(IID, {}, {A, B, C, CI->getArgOperand(4)});
     } else {
       Function *FMA = Intrinsic::getOrInsertDeclaration(
           CI->getModule(), Intrinsic::fma, A->getType());
@@ -3878,8 +3855,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Value *Ops[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                     CI->getArgOperand(2)};
     Ops[2] = Builder.CreateFNeg(Ops[2]);
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
+    Rep = Builder.CreateIntrinsic(IID, {}, Ops);
   } else if (Name.starts_with("avx512.mask.vfmaddsub.p") ||
              Name.starts_with("avx512.mask3.vfmaddsub.p") ||
              Name.starts_with("avx512.maskz.vfmaddsub.p") ||
@@ -3902,8 +3878,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       if (IsSubAdd)
         Ops[2] = Builder.CreateFNeg(Ops[2]);
 
-      Rep = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(F->getParent(), IID), Ops);
+      Rep = Builder.CreateIntrinsic(IID, {}, Ops);
     } else {
       int NumElts = cast<FixedVectorType>(CI->getType())->getNumElements();
 
@@ -3954,8 +3929,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2), CI->getArgOperand(3)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(4), Rep, PassThru);
@@ -3982,8 +3956,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4018,8 +3991,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4048,8 +4020,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Rep = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Rep = Builder.CreateIntrinsic(IID, {}, Args);
     Value *PassThru = ZeroMask ? ConstantAggregateZero::get(CI->getType())
                                : CI->getArgOperand(0);
     Rep = emitX86Select(Builder, CI->getArgOperand(3), Rep, PassThru);
@@ -4071,8 +4042,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     // Make a call with 3 operands.
     Value *Args[] = {CI->getArgOperand(0), CI->getArgOperand(1),
                      CI->getArgOperand(2)};
-    Value *NewCall = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(CI->getModule(), IID), Args);
+    Value *NewCall = Builder.CreateIntrinsic(IID, {}, Args);
 
     // Extract the second result and store it.
     Value *Data = Builder.CreateExtractValue(NewCall, 1);
@@ -4127,20 +4097,15 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   if (Name == "mve.vctp64.old") {
     // Replace the old v4i1 vctp64 with a v2i1 vctp and predicate-casts to the
     // correct type.
-    Value *VCTP =
-        Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                               F->getParent(), Intrinsic::arm_mve_vctp64),
-                           CI->getArgOperand(0), CI->getName());
-    Value *C1 = Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(
-            F->getParent(), Intrinsic::arm_mve_pred_v2i,
-            {VectorType::get(Builder.getInt1Ty(), 2, false)}),
-        VCTP);
-    return Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(
-            F->getParent(), Intrinsic::arm_mve_pred_i2v,
-            {VectorType::get(Builder.getInt1Ty(), 4, false)}),
-        C1);
+    Value *VCTP = Builder.CreateIntrinsic(Intrinsic::arm_mve_vctp64, {},
+                                          CI->getArgOperand(0),
+                                          /*FMFSource=*/nullptr, CI->getName());
+    Value *C1 = Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_pred_v2i,
+        {VectorType::get(Builder.getInt1Ty(), 2, false)}, VCTP);
+    return Builder.CreateIntrinsic(
+        Intrinsic::arm_mve_pred_i2v,
+        {VectorType::get(Builder.getInt1Ty(), 4, false)}, C1);
   } else if (Name == "mve.mull.int.predicated.v2i64.v4i32.v4i1" ||
              Name == "mve.vqdmull.predicated.v2i64.v4i32.v4i1" ||
              Name == "mve.vldr.gather.base.predicated.v2i64.v2i64.v4i1" ||
@@ -4198,15 +4163,10 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     for (Value *Op : CI->args()) {
       Type *Ty = Op->getType();
       if (Ty->getScalarSizeInBits() == 1) {
-        Value *C1 = Builder.CreateCall(
-            Intrinsic::getOrInsertDeclaration(
-                F->getParent(), Intrinsic::arm_mve_pred_v2i,
-                {VectorType::get(Builder.getInt1Ty(), 4, false)}),
-            Op);
-        Op = Builder.CreateCall(
-            Intrinsic::getOrInsertDeclaration(
-                F->getParent(), Intrinsic::arm_mve_pred_i2v, {V2I1Ty}),
-            C1);
+        Value *C1 = Builder.CreateIntrinsic(
+            Intrinsic::arm_mve_pred_v2i,
+            {VectorType::get(Builder.getInt1Ty(), 4, false)}, Op);
+        Op = Builder.CreateIntrinsic(Intrinsic::arm_mve_pred_i2v, {V2I1Ty}, C1);
       }
       Ops.push_back(Op);
     }
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index ed06d8a5d630..6ec492227d9f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27284,8 +27284,7 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilderBase &Builder) const {
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::aarch64_clrex));
+  Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
 }
 
 Value *AArch64TargetLowering::emitStoreConditional(IRBuilderBase &Builder,
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index cfce56f0bfe9..51af16c48f70 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -921,9 +921,8 @@ void AMDGPUSwLowerLDS::lowerKernelLDSAccesses(Function *Func,
   FunctionCallee AsanFreeFunc = M.getOrInsertFunction(
       StringRef("__asan_free_impl"),
       FunctionType::get(IRB.getVoidTy(), {Int64Ty, Int64Ty}, false));
-  Value *ReturnAddr = IRB.CreateCall(
-      Intrinsic::getOrInsertDeclaration(&M, Intrinsic::returnaddress),
-      IRB.getInt32(0));
+  Value *ReturnAddr =
+      IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
   Value *RAPToInt = IRB.CreatePtrToInt(ReturnAddr, Int64Ty);
   Value *MallocPtrToInt = IRB.CreatePtrToInt(LoadMallocPtr, Int64Ty);
   IRB.CreateCall(AsanFreeFunc, {MallocPtrToInt, RAPToInt});
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a35582bebb08..75a4ccb7b353 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21446,8 +21446,7 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
   if (!Subtarget->hasV7Ops())
     return;
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_clrex));
+  Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}, {});
 }
 
 Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 911d92f0c484..cec1e507f08f 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12205,11 +12205,8 @@ Instruction *PPCTargetLowering::emitTrailingFence(IRBuilderBase &Builder,
     // http://www.rdrop.com/users/paulmck/scalability/paper/N2745r.2011.03.04a.html
     // and http://www.cl.cam.ac.uk/~pes20/cppppc/ for justification.
     if (isa<LoadInst>(Inst))
-      return Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(
-              Builder.GetInsertBlock()->getParent()->getParent(),
-              Intrinsic::ppc_cfence, {Inst->getType()}),
-          {Inst});
+      return Builder.CreateIntrinsic(Intrinsic::ppc_cfence, {Inst->getType()},
+                                     {Inst});
     // FIXME: Can use isync for rmw operation.
     return callIntrinsic(Builder, Intrinsic::ppc_lwsync);
   }
diff --git a/llvm/lib/Target/X86/X86WinEHState.cpp b/llvm/lib/Target/X86/X86WinEHState.cpp
index 05fc6f13129f..bc9fd801f94b 100644
--- a/llvm/lib/Target/X86/X86WinEHState.cpp
+++ b/llvm/lib/Target/X86/X86WinEHState.cpp
@@ -333,12 +333,10 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
     // If using _except_handler4, the EHGuard contains: FramePtr xor Cookie.
     if (UseStackGuard) {
       Value *Val = Builder.CreateLoad(Int32Ty, Cookie);
-      Value *FrameAddr = Builder.CreateCall(
-          Intrinsic::getOrInsertDeclaration(
-              TheModule, Intrinsic::frameaddress,
-              Builder.getPtrTy(
-                  TheModule->getDataLayout().getAllocaAddrSpace())),
-          Builder.getInt32(0), "frameaddr");
+      Value *FrameAddr = Builder.CreateIntrinsic(
+          Intrinsic::frameaddress,
+          Builder.getPtrTy(TheModule->getDataLayout().getAllocaAddrSpace()),
+          Builder.getInt32(0), /*FMFSource=*/nullptr, "frameaddr");
       Value *FrameAddrI32 = Builder.CreatePtrToInt(FrameAddr, Int32Ty);
       FrameAddrI32 = Builder.CreateXor(FrameAddrI32, Val);
       Builder.CreateStore(FrameAddrI32, EHGuardNode);
@@ -369,8 +367,7 @@ void WinEHStatePass::emitExceptionRegistrationRecord(Function *F) {
 }
 
 Value *WinEHStatePass::emitEHLSDA(IRBuilder<> &Builder, Function *F) {
-  return Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(TheModule, Intrinsic::x86_seh_lsda), F);
+  return Builder.CreateIntrinsic(Intrinsic::x86_seh_lsda, {}, F);
 }
 
 /// Generate a thunk that puts the LSDA of ParentFunc in EAX and then calls
@@ -624,17 +621,13 @@ void WinEHStatePass::addStateStores(Function &F, WinEHFuncInfo &FuncInfo) {
   // that it can recover the original frame pointer.
   IRBuilder<> Builder(RegNode->getNextNode());
   Value *RegNodeI8 = Builder.CreateBitCast(RegNode, Builder.getPtrTy());
-  Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                         TheModule, Intrinsic::x86_seh_ehregnode),
-                     {RegNodeI8});
+  Builder.CreateIntrinsic(Intrinsic::x86_seh_ehregnode, {}, {RegNodeI8});
 
   if (EHGuardNode) {
     IRBuilder<> Builder(EHGuardNode->getNextNode());
     Value *EHGuardNodeI8 =
         Builder.CreateBitCast(EHGuardNode, Builder.getPtrTy());
-    Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                           TheModule, Intrinsic::x86_seh_ehguard),
-                       {EHGuardNodeI8});
+    Builder.CreateIntrinsic(Intrinsic::x86_seh_ehguard, {}, {EHGuardNodeI8});
   }
 
   // Calculate state numbers.
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 02d9fab309d8..9d4f05780cb6 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1866,10 +1866,9 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
   if (UseCalls && ClOptimizeCallbacks) {
     const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
     Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-    IRB.CreateCall(
-        Intrinsic::getOrInsertDeclaration(M, Intrinsic::asan_check_memaccess),
-        {IRB.CreatePointerCast(Addr, PtrTy),
-         ConstantInt::get(Int32Ty, AccessInfo.Packed)});
+    IRB.CreateIntrinsic(Intrinsic::asan_check_memaccess, {},
+                        {IRB.CreatePointerCast(Addr, PtrTy),
+                         ConstantInt::get(Int32Ty, AccessInfo.Packed)});
     return;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index 5ec4973ea03d..c55043a9b20f 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1041,19 +1041,18 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
   }
 
   if (UseFixedShadowIntrinsic) {
-    IRB.CreateCall(
-        Intrinsic::getOrInsertDeclaration(
-            M, UseShortGranules
-                   ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow
-                   : Intrinsic::hwasan_check_memaccess_fixedshadow),
+    IRB.CreateIntrinsic(
+        UseShortGranules
+            ? Intrinsic::hwasan_check_memaccess_shortgranules_fixedshadow
+            : Intrinsic::hwasan_check_memaccess_fixedshadow,
+        {},
         {Ptr, ConstantInt::get(Int32Ty, AccessInfo),
          ConstantInt::get(Int64Ty, Mapping.offset())});
   } else {
-    IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
-                       M, UseShortGranules
-                              ? Intrinsic::hwasan_check_memaccess_shortgranules
-                              : Intrinsic::hwasan_check_memaccess),
-                   {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
+    IRB.CreateIntrinsic(
+        UseShortGranules ? Intrinsic::hwasan_check_memaccess_shortgranules
+                         : Intrinsic::hwasan_check_memaccess,
+        {}, {ShadowBase, Ptr, ConstantInt::get(Int32Ty, AccessInfo)});
   }
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/KCFI.cpp b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
index bbe0f4c61781..4b653a83a896 100644
--- a/llvm/lib/Transforms/Instrumentation/KCFI.cpp
+++ b/llvm/lib/Transforms/Instrumentation/KCFI.cpp
@@ -110,8 +110,7 @@ PreservedAnalyses KCFIPass::run(Function &F, FunctionAnalysisManager &AM) {
     Instruction *ThenTerm =
         SplitBlockAndInsertIfThen(Test, Call, false, VeryUnlikelyWeights);
     Builder.SetInsertPoint(ThenTerm);
-    Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::debugtrap));
+    Builder.CreateIntrinsic(Intrinsic::debugtrap, {}, {});
     ++NumKCFIChecks;
   }
 
diff --git a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
index e6e474ed3760..919660e7a040 100644
--- a/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
+++ b/llvm/lib/Transforms/Instrumentation/PGOInstrumentation.cpp
@@ -918,8 +918,8 @@ void FunctionInstrumenter::instrument() {
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
     // llvm.instrprof.cover(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                      i32 <index>)
-    Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_cover),
+    Builder.CreateIntrinsic(
+        Intrinsic::instrprof_cover, {},
         {NormalizedNamePtr, CFGHash, Builder.getInt32(1), Builder.getInt32(0)});
     return;
   }
@@ -971,10 +971,10 @@ void FunctionInstrumenter::instrument() {
     IRBuilder<> Builder(&EntryBB, EntryBB.getFirstInsertionPt());
     // llvm.instrprof.timestamp(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                          i32 <index>)
-    Builder.CreateCall(
-        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::instrprof_timestamp),
-        {NormalizedNamePtr, CFGHash, Builder.getInt32(NumCounters),
-         Builder.getInt32(I)});
+    Builder.CreateIntrinsic(Intrinsic::instrprof_timestamp, {},
+                            {NormalizedNamePtr, CFGHash,
+                             Builder.getInt32(NumCounters),
+                             Builder.getInt32(I)});
     I += PGOBlockCoverage ? 8 : 1;
   }
 
@@ -984,12 +984,12 @@ void FunctionInstrumenter::instrument() {
            "Cannot get the Instrumentation point");
     // llvm.instrprof.increment(i8* <name>, i64 <hash>, i32 <num-counters>,
     //                          i32 <index>)
-    Builder.CreateCall(Intrinsic::getOrInsertDeclaration(
-                           &M, PGOBlockCoverage
-                                   ? Intrinsic::instrprof_cover
-                                   : Intrinsic::instrprof_increment),
-                       {NormalizedNamePtr, CFGHash,
-                        Builder.getInt32(NumCounters), Builder.getInt32(I++)});
+    Builder.CreateIntrinsic(PGOBlockCoverage ? Intrinsic::instrprof_cover
+                                             : Intrinsic::instrprof_increment,
+                            {},
+                            {NormalizedNamePtr, CFGHash,
+                             Builder.getInt32(NumCounters),
+                             Builder.getInt32(I++)});
   }
 
   // Now instrument select instructions:
@@ -1726,10 +1726,10 @@ void SelectInstVisitor::instrumentOneSelectInst(SelectInst &SI) {
   auto *NormalizedFuncNameVarPtr =
       ConstantExpr::getPointerBitCastOrAddrSpaceCast(
           FuncNameVar, PointerType::get(M->getContext(), 0));
-  Builder.CreateCall(
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::instrprof_increment_step),
-      {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
-       Builder.getInt32(TotalNumCtrs), Builder.getInt32(*CurCtrIdx), Step});
+  Builder.CreateIntrinsic(Intrinsic::instrprof_increment_step, {},
+                          {NormalizedFuncNameVarPtr, Builder.getInt64(FuncHash),
+                           Builder.getInt32(TotalNumCtrs),
+                           Builder.getInt32(*CurCtrIdx), Step});
   ++(*CurCtrIdx);
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
index 388addfab181..915dc70336de 100644
--- a/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/ThreadSanitizer.cpp
@@ -572,9 +572,7 @@ bool ThreadSanitizer::sanitizeFunction(Function &F,
   if ((Res || HasCalls) && ClInstrumentFuncEntryExit) {
     InstrumentationIRBuilder IRB(F.getEntryBlock().getFirstNonPHI());
     Value *ReturnAddress =
-        IRB.CreateCall(Intrinsic::getOrInsertDeclaration(
-                           F.getParent(), Intrinsic::returnaddress),
-                       IRB.getInt32(0));
+        IRB.CreateIntrinsic(Intrinsic::returnaddress, {}, IRB.getInt32(0));
     IRB.CreateCall(TsanFuncEntry, ReturnAddress);
 
     EscapeEnumerator EE(F, "tsan_cleanup", ClHandleCxxExceptions);
-- 
GitLab


From 9255850e89b1e538e11fcc8b71cfd0b320546a75 Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Wed, 16 Oct 2024 16:10:45 +0100
Subject: [PATCH 146/329] [LLVM] Remove unused variables after #112546

---
 llvm/lib/Target/AArch64/AArch64ISelLowering.cpp            | 1 -
 llvm/lib/Target/ARM/ARMISelLowering.cpp                    | 1 -
 llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp   | 1 -
 llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp | 1 -
 4 files changed, 4 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 6ec492227d9f..60150c3328aa 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -27283,7 +27283,6 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
 
 void AArch64TargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilderBase &Builder) const {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Builder.CreateIntrinsic(Intrinsic::aarch64_clrex, {}, {});
 }
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 75a4ccb7b353..a49dda871dc3 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21445,7 +21445,6 @@ void ARMTargetLowering::emitAtomicCmpXchgNoStoreLLBalance(
     IRBuilderBase &Builder) const {
   if (!Subtarget->hasV7Ops())
     return;
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Builder.CreateIntrinsic(Intrinsic::arm_clrex, {}, {});
 }
 
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 9d4f05780cb6..55e9903876b1 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1865,7 +1865,6 @@ void AddressSanitizer::instrumentAddress(Instruction *OrigIns,
 
   if (UseCalls && ClOptimizeCallbacks) {
     const ASanAccessInfo AccessInfo(IsWrite, CompileKernel, AccessSizeIndex);
-    Module *M = IRB.GetInsertBlock()->getParent()->getParent();
     IRB.CreateIntrinsic(Intrinsic::asan_check_memaccess, {},
                         {IRB.CreatePointerCast(Addr, PtrTy),
                          ConstantInt::get(Int32Ty, AccessInfo.Packed)});
diff --git a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
index c55043a9b20f..21d4d37d7e6c 100644
--- a/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/HWAddressSanitizer.cpp
@@ -1025,7 +1025,6 @@ void HWAddressSanitizer::instrumentMemAccessOutline(Value *Ptr, bool IsWrite,
         insertShadowTagCheck(Ptr, InsertBefore, DTU, LI).TagMismatchTerm;
 
   IRBuilder<> IRB(InsertBefore);
-  Module *M = IRB.GetInsertBlock()->getParent()->getParent();
   bool UseFixedShadowIntrinsic = false;
   // The memaccess fixed shadow intrinsic is only supported on AArch64,
   // which allows a 16-bit immediate to be left-shifted by 32.
-- 
GitLab


From 92ad0397f494a9895385057586cc59a908107f81 Mon Sep 17 00:00:00 2001
From: Wael Yehia <wyehia@ca.ibm.com>
Date: Thu, 10 Oct 2024 22:18:54 -0400
Subject: [PATCH 147/329] [AIX][PGO] Enable some profile-rt tests now that
 -fprofile-instr-generate works on AIX

---
 compiler-rt/test/profile/Posix/instrprof-visibility.cpp   | 1 +
 compiler-rt/test/profile/coverage-inline.cpp              | 1 +
 compiler-rt/test/profile/coverage_comments.cpp            | 1 +
 compiler-rt/test/profile/coverage_emptylines.cpp          | 1 +
 compiler-rt/test/profile/instrprof-merging.cpp            | 1 +
 .../test/profile/instrprof-set-file-object-merging.c      | 1 +
 compiler-rt/test/profile/instrprof-set-file-object.c      | 1 +
 compiler-rt/test/profile/instrprof-without-libc.c         | 1 +
 compiler-rt/test/profile/instrprof-write-file-only.c      | 1 +
 compiler-rt/test/profile/lit.cfg.py                       | 8 ++------
 10 files changed, 11 insertions(+), 6 deletions(-)

diff --git a/compiler-rt/test/profile/Posix/instrprof-visibility.cpp b/compiler-rt/test/profile/Posix/instrprof-visibility.cpp
index bb533050e059..016aaed57e15 100644
--- a/compiler-rt/test/profile/Posix/instrprof-visibility.cpp
+++ b/compiler-rt/test/profile/Posix/instrprof-visibility.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: %clangxx_profgen -fcoverage-mapping %S/Inputs/instrprof-visibility-helper.cpp -o %t %s
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
 // RUN: llvm-profdata merge %t.profraw -o %t.profdata
diff --git a/compiler-rt/test/profile/coverage-inline.cpp b/compiler-rt/test/profile/coverage-inline.cpp
index e362e566fb4b..a4114363007a 100644
--- a/compiler-rt/test/profile/coverage-inline.cpp
+++ b/compiler-rt/test/profile/coverage-inline.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
 // Test that the instrumentation puts the right linkage on the profile data for
 // inline functions.
 // RUN: %clang_profgen -g -fcoverage-mapping -c -o %t1.o %s -DOBJECT_1
diff --git a/compiler-rt/test/profile/coverage_comments.cpp b/compiler-rt/test/profile/coverage_comments.cpp
index d206fb608792..8a99d646f581 100644
--- a/compiler-rt/test/profile/coverage_comments.cpp
+++ b/compiler-rt/test/profile/coverage_comments.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: %clangxx_profgen -fcoverage-mapping -Wno-comment -o %t %s
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
 // RUN: llvm-profdata merge -o %t.profdata %t.profraw
diff --git a/compiler-rt/test/profile/coverage_emptylines.cpp b/compiler-rt/test/profile/coverage_emptylines.cpp
index 8610d70f3e1b..8006cdee6ec1 100644
--- a/compiler-rt/test/profile/coverage_emptylines.cpp
+++ b/compiler-rt/test/profile/coverage_emptylines.cpp
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
 // Remove comments first.
 // RUN: sed 's/[ \t]*\/\/.*//' %s > %t.stripped.cpp
 // RUN: %clangxx_profgen -fcoverage-mapping -o %t %t.stripped.cpp
diff --git a/compiler-rt/test/profile/instrprof-merging.cpp b/compiler-rt/test/profile/instrprof-merging.cpp
index 6212feb19c2a..4a3f14b044a5 100644
--- a/compiler-rt/test/profile/instrprof-merging.cpp
+++ b/compiler-rt/test/profile/instrprof-merging.cpp
@@ -1,4 +1,5 @@
 // UNSUPPORTED: target={{.*windows.*}}
+// XFAIL: target={{.*}}-aix{{.*}}
 // 1) Compile shared code into different object files and into an executable.
 
 // RUN: %clangxx_profgen -std=c++14 -fcoverage-mapping %s -c -o %t.v1.o \
diff --git a/compiler-rt/test/profile/instrprof-set-file-object-merging.c b/compiler-rt/test/profile/instrprof-set-file-object-merging.c
index 92f5f92e2772..baabb21cd672 100644
--- a/compiler-rt/test/profile/instrprof-set-file-object-merging.c
+++ b/compiler-rt/test/profile/instrprof-set-file-object-merging.c
@@ -24,6 +24,7 @@ int main(int argc, const char *argv[]) {
 
   return 0;
 }
+// XFAIL: target={{.*}}-aix{{.*}}
 // CHECK:   10|       |#include <stdio.h>
 // CHECK:   11|       |
 // CHECK:   12|       |extern void __llvm_profile_set_file_object(FILE *, int);
diff --git a/compiler-rt/test/profile/instrprof-set-file-object.c b/compiler-rt/test/profile/instrprof-set-file-object.c
index 280374acb55d..0d1f96d5d826 100644
--- a/compiler-rt/test/profile/instrprof-set-file-object.c
+++ b/compiler-rt/test/profile/instrprof-set-file-object.c
@@ -17,6 +17,7 @@ int main(int argc, const char *argv[]) {
   __llvm_profile_set_file_object(F, 0);
   return 0;
 }
+// XFAIL: target={{.*}}-aix{{.*}}
 // CHECK:    8|       |#include <stdio.h>
 // CHECK:    9|       |
 // CHECK:   10|       |extern void __llvm_profile_set_file_object(FILE *, int);
diff --git a/compiler-rt/test/profile/instrprof-without-libc.c b/compiler-rt/test/profile/instrprof-without-libc.c
index 3142138cdffc..d0d213b07ba2 100644
--- a/compiler-rt/test/profile/instrprof-without-libc.c
+++ b/compiler-rt/test/profile/instrprof-without-libc.c
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: %clang_profgen -DCHECK_SYMBOLS -O3 -o %t.symbols %s
 // RUN: llvm-nm %t.symbols | FileCheck %s --check-prefix=CHECK-SYMBOLS
 // RUN: %clang_profgen -O3 -o %t %s
diff --git a/compiler-rt/test/profile/instrprof-write-file-only.c b/compiler-rt/test/profile/instrprof-write-file-only.c
index f505cf64a5c7..5edad271f869 100644
--- a/compiler-rt/test/profile/instrprof-write-file-only.c
+++ b/compiler-rt/test/profile/instrprof-write-file-only.c
@@ -1,3 +1,4 @@
+// XFAIL: target={{.*}}-aix{{.*}}
 // RUN: %clang_profgen -o %t -O3 %s
 // RUN: env LLVM_PROFILE_FILE=%t.profraw %run %t
 // RUN: llvm-profdata merge -o %t.profdata %t.profraw
diff --git a/compiler-rt/test/profile/lit.cfg.py b/compiler-rt/test/profile/lit.cfg.py
index 3b3019a07c30..c8c78a746b4c 100644
--- a/compiler-rt/test/profile/lit.cfg.py
+++ b/compiler-rt/test/profile/lit.cfg.py
@@ -77,12 +77,8 @@ def exclude_unsupported_files_for_aix(dirname):
         f = open(source_path, "r")
         try:
             data = f.read()
-            # -fprofile-instr-generate and rpath are not supported on AIX, exclude all tests with them.
-            if (
-                "%clang_profgen" in data
-                or "%clangxx_profgen" in data
-                or "-rpath" in data
-            ):
+            # rpath is not supported on AIX, exclude all tests with them.
+            if ( "-rpath" in data ):
                 config.excludes += [filename]
         finally:
             f.close()
-- 
GitLab


From 1c154a20b4943e9c94bcff8ee5bba34fdf9e52e5 Mon Sep 17 00:00:00 2001
From: Vivian <zhyuhang88@gmail.com>
Date: Wed, 16 Oct 2024 08:56:29 -0700
Subject: [PATCH 148/329] [mlir][td] More rename from packPaddings to
 nofoldFlags (#112453)

The pack_paddings attribute has been renamed to nofold_flags in
https://github.com/llvm/llvm-project/pull/111036. There are still some
`packPadding` remaining unchanged. This PR rename those to keep
consistent.
---
 .../mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td    | 4 ++--
 mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h      | 2 +-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
index 98b915138122..0915bbde3072 100644
--- a/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
+++ b/mlir/include/mlir/Dialect/Linalg/TransformOps/LinalgTransformOps.td
@@ -1055,13 +1055,13 @@ def PadOp : Op<Transform_Dialect, "structured.pad",
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<int64_t>":$paddingDimensions,
                    CArg<"ArrayRef<int64_t>", "{}">:$staticPadToMultipleOf,
-                   CArg<"ArrayRef<int64_t>", "{}">:$packPaddings,
+                   CArg<"ArrayRef<int64_t>", "{}">:$nofoldFlags,
                    CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
                    CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>,
     OpBuilder<(ins "Value":$target,
                    "ArrayRef<int64_t>":$paddingDimensions,
                    "ArrayRef<OpFoldResult>":$mixedPadToMultipleOf,
-                   CArg<"ArrayRef<int64_t>", "{}">:$packPaddings,
+                   CArg<"ArrayRef<int64_t>", "{}">:$nofoldFlags,
                    CArg<"ArrayRef<Attribute>", "{}">:$transposePaddings,
                    CArg<"StringRef", "::mlir::bufferization::MaterializeInDestinationOp::getOperationName()">:$copyBackOp)>
   ];
diff --git a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
index 96e0b3c978d5..70b086641bdc 100644
--- a/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
+++ b/mlir/include/mlir/Dialect/Linalg/Transforms/Transforms.h
@@ -297,7 +297,7 @@ struct LinalgPaddingOptions {
   /// A flag for every operand to mark the PadOp as nofold which enables
   /// packing for statically shaped operands.
   SmallVector<bool> nofoldFlags;
-  LinalgPaddingOptions &setPackPaddings(ArrayRef<bool> pp) {
+  LinalgPaddingOptions &setNofoldFlags(ArrayRef<bool> pp) {
     nofoldFlags.assign(pp.begin(), pp.end());
     return *this;
   }
-- 
GitLab


From 87dd5dc8f03e78a34d99630b80024c102e5aee10 Mon Sep 17 00:00:00 2001
From: Jan Voung <jvoung@google.com>
Date: Wed, 16 Oct 2024 12:10:39 -0400
Subject: [PATCH 149/329] [clang][dataflow] Add a lattice to help cache const
 accessor methods (#111006)

By caching const accessor methods we can sometimes treat method call
results as stable (e.g., for issue
https://github.com/llvm/llvm-project/issues/58510). Users can clear the
cache when a non-const method is called that may modify the state of an
object.
This is represented as mixin.

It will be used in a follow on patch to change
bugprone-unchecked-optional-access's lattice from NoopLattice to
CachedConstAccessorsLattice<NoopLattice>, along with some additional
transfer functions.
---
 .../CachedConstAccessorsLattice.h             | 218 +++++++++++++
 .../Analysis/FlowSensitive/CMakeLists.txt     |   1 +
 .../CachedConstAccessorsLatticeTest.cpp       | 305 ++++++++++++++++++
 3 files changed, 524 insertions(+)
 create mode 100644 clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
 create mode 100644 clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp

diff --git a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
new file mode 100644
index 000000000000..3c3028eb9452
--- /dev/null
+++ b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
@@ -0,0 +1,218 @@
+//===-- CachedConstAccessorsLattice.h ---------------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This file defines the lattice mixin that additionally maintains a cache of
+// stable method call return values to model const accessor member functions.
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
+#define LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
+
+#include "clang/AST/Expr.h"
+#include "clang/Analysis/FlowSensitive/DataflowEnvironment.h"
+#include "clang/Analysis/FlowSensitive/DataflowLattice.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "llvm/ADT/DenseMap.h"
+#include "llvm/ADT/STLFunctionalExtras.h"
+
+namespace clang {
+namespace dataflow {
+
+/// A mixin for a lattice that additionally maintains a cache of stable method
+/// call return values to model const accessors methods. When a non-const method
+/// is called, the cache should be cleared causing the next call to a const
+/// method to be considered a different value. NOTE: The user is responsible for
+/// clearing the cache.
+///
+/// For example:
+///
+/// class Bar {
+/// public:
+///   const std::optional<Foo>& getFoo() const;
+///   void clear();
+/// };
+//
+/// void func(Bar& s) {
+///   if (s.getFoo().has_value()) {
+///     use(s.getFoo().value()); // safe (checked earlier getFoo())
+///     s.clear();
+///     use(s.getFoo().value()); // unsafe (invalidate cache for s)
+///   }
+/// }
+template <typename Base> class CachedConstAccessorsLattice : public Base {
+public:
+  using Base::Base; // inherit all constructors
+
+  /// Creates or returns a previously created `Value` associated with a const
+  /// method call `obj.getFoo()` where `RecordLoc` is the
+  /// `RecordStorageLocation` of `obj`.
+  /// Returns nullptr if unable to find or create a value.
+  ///
+  /// Requirements:
+  ///
+  ///  - `CE` should return a value (not a reference or record type)
+  Value *
+  getOrCreateConstMethodReturnValue(const RecordStorageLocation &RecordLoc,
+                                    const CallExpr *CE, Environment &Env);
+
+  /// Creates or returns a previously created `StorageLocation` associated with
+  /// a const method call `obj.getFoo()` where `RecordLoc` is the
+  /// `RecordStorageLocation` of `obj`.
+  ///
+  /// The callback `Initialize` runs on the storage location if newly created.
+  /// Returns nullptr if unable to find or create a value.
+  ///
+  /// Requirements:
+  ///
+  ///  - `CE` should return a location (GLValue or a record type).
+  StorageLocation *getOrCreateConstMethodReturnStorageLocation(
+      const RecordStorageLocation &RecordLoc, const CallExpr *CE,
+      Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize);
+
+  void clearConstMethodReturnValues(const RecordStorageLocation &RecordLoc) {
+    ConstMethodReturnValues.erase(&RecordLoc);
+  }
+
+  void clearConstMethodReturnStorageLocations(
+      const RecordStorageLocation &RecordLoc) {
+    ConstMethodReturnStorageLocations.erase(&RecordLoc);
+  }
+
+  bool operator==(const CachedConstAccessorsLattice &Other) const {
+    return Base::operator==(Other);
+  }
+
+  LatticeJoinEffect join(const CachedConstAccessorsLattice &Other);
+
+private:
+  // Maps a record storage location and const method to the value to return
+  // from that const method.
+  using ConstMethodReturnValuesType =
+      llvm::SmallDenseMap<const RecordStorageLocation *,
+                          llvm::SmallDenseMap<const FunctionDecl *, Value *>>;
+  ConstMethodReturnValuesType ConstMethodReturnValues;
+
+  // Maps a record storage location and const method to the record storage
+  // location to return from that const method.
+  using ConstMethodReturnStorageLocationsType = llvm::SmallDenseMap<
+      const RecordStorageLocation *,
+      llvm::SmallDenseMap<const FunctionDecl *, StorageLocation *>>;
+  ConstMethodReturnStorageLocationsType ConstMethodReturnStorageLocations;
+};
+
+namespace internal {
+
+template <typename T>
+llvm::SmallDenseMap<const RecordStorageLocation *,
+                    llvm::SmallDenseMap<const FunctionDecl *, T *>>
+joinConstMethodMap(
+    const llvm::SmallDenseMap<const RecordStorageLocation *,
+                              llvm::SmallDenseMap<const FunctionDecl *, T *>>
+        &Map1,
+    const llvm::SmallDenseMap<const RecordStorageLocation *,
+                              llvm::SmallDenseMap<const FunctionDecl *, T *>>
+        &Map2,
+    LatticeEffect &Effect) {
+  llvm::SmallDenseMap<const RecordStorageLocation *,
+                      llvm::SmallDenseMap<const FunctionDecl *, T *>>
+      Result;
+  for (auto &[Loc, DeclToT] : Map1) {
+    auto It = Map2.find(Loc);
+    if (It == Map2.end()) {
+      Effect = LatticeJoinEffect::Changed;
+      continue;
+    }
+    const auto &OtherDeclToT = It->second;
+    auto &JoinedDeclToT = Result[Loc];
+    for (auto [Func, Var] : DeclToT) {
+      T *OtherVar = OtherDeclToT.lookup(Func);
+      if (OtherVar == nullptr || OtherVar != Var) {
+        Effect = LatticeJoinEffect::Changed;
+        continue;
+      }
+      JoinedDeclToT.insert({Func, Var});
+    }
+  }
+  return Result;
+}
+
+} // namespace internal
+
+template <typename Base>
+LatticeEffect CachedConstAccessorsLattice<Base>::join(
+    const CachedConstAccessorsLattice<Base> &Other) {
+
+  LatticeEffect Effect = Base::join(Other);
+
+  // For simplicity, we only retain values that are identical, but not ones that
+  // are non-identical but equivalent. This is likely to be sufficient in
+  // practice, and it reduces implementation complexity considerably.
+
+  ConstMethodReturnValues = internal::joinConstMethodMap<Value>(
+      ConstMethodReturnValues, Other.ConstMethodReturnValues, Effect);
+
+  ConstMethodReturnStorageLocations =
+      internal::joinConstMethodMap<StorageLocation>(
+          ConstMethodReturnStorageLocations,
+          Other.ConstMethodReturnStorageLocations, Effect);
+
+  return Effect;
+}
+
+template <typename Base>
+Value *CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnValue(
+    const RecordStorageLocation &RecordLoc, const CallExpr *CE,
+    Environment &Env) {
+  QualType Type = CE->getType();
+  assert(!Type.isNull());
+  assert(!Type->isReferenceType());
+  assert(!Type->isRecordType());
+
+  auto &ObjMap = ConstMethodReturnValues[&RecordLoc];
+  const FunctionDecl *DirectCallee = CE->getDirectCallee();
+  if (DirectCallee == nullptr)
+    return nullptr;
+  auto it = ObjMap.find(DirectCallee);
+  if (it != ObjMap.end())
+    return it->second;
+
+  Value *Val = Env.createValue(Type);
+  if (Val != nullptr)
+    ObjMap.insert({DirectCallee, Val});
+  return Val;
+}
+
+template <typename Base>
+StorageLocation *
+CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnStorageLocation(
+    const RecordStorageLocation &RecordLoc, const CallExpr *CE,
+    Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize) {
+  QualType Type = CE->getType();
+  assert(!Type.isNull());
+  assert(CE->isGLValue() || Type->isRecordType());
+  auto &ObjMap = ConstMethodReturnStorageLocations[&RecordLoc];
+  const FunctionDecl *DirectCallee = CE->getDirectCallee();
+  if (DirectCallee == nullptr)
+    return nullptr;
+  auto it = ObjMap.find(DirectCallee);
+  if (it != ObjMap.end())
+    return it->second;
+
+  StorageLocation &Loc =
+      Env.createStorageLocation(CE->getType().getNonReferenceType());
+  Initialize(Loc);
+
+  ObjMap.insert({DirectCallee, &Loc});
+  return &Loc;
+}
+
+} // namespace dataflow
+} // namespace clang
+
+#endif // LLVM_CLANG_ANALYSIS_FLOWSENSITIVE_CACHED_CONST_ACCESSORS_LATTICE_H
diff --git a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
index 12fee5dc2789..4e1819bfa166 100644
--- a/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
+++ b/clang/unittests/Analysis/FlowSensitive/CMakeLists.txt
@@ -7,6 +7,7 @@ add_clang_unittest(ClangAnalysisFlowSensitiveTests
   ArenaTest.cpp
   ASTOpsTest.cpp
   CFGMatchSwitchTest.cpp
+  CachedConstAccessorsLatticeTest.cpp
   ChromiumCheckModelTest.cpp
   DataflowAnalysisContextTest.cpp
   DataflowEnvironmentTest.cpp
diff --git a/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
new file mode 100644
index 000000000000..6488833bd14c
--- /dev/null
+++ b/clang/unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp
@@ -0,0 +1,305 @@
+//===- unittests/Analysis/FlowSensitive/CachedConstAccessorsLatticeTest.cpp ==//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h"
+
+#include <cassert>
+#include <memory>
+
+#include "clang/AST/Decl.h"
+#include "clang/AST/DeclBase.h"
+#include "clang/AST/DeclCXX.h"
+#include "clang/AST/Expr.h"
+#include "clang/AST/Type.h"
+#include "clang/ASTMatchers/ASTMatchFinder.h"
+#include "clang/ASTMatchers/ASTMatchers.h"
+#include "clang/Analysis/FlowSensitive/DataflowAnalysisContext.h"
+#include "clang/Analysis/FlowSensitive/DataflowLattice.h"
+#include "clang/Analysis/FlowSensitive/NoopLattice.h"
+#include "clang/Analysis/FlowSensitive/StorageLocation.h"
+#include "clang/Analysis/FlowSensitive/Value.h"
+#include "clang/Analysis/FlowSensitive/WatchedLiteralsSolver.h"
+#include "clang/Basic/LLVM.h"
+#include "clang/Testing/TestAST.h"
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+namespace clang::dataflow {
+namespace {
+
+using ast_matchers::BoundNodes;
+using ast_matchers::callee;
+using ast_matchers::cxxMemberCallExpr;
+using ast_matchers::functionDecl;
+using ast_matchers::hasName;
+using ast_matchers::match;
+using ast_matchers::selectFirst;
+
+using dataflow::DataflowAnalysisContext;
+using dataflow::Environment;
+using dataflow::LatticeJoinEffect;
+using dataflow::RecordStorageLocation;
+using dataflow::Value;
+using dataflow::WatchedLiteralsSolver;
+
+using testing::SizeIs;
+
+NamedDecl *lookup(StringRef Name, const DeclContext &DC) {
+  auto Result = DC.lookup(&DC.getParentASTContext().Idents.get(Name));
+  EXPECT_TRUE(Result.isSingleResult()) << Name;
+  return Result.front();
+}
+
+class CachedConstAccessorsLatticeTest : public ::testing::Test {
+protected:
+  using LatticeT = CachedConstAccessorsLattice<NoopLattice>;
+
+  DataflowAnalysisContext DACtx{std::make_unique<WatchedLiteralsSolver>()};
+  Environment Env{DACtx};
+};
+
+// Basic test AST with two const methods (return a value, and return a ref).
+struct CommonTestInputs {
+  CommonTestInputs()
+      : AST(R"cpp(
+    struct S {
+      int *valProperty() const;
+      int &refProperty() const;
+    };
+    void target() {
+      S s;
+      s.valProperty();
+      S s2;
+      s2.refProperty();
+    }
+  )cpp") {
+    auto *SDecl = cast<CXXRecordDecl>(
+        lookup("S", *AST.context().getTranslationUnitDecl()));
+    SType = AST.context().getRecordType(SDecl);
+    CallVal = selectFirst<CallExpr>(
+        "call",
+        match(cxxMemberCallExpr(callee(functionDecl(hasName("valProperty"))))
+                  .bind("call"),
+              AST.context()));
+    assert(CallVal != nullptr);
+
+    CallRef = selectFirst<CallExpr>(
+        "call",
+        match(cxxMemberCallExpr(callee(functionDecl(hasName("refProperty"))))
+                  .bind("call"),
+              AST.context()));
+    assert(CallRef != nullptr);
+  }
+
+  TestAST AST;
+  QualType SType;
+  const CallExpr *CallVal;
+  const CallExpr *CallRef;
+};
+
+TEST_F(CachedConstAccessorsLatticeTest,
+       SamePrimitiveValBeforeClearOrDiffAfterClear) {
+  CommonTestInputs Inputs;
+  auto *CE = Inputs.CallVal;
+  RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                            {});
+
+  LatticeT Lattice;
+  Value *Val1 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+  Value *Val2 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+  EXPECT_EQ(Val1, Val2);
+
+  Lattice.clearConstMethodReturnValues(Loc);
+  Value *Val3 = Lattice.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+  EXPECT_NE(Val3, Val1);
+  EXPECT_NE(Val3, Val2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, SameLocBeforeClearOrDiffAfterClear) {
+  CommonTestInputs Inputs;
+  auto *CE = Inputs.CallRef;
+  RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                            {});
+
+  LatticeT Lattice;
+  auto NopInit = [](StorageLocation &) {};
+  StorageLocation *Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, CE, Env, NopInit);
+  auto NotCalled = [](StorageLocation &) {
+    ASSERT_TRUE(false) << "Not reached";
+  };
+  StorageLocation *Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, CE, Env, NotCalled);
+
+  EXPECT_EQ(Loc1, Loc2);
+
+  Lattice.clearConstMethodReturnStorageLocations(Loc);
+  StorageLocation *Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, CE, Env, NopInit);
+
+  EXPECT_NE(Loc3, Loc1);
+  EXPECT_NE(Loc3, Loc2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest,
+       SameStructValBeforeClearOrDiffAfterClear) {
+  TestAST AST(R"cpp(
+    struct S {
+      S structValProperty() const;
+    };
+    void target() {
+      S s;
+      s.structValProperty();
+    }
+  )cpp");
+  auto *SDecl =
+      cast<CXXRecordDecl>(lookup("S", *AST.context().getTranslationUnitDecl()));
+  QualType SType = AST.context().getRecordType(SDecl);
+  const CallExpr *CE = selectFirst<CallExpr>(
+      "call", match(cxxMemberCallExpr(
+                        callee(functionDecl(hasName("structValProperty"))))
+                        .bind("call"),
+                    AST.context()));
+  ASSERT_NE(CE, nullptr);
+
+  RecordStorageLocation Loc(SType, RecordStorageLocation::FieldToLoc(), {});
+
+  LatticeT Lattice;
+  // Accessors that return a record by value are modeled by a record storage
+  // location (instead of a Value).
+  auto NopInit = [](StorageLocation &) {};
+  StorageLocation *Loc1 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, CE, Env, NopInit);
+  auto NotCalled = [](StorageLocation &) {
+    ASSERT_TRUE(false) << "Not reached";
+  };
+  StorageLocation *Loc2 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, CE, Env, NotCalled);
+
+  EXPECT_EQ(Loc1, Loc2);
+
+  Lattice.clearConstMethodReturnStorageLocations(Loc);
+  StorageLocation *Loc3 = Lattice.getOrCreateConstMethodReturnStorageLocation(
+      Loc, CE, Env, NopInit);
+
+  EXPECT_NE(Loc3, Loc1);
+  EXPECT_NE(Loc3, Loc1);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, ClearDifferentLocs) {
+  CommonTestInputs Inputs;
+  auto *CE = Inputs.CallRef;
+  RecordStorageLocation LocS1(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                              {});
+  RecordStorageLocation LocS2(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                              {});
+
+  LatticeT Lattice;
+  auto NopInit = [](StorageLocation &) {};
+  StorageLocation *RetLoc1 =
+      Lattice.getOrCreateConstMethodReturnStorageLocation(LocS1, CE, Env,
+                                                          NopInit);
+  Lattice.clearConstMethodReturnStorageLocations(LocS2);
+  auto NotCalled = [](StorageLocation &) {
+    ASSERT_TRUE(false) << "Not reached";
+  };
+  StorageLocation *RetLoc2 =
+      Lattice.getOrCreateConstMethodReturnStorageLocation(LocS1, CE, Env,
+                                                          NotCalled);
+
+  EXPECT_EQ(RetLoc1, RetLoc2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, DifferentValsFromDifferentLocs) {
+  TestAST AST(R"cpp(
+    struct S {
+      int *valProperty() const;
+    };
+    void target() {
+      S s1;
+      s1.valProperty();
+      S s2;
+      s2.valProperty();
+    }
+  )cpp");
+  auto *SDecl =
+      cast<CXXRecordDecl>(lookup("S", *AST.context().getTranslationUnitDecl()));
+  QualType SType = AST.context().getRecordType(SDecl);
+  SmallVector<BoundNodes, 1> valPropertyCalls =
+      match(cxxMemberCallExpr(callee(functionDecl(hasName("valProperty"))))
+                .bind("call"),
+            AST.context());
+  ASSERT_THAT(valPropertyCalls, SizeIs(2));
+
+  const CallExpr *CE1 = selectFirst<CallExpr>("call", valPropertyCalls);
+  ASSERT_NE(CE1, nullptr);
+
+  valPropertyCalls.erase(valPropertyCalls.begin());
+  const CallExpr *CE2 = selectFirst<CallExpr>("call", valPropertyCalls);
+  ASSERT_NE(CE2, nullptr);
+  ASSERT_NE(CE1, CE2);
+
+  RecordStorageLocation LocS1(SType, RecordStorageLocation::FieldToLoc(), {});
+  RecordStorageLocation LocS2(SType, RecordStorageLocation::FieldToLoc(), {});
+
+  LatticeT Lattice;
+  Value *Val1 = Lattice.getOrCreateConstMethodReturnValue(LocS1, CE1, Env);
+  Value *Val2 = Lattice.getOrCreateConstMethodReturnValue(LocS2, CE2, Env);
+
+  EXPECT_NE(Val1, Val2);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, JoinSameNoop) {
+  CommonTestInputs Inputs;
+  auto *CE = Inputs.CallVal;
+  RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                            {});
+
+  LatticeT EmptyLattice;
+  LatticeT EmptyLattice2;
+  EXPECT_EQ(EmptyLattice.join(EmptyLattice2), LatticeJoinEffect::Unchanged);
+
+  LatticeT Lattice1;
+  Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+  EXPECT_EQ(Lattice1.join(Lattice1), LatticeJoinEffect::Unchanged);
+}
+
+TEST_F(CachedConstAccessorsLatticeTest, ProducesNewValueAfterJoinDistinct) {
+  CommonTestInputs Inputs;
+  auto *CE = Inputs.CallVal;
+  RecordStorageLocation Loc(Inputs.SType, RecordStorageLocation::FieldToLoc(),
+                            {});
+
+  // L1 w/ v vs L2 empty
+  LatticeT Lattice1;
+  Value *Val1 = Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+  LatticeT EmptyLattice;
+
+  EXPECT_EQ(Lattice1.join(EmptyLattice), LatticeJoinEffect::Changed);
+  Value *ValAfterJoin =
+      Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+  EXPECT_NE(ValAfterJoin, Val1);
+
+  // L1 w/ v1 vs L3 w/ v2
+  LatticeT Lattice3;
+  Value *Val3 = Lattice3.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+  EXPECT_EQ(Lattice1.join(Lattice3), LatticeJoinEffect::Changed);
+  Value *ValAfterJoin2 =
+      Lattice1.getOrCreateConstMethodReturnValue(Loc, CE, Env);
+
+  EXPECT_NE(ValAfterJoin2, ValAfterJoin);
+  EXPECT_NE(ValAfterJoin2, Val3);
+}
+
+} // namespace
+} // namespace clang::dataflow
-- 
GitLab


From cf5e295ec0e05058d0e10a3779fe4093d96074b2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 17:15:47 +0100
Subject: [PATCH 150/329] Fix MSVC "not all control paths return a value"
 warning. NFC.

---
 clang/lib/CodeGen/Targets/DirectX.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/clang/lib/CodeGen/Targets/DirectX.cpp b/clang/lib/CodeGen/Targets/DirectX.cpp
index 303a4309d62f..7935f7ae3700 100644
--- a/clang/lib/CodeGen/Targets/DirectX.cpp
+++ b/clang/lib/CodeGen/Targets/DirectX.cpp
@@ -63,6 +63,7 @@ llvm::Type *DirectXTargetCodeGenInfo::getHLSLType(CodeGenModule &CGM,
     llvm_unreachable("dx.Sampler handles are not implemented yet");
     break;
   }
+  llvm_unreachable("Unknown llvm::dxil::ResourceClass enum");
 }
 
 } // namespace
-- 
GitLab


From 7ebe5a1ec00ce463ef421cdb4a3d84500c09e77a Mon Sep 17 00:00:00 2001
From: LLVM GN Syncbot <llvmgnsyncbot@gmail.com>
Date: Wed, 16 Oct 2024 16:16:54 +0000
Subject: [PATCH 151/329] [gn build] Port 87dd5dc8f03e

---
 .../gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn | 1 +
 1 file changed, 1 insertion(+)

diff --git a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
index 780a69f1f329..1287bdd2bb88 100644
--- a/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
+++ b/llvm/utils/gn/secondary/clang/unittests/Analysis/FlowSensitive/BUILD.gn
@@ -21,6 +21,7 @@ unittest("ClangAnalysisFlowSensitiveTests") {
     "ASTOpsTest.cpp",
     "ArenaTest.cpp",
     "CFGMatchSwitchTest.cpp",
+    "CachedConstAccessorsLatticeTest.cpp",
     "ChromiumCheckModelTest.cpp",
     "DataflowAnalysisContextTest.cpp",
     "DataflowEnvironmentTest.cpp",
-- 
GitLab


From 35e937b4de1890186347a382f7727ba86441dbda Mon Sep 17 00:00:00 2001
From: Brox Chen <guochen2@amd.com>
Date: Wed, 16 Oct 2024 12:26:01 -0400
Subject: [PATCH 152/329] [AMDGPU][True16][CodeGen] fp conversion in
 true/fake16 format (#101678)

fp conversion V_CVT_F_F/V_CVT_F_U instructions true16 format were
previously implemented using fake16 profile.

With the MC support inplace, correct and support these instructions in
true16/fake16 format in CodeGen
---
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  27 +-
 llvm/lib/Target/AMDGPU/SIInstructions.td      |  86 +-
 llvm/lib/Target/AMDGPU/VOP1Instructions.td    |  21 +-
 .../inst-select-amdgcn.fcmp.constants.w32.mir | 127 ++-
 .../inst-select-amdgcn.fcmp.constants.w64.mir | 127 ++-
 .../GlobalISel/inst-select-fceil.s16.mir      |  54 +-
 .../GlobalISel/inst-select-ffloor.s16.mir     |  54 +-
 .../AMDGPU/GlobalISel/inst-select-fptosi.mir  | 152 ++-
 .../AMDGPU/GlobalISel/inst-select-fptoui.mir  | 152 ++-
 .../AMDGPU/GlobalISel/inst-select-sitofp.mir  |  51 +-
 .../AMDGPU/GlobalISel/inst-select-uitofp.mir  |  51 +-
 .../CodeGen/AMDGPU/GlobalISel/llvm.powi.ll    |  49 +-
 .../AMDGPU/fix-sgpr-copies-f16-fake16.mir     |  23 +
 .../AMDGPU/fix-sgpr-copies-f16-true16.mir     |  35 +-
 .../CodeGen/AMDGPU/fix-sgpr-copies-f16.mir    |  20 -
 llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll      |  61 +-
 llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll      |  68 +-
 llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll      |  61 +-
 llvm/test/CodeGen/AMDGPU/fpext.f16.ll         | 916 ++++++++++++------
 llvm/test/CodeGen/AMDGPU/fptosi.f16.ll        | 480 ++++++---
 llvm/test/CodeGen/AMDGPU/fptoui.f16.ll        | 477 ++++++---
 llvm/test/CodeGen/AMDGPU/sitofp.f16.ll        | 371 ++++---
 llvm/test/CodeGen/AMDGPU/uitofp.f16.ll        | 371 ++++---
 23 files changed, 2562 insertions(+), 1272 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index d676d561d081..abd6c7cce53c 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -7366,14 +7366,25 @@ void SIInstrInfo::moveToVALUImpl(SIInstrWorklist &Worklist,
     const DebugLoc &DL = Inst.getDebugLoc();
     Register TmpReg = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
     Register NewDst = MRI.createVirtualRegister(&AMDGPU::VGPR_32RegClass);
-    BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
-        .addImm(16)
-        .add(Inst.getOperand(1));
-    BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
-        .addImm(0) // src0_modifiers
-        .addReg(TmpReg)
-        .addImm(0)  // clamp
-        .addImm(0); // omod
+    if (ST.useRealTrue16Insts()) {
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::COPY), TmpReg)
+          .add(Inst.getOperand(1));
+      BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+          .addImm(0) // src0_modifiers
+          .addReg(TmpReg, 0, AMDGPU::hi16)
+          .addImm(0)  // clamp
+          .addImm(0)  // omod
+          .addImm(0); // op_sel0
+    } else {
+      BuildMI(*MBB, Inst, DL, get(AMDGPU::V_LSHRREV_B32_e64), TmpReg)
+          .addImm(16)
+          .add(Inst.getOperand(1));
+      BuildMI(*MBB, Inst, DL, get(NewOpcode), NewDst)
+          .addImm(0) // src0_modifiers
+          .addReg(TmpReg)
+          .addImm(0)  // clamp
+          .addImm(0); // omod
+    }
 
     MRI.replaceRegWith(Inst.getOperand(0).getReg(), NewDst);
     addUsersToMoveToVALUWorklist(NewDst, MRI, Worklist);
diff --git a/llvm/lib/Target/AMDGPU/SIInstructions.td b/llvm/lib/Target/AMDGPU/SIInstructions.td
index 8073aca7f197..faa0b6d6c3f5 100644
--- a/llvm/lib/Target/AMDGPU/SIInstructions.td
+++ b/llvm/lib/Target/AMDGPU/SIInstructions.td
@@ -1094,7 +1094,7 @@ def : Pat <
 // VOP1 Patterns
 //===----------------------------------------------------------------------===//
 
-multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
+multiclass f16_to_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16_inst_e64> {
   // f16_to_fp patterns
   def : GCNPat <
     (f32 (any_f16_to_fp i32:$src0)),
@@ -1121,25 +1121,42 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
     (cvt_f32_f16_inst_e64 SRCMODS.NEG, $src0)
   >;
 
+  // fp_to_fp16 patterns
   def : GCNPat <
-    (f64 (any_fpextend f16:$src)),
-    (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
+    (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+    (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
   >;
 
-  // fp_to_fp16 patterns
+  // This is only used on targets without half support
+  // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
   def : GCNPat <
-    (i32 (AMDGPUfp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
+    (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
     (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
   >;
+}
+
+let True16Predicate = NotHasTrue16BitInsts in
+defm : f16_to_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
+
+let True16Predicate = UseFakeTrue16Insts in
+defm : f16_to_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
+
+multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64,
+                       Instruction cvt_f32_f16_inst_e64,
+                       RegOrImmOperand VSrc> {
+  def : GCNPat <
+    (f64 (any_fpextend f16:$src)),
+    (V_CVT_F64_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, $src))
+  >;
 
   def : GCNPat <
     (i32 (fp_to_sint f16:$src)),
-    (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
+    (V_CVT_I32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
   >;
 
   def : GCNPat <
     (i32 (fp_to_uint f16:$src)),
-    (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc_b32:$src))
+    (V_CVT_U32_F32_e32 (cvt_f32_f16_inst_e64 SRCMODS.NONE, VSrc:$src))
   >;
 
   def : GCNPat <
@@ -1151,20 +1168,16 @@ multiclass f16_fp_Pats<Instruction cvt_f16_f32_inst_e64, Instruction cvt_f32_f16
     (f16 (uint_to_fp i32:$src)),
     (cvt_f16_f32_inst_e64 SRCMODS.NONE, (V_CVT_F32_U32_e32 VSrc_b32:$src))
   >;
-
-  // This is only used on targets without half support
-  // TODO: Introduce strict variant of AMDGPUfp_to_f16 and share custom lowering
-  def : GCNPat <
-    (i32 (strict_fp_to_f16 (f32 (VOP3Mods f32:$src0, i32:$src0_modifiers)))),
-    (cvt_f16_f32_inst_e64 $src0_modifiers, f32:$src0)
-  >;
 }
 
 let True16Predicate = NotHasTrue16BitInsts in
-defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64>;
+defm : f16_fp_Pats<V_CVT_F16_F32_e64, V_CVT_F32_F16_e64, VSrc_b32>;
+
+let True16Predicate = UseRealTrue16Insts in
+defm : f16_fp_Pats<V_CVT_F16_F32_t16_e64, V_CVT_F32_F16_t16_e64, VSrcT_b16>;
 
 let True16Predicate = UseFakeTrue16Insts in
-defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64>;
+defm : f16_fp_Pats<V_CVT_F16_F32_fake16_e64, V_CVT_F32_F16_fake16_e64, VSrc_b16>;
 
 //===----------------------------------------------------------------------===//
 // VOP2 Patterns
@@ -2774,16 +2787,27 @@ def : GCNPat <
                         SSrc_i1:$src))
 >;
 
-let SubtargetPredicate = HasTrue16BitInsts in
+let True16Predicate = UseRealTrue16Insts in
 def : GCNPat <
   (f16 (sint_to_fp i1:$src)),
-  (V_CVT_F16_F32_fake16_e32 (
-      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+  (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
-                        SSrc_i1:$src))
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
 >;
 
-let SubtargetPredicate = NotHasTrue16BitInsts in
+let True16Predicate = UseFakeTrue16Insts in
+def : GCNPat <
+  (f16 (sint_to_fp i1:$src)),
+  (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_NEG_ONE),
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0)
+>;
+
+let True16Predicate = NotHasTrue16BitInsts in
 def : GCNPat <
   (f16 (uint_to_fp i1:$src)),
   (V_CVT_F16_F32_e32 (
@@ -2791,13 +2815,25 @@ def : GCNPat <
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
                         SSrc_i1:$src))
 >;
-let SubtargetPredicate = HasTrue16BitInsts in
+
+let True16Predicate = UseRealTrue16Insts in
 def : GCNPat <
   (f16 (uint_to_fp i1:$src)),
-  (V_CVT_F16_F32_fake16_e32 (
-      V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+  (V_CVT_F16_F32_t16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
                         /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
-                        SSrc_i1:$src))
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0, /*op_sel*/ 0)
+>;
+
+let True16Predicate = UseFakeTrue16Insts in
+def : GCNPat <
+  (f16 (uint_to_fp i1:$src)),
+  (V_CVT_F16_F32_fake16_e64 /*src0_modifiers*/ 0,
+      (V_CNDMASK_B32_e64 /*src0mod*/(i32 0), /*src0*/(i32 0),
+                        /*src1mod*/(i32 0), /*src1*/(i32 CONST.FP32_ONE),
+                        SSrc_i1:$src),
+      /*clamp*/ 0, /*omod*/ 0)
 >;
 
 def : GCNPat <
diff --git a/llvm/lib/Target/AMDGPU/VOP1Instructions.td b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
index be98d201a64a..701aeda82c91 100644
--- a/llvm/lib/Target/AMDGPU/VOP1Instructions.td
+++ b/llvm/lib/Target/AMDGPU/VOP1Instructions.td
@@ -503,7 +503,7 @@ let FPDPRounding = 1 in {
 defm V_FRACT_F16 : VOP1Inst_t16 <"v_fract_f16", VOP_F16_F16, AMDGPUfract>;
 } // End FPDPRounding = 1
 
-let OtherPredicates = [Has16BitInsts, NotHasTrue16BitInsts] in {
+let OtherPredicates = [Has16BitInsts], True16Predicate = NotHasTrue16BitInsts in {
 def : GCNPat<
     (f32 (f16_to_fp i16:$src)),
     (V_CVT_F32_F16_e32 $src)
@@ -513,7 +513,7 @@ def : GCNPat<
     (V_CVT_F16_F32_e32 $src)
 >;
 }
-let OtherPredicates = [HasTrue16BitInsts] in {
+let True16Predicate = UseRealTrue16Insts in {
 def : GCNPat<
     (f32 (f16_to_fp i16:$src)),
     (V_CVT_F32_F16_t16_e32 $src)
@@ -523,6 +523,16 @@ def : GCNPat<
     (V_CVT_F16_F32_t16_e32 $src)
 >;
 }
+let True16Predicate = UseFakeTrue16Insts in {
+def : GCNPat<
+    (f32 (f16_to_fp i16:$src)),
+    (V_CVT_F32_F16_fake16_e32 $src)
+>;
+def : GCNPat<
+    (i16 (AMDGPUfp_to_f16 f32:$src)),
+    (V_CVT_F16_F32_fake16_e32 $src)
+>;
+}
 
 def VOP_SWAP_I32 : VOPProfile<[i32, i32, untyped, untyped]> {
   let Outs32 = (outs VGPR_32:$vdst, VRegSrc_32:$vdst1);
@@ -1417,15 +1427,14 @@ def : GCNPat <
 
 } // End OtherPredicates = [isGFX8Plus, p]
 
-let OtherPredicates = [UseFakeTrue16Insts] in {
+let True16Predicate = UseFakeTrue16Insts in {
 def : GCNPat<
   (i32 (DivergentUnaryFrag<anyext> i16:$src)),
   (COPY $src)
 >;
-} // End OtherPredicates = [UseFakeTrue16Insts]
-
+} // End True16Predicate = UseFakeTrue16Insts
 
-let OtherPredicates = [UseRealTrue16Insts] in {
+let True16Predicate = UseRealTrue16Insts in {
 def : GCNPat<
   (i32 (UniformUnaryFrag<anyext> (i16 SReg_32:$src))),
   (COPY $src)
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
index 17cdab46c3b9..b5f91b6b8608 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w32.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ---
 name: fcmp_false_f16
@@ -10,15 +11,27 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_false_f16
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: fcmp_false_f16
+    ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
+    ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_FPTRUNC %0
@@ -36,15 +49,27 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_true_f16
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: fcmp_true_f16
+    ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
+    ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_FPTRUNC %0
@@ -62,13 +87,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_false_f32
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
+    ; GFX11-LABEL: name: fcmp_false_f32
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0
@@ -84,13 +109,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_true_f32
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
+    ; GFX11-LABEL: name: fcmp_true_f32
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %4:sgpr(s32) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15
@@ -106,15 +131,15 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_false_f64
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
+    ; GFX11-LABEL: name: fcmp_false_f64
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s64) = G_FPEXT %0
@@ -132,15 +157,15 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_true_f64
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
+    ; GFX11-LABEL: name: fcmp_true_f64
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_32 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s64) = G_FPEXT %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
index 158076a3b74a..a67a0b6455fa 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-amdgcn.fcmp.constants.w64.mir
@@ -1,5 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64" -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",+real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr="+wavefrontsize64",-real-true16 -run-pass=instruction-select -verify-machineinstrs -global-isel %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ---
 name: fcmp_false_f16
@@ -10,15 +11,27 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_false_f16
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: fcmp_false_f16
+    ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fcmp_false_f16
+    ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CMP_F_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F16_t16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_FPTRUNC %0
@@ -36,15 +49,27 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_true_f16
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: fcmp_true_f16
+    ; GFX11-TRUE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_]]
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_1:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY3:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_F32_t16_e64_1]]
+    ; GFX11-TRUE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[COPY2]], 0, [[COPY3]], 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fcmp_true_f16
+    ; GFX11-FAKE16: liveins: $vgpr0, $vgpr1
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_1:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CMP_TRU_F16_t16_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F16_t16_e64 0, [[V_CVT_F16_F32_fake16_e64_]], 0, [[V_CVT_F16_F32_fake16_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F16_t16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s16) = G_FPTRUNC %0
@@ -62,13 +87,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_false_f32
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
+    ; GFX11-LABEL: name: fcmp_false_f32
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_F_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 0
@@ -84,13 +109,13 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_true_f32
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
+    ; GFX11-LABEL: name: fcmp_true_f32
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CMP_TRU_F32_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F32_e64 0, [[COPY]], 0, [[COPY1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F32_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %4:sgpr(s64) = G_INTRINSIC_CONVERGENT intrinsic(@llvm.amdgcn.fcmp), %0, %1, 15
@@ -106,15 +131,15 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_false_f64
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
+    ; GFX11-LABEL: name: fcmp_false_f64
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CMP_F_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_F_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_F_F64_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s64) = G_FPEXT %0
@@ -132,15 +157,15 @@ tracksRegLiveness: true
 body: |
   bb.0:
     liveins: $vgpr0, $vgpr1
-    ; CHECK-LABEL: name: fcmp_true_f64
-    ; CHECK: liveins: $vgpr0, $vgpr1
-    ; CHECK-NEXT: {{  $}}
-    ; CHECK-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; CHECK-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
-    ; CHECK-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
+    ; GFX11-LABEL: name: fcmp_true_f64
+    ; GFX11: liveins: $vgpr0, $vgpr1
+    ; GFX11-NEXT: {{  $}}
+    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY $vgpr1
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CVT_F64_F32_e64_1:%[0-9]+]]:vreg_64 = nofpexcept V_CVT_F64_F32_e64 0, [[COPY1]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: [[V_CMP_TRU_F64_e64_:%[0-9]+]]:sreg_64 = V_CMP_TRU_F64_e64 0, [[V_CVT_F64_F32_e64_]], 0, [[V_CVT_F64_F32_e64_1]], 0, implicit $mode, implicit $exec
+    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CMP_TRU_F64_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s32) = COPY $vgpr1
     %2:vgpr(s64) = G_FPEXT %0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
index 0ff633fb4d8b..df2f390124eb 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fceil.s16.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX8 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
 
 ---
@@ -45,15 +45,15 @@ body: |
     ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fceil_s16_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
-    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ; GFX11-TRUE16-LABEL: name: fceil_s16_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: fceil_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
@@ -85,14 +85,14 @@ body: |
     ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fceil_s16_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
-    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ; GFX11-TRUE16-LABEL: name: fceil_s16_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: fceil_s16_vs
     ; GFX11-FAKE16: liveins: $sgpr0
@@ -124,15 +124,15 @@ body: |
     ; GFX8-NEXT: [[V_CEIL_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CEIL_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; GFX8-NEXT: $vgpr0 = COPY [[V_CEIL_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: fceil_fneg_s16_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
-    ; GFX11-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
-    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ; GFX11-TRUE16-LABEL: name: fceil_fneg_s16_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_CEIL_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CEIL_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CEIL_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: fceil_fneg_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
index fc8a6aaa1751..df62806b6191 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-ffloor.s16.mir
@@ -1,6 +1,6 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefix=VI %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-TRUE16 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -global-isel-abort=0 -verify-machineinstrs -o - %s | FileCheck -check-prefixes=GCN,GFX11-FAKE16 %s
 
 ---
@@ -54,15 +54,15 @@ body: |
     ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: ffloor_s16_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
-    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ; GFX11-TRUE16-LABEL: name: ffloor_s16_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: ffloor_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
@@ -94,14 +94,14 @@ body: |
     ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: ffloor_s16_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
-    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ; GFX11-TRUE16-LABEL: name: ffloor_s16_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: ffloor_s16_vs
     ; GFX11-FAKE16: liveins: $sgpr0
@@ -133,15 +133,15 @@ body: |
     ; VI-NEXT: [[V_FLOOR_F16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_FLOOR_F16_e64 1, [[COPY]], 0, 0, implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_FLOOR_F16_e64_]]
     ;
-    ; GFX11-LABEL: name: ffloor_fneg_s16_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
-    ; GFX11-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
-    ; GFX11-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
-    ; GFX11-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ; GFX11-TRUE16-LABEL: name: ffloor_fneg_s16_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_FLOOR_F16_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_FLOOR_F16_t16_e64 1, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_FLOOR_F16_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
     ;
     ; GFX11-FAKE16-LABEL: name: ffloor_fneg_s16_vv
     ; GFX11-FAKE16: liveins: $vgpr0
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
index 32a73bc4e24a..03cb907f82a1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptosi.mir
@@ -1,7 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
 # RUN: llc -mtriple=amdgcn -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
 
 ---
 name: fptosi_s32_to_s32_vv
@@ -135,13 +136,22 @@ body: |
     ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptosi_s16_to_s32_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -174,13 +184,21 @@ body: |
     ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptosi_s16_to_s32_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_vs
+    ; GFX11-FAKE16: liveins: $sgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -217,15 +235,25 @@ body: |
     ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptosi_s16_to_s32_fneg_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
-    ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s32_fneg_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s32_fneg_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+    ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FNEG %1
@@ -259,13 +287,23 @@ body: |
     ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptosi_s16_to_s1_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -299,13 +337,22 @@ body: |
     ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptosi_s16_to_s1_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_vs
+    ; GFX11-FAKE16: liveins: $sgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOSI %1
@@ -343,15 +390,26 @@ body: |
     ; VI-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptosi_s16_to_s1_fneg_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
-    ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptosi_s16_to_s1_fneg_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_I32_F32_e32_]].lo16
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptosi_s16_to_s1_fneg_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+    ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_I32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_I32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_I32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FNEG %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
index 47a091804ce0..521a0e8a2a79 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-fptoui.mir
@@ -1,7 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GCN
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=fiji -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=VI
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefix=GFX11
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-TRUE16
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs -o - %s | FileCheck %s -check-prefixes=GFX11,GFX11-FAKE16
 
 ---
 
@@ -85,13 +86,22 @@ body: |
     ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptoui_s16_to_s32_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOUI %1
@@ -124,13 +134,21 @@ body: |
     ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptoui_s16_to_s32_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_vs
+    ; GFX11-FAKE16: liveins: $sgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOUI %1
@@ -167,15 +185,25 @@ body: |
     ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptoui_s16_to_s32_fneg_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
-    ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s32_fneg_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s32_fneg_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+    ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_U32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FNEG %1
@@ -209,13 +237,23 @@ body: |
     ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptoui_s16_to_s1_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY1]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOUI %1
@@ -249,13 +287,22 @@ body: |
     ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptoui_s16_to_s1_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[COPY]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY1]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_vs
+    ; GFX11-FAKE16: liveins: $sgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[COPY]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:sgpr(s16) = G_TRUNC %0
     %2:vgpr(s32) = G_FPTOUI %1
@@ -293,15 +340,26 @@ body: |
     ; VI-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_e64_]], implicit $mode, implicit $exec
     ; VI-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
     ;
-    ; GFX11-LABEL: name: fptoui_s16_to_s1_fneg_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
-    ; GFX11-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
+    ; GFX11-TRUE16-LABEL: name: fptoui_s16_to_s1_fneg_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[COPY1:%[0-9]+]]:vgpr_16 = COPY [[COPY]].lo16
+    ; GFX11-TRUE16-NEXT: [[V_XOR_B16_t16_e64_:%[0-9]+]]:vgpr_16 = V_XOR_B16_t16_e64 0, 32768, 0, [[COPY1]], 0, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_t16_e64 0, [[V_XOR_B16_t16_e64_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_t16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[COPY2:%[0-9]+]]:vgpr_16 = COPY [[V_CVT_U32_F32_e32_]].lo16
+    ; GFX11-TRUE16-NEXT: S_ENDPGM 0, implicit [[COPY2]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: fptoui_s16_to_s1_fneg_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[S_MOV_B32_:%[0-9]+]]:sreg_32 = S_MOV_B32 32768
+    ; GFX11-FAKE16-NEXT: [[V_XOR_B32_e64_:%[0-9]+]]:vgpr_32 = V_XOR_B32_e64 [[S_MOV_B32_]], [[COPY]], implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_F16_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F32_F16_fake16_e64 0, [[V_XOR_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_U32_F32_e32_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_U32_F32_e32 [[V_CVT_F32_F16_fake16_e64_]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: S_ENDPGM 0, implicit [[V_CVT_U32_F32_e32_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_TRUNC %0
     %2:vgpr(s16) = G_FNEG %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
index 938bb58bafc9..3888ce87b46f 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-sitofp.mir
@@ -1,7 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn-amd-amdhsa -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ---
 
@@ -85,13 +86,23 @@ body: |
     ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
     ;
-    ; GFX11-LABEL: name: sitofp_s32_to_s16_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_SITOFP %0
     %2:vgpr(s32) = G_ANYEXT %1
@@ -124,13 +135,23 @@ body: |
     ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
     ;
-    ; GFX11-LABEL: name: sitofp_s32_to_s16_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: sitofp_s32_to_s16_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: sitofp_s32_to_s16_vs
+    ; GFX11-FAKE16: liveins: $sgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_I32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_I32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_I32_e32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s16) = G_SITOFP %0
     %2:vgpr(s32) = G_ANYEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
index 9c6fded0d142..35d622dc57d1 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/inst-select-uitofp.mir
@@ -1,7 +1,8 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py
 # RUN: llc -mtriple=amdgcn -mcpu=hawaii -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE64 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1010 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=WAVE32 %s
-# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefix=GFX11 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+# RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=instruction-select -verify-machineinstrs %s -o - | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 ---
 name: uitofp_s32_to_s32_vv
@@ -99,13 +100,23 @@ body: |
     ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
     ;
-    ; GFX11-LABEL: name: uitofp_s32_to_s16_vv
-    ; GFX11: liveins: $vgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vv
+    ; GFX11-TRUE16: liveins: $vgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vv
+    ; GFX11-FAKE16: liveins: $vgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY $vgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
     %0:vgpr(s32) = COPY $vgpr0
     %1:vgpr(s16) = G_UITOFP %0
     %2:vgpr(s32) = G_ANYEXT %1
@@ -138,13 +149,23 @@ body: |
     ; WAVE32-NEXT: [[V_CVT_F16_F32_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
     ; WAVE32-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_e64_]]
     ;
-    ; GFX11-LABEL: name: uitofp_s32_to_s16_vs
-    ; GFX11: liveins: $sgpr0
-    ; GFX11-NEXT: {{  $}}
-    ; GFX11-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
-    ; GFX11-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
-    ; GFX11-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
-    ; GFX11-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
+    ; GFX11-TRUE16-LABEL: name: uitofp_s32_to_s16_vs
+    ; GFX11-TRUE16: liveins: $sgpr0
+    ; GFX11-TRUE16-NEXT: {{  $}}
+    ; GFX11-TRUE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[V_CVT_F16_F32_t16_e64_:%[0-9]+]]:vgpr_16 = nofpexcept V_CVT_F16_F32_t16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GFX11-TRUE16-NEXT: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GFX11-TRUE16-NEXT: [[REG_SEQUENCE:%[0-9]+]]:vgpr_32 = REG_SEQUENCE [[V_CVT_F16_F32_t16_e64_]], %subreg.lo16, [[DEF]], %subreg.hi16
+    ; GFX11-TRUE16-NEXT: $vgpr0 = COPY [[REG_SEQUENCE]]
+    ;
+    ; GFX11-FAKE16-LABEL: name: uitofp_s32_to_s16_vs
+    ; GFX11-FAKE16: liveins: $sgpr0
+    ; GFX11-FAKE16-NEXT: {{  $}}
+    ; GFX11-FAKE16-NEXT: [[COPY:%[0-9]+]]:sreg_32 = COPY $sgpr0
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F32_U32_e32_:%[0-9]+]]:vgpr_32 = V_CVT_F32_U32_e32 [[COPY]], implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: [[V_CVT_F16_F32_fake16_e64_:%[0-9]+]]:vgpr_32 = nofpexcept V_CVT_F16_F32_fake16_e64 0, [[V_CVT_F32_U32_e32_]], 0, 0, implicit $mode, implicit $exec
+    ; GFX11-FAKE16-NEXT: $vgpr0 = COPY [[V_CVT_F16_F32_fake16_e64_]]
     %0:sgpr(s32) = COPY $sgpr0
     %1:vgpr(s16) = G_UITOFP %0
     %2:vgpr(s32) = G_ANYEXT %1
diff --git a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
index 9d586e3e4a09..eeb7b138fde3 100644
--- a/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
+++ b/llvm/test/CodeGen/AMDGPU/GlobalISel/llvm.powi.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=hawaii < %s | FileCheck -check-prefixes=GFX78,GFX7 %s
 ; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=fiji < %s | FileCheck -check-prefixes=GFX78,GFX8 %s
-; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=+real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-TRUE16 %s
+; RUN: llc -global-isel -mtriple=amdgcn-mesa-mesa3d -mcpu=gfx1100 -mattr=-real-true16 < %s | FileCheck -check-prefixes=GFX11,GFX11-FAKE16 %s
 
 define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX7-LABEL: v_powi_f16:
@@ -36,21 +37,37 @@ define i16 @v_powi_f16(i16 %l, i32 %r) {
 ; GFX8-NEXT:    v_exp_f16_e32 v0, v0
 ; GFX8-NEXT:    s_setpc_b64 s[30:31]
 ;
-; GFX11-LABEL: v_powi_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
-; GFX11-NEXT:    v_log_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    s_waitcnt_depctr 0xfff
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    v_exp_f16_e32 v0, v0
-; GFX11-NEXT:    s_setpc_b64 s[30:31]
+; GFX11-TRUE16-LABEL: v_powi_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_log_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.h
+; GFX11-TRUE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v1, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_exp_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_setpc_b64 s[30:31]
+;
+; GFX11-FAKE16-LABEL: v_powi_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_log_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(SKIP_3) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    s_waitcnt_depctr 0xfff
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_mul_dx9_zero_f32_e32 v0, v0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_exp_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_setpc_b64 s[30:31]
   %l.cast = bitcast i16 %l to half
   %res = call half @llvm.powi.f16.i32(half %l.cast, i32 %r)
   %res.cast = bitcast half %res to i16
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
index 265bdd0cf2f4..30a24c675a76 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-fake16.mir
@@ -1,6 +1,29 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
 
+# V_CVT_LT_F16 will be replaced with fake16 when its true16/fake16 profile is corrected
+
+---
+name:            cmp_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: cmp_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F16_U16_fake16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_fake16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_fake16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+    %0:vgpr_32 = IMPLICIT_DEF
+    %1:sreg_32 = IMPLICIT_DEF
+    %2:vgpr_32 = V_CVT_F16_U16_fake16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
+    %3:sreg_32 = COPY %2:vgpr_32
+    nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
+    %4:sreg_32_xm0_xexec = COPY $scc
+    %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+...
+
+# Needs extra shift instruction to select hi 16 bits
 ---
 name:            cvt_hi_f32_f16
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
index 03a77dc2b8b5..4604518d71c9 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16-true16.mir
@@ -1,20 +1,39 @@
 # NOTE: Assertions have been autogenerated by utils/update_mir_test_checks.py UTC_ARGS: --version 3
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN %s
-# XFAIL: *
-# FIXME-TRUE16. reenable after CVT_F16_U16_t16 is supported in CodeGen
+#
+
+---
+name:            cmp_f16
+body:             |
+  bb.0.entry:
+    ; GCN-LABEL: name: cmp_f16
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
+    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, killed [[COPY]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
+    %0:vgpr_16 = IMPLICIT_DEF
+    %1:sreg_32 = IMPLICIT_DEF
+    %2:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+    %3:sreg_32 = COPY %2:vgpr_16
+    nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
+    %4:sreg_32_xm0_xexec = COPY $scc
+    %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
+...
 
 ---
 name:            cvt_hi_f32_f16
 body:             |
   bb.0:
     ; GCN-LABEL: name: cvt_hi_f32_f16
-    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; GCN-NEXT: [[V_LSHRREV_B32_e64_:%[0-9]+]]:vgpr_32 = V_LSHRREV_B32_e64 16, [[V_CVT_F16_U16_e64_]], implicit $exec
-    ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[V_LSHRREV_B32_e64_]], 0, 0, implicit $mode, implicit $exec
+    ; GCN: [[DEF:%[0-9]+]]:vgpr_16 = IMPLICIT_DEF
+    ; GCN-NEXT: [[V_CVT_F16_U16_t16_e64_:%[0-9]+]]:vgpr_16 = V_CVT_F16_U16_t16_e64 0, [[DEF]], 0, 0, 0, implicit $mode, implicit $exec
+    ; GCN-NEXT: [[COPY:%[0-9]+]]:vgpr_32 = COPY [[V_CVT_F16_U16_t16_e64_]]
+    ; GCN-NEXT: [[COPY1:%[0-9]+]]:vgpr_32 = COPY [[COPY]]
+    ; GCN-NEXT: [[V_CVT_F32_F16_t16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F32_F16_t16_e64 0, [[COPY1]].hi16, 0, 0, 0, implicit $mode, implicit $exec
     %0:vgpr_16 = IMPLICIT_DEF
-    %1:vgpr_16 = V_CVT_F16_U16_t16_e64 %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
+    %1:vgpr_16 = V_CVT_F16_U16_t16_e64 0, %0:vgpr_16, 0, 0, 0, implicit $mode, implicit $exec
     %2:sreg_32 = COPY %1:vgpr_16
     %3:sreg_32 = S_CVT_HI_F32_F16 %2:sreg_32, implicit $mode
 ...
diff --git a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
index 9a727a321d78..e8291f7ab8f7 100644
--- a/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
+++ b/llvm/test/CodeGen/AMDGPU/fix-sgpr-copies-f16.mir
@@ -2,26 +2,6 @@
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,REAL16 %s
 # RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -run-pass=si-fix-sgpr-copies -verify-machineinstrs -o - %s | FileCheck --check-prefixes=GCN,FAKE16 %s
 
----
-name:            cmp_f16
-body:             |
-  bb.0.entry:
-    ; GCN-LABEL: name: cmp_f16
-    ; GCN: [[DEF:%[0-9]+]]:vgpr_32 = IMPLICIT_DEF
-    ; GCN-NEXT: [[DEF1:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; GCN-NEXT: [[V_CVT_F16_U16_e64_:%[0-9]+]]:vgpr_32 = V_CVT_F16_U16_e64 [[DEF]], 0, 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[DEF2:%[0-9]+]]:sreg_32 = IMPLICIT_DEF
-    ; GCN-NEXT: [[V_CMP_LT_F16_t16_e64_:%[0-9]+]]:sreg_32_xm0_xexec = nofpexcept V_CMP_LT_F16_t16_e64 0, [[V_CVT_F16_U16_e64_]], 0, [[DEF1]], 0, implicit $mode, implicit $exec
-    ; GCN-NEXT: [[V_CNDMASK_B32_e64_:%[0-9]+]]:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed [[V_CMP_LT_F16_t16_e64_]], implicit $exec
-    %0:vgpr_32 = IMPLICIT_DEF
-    %1:sreg_32 = IMPLICIT_DEF
-    %2:vgpr_32 = V_CVT_F16_U16_e64 %0:vgpr_32, 0, 0, implicit $mode, implicit $exec
-    %3:sreg_32 = COPY %2:vgpr_32
-    nofpexcept S_CMP_LT_F16 killed %3:sreg_32, %1:sreg_32, implicit-def $scc, implicit $mode
-    %4:sreg_32_xm0_xexec = COPY $scc
-    %5:vgpr_32 = V_CNDMASK_B32_e64 0, 0, 0, -1, killed %4, implicit $exec
-...
-
 ---
 name:            fmac_f16
 body:             |
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
index 81859dce0488..064e88873a17 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp32.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 ; RUN: llc -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s
 ; RUN: llc -mtriple=r600 -mcpu=cayman -verify-machineinstrs < %s | FileCheck -check-prefixes=CAYMAN %s
 
@@ -44,25 +45,45 @@ define amdgpu_kernel void @test_convert_fp16_to_fp32(ptr addrspace(1) noalias %o
 ; GFX8-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_convert_fp16_to_fp32:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp32:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp32:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; CYPRESS-LABEL: test_convert_fp16_to_fp32:
 ; CYPRESS:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
index c17be87834ae..6c9f451167b7 100644
--- a/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp16_to_fp64.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
 ; RUN: llc -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+
 
 declare double @llvm.convert.from.fp16.f64(i16) nounwind readnone
 
@@ -44,27 +46,49 @@ define amdgpu_kernel void @test_convert_fp16_to_fp64(ptr addrspace(1) noalias %o
 ; GFX8-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_convert_fp16_to_fp64:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_convert_fp16_to_fp64:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_convert_fp16_to_fp64:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
   %val = load i16, ptr addrspace(1) %in, align 2
   %cvt = call double @llvm.convert.from.fp16.f64(i16 %val) nounwind readnone
   store double %cvt, ptr addrspace(1) %out, align 4
diff --git a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
index d8a726f251a0..5bac71007047 100644
--- a/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fp32_to_fp16.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX6 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tonga -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX8 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=r600 -mcpu=cypress -verify-machineinstrs < %s | FileCheck -check-prefixes=CYPRESS %s
 
 declare i16 @llvm.convert.to.fp16.f32(float) nounwind readnone
@@ -43,25 +44,45 @@ define amdgpu_kernel void @test_convert_fp32_to_fp16(ptr addrspace(1) noalias %o
 ; GFX8-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; GFX8-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: test_convert_fp32_to_fp16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: test_convert_fp32_to_fp16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: test_convert_fp32_to_fp16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
 ;
 ; CYPRESS-LABEL: test_convert_fp32_to_fp16:
 ; CYPRESS:       ; %bb.0:
diff --git a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
index 75f4dff14fcb..a40d678e84d7 100644
--- a/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fpext.f16.ll
@@ -2,7 +2,8 @@
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,VI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx900 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX89,GFX9 %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -allow-deprecated-dag-overlap -enable-var-scope --check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @fpext_f16_to_f32(
 ; SI-LABEL: fpext_f16_to_f32:
@@ -59,25 +60,45 @@ define amdgpu_kernel void @fpext_f16_to_f32(
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
 entry:
@@ -145,27 +166,49 @@ define amdgpu_kernel void @fpext_f16_to_f64(
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fpext_f16_to_f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fpext_f16_to_f64:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_f16_to_f64:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
 entry:
@@ -234,28 +277,51 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f32(
 ; GFX9-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fpext_v2f16_to_v2f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) #0 {
 entry:
@@ -330,31 +396,57 @@ define amdgpu_kernel void @fpext_v2f16_to_v2f64(
 ; GFX9-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fpext_v2f16_to_v2f64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
-; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fpext_v2f16_to_v2f64:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fpext_v2f16_to_v2f64:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[0:1], v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f64_f32_e32 v[2:3], v2
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -387,19 +479,35 @@ define amdgpu_kernel void @s_fneg_fpext_f16_to_f32(ptr addrspace(1) %r, i32 %a)
 ; GFX89-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX89-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_fneg_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, s4
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_fneg_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_fneg_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, s4
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %a.trunc = trunc i32 %a to i16
   %a.val = bitcast i16 %a.trunc to half
@@ -463,25 +571,45 @@ define amdgpu_kernel void @fneg_fpext_f16_to_f32(
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fneg_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v0, -v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -547,25 +675,45 @@ define amdgpu_kernel void @fabs_fpext_f16_to_f32(
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fabs_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fabs_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0|
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -631,25 +779,45 @@ define amdgpu_kernel void @fneg_fabs_fpext_f16_to_f32(
 ; GFX9-NEXT:    buffer_store_dword v0, off, s[0:3], 0
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fneg_fabs_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fneg_fabs_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_fabs_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0|
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -730,29 +898,55 @@ define amdgpu_kernel void @fneg_multi_use_fpext_f16_to_f32(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fneg_multi_use_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; GFX11-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fneg_multi_use_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_xor_b32_e32 v1, 0x8000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_multi_use_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; GFX11-FAKE16-NEXT:    v_xor_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -833,29 +1027,55 @@ define amdgpu_kernel void @fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, -v0
-; GFX11-NEXT:    v_mul_f16_e64 v0, -v0, v0
-; GFX11-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, -v0.l, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -v0
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, -v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -935,29 +1155,55 @@ define amdgpu_kernel void @fabs_multi_use_fpext_f16_to_f32(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fabs_multi_use_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; GFX11-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
-; GFX11-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fabs_multi_use_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0x7fff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, |v0.l|
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_multi_use_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0x7fff, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -1038,29 +1284,55 @@ define amdgpu_kernel void @fabs_multi_foldable_use_fpext_f16_to_f32(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
-; GFX11-NEXT:    v_mul_f16_e64 v0, |v0|, v0
-; GFX11-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, |v0.l|, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0.l|
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, |v0|
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, |v0|, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -1140,29 +1412,55 @@ define amdgpu_kernel void @fabs_fneg_multi_use_fpext_f16_to_f32(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; GFX11-NEXT:    v_or_b32_e32 v0, 0x8000, v0
-; GFX11-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v1, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v1.l
+; GFX11-TRUE16-NEXT:    v_or_b32_e32 v1, 0x8000, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v0, -|v0.l|
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_fneg_multi_use_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; GFX11-FAKE16-NEXT:    v_or_b32_e32 v0, 0x8000, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -1244,29 +1542,55 @@ define amdgpu_kernel void @fabs_fneg_multi_foldable_use_fpext_f16_to_f32(
 ; GFX9-NEXT:    s_waitcnt vmcnt(0)
 ; GFX9-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
-; GFX11-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
-; GFX11-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
-; GFX11-NEXT:    s_waitcnt_vscnt null, 0x0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mul_f16_e64 v0.h, -|v0.l|, v0.l
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0.l|
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-TRUE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fabs_fneg_multi_foldable_use_fpext_f16_to_f32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e64 v1, -|v0|
+; GFX11-FAKE16-NEXT:    v_mul_f16_e64 v0, -|v0|, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v1, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0 dlc
+; GFX11-FAKE16-NEXT:    s_waitcnt_vscnt null, 0x0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
diff --git a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
index 0e12cca1900c..327f2653c474 100644
--- a/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptosi.f16.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
 ; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN:  llc -amdgpu-scalarize-global-loads=false  -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+
 
 define amdgpu_kernel void @fptosi_f16_to_i16(
 ; SI-LABEL: fptosi_f16_to_i16:
@@ -41,25 +43,45 @@ define amdgpu_kernel void @fptosi_f16_to_i16(
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptosi_f16_to_i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_i16_f16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_i16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -108,27 +130,49 @@ define amdgpu_kernel void @fptosi_f16_to_i32(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptosi_f16_to_i32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -182,28 +226,51 @@ define amdgpu_kernel void @fptosi_f16_to_i64(
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptosi_f16_to_i64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i64:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i64:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -259,31 +326,60 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i16(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptosi_v2f16_to_v2i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_i16_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_i16_f16_e32 v1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_i16_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_i16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_i16_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -337,31 +433,57 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i32(
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptosi_v2f16_to_v2i32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v1, v1
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -422,34 +544,63 @@ define amdgpu_kernel void @fptosi_v2f16_to_v2i64(
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptosi_v2f16_to_v2i64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_i32_f32_e32 v2, v1
-; GFX11-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
-; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_v2f16_to_v2i64:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_i32_f32_e32 v2, v1
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_v2f16_to_v2i64:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_i32_f32_e32 v2, v1
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v1, 31, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_ashrrev_i32_e32 v3, 31, v2
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -485,21 +636,38 @@ define amdgpu_kernel void @fptosi_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptosi_f16_to_i1:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f16_e64 s2, -1.0, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptosi_f16_to_i1:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, -1.0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptosi_f16_to_i1:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f16_e64 s2, -1.0, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %conv = fptosi half %in to i1
   store i1 %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
index abc5c7af13b0..ba540f4948b5 100644
--- a/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/fptoui.f16.ll
@@ -1,7 +1,9 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=+real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-real-true16 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
+
 
 define amdgpu_kernel void @fptoui_f16_to_i16(
 ; SI-LABEL: fptoui_f16_to_i16:
@@ -41,25 +43,45 @@ define amdgpu_kernel void @fptoui_f16_to_i16(
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptoui_f16_to_i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_u16_f16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_u16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -108,27 +130,49 @@ define amdgpu_kernel void @fptoui_f16_to_i32(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptoui_f16_to_i32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -182,28 +226,51 @@ define amdgpu_kernel void @fptoui_f16_to_i64(
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptoui_f16_to_i64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i64:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i64:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -258,31 +325,60 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i16(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptoui_v2f16_to_v2i16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_u16_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_u16_f16_e32 v1, v1
-; GFX11-NEXT:    v_and_b32_e32 v0, 0xffff, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_u16_f16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_and_b32_e32 v1, 0xffff, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_lshl_or_b32 v0, v0, 16, v1
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_u16_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_u16_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_lshl_or_b32 v0, v1, 16, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -336,31 +432,57 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i32(
 ; VI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptoui_v2f16_to_v2i32:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v1, v1
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v1, v1
-; GFX11-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i32:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v1, v1.l
+; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i32:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b64 v[0:1], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -421,33 +543,61 @@ define amdgpu_kernel void @fptoui_v2f16_to_v2i64(
 ; VI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptoui_v2f16_to_v2i64:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_f32_f16_e32 v2, v1
-; GFX11-NEXT:    v_mov_b32_e32 v1, 0
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
-; GFX11-NEXT:    v_cvt_u32_f32_e32 v2, v2
-; GFX11-NEXT:    v_mov_b32_e32 v3, v1
-; GFX11-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_v2f16_to_v2i64:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v0, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_f16_e32 v2, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-TRUE16-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-TRUE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-TRUE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_v2f16_to_v2i64:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(SKIP_1) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_f16_e32 v2, v1
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v1, 0
+; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_3) | instskip(NEXT) | instid1(VALU_DEP_3)
+; GFX11-FAKE16-NEXT:    v_cvt_u32_f32_e32 v2, v2
+; GFX11-FAKE16-NEXT:    v_mov_b32_e32 v3, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b128 v[0:3], off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -484,21 +634,38 @@ define amdgpu_kernel void @fptoui_f16_to_i1(ptr addrspace(1) %out, half %in) {
 ; VI-NEXT:    buffer_store_byte v0, off, s[0:3], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: fptoui_f16_to_i1:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b32 s4, s[2:3], 0x2c
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s3, 0x31016000
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    v_cmp_eq_f16_e64 s2, 1.0, s4
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
-; GFX11-NEXT:    s_mov_b32 s2, -1
-; GFX11-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: fptoui_f16_to_i1:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, s4
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cmp_eq_f16_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, vcc_lo
+; GFX11-TRUE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: fptoui_f16_to_i1:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b32 s4, s[2:3], 0x2c
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, 0x31016000
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_eq_f16_e64 s2, 1.0, s4
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, -1
+; GFX11-FAKE16-NEXT:    buffer_store_b8 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
 entry:
   %conv = fptoui half %in to i1
   store i1 %conv, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
index b08a35ab8073..9169433cdca5 100644
--- a/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/sitofp.f16.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @sitofp_i16_to_f16(
 ; SI-LABEL: sitofp_i16_to_f16:
@@ -41,25 +42,45 @@ define amdgpu_kernel void @sitofp_i16_to_f16(
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: sitofp_i16_to_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_i16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_i16_to_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_i16_to_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -108,27 +129,49 @@ define amdgpu_kernel void @sitofp_i32_to_f16(
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: sitofp_i32_to_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_i32_to_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_i32_to_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -186,29 +229,56 @@ define amdgpu_kernel void @sitofp_v2i16_to_v2f16(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: sitofp_v2i16_to_v2f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f16_i16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_i16_e32 v1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_v2i16_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_i16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_v2i16_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_i16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -264,31 +334,60 @@ define amdgpu_kernel void @sitofp_v2i32_to_v2f16(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: sitofp_v2i32_to_v2f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_i32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: sitofp_v2i32_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: sitofp_v2i32_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_i32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -353,37 +452,69 @@ define amdgpu_kernel void @s_sint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_sint_to_fp_i1_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, s10
-; GFX11-NEXT:    s_mov_b32 s3, s11
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s6
-; GFX11-NEXT:    s_mov_b32 s13, s7
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT:    s_mov_b32 s8, s4
-; GFX11-NEXT:    s_mov_b32 s9, s5
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, -1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_sint_to_fp_i1_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s5
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_sint_to_fp_i1_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s5
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, -1.0, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
   %a = load float, ptr addrspace(1) %in0
   %b = load float, ptr addrspace(1) %in1
   %acmp = fcmp oge float %a, 0.000000e+00
diff --git a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
index c21ae434f447..c4268c15d9db 100644
--- a/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
+++ b/llvm/test/CodeGen/AMDGPU/uitofp.f16.ll
@@ -1,7 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=SI %s
 ; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=fiji -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=VI %s
-; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,+real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-TRUE16 %s
+; RUN: llc -amdgpu-scalarize-global-loads=false -mtriple=amdgcn -mcpu=gfx1100 -mattr=-flat-for-global,-real-true16 -verify-machineinstrs -enable-unsafe-fp-math < %s | FileCheck -check-prefixes=GFX11-FAKE16 %s
 
 define amdgpu_kernel void @uitofp_i16_to_f16(
 ; SI-LABEL: uitofp_i16_to_f16:
@@ -41,25 +42,45 @@ define amdgpu_kernel void @uitofp_i16_to_f16(
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: uitofp_i16_to_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_i16_to_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_i16_to_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_u16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -108,27 +129,49 @@ define amdgpu_kernel void @uitofp_i32_to_f16(
 ; VI-NEXT:    buffer_store_short v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: uitofp_i32_to_f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_i32_to_f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_i32_to_f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -186,29 +229,56 @@ define amdgpu_kernel void @uitofp_v2i16_to_v2f16(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: uitofp_v2i16_to_v2f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_u16_e32 v1, v1
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_v2i16_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_u16_e32 v0.h, v1.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_v2i16_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_lshrrev_b32_e32 v1, 16, v0
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_u16_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -264,31 +334,60 @@ define amdgpu_kernel void @uitofp_v2i32_to_v2f16(
 ; VI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: uitofp_v2i32_to_v2f16:
-; GFX11:       ; %bb.0: ; %entry
-; GFX11-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
-; GFX11-NEXT:    s_mov_b32 s6, -1
-; GFX11-NEXT:    s_mov_b32 s7, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s10, s6
-; GFX11-NEXT:    s_mov_b32 s11, s7
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s8, s2
-; GFX11-NEXT:    s_mov_b32 s9, s3
-; GFX11-NEXT:    s_mov_b32 s4, s0
-; GFX11-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
-; GFX11-NEXT:    s_mov_b32 s5, s1
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f32_u32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v1, v1
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_pack_b32_f16 v0, v0, v1
-; GFX11-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: uitofp_v2i32_to_v2f16:
+; GFX11-TRUE16:       ; %bb.0: ; %entry
+; GFX11-TRUE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-TRUE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-TRUE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.h, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v1.l, v0.l
+; GFX11-TRUE16-NEXT:    v_mov_b16_e32 v0.l, v0.h
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_pack_b32_f16 v0, v1, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: uitofp_v2i32_to_v2f16:
+; GFX11-FAKE16:       ; %bb.0: ; %entry
+; GFX11-FAKE16-NEXT:    s_load_b128 s[0:3], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_mov_b32 s6, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s7, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, s7
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s2
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s3
+; GFX11-FAKE16-NEXT:    s_mov_b32 s4, s0
+; GFX11-FAKE16-NEXT:    buffer_load_b64 v[0:1], off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s5, s1
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f32_u32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_2) | instskip(NEXT) | instid1(VALU_DEP_2)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_pack_b32_f16 v0, v0, v1
+; GFX11-FAKE16-NEXT:    buffer_store_b32 v0, off, s[4:7], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
     ptr addrspace(1) %r,
     ptr addrspace(1) %a) {
 entry:
@@ -353,37 +452,69 @@ define amdgpu_kernel void @s_uint_to_fp_i1_to_f16(ptr addrspace(1) %out, ptr add
 ; VI-NEXT:    buffer_store_short v0, off, s[8:11], 0
 ; VI-NEXT:    s_endpgm
 ;
-; GFX11-LABEL: s_uint_to_fp_i1_to_f16:
-; GFX11:       ; %bb.0:
-; GFX11-NEXT:    s_clause 0x1
-; GFX11-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
-; GFX11-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
-; GFX11-NEXT:    s_mov_b32 s10, -1
-; GFX11-NEXT:    s_mov_b32 s11, 0x31016000
-; GFX11-NEXT:    s_mov_b32 s2, s10
-; GFX11-NEXT:    s_mov_b32 s3, s11
-; GFX11-NEXT:    s_mov_b32 s14, s10
-; GFX11-NEXT:    s_mov_b32 s15, s11
-; GFX11-NEXT:    s_waitcnt lgkmcnt(0)
-; GFX11-NEXT:    s_mov_b32 s12, s6
-; GFX11-NEXT:    s_mov_b32 s13, s7
-; GFX11-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
-; GFX11-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
-; GFX11-NEXT:    s_mov_b32 s8, s4
-; GFX11-NEXT:    s_mov_b32 s9, s5
-; GFX11-NEXT:    s_waitcnt vmcnt(1)
-; GFX11-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
-; GFX11-NEXT:    s_waitcnt vmcnt(0)
-; GFX11-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
-; GFX11-NEXT:    s_xor_b32 s0, s0, vcc_lo
-; GFX11-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
-; GFX11-NEXT:    s_delay_alu instid0(VALU_DEP_1)
-; GFX11-NEXT:    v_cvt_f16_f32_e32 v0, v0
-; GFX11-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
-; GFX11-NEXT:    s_nop 0
-; GFX11-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
-; GFX11-NEXT:    s_endpgm
+; GFX11-TRUE16-LABEL: s_uint_to_fp_i1_to_f16:
+; GFX11-TRUE16:       ; %bb.0:
+; GFX11-TRUE16-NEXT:    s_clause 0x1
+; GFX11-TRUE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-TRUE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-TRUE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-TRUE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-TRUE16-NEXT:    s_mov_b32 s2, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s3, s11
+; GFX11-TRUE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-TRUE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-TRUE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-TRUE16-NEXT:    s_mov_b32 s12, s6
+; GFX11-TRUE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-TRUE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-TRUE16-NEXT:    s_mov_b32 s8, s4
+; GFX11-TRUE16-NEXT:    s_mov_b32 s9, s5
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-TRUE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-TRUE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-TRUE16-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-TRUE16-NEXT:    s_xor_b32 s0, s0, vcc_lo
+; GFX11-TRUE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-TRUE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-TRUE16-NEXT:    v_cvt_f16_f32_e32 v0.l, v0
+; GFX11-TRUE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-TRUE16-NEXT:    s_nop 0
+; GFX11-TRUE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-TRUE16-NEXT:    s_endpgm
+;
+; GFX11-FAKE16-LABEL: s_uint_to_fp_i1_to_f16:
+; GFX11-FAKE16:       ; %bb.0:
+; GFX11-FAKE16-NEXT:    s_clause 0x1
+; GFX11-FAKE16-NEXT:    s_load_b128 s[4:7], s[2:3], 0x24
+; GFX11-FAKE16-NEXT:    s_load_b64 s[0:1], s[2:3], 0x34
+; GFX11-FAKE16-NEXT:    s_mov_b32 s10, -1
+; GFX11-FAKE16-NEXT:    s_mov_b32 s11, 0x31016000
+; GFX11-FAKE16-NEXT:    s_mov_b32 s2, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s3, s11
+; GFX11-FAKE16-NEXT:    s_mov_b32 s14, s10
+; GFX11-FAKE16-NEXT:    s_mov_b32 s15, s11
+; GFX11-FAKE16-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX11-FAKE16-NEXT:    s_mov_b32 s12, s6
+; GFX11-FAKE16-NEXT:    s_mov_b32 s13, s7
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v0, off, s[0:3], 0
+; GFX11-FAKE16-NEXT:    buffer_load_b32 v1, off, s[12:15], 0
+; GFX11-FAKE16-NEXT:    s_mov_b32 s8, s4
+; GFX11-FAKE16-NEXT:    s_mov_b32 s9, s5
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(1)
+; GFX11-FAKE16-NEXT:    v_cmp_le_f32_e32 vcc_lo, 1.0, v0
+; GFX11-FAKE16-NEXT:    s_waitcnt vmcnt(0)
+; GFX11-FAKE16-NEXT:    v_cmp_le_f32_e64 s0, 0, v1
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1) | instskip(NEXT) | instid1(SALU_CYCLE_1)
+; GFX11-FAKE16-NEXT:    s_xor_b32 s0, s0, vcc_lo
+; GFX11-FAKE16-NEXT:    v_cndmask_b32_e64 v0, 0, 1.0, s0
+; GFX11-FAKE16-NEXT:    s_delay_alu instid0(VALU_DEP_1)
+; GFX11-FAKE16-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; GFX11-FAKE16-NEXT:    buffer_store_b16 v0, off, s[8:11], 0
+; GFX11-FAKE16-NEXT:    s_nop 0
+; GFX11-FAKE16-NEXT:    s_sendmsg sendmsg(MSG_DEALLOC_VGPRS)
+; GFX11-FAKE16-NEXT:    s_endpgm
   %a = load float, ptr addrspace(1) %in0
   %b = load float, ptr addrspace(1) %in1
   %acmp = fcmp oge float %a, 0.000000e+00
-- 
GitLab


From aad3a1630e385a6834f92a5f1d6045451ba21e4e Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 17 Oct 2024 00:27:21 +0800
Subject: [PATCH 153/329] [ValueTracking] Respect `samesign` flag in
 `isKnownInversion` (#112390)

In https://github.com/llvm/llvm-project/pull/93591 we introduced
`isKnownInversion` and assumes `X` is poison implies `Y` is poison
because they share common operands. But after introducing `samesign`
this assumption no longer hold if `X` is an icmp has `samesign` flag.

Alive2 link: https://alive2.llvm.org/ce/z/rj3EwQ (Please run it locally
with this patch and https://github.com/AliveToolkit/alive2/pull/1098).

This approach is the most conservative way in my mind to address this
problem. If `X` has `samesign` flag, it will check if `Y` also has this
flag and make sure constant RHS operands have the same sign.

Fixes https://github.com/llvm/llvm-project/issues/112350.
---
 llvm/lib/Analysis/ValueTracking.cpp           |  9 ++
 .../test/Transforms/InstCombine/select-cmp.ll | 89 +++++++++++++++++++
 2 files changed, 98 insertions(+)

diff --git a/llvm/lib/Analysis/ValueTracking.cpp b/llvm/lib/Analysis/ValueTracking.cpp
index eb8d17044a17..e9ed8b3c862b 100644
--- a/llvm/lib/Analysis/ValueTracking.cpp
+++ b/llvm/lib/Analysis/ValueTracking.cpp
@@ -8522,6 +8522,10 @@ bool llvm::isKnownInversion(const Value *X, const Value *Y) {
       !match(Y, m_c_ICmp(Pred2, m_Specific(A), m_Value(C))))
     return false;
 
+  // They must both have samesign flag or not.
+  if (cast<ICmpInst>(X)->hasSameSign() != cast<ICmpInst>(Y)->hasSameSign())
+    return false;
+
   if (B == C)
     return Pred1 == ICmpInst::getInversePredicate(Pred2);
 
@@ -8530,6 +8534,11 @@ bool llvm::isKnownInversion(const Value *X, const Value *Y) {
   if (!match(B, m_APInt(RHSC1)) || !match(C, m_APInt(RHSC2)))
     return false;
 
+  // Sign bits of two RHSCs should match.
+  if (cast<ICmpInst>(X)->hasSameSign() &&
+      RHSC1->isNonNegative() != RHSC2->isNonNegative())
+    return false;
+
   const auto CR1 = ConstantRange::makeExactICmpRegion(Pred1, *RHSC1);
   const auto CR2 = ConstantRange::makeExactICmpRegion(Pred2, *RHSC2);
 
diff --git a/llvm/test/Transforms/InstCombine/select-cmp.ll b/llvm/test/Transforms/InstCombine/select-cmp.ll
index 234815949d77..f7505bd85f89 100644
--- a/llvm/test/Transforms/InstCombine/select-cmp.ll
+++ b/llvm/test/Transforms/InstCombine/select-cmp.ll
@@ -480,6 +480,95 @@ define i1 @test_select_inverse_nonconst4(i64 %x, i64 %y, i64 %z, i1 %cond) {
   ret i1 %sel
 }
 
+define i1 @test_select_inverse_samesign_true_arm(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_true_arm(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp uge i64 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+  %cmp1 = icmp samesign ult i64 %x, %y
+  %cmp2 = icmp uge i64 %x, %y
+  %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+  ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_false_arm(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_false_arm(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp samesign uge i64 [[X]], [[Y]]
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+  %cmp1 = icmp ult i64 %x, %y
+  %cmp2 = icmp samesign uge i64 %x, %y
+  %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+  ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_both(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_both(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp samesign uge i64 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+  %cmp1 = icmp samesign ult i64 %x, %y
+  %cmp2 = icmp samesign uge i64 %x, %y
+  %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+  ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_false_arm_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_false_arm_rhsc_same_sign(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ult i64 [[X:%.*]], 11
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp samesign ugt i64 [[X]], 10
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+  %cmp1 = icmp ult i64 %x, 11
+  %cmp2 = icmp samesign ugt i64 %x, 10
+  %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+  ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_true_arm_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_true_arm_rhsc_same_sign(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ult i64 [[X:%.*]], 11
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i64 [[X]], 10
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+  %cmp1 = icmp samesign ult i64 %x, 11
+  %cmp2 = icmp ugt i64 %x, 10
+  %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+  ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_both_rhsc_same_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_both_rhsc_same_sign(
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp samesign ugt i64 [[X:%.*]], 10
+; CHECK-NEXT:    [[SEL:%.*]] = xor i1 [[COND:%.*]], [[CMP2]]
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+  %cmp1 = icmp samesign ult i64 %x, 11
+  %cmp2 = icmp samesign ugt i64 %x, 10
+  %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+  ret i1 %sel
+}
+
+define i1 @test_select_inverse_samesign_both_rhsc_diff_sign(i64 %x, i64 %y, i1 %cond) {
+; CHECK-LABEL: @test_select_inverse_samesign_both_rhsc_diff_sign(
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign slt i64 [[X:%.*]], 0
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp samesign sgt i64 [[X]], -1
+; CHECK-NEXT:    [[SEL:%.*]] = select i1 [[COND:%.*]], i1 [[CMP1]], i1 [[CMP2]]
+; CHECK-NEXT:    ret i1 [[SEL]]
+;
+  %cmp1 = icmp samesign slt i64 %x, 0
+  %cmp2 = icmp samesign sgt i64 %x, -1
+  %sel = select i1 %cond, i1 %cmp1, i1 %cmp2
+  ret i1 %sel
+}
+
 define i1 @sel_icmp_two_cmp(i1 %c, i32 %a1, i32 %a2, i32 %a3, i32 %a4) {
 ; CHECK-LABEL: @sel_icmp_two_cmp(
 ; CHECK-NEXT:    [[CMP1:%.*]] = icmp ule i32 [[A1:%.*]], [[A2:%.*]]
-- 
GitLab


From 1de15c15bc52b1e3bf97f90a72d79100dc3f5b8e Mon Sep 17 00:00:00 2001
From: Hiroshi Yamauchi <56735936+hjyamauchi@users.noreply.github.com>
Date: Wed, 16 Oct 2024 09:35:05 -0700
Subject: [PATCH 154/329] Add arrangeCXXMethodCall to the CodeGenABITypes
 interface. (#111597)

In MSVC, the calling conventions for free functions and C++ instance
methods could be different, it makes sense to have this variant there.
---
 clang/include/clang/CodeGen/CodeGenABITypes.h | 24 ++++++++++++++----
 clang/lib/CodeGen/CodeGenABITypes.cpp         | 25 +++++++++++++------
 2 files changed, 36 insertions(+), 13 deletions(-)

diff --git a/clang/include/clang/CodeGen/CodeGenABITypes.h b/clang/include/clang/CodeGen/CodeGenABITypes.h
index 9cbc5a8a2a3f..836fdd75477c 100644
--- a/clang/include/clang/CodeGen/CodeGenABITypes.h
+++ b/clang/include/clang/CodeGen/CodeGenABITypes.h
@@ -75,11 +75,25 @@ const CGFunctionInfo &arrangeCXXMethodType(CodeGenModule &CGM,
                                            const FunctionProtoType *FTP,
                                            const CXXMethodDecl *MD);
 
-const CGFunctionInfo &arrangeFreeFunctionCall(CodeGenModule &CGM,
-                                              CanQualType returnType,
-                                              ArrayRef<CanQualType> argTypes,
-                                              FunctionType::ExtInfo info,
-                                              RequiredArgs args);
+const CGFunctionInfo &
+arrangeCXXMethodCall(CodeGenModule &CGM, CanQualType returnType,
+                     ArrayRef<CanQualType> argTypes, FunctionType::ExtInfo info,
+                     ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+                     RequiredArgs args);
+
+const CGFunctionInfo &arrangeFreeFunctionCall(
+    CodeGenModule &CGM, CanQualType returnType, ArrayRef<CanQualType> argTypes,
+    FunctionType::ExtInfo info,
+    ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+    RequiredArgs args);
+
+// An overload with an empty `paramInfos`
+inline const CGFunctionInfo &
+arrangeFreeFunctionCall(CodeGenModule &CGM, CanQualType returnType,
+                        ArrayRef<CanQualType> argTypes,
+                        FunctionType::ExtInfo info, RequiredArgs args) {
+  return arrangeFreeFunctionCall(CGM, returnType, argTypes, info, {}, args);
+}
 
 /// Returns the implicit arguments to add to a complete, non-delegating C++
 /// constructor call.
diff --git a/clang/lib/CodeGen/CodeGenABITypes.cpp b/clang/lib/CodeGen/CodeGenABITypes.cpp
index a6073e1188d6..3f10d68f8c5d 100644
--- a/clang/lib/CodeGen/CodeGenABITypes.cpp
+++ b/clang/lib/CodeGen/CodeGenABITypes.cpp
@@ -59,14 +59,23 @@ CodeGen::arrangeCXXMethodType(CodeGenModule &CGM,
   return CGM.getTypes().arrangeCXXMethodType(RD, FTP, MD);
 }
 
-const CGFunctionInfo &
-CodeGen::arrangeFreeFunctionCall(CodeGenModule &CGM,
-                                 CanQualType returnType,
-                                 ArrayRef<CanQualType> argTypes,
-                                 FunctionType::ExtInfo info,
-                                 RequiredArgs args) {
-  return CGM.getTypes().arrangeLLVMFunctionInfo(returnType, FnInfoOpts::None,
-                                                argTypes, info, {}, args);
+const CGFunctionInfo &CodeGen::arrangeCXXMethodCall(
+    CodeGenModule &CGM, CanQualType returnType, ArrayRef<CanQualType> argTypes,
+    FunctionType::ExtInfo info,
+    ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+    RequiredArgs args) {
+  return CGM.getTypes().arrangeLLVMFunctionInfo(
+      returnType, FnInfoOpts::IsInstanceMethod, argTypes, info, paramInfos,
+      args);
+}
+
+const CGFunctionInfo &CodeGen::arrangeFreeFunctionCall(
+    CodeGenModule &CGM, CanQualType returnType, ArrayRef<CanQualType> argTypes,
+    FunctionType::ExtInfo info,
+    ArrayRef<FunctionProtoType::ExtParameterInfo> paramInfos,
+    RequiredArgs args) {
+  return CGM.getTypes().arrangeLLVMFunctionInfo(
+      returnType, FnInfoOpts::None, argTypes, info, paramInfos, args);
 }
 
 ImplicitCXXConstructorArgs
-- 
GitLab


From 889e6ad24b6df4f8d5232d4ecbd8eb492717f1b7 Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Wed, 16 Oct 2024 09:45:39 -0700
Subject: [PATCH 155/329] =?UTF-8?q?[lldb]=20Fix=20a=20crash=20when=20two?=
 =?UTF-8?q?=20diagnostics=20are=20on=20the=20same=20column=20or=20in=20?=
 =?UTF-8?q?=E2=80=A6=20(#112451)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

…reverse order

The second inner loop (only) was missing the check for offset > column.
Also this patch sorts the diagnostics before printing them.
---
 lldb/source/Utility/DiagnosticsRendering.cpp  | 32 ++++++++++++----
 .../Utility/DiagnosticsRenderingTest.cpp      | 37 ++++++++++++++++++-
 2 files changed, 60 insertions(+), 9 deletions(-)

diff --git a/lldb/source/Utility/DiagnosticsRendering.cpp b/lldb/source/Utility/DiagnosticsRendering.cpp
index 96caf934cc23..dd059d6e98a6 100644
--- a/lldb/source/Utility/DiagnosticsRendering.cpp
+++ b/lldb/source/Utility/DiagnosticsRendering.cpp
@@ -77,11 +77,7 @@ void RenderDiagnosticDetails(Stream &stream,
     spacer = "";
   }
 
-  // Print a line with caret indicator(s) below the lldb prompt + command.
-  const size_t padding = *offset_in_command;
-  stream << std::string(padding, ' ');
-
-  size_t offset = 1;
+  // Partition the diagnostics.
   std::vector<DiagnosticDetail> remaining_details, other_details,
       hidden_details;
   for (const DiagnosticDetail &detail : details) {
@@ -98,10 +94,31 @@ void RenderDiagnosticDetails(Stream &stream,
       continue;
     }
 
-    auto &loc = *detail.source_location;
     remaining_details.push_back(detail);
+  }
+
+  // Sort the diagnostics.
+  auto sort = [](auto &ds) {
+    llvm::sort(ds.begin(), ds.end(), [](auto &d1, auto &d2) {
+      auto l1 = d1.source_location.value_or(DiagnosticDetail::SourceLocation{});
+      auto l2 = d2.source_location.value_or(DiagnosticDetail::SourceLocation{});
+      return std::pair(l1.line, l2.column) < std::pair(l1.line, l2.column);
+    });
+  };
+  sort(remaining_details);
+  sort(other_details);
+  sort(hidden_details);
+
+  // Print a line with caret indicator(s) below the lldb prompt + command.
+  const size_t padding = *offset_in_command;
+  stream << std::string(padding, ' ');
+  size_t offset = 1;
+  for (const DiagnosticDetail &detail : remaining_details) {
+    auto &loc = *detail.source_location;
+
     if (offset > loc.column)
       continue;
+
     stream << std::string(loc.column - offset, ' ') << cursor;
     for (unsigned i = 0; i + 1 < loc.length; ++i)
       stream << underline;
@@ -121,7 +138,8 @@ void RenderDiagnosticDetails(Stream &stream,
     for (auto &remaining_detail :
          llvm::ArrayRef(remaining_details).drop_back(1)) {
       uint16_t column = remaining_detail.source_location->column;
-      stream << std::string(column - offset, ' ') << vbar;
+      if (offset <= column)
+        stream << std::string(column - offset, ' ') << vbar;
       offset = column + 1;
     }
 
diff --git a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
index 2bd80796b807..3d37403331b4 100644
--- a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
+++ b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
@@ -16,12 +16,45 @@ std::string Render(std::vector<DiagnosticDetail> details) {
 } // namespace
 
 TEST_F(ErrorDisplayTest, RenderStatus) {
-  DiagnosticDetail::SourceLocation inline_loc;
-  inline_loc.in_user_input = true;
+  using SourceLocation = DiagnosticDetail::SourceLocation;
   {
+    SourceLocation inline_loc;
+    inline_loc.in_user_input = true;
     std::string result =
         Render({DiagnosticDetail{inline_loc, eSeverityError, "foo", ""}});
     ASSERT_TRUE(StringRef(result).contains("error:"));
     ASSERT_TRUE(StringRef(result).contains("foo"));
   }
+
+  {
+    // Test that diagnostics on the same column can be handled and all
+    // three errors are diagnosed.
+    SourceLocation loc1 = {FileSpec{"a.c"}, 13, 11, 0, false, true};
+    SourceLocation loc2 = {FileSpec{"a.c"}, 13, 13, 0, false, true};
+    std::string result =
+        Render({DiagnosticDetail{loc1, eSeverityError, "1", "1"},
+                DiagnosticDetail{loc1, eSeverityError, "2", "2"},
+                DiagnosticDetail{loc2, eSeverityError, "3", "3"}});
+    ASSERT_TRUE(StringRef(result).contains("error: 1"));
+    ASSERT_TRUE(StringRef(result).contains("error: 2"));
+    ASSERT_TRUE(StringRef(result).contains("error: 3"));
+  }
+  {
+    // Test that diagnostics in reverse order are emitted correctly.
+    SourceLocation loc1 = {FileSpec{"a.c"}, 1, 20, 0, false, true};
+    SourceLocation loc2 = {FileSpec{"a.c"}, 2, 10, 0, false, true};
+    std::string result =
+        Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"},
+                DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}});
+    ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X"));
+  }
+  {
+    // Test that diagnostics in reverse order are emitted correctly.
+    SourceLocation loc1 = {FileSpec{"a.c"}, 2, 10, 0, false, true};
+    SourceLocation loc2 = {FileSpec{"a.c"}, 1, 20, 0, false, true};
+    std::string result =
+        Render({DiagnosticDetail{loc2, eSeverityError, "X", "X"},
+                DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}});
+    ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X"));
+  }
 }
-- 
GitLab


From 8046f15dfaaa8726b058a3483175890ca95832af Mon Sep 17 00:00:00 2001
From: Adrian Prantl <aprantl@apple.com>
Date: Wed, 16 Oct 2024 09:46:35 -0700
Subject: [PATCH 156/329] [lldb] Fix offset calculation when printing
 diagnostics in multiple ranges (#112466)

depends on https://github.com/llvm/llvm-project/pull/112451
---
 lldb/source/Utility/DiagnosticsRendering.cpp  | 38 ++++++++++---------
 .../Utility/DiagnosticsRenderingTest.cpp      | 18 +++++++++
 2 files changed, 39 insertions(+), 17 deletions(-)

diff --git a/lldb/source/Utility/DiagnosticsRendering.cpp b/lldb/source/Utility/DiagnosticsRendering.cpp
index dd059d6e98a6..d28a9ab8958b 100644
--- a/lldb/source/Utility/DiagnosticsRendering.cpp
+++ b/lldb/source/Utility/DiagnosticsRendering.cpp
@@ -112,17 +112,21 @@ void RenderDiagnosticDetails(Stream &stream,
   // Print a line with caret indicator(s) below the lldb prompt + command.
   const size_t padding = *offset_in_command;
   stream << std::string(padding, ' ');
-  size_t offset = 1;
-  for (const DiagnosticDetail &detail : remaining_details) {
-    auto &loc = *detail.source_location;
-
-    if (offset > loc.column)
-      continue;
-
-    stream << std::string(loc.column - offset, ' ') << cursor;
-    for (unsigned i = 0; i + 1 < loc.length; ++i)
-      stream << underline;
-    offset = loc.column + 1;
+  {
+    size_t x_pos = 1;
+    for (const DiagnosticDetail &detail : remaining_details) {
+      auto &loc = *detail.source_location;
+
+      if (x_pos > loc.column)
+        continue;
+
+      stream << std::string(loc.column - x_pos, ' ') << cursor;
+      ++x_pos;
+      for (unsigned i = 0; i + 1 < loc.length; ++i) {
+        stream << underline;
+        ++x_pos;
+      }
+    }
   }
   stream << '\n';
 
@@ -134,19 +138,19 @@ void RenderDiagnosticDetails(Stream &stream,
     // Get the information to print this detail and remove it from the stack.
     // Print all the lines for all the other messages first.
     stream << std::string(padding, ' ');
-    size_t offset = 1;
+    size_t x_pos = 1;
     for (auto &remaining_detail :
          llvm::ArrayRef(remaining_details).drop_back(1)) {
       uint16_t column = remaining_detail.source_location->column;
-      if (offset <= column)
-        stream << std::string(column - offset, ' ') << vbar;
-      offset = column + 1;
+      if (x_pos <= column)
+        stream << std::string(column - x_pos, ' ') << vbar;
+      x_pos = column + 1;
     }
 
     // Print the line connecting the ^ with the error message.
     uint16_t column = detail->source_location->column;
-    if (offset <= column)
-      stream << std::string(column - offset, ' ') << joint << hbar << spacer;
+    if (x_pos <= column)
+      stream << std::string(column - x_pos, ' ') << joint << hbar << spacer;
 
     // Print a colorized string based on the message's severity type.
     PrintSeverity(stream, detail->severity);
diff --git a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
index 3d37403331b4..39d8b1d55842 100644
--- a/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
+++ b/lldb/unittests/Utility/DiagnosticsRenderingTest.cpp
@@ -57,4 +57,22 @@ TEST_F(ErrorDisplayTest, RenderStatus) {
                 DiagnosticDetail{loc1, eSeverityError, "Y", "Y"}});
     ASSERT_LT(StringRef(result).find("Y"), StringRef(result).find("X"));
   }
+  {
+    // Test that range diagnostics are emitted correctly.
+    SourceLocation loc1 = {FileSpec{"a.c"}, 1, 1, 3, false, true};
+    SourceLocation loc2 = {FileSpec{"a.c"}, 1, 5, 3, false, true};
+    std::string result =
+        Render({DiagnosticDetail{loc1, eSeverityError, "X", "X"},
+                DiagnosticDetail{loc2, eSeverityError, "Y", "Y"}});
+    auto lines = StringRef(result).split('\n');
+    auto line1 = lines.first;
+    lines = lines.second.split('\n');
+    auto line2 = lines.first;
+    lines = lines.second.split('\n');
+    auto line3 = lines.first;
+    //               1234567
+    ASSERT_EQ(line1, "^~~ ^~~");
+    ASSERT_EQ(line2, "|   error: Y");
+    ASSERT_EQ(line3, "error: X");
+  }
 }
-- 
GitLab


From 2c8ecb327249aee001594d6f4ad1eddc7330994f Mon Sep 17 00:00:00 2001
From: Steven Perron <stevenperron@google.com>
Date: Wed, 16 Oct 2024 12:46:45 -0400
Subject: [PATCH 157/329] [HLSL][SPIRV] Use Spirv target codegen (#112573)

When the arch in the triple in "spirv", the default target codegen is
currently used. We should be using the spir-v target codegen. This will
be used to have SPIR-V specific lowering of the HLSL types.
---
 clang/lib/CodeGen/CodeGenModule.cpp           |  1 +
 .../CodeGenHLSL/builtins/WaveReadLaneAt.hlsl  | 20 +++++++++----------
 .../wave_get_lane_index_do_while.hlsl         |  4 ++--
 .../builtins/wave_get_lane_index_simple.hlsl  |  4 ++--
 .../builtins/wave_get_lane_index_subcall.hlsl |  4 ++--
 .../builtins/wave_is_first_lane.hlsl          |  4 ++--
 6 files changed, 19 insertions(+), 18 deletions(-)

diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b05ab3606a69..b3e805a67768 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -295,6 +295,7 @@ createTargetCodeGenInfo(CodeGenModule &CGM) {
     return createCommonSPIRTargetCodeGenInfo(CGM);
   case llvm::Triple::spirv32:
   case llvm::Triple::spirv64:
+  case llvm::Triple::spirv:
     return createSPIRVTargetCodeGenInfo(CGM);
   case llvm::Triple::dxil:
     return createDirectXTargetCodeGenInfo(CGM);
diff --git a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
index 03e149d0a9f2..093a199a32bd 100644
--- a/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/WaveReadLaneAt.hlsl
@@ -10,27 +10,27 @@
 // CHECK-LABEL: test_int
 int test_int(int expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok0:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok0]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i32([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i32([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i32([[TY]], i32) #[[#attr:]]
 
 #ifdef __HLSL_ENABLE_16_BIT
 // CHECK-LABEL: test_int16
 int16_t test_int16(int16_t expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok1:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok1]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.i16([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.i16([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.i16([[TY]], i32) #[[#attr:]]
 #endif
 
 // Test basic lowering to runtime function call with array and float values.
@@ -38,37 +38,37 @@ int16_t test_int16(int16_t expr, uint idx) {
 // CHECK-LABEL: test_half
 half test_half(half expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok2:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok2]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f16([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f16([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f16([[TY]], i32) #[[#attr:]]
 
 // CHECK-LABEL: test_double
 double test_double(double expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok3:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET:.*]] = call [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ]
+  // CHECK-SPIRV:  %[[RET:.*]] = call spir_func [[TY:.*]] @llvm.spv.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok3]]) ]
   // CHECK-DXIL:  %[[RET:.*]] = call [[TY:.*]] @llvm.dx.wave.readlane.f64([[TY]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY]] %[[RET]]
   return WaveReadLaneAt(expr, idx);
 }
 
 // CHECK-DXIL: declare [[TY]] @llvm.dx.wave.readlane.f64([[TY]], i32) #[[#attr:]]
-// CHECK-SPIRV: declare [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]]
+// CHECK-SPIRV: declare spir_func [[TY]] @llvm.spv.wave.readlane.f64([[TY]], i32) #[[#attr:]]
 
 // CHECK-LABEL: test_floatv4
 float4 test_floatv4(float4 expr, uint idx) {
   // CHECK-SPIRV: %[[#entry_tok4:]] = call token @llvm.experimental.convergence.entry()
-  // CHECK-SPIRV:  %[[RET1:.*]] = call [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ]
+  // CHECK-SPIRV:  %[[RET1:.*]] = call spir_func [[TY1:.*]] @llvm.spv.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]]) [ "convergencectrl"(token %[[#entry_tok4]]) ]
   // CHECK-DXIL:  %[[RET1:.*]] = call [[TY1:.*]] @llvm.dx.wave.readlane.v4f32([[TY1]] %[[#]], i32 %[[#]])
   // CHECK:  ret [[TY1]] %[[RET1]]
   return WaveReadLaneAt(expr, idx);
 }
 
 // CHECK-DXIL: declare [[TY1]] @llvm.dx.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
-// CHECK-SPIRV: declare [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
+// CHECK-SPIRV: declare spir_func [[TY1]] @llvm.spv.wave.readlane.v4f32([[TY1]], i32) #[[#attr]]
 
 // CHECK: attributes #[[#attr]] = {{{.*}} convergent {{.*}}}
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
index 6b053dc6add1..3ab8048146ad 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_do_while.hlsl
@@ -17,7 +17,7 @@ void main() {
 // CHECK:   br i1 {{%.+}}, label %[[LABEL_IF_THEN:.+]], label %[[LABEL_IF_END:.+]]
 
 // CHECK: [[LABEL_IF_THEN]]:
-// CHECK:   call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CT_LOOP]]) ]
+// CHECK:   call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CT_LOOP]]) ]
 // CHECK:   br label %[[LABEL_WHILE_END:.+]]
     if (cond == 2) {
       uint index = WaveGetLaneIndex();
@@ -33,7 +33,7 @@ void main() {
 // CHECK:   ret void
 }
 
-// CHECK-DAG: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
 
 // CHECK-DAG: attributes [[A0]] = {{{.*}}convergent{{.*}}}
 // CHECK-DAG: attributes [[A1]] = {{{.*}}convergent{{.*}}}
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
index 06a2715b00e9..8e1f2d69e743 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_simple.hlsl
@@ -9,13 +9,13 @@
 // CHECK-SPIRV: define spir_func noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
 // CHECK-DXIL: define noundef i32 @{{.*test_1.*}}() [[A0:#[0-9]+]] {
 // CHECK-SPIRV: %[[CI:[0-9]+]] = call token @llvm.experimental.convergence.entry()
-// CHECK-SPIRV: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
+// CHECK-SPIRV: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[CI]]) ]
 // CHECK-DXIL: call i32 @llvm.dx.wave.getlaneindex()
 int test_1() {
   return WaveGetLaneIndex();
 }
 
-// CHECK-SPIRV: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-SPIRV: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
 // CHECK-DXIL: declare i32 @llvm.dx.wave.getlaneindex() [[A1:#[0-9]+]]
 
 // CHECK-DAG: attributes [[A0]] = { {{.*}}convergent{{.*}} }
diff --git a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
index 6ea80d692cd2..12b120d0c067 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_get_lane_index_subcall.hlsl
@@ -3,12 +3,12 @@
 
 // CHECK: define spir_func noundef i32 @_Z6test_1v() [[A0:#[0-9]+]] {
 // CHECK: %[[C1:[0-9]+]] = call token @llvm.experimental.convergence.entry()
-// CHECK: call i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ]
+// CHECK: call spir_func i32 @__hlsl_wave_get_lane_index() [ "convergencectrl"(token %[[C1]]) ]
 uint test_1() {
   return WaveGetLaneIndex();
 }
 
-// CHECK-DAG: declare i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
+// CHECK-DAG: declare spir_func i32 @__hlsl_wave_get_lane_index() [[A1:#[0-9]+]]
 
 // CHECK: define spir_func noundef i32 @_Z6test_2v() [[A0]] {
 // CHECK: %[[C2:[0-9]+]] = call token @llvm.experimental.convergence.entry()
diff --git a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
index 18860c321eb9..2fb6defb896f 100644
--- a/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/wave_is_first_lane.hlsl
@@ -13,7 +13,7 @@ void main() {
   while (true) {
 
 // CHECK-DXIL:  %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
-// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane()
 // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#loop_tok]]) ]
     if (WaveIsFirstLane()) {
       break;
@@ -21,7 +21,7 @@ void main() {
   }
 
 // CHECK-DXIL:  %[[#]] = call i1 @llvm.dx.wave.is.first.lane()
-// CHECK-SPIRV: %[[#]] = call i1 @llvm.spv.wave.is.first.lane()
+// CHECK-SPIRV: %[[#]] = call spir_func i1 @llvm.spv.wave.is.first.lane()
 // CHECK-SPIRV-SAME: [ "convergencectrl"(token %[[#entry_tok]]) ]
   if (WaveIsFirstLane()) {
     return;
-- 
GitLab


From 0850e721ab1c198f08994f003873a4147ec05e25 Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Wed, 16 Oct 2024 09:47:06 -0700
Subject: [PATCH 158/329] [RISCV] Convert C_ADDI_NOP to C_NOP in the assembler.
 (#112314)

Make it a pseudoinstruction so we can convert it to C_NOP. This makes
the printing from the assembler consistent with what we get from
llvm-objdump.

I tried to do this with an InstAlias, but I don't think it can drop
operands.
---
 llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp |  3 +++
 llvm/lib/Target/RISCV/RISCVInstrInfoC.td           | 14 +++++---------
 llvm/test/MC/RISCV/rv32c-valid.s                   |  3 +--
 3 files changed, 9 insertions(+), 11 deletions(-)

diff --git a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
index d77ad02ec47b..0bc35846627c 100644
--- a/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
+++ b/llvm/lib/Target/RISCV/AsmParser/RISCVAsmParser.cpp
@@ -3693,6 +3693,9 @@ bool RISCVAsmParser::processInstruction(MCInst &Inst, SMLoc IDLoc,
   switch (Inst.getOpcode()) {
   default:
     break;
+  case RISCV::PseudoC_ADDI_NOP:
+    emitToStreamer(Out, MCInstBuilder(RISCV::C_NOP));
+    return false;
   case RISCV::PseudoLLAImm:
   case RISCV::PseudoLAImm:
   case RISCV::PseudoLI: {
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
index e8c4860fd3e5..8a76dba23d42 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfoC.td
@@ -418,15 +418,11 @@ def C_ADDI : RVInst16CI<0b000, 0b01, (outs GPRNoX0:$rd_wb),
   let Inst{6-2} = imm{4-0};
 }
 
-let hasSideEffects = 0, mayLoad = 0, mayStore = 0 in
-def C_ADDI_NOP : RVInst16CI<0b000, 0b01, (outs GPRX0:$rd_wb),
-                            (ins GPRX0:$rd, immzero:$imm),
-                            "c.addi", "$rd, $imm">,
-                 Sched<[WriteIALU, ReadIALU]> {
-  let Constraints = "$rd = $rd_wb";
-  let Inst{6-2} = 0;
-  let isAsmParserOnly = 1;
-}
+// Alternate syntax for c.nop. Converted to C_NOP by the assembler.
+let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCodeGenOnly = 0,
+    isAsmParserOnly = 1 in
+def PseudoC_ADDI_NOP : Pseudo<(outs GPRX0:$rd), (ins GPRX0:$rs1, immzero:$imm),
+                              [], "c.addi", "$rd, $imm">;
 
 let hasSideEffects = 0, mayLoad = 0, mayStore = 0, isCall = 1,
     DecoderNamespace = "RISCV32Only_", Defs = [X1],
diff --git a/llvm/test/MC/RISCV/rv32c-valid.s b/llvm/test/MC/RISCV/rv32c-valid.s
index bcdf27a2ba78..9b0ca80a7adc 100644
--- a/llvm/test/MC/RISCV/rv32c-valid.s
+++ b/llvm/test/MC/RISCV/rv32c-valid.s
@@ -147,8 +147,7 @@ c.sub a4, a5
 # CHECK-ASM: encoding: [0x01,0x00]
 # CHECK-NO-EXT:  error: instruction requires the following: 'C' (Compressed Instructions) or 'Zca' (part of the C extension, excluding compressed floating point loads/stores){{$}}
 c.nop
-# CHECK-ASM: c.addi zero, 0
-# CHECK-OBJ: c.nop
+# CHECK-ASM-AND-OBJ: c.nop
 # CHECK-ASM: encoding: [0x01,0x00]
 # CHECK-NO-EXT:  error: instruction requires the following: 'C' (Compressed Instructions) or 'Zca' (part of the C extension, excluding compressed floating point loads/stores){{$}}
 c.addi x0, 0
-- 
GitLab


From ae778ae7ce72219270c30d5c8b3d88c9a4803f81 Mon Sep 17 00:00:00 2001
From: goldsteinn <35538541+goldsteinn@users.noreply.github.com>
Date: Wed, 16 Oct 2024 12:53:21 -0400
Subject: [PATCH 159/329] [Inliner] Propagate more attributes to params when
 inlining (#91101)

- **[Inliner] Add tests for propagating more parameter attributes; NFC**
- **[Inliner] Propagate more attributes to params when inlining**

Add support for propagating:
        - `derefereancable`
        - `derefereancable_or_null`
        - `align`
        - `nonnull`
        - `range`

These are only propagated if the parameter to the to-be-inlined callsite
match the exact parameter used in the to-be-inlined function.
---
 .../test/CodeGen/attr-counted-by-pr88931.cpp  |   2 +-
 clang/test/OpenMP/bug57757.cpp                |   2 +-
 llvm/include/llvm/IR/Attributes.h             |   7 +
 llvm/lib/IR/Attributes.cpp                    |  15 ++
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  90 ++++++++--
 .../Inline/access-attributes-prop.ll          | 164 +++++++++++++++++-
 .../Inline/assumptions-from-callsite-attrs.ll |   2 +-
 llvm/test/Transforms/Inline/byval.ll          |   4 +-
 llvm/test/Transforms/PhaseOrdering/pr95152.ll |   2 +-
 9 files changed, 259 insertions(+), 29 deletions(-)

diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.cpp b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
index 2a8cc1d07e50..6d0c46bbbe8f 100644
--- a/clang/test/CodeGen/attr-counted-by-pr88931.cpp
+++ b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
@@ -13,7 +13,7 @@ void init(void * __attribute__((pass_dynamic_object_size(0))));
 // CHECK-LABEL: define dso_local void @_ZN3foo3barC1Ev(
 // CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(1) [[THIS:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull align 4 dereferenceable(1) [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 foo::bar::bar() {
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 240b22a30671..eabf233dde24 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -39,7 +39,7 @@ void foo() {
 // CHECK-NEXT:    ]
 // CHECK:       .untied.jmp..i:
 // CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope [[META13]], !noalias [[META17]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias [[META13]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr nonnull [[TMP1]]), !noalias [[META13]]
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 57db52e4879b..80490e3b7f49 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -947,6 +947,9 @@ public:
   /// arg.
   uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const;
 
+  /// Get range (or std::nullopt if unknown) of an arg.
+  std::optional<ConstantRange> getParamRange(unsigned ArgNo) const;
+
   /// Get the disallowed floating-point classes of the return value.
   FPClassTest getRetNoFPClass() const;
 
@@ -1123,6 +1126,10 @@ public:
   /// invalid if the Kind is not present in the builder.
   Attribute getAttribute(StringRef Kind) const;
 
+  /// Retrieve the range if the attribute exists (std::nullopt is returned
+  /// otherwise).
+  std::optional<ConstantRange> getRange() const;
+
   /// Return raw (possibly packed/encoded) value of integer attribute or
   /// std::nullopt if not set.
   std::optional<uint64_t> getRawIntAttr(Attribute::AttrKind Kind) const;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index c2fba49692c7..55851d499c60 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1931,6 +1931,14 @@ AttributeList::getParamDereferenceableOrNullBytes(unsigned Index) const {
   return getParamAttrs(Index).getDereferenceableOrNullBytes();
 }
 
+std::optional<ConstantRange>
+AttributeList::getParamRange(unsigned ArgNo) const {
+  auto RangeAttr = getParamAttrs(ArgNo).getAttribute(Attribute::Range);
+  if (RangeAttr.isValid())
+    return RangeAttr.getRange();
+  return std::nullopt;
+}
+
 FPClassTest AttributeList::getRetNoFPClass() const {
   return getRetAttrs().getNoFPClass();
 }
@@ -2277,6 +2285,13 @@ Attribute AttrBuilder::getAttribute(StringRef A) const {
   return {};
 }
 
+std::optional<ConstantRange> AttrBuilder::getRange() const {
+  const Attribute RangeAttr = getAttribute(Attribute::Range);
+  if (RangeAttr.isValid())
+    return RangeAttr.getRange();
+  return std::nullopt;
+}
+
 bool AttrBuilder::contains(Attribute::AttrKind A) const {
   return getAttribute(A).isValid();
 }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 55ad2b6d6200..71ca527e5daa 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -59,6 +60,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -1358,18 +1360,36 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB,
   auto &Context = CalledFunction->getContext();
 
   // Collect valid attributes for all params.
-  SmallVector<AttrBuilder> ValidParamAttrs;
+  SmallVector<AttrBuilder> ValidObjParamAttrs, ValidExactParamAttrs;
   bool HasAttrToPropagate = false;
 
+  // Attributes we can only propagate if the exact parameter is forwarded.
+  // We can propagate both poison generating and UB generating attributes
+  // without any extra checks. The only attribute that is tricky to propagate
+  // is `noundef` (skipped for now) as that can create new UB where previous
+  // behavior was just using a poison value.
+  static const Attribute::AttrKind ExactAttrsToPropagate[] = {
+      Attribute::Dereferenceable, Attribute::DereferenceableOrNull,
+      Attribute::NonNull, Attribute::Alignment, Attribute::Range};
+
   for (unsigned I = 0, E = CB.arg_size(); I < E; ++I) {
-    ValidParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
+    ValidObjParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
+    ValidExactParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
     // Access attributes can be propagated to any param with the same underlying
     // object as the argument.
     if (CB.paramHasAttr(I, Attribute::ReadNone))
-      ValidParamAttrs.back().addAttribute(Attribute::ReadNone);
+      ValidObjParamAttrs.back().addAttribute(Attribute::ReadNone);
     if (CB.paramHasAttr(I, Attribute::ReadOnly))
-      ValidParamAttrs.back().addAttribute(Attribute::ReadOnly);
-    HasAttrToPropagate |= ValidParamAttrs.back().hasAttributes();
+      ValidObjParamAttrs.back().addAttribute(Attribute::ReadOnly);
+
+    for (Attribute::AttrKind AK : ExactAttrsToPropagate) {
+      Attribute Attr = CB.getParamAttr(I, AK);
+      if (Attr.isValid())
+        ValidExactParamAttrs.back().addAttribute(Attr);
+    }
+
+    HasAttrToPropagate |= ValidObjParamAttrs.back().hasAttributes();
+    HasAttrToPropagate |= ValidExactParamAttrs.back().hasAttributes();
   }
 
   // Won't be able to propagate anything.
@@ -1391,22 +1411,60 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB,
 
       AttributeList AL = NewInnerCB->getAttributes();
       for (unsigned I = 0, E = InnerCB->arg_size(); I < E; ++I) {
-        // Check if the underlying value for the parameter is an argument.
-        const Value *UnderlyingV =
-            getUnderlyingObject(InnerCB->getArgOperand(I));
-        const Argument *Arg = dyn_cast<Argument>(UnderlyingV);
-        if (!Arg)
+        // It's unsound or requires special handling to propagate
+        // attributes to byval arguments. Even if CalledFunction
+        // doesn't e.g. write to the argument (readonly), the call to
+        // NewInnerCB may write to its by-value copy.
+        if (NewInnerCB->paramHasAttr(I, Attribute::ByVal))
           continue;
 
-        if (NewInnerCB->paramHasAttr(I, Attribute::ByVal))
-          // It's unsound to propagate memory attributes to byval arguments.
-          // Even if CalledFunction doesn't e.g. write to the argument,
-          // the call to NewInnerCB may write to its by-value copy.
+        // Don't bother propagating attrs to constants.
+        if (match(NewInnerCB->getArgOperand(I),
+                  llvm::PatternMatch::m_ImmConstant()))
           continue;
 
-        unsigned ArgNo = Arg->getArgNo();
+        // Check if the underlying value for the parameter is an argument.
+        const Argument *Arg = dyn_cast<Argument>(InnerCB->getArgOperand(I));
+        unsigned ArgNo;
+        if (Arg) {
+          ArgNo = Arg->getArgNo();
+          // For dereferenceable, dereferenceable_or_null, align, etc...
+          // we don't want to propagate if the existing param has the same
+          // attribute with "better" constraints. So  remove from the
+          // new AL if the region of the existing param is larger than
+          // what we can propagate.
+          AttrBuilder NewAB{
+              Context, AttributeSet::get(Context, ValidExactParamAttrs[ArgNo])};
+          if (AL.getParamDereferenceableBytes(I) >
+              NewAB.getDereferenceableBytes())
+            NewAB.removeAttribute(Attribute::Dereferenceable);
+          if (AL.getParamDereferenceableOrNullBytes(I) >
+              NewAB.getDereferenceableOrNullBytes())
+            NewAB.removeAttribute(Attribute::DereferenceableOrNull);
+          if (AL.getParamAlignment(I).valueOrOne() >
+              NewAB.getAlignment().valueOrOne())
+            NewAB.removeAttribute(Attribute::Alignment);
+          if (auto ExistingRange = AL.getParamRange(I)) {
+            if (auto NewRange = NewAB.getRange()) {
+              ConstantRange CombinedRange =
+                  ExistingRange->intersectWith(*NewRange);
+              NewAB.removeAttribute(Attribute::Range);
+              NewAB.addRangeAttr(CombinedRange);
+            }
+          }
+          AL = AL.addParamAttributes(Context, I, NewAB);
+        } else {
+          // Check if the underlying value for the parameter is an argument.
+          const Value *UnderlyingV =
+              getUnderlyingObject(InnerCB->getArgOperand(I));
+          Arg = dyn_cast<Argument>(UnderlyingV);
+          if (!Arg)
+            continue;
+          ArgNo = Arg->getArgNo();
+        }
+
         // If so, propagate its access attributes.
-        AL = AL.addParamAttributes(Context, I, ValidParamAttrs[ArgNo]);
+        AL = AL.addParamAttributes(Context, I, ValidObjParamAttrs[ArgNo]);
         // We can have conflicting attributes from the inner callsite and
         // to-be-inlined callsite. In that case, choose the most
         // restrictive.
diff --git a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll
index 5051c92345ec..5bf845d5ba94 100644
--- a/llvm/test/Transforms/Inline/access-attributes-prop.ll
+++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll
@@ -47,7 +47,6 @@ define dso_local void @foo3_writable(ptr %p) {
   ret void
 }
 
-
 define dso_local void @foo1_bar_aligned64_deref512(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@foo1_bar_aligned64_deref512
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -306,7 +305,7 @@ define void @prop_param_callbase_def_1x_partial_3(ptr %p, ptr %p2) {
 define void @prop_deref(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_deref
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr dereferenceable(16) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr dereferenceable(16) %p)
@@ -316,7 +315,7 @@ define void @prop_deref(ptr %p) {
 define void @prop_deref_or_null(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_deref_or_null
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr dereferenceable_or_null(256) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr dereferenceable_or_null(256) %p)
@@ -326,13 +325,23 @@ define void @prop_deref_or_null(ptr %p) {
 define void @prop_param_nonnull_and_align(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_nonnull_and_align
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr nonnull align 32 [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr nonnull align 32 %p)
   ret void
 }
 
+define void @prop_param_nofree_and_align(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@prop_param_nofree_and_align
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @bar1(ptr align 32 [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo1(ptr nofree align 32 %p)
+  ret void
+}
+
 define void @prop_param_deref_align_no_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_no_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -346,7 +355,7 @@ define void @prop_param_deref_align_no_update(ptr %p) {
 define void @prop_param_deref_align_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr align 64 dereferenceable(512) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr align 128 dereferenceable(1024) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1_bar_aligned64_deref512(ptr align 128 dereferenceable(1024) %p)
@@ -356,7 +365,7 @@ define void @prop_param_deref_align_update(ptr %p) {
 define void @prop_param_deref_or_null_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_or_null_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr align 512 dereferenceable_or_null(512) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr align 512 dereferenceable_or_null(1024) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1_bar_aligned512_deref_or_null512(ptr dereferenceable_or_null(1024) %p)
@@ -539,7 +548,6 @@ define void @prop_no_conflict_writable(ptr %p) {
   ret void
 }
 
-
 define void @prop_no_conflict_writable2(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_no_conflict_writable2
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -600,3 +608,145 @@ define void @prop_byval_readonly2(ptr %p) {
   call void @foo_byval_readonly2(ptr %p)
   ret void
 }
+
+declare void @bar5(i32)
+
+define dso_local void @foo4_range_0_10(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4_range_0_10
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 10) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 range(i32 0, 10) %v)
+  ret void
+}
+
+define dso_local void @foo4_range_10_40(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4_range_10_40
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 range(i32 10, 40) %v)
+  ret void
+}
+
+define dso_local void @foo4_2_range_0_10(i32 range(i32 0, 10) %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4_2_range_0_10
+; CHECK-SAME: (i32 range(i32 0, 10) [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 %v)
+  ret void
+}
+
+define dso_local void @foo4(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 %v)
+  ret void
+}
+
+define void @prop_range_empty_intersect(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_intersect
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 0) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 11, 50) %v)
+  ret void
+}
+
+define void @prop_range_empty(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_empty
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 0) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4(i32 range(i32 1, 0) %v)
+  ret void
+}
+
+define void @prop_range_empty_with_intersect(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_with_intersect
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 10) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 1, 0) %v)
+  ret void
+}
+
+define void @prop_range_intersect1(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect1
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 9) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 0, 9) %v)
+  ret void
+}
+
+define void @prop_range_intersect2(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect2
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 9) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 1, 9) %v)
+  ret void
+}
+
+define void @prop_range_intersect3(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect3
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 11) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_2_range_0_10(i32 range(i32 0, 11) %v)
+  ret void
+}
+
+define void @prop_range_intersect4(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect4
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 5) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 40, 5) %v)
+  ret void
+}
+
+define void @prop_range_intersect5(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect5
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_10_40(i32 range(i32 30, 20) %v)
+  ret void
+}
+
+define void @prop_range_keep(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_keep
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_10_40(i32 %v)
+  ret void
+}
+
+define void @prop_range_direct(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_direct
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 11) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4(i32 range(i32 1, 11) %v)
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
index 1a219a22019c..c0943f4aefb8 100644
--- a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
+++ b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
@@ -8,7 +8,7 @@ declare void @h(ptr %p, ptr %q, ptr %z)
 define void @f(ptr %p, ptr %q, ptr %z) {
 ; CHECK-LABEL: define void @f
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[Z:%.*]]) {
-; CHECK-NEXT:    call void @h(ptr [[P]], ptr [[Q]], ptr [[Z]])
+; CHECK-NEXT:    call void @h(ptr nonnull [[P]], ptr [[Q]], ptr nonnull [[Z]])
 ; CHECK-NEXT:    ret void
 ;
   call void @g(ptr nonnull %p, ptr %q, ptr nonnull %z)
diff --git a/llvm/test/Transforms/Inline/byval.ll b/llvm/test/Transforms/Inline/byval.ll
index dd5be40b90a8..1a70da8472cb 100644
--- a/llvm/test/Transforms/Inline/byval.ll
+++ b/llvm/test/Transforms/Inline/byval.ll
@@ -106,7 +106,7 @@ define void @test3() nounwind  {
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS]], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[S1]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1]], ptr align 1 [[S]], i64 12, i1 false)
-; CHECK-NEXT:    call void @g3(ptr [[S1]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @g3(ptr align 64 [[S1]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[S1]])
 ; CHECK-NEXT:    ret void
 ;
@@ -131,7 +131,7 @@ define i32 @test4() nounwind  {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 64
-; CHECK-NEXT:    call void @g3(ptr [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @g3(ptr align 64 [[S]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret i32 4
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
index 16610c439f4c..fff94673a1a5 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
@@ -47,7 +47,7 @@ define void @f(ptr dead_on_unwind noalias %p) {
 ; CHECK-LABEL: define void @f(
 ; CHECK-SAME: ptr dead_on_unwind noalias [[P:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:    store i64 3, ptr [[P]], align 4
-; CHECK-NEXT:    tail call void @j(ptr nonnull [[P]])
+; CHECK-NEXT:    tail call void @j(ptr nonnull align 8 dereferenceable(8) [[P]])
 ; CHECK-NEXT:    store i64 43, ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
-- 
GitLab


From fc362521a3a5d67e3059ca02b504d87c32ede02b Mon Sep 17 00:00:00 2001
From: Alex Voicu <alexandru.voicu@amd.com>
Date: Wed, 16 Oct 2024 18:48:17 +0100
Subject: [PATCH 160/329] [clang][OpenCL][NFC] Switch two tests to being
 generated (#112554)

Turns out these tests are a bit unwieldy to hand-update, so switch them over to being generated, as requested in #112442.
---
 .../CodeGenOpenCL/addr-space-struct-arg.cl    | 1252 +++++++++++++++--
 .../amdgcn-automatic-variable.cl              |  122 +-
 2 files changed, 1252 insertions(+), 122 deletions(-)

diff --git a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
index bab0e21067ee..7377b5bcbc34 100644
--- a/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
+++ b/clang/test/CodeGenOpenCL/addr-space-struct-arg.cl
@@ -1,9 +1,10 @@
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -enable-var-scope -check-prefixes=ALL,X86 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -enable-var-scope -check-prefixes=SPIR %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN,AMDGCN20 %s
-// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -enable-var-scope -check-prefixes=ALL,AMDGCN %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -ffake-address-space-map -triple i686-pc-darwin | FileCheck -check-prefixes=X86 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL2.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN20 %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL1.2 -O0 -triple spir-unknown-unknown-unknown | FileCheck -check-prefixes=SPIR %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn -cl-ext=+__opencl_c_program_scope_global_variables | FileCheck -check-prefixes=AMDGCN30-GVAR %s
+// RUN: %clang_cc1 %s -emit-llvm -o - -cl-std=CL3.0 -O0 -triple amdgcn | FileCheck -check-prefixes=AMDGCN30 %s
 
 typedef int int2 __attribute__((ext_vector_type(2)));
 
@@ -45,147 +46,1236 @@ struct LargeStructTwoMember {
 struct LargeStructOneMember g_s;
 #endif
 
-// X86-LABEL: define{{.*}} void @foo(ptr dead_on_unwind noalias writable sret(%struct.Mat4X4) align 4 %agg.result, ptr noundef byval(%struct.Mat3X3) align 4 %in)
-// AMDGCN-LABEL: define{{.*}} %struct.Mat4X4 @foo([9 x i32] %in.coerce)
+//
+// X86-LABEL: define void @foo(
+// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
+// AMDGCN20-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN20-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN20-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN20-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
+// SPIR-LABEL: define dso_local spir_func void @foo(
+// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT4X4:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT3X3:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0:[0-9]+]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN30-GVAR-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN30-GVAR-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
+// AMDGCN30-LABEL: define dso_local %struct.Mat4X4 @foo(
+// AMDGCN30-SAME: [9 x i32] [[IN_COERCE:%.*]]) #[[ATTR0:[0-9]+]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[RETVAL:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(5) [[IN]], i32 0, i32 0
+// AMDGCN30-NEXT:    store [9 x i32] [[IN_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 4
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = load [[STRUCT_MAT4X4]], ptr addrspace(5) [[RETVAL]], align 4
+// AMDGCN30-NEXT:    ret [[STRUCT_MAT4X4]] [[TMP0]]
+//
 Mat4X4 __attribute__((noinline)) foo(Mat3X3 in) {
   Mat4X4 out;
   return out;
 }
 
-// ALL-LABEL: define {{.*}} void @ker
-// Expect two mem copies: one for the argument "in", and one for
-// the return value.
-// X86: call void @llvm.memcpy.p0.p1.i32(ptr
-// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1)
-
-// AMDGCN: load [9 x i32], ptr addrspace(1)
-// AMDGCN: call %struct.Mat4X4 @foo([9 x i32]
-// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1)
+//
+// X86-LABEL: define spir_kernel void @ker(
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// X86-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
+// X86-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// X86-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0
+// X86-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// X86-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
+// X86-NEXT:    call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
+// X86-NEXT:    call void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]]
+// X86-NEXT:    call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN20-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN20-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @ker(
+// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META3:![0-9]+]] !kernel_arg_access_qual [[META4:![0-9]+]] !kernel_arg_type [[META5:![0-9]+]] !kernel_arg_base_type [[META5]] !kernel_arg_type_qual [[META6:![0-9]+]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4
+// SPIR-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT3X3:%.*]], align 4
+// SPIR-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// SPIR-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i32 0
+// SPIR-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// SPIR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3]], ptr addrspace(1) [[TMP1]], i32 1
+// SPIR-NEXT:    call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 36, i1 false)
+// SPIR-NEXT:    call spir_func void @foo(ptr dead_on_unwind writable sret([[STRUCT_MAT4X4]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT3X3]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3:[0-9]+]]
+// SPIR-NEXT:    call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 64, i1 false)
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN30-GVAR-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN30-GVAR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN30-GVAR-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker(
+// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1:[0-9]+]] !kernel_arg_addr_space [[META4:![0-9]+]] !kernel_arg_access_qual [[META5:![0-9]+]] !kernel_arg_type [[META6:![0-9]+]] !kernel_arg_base_type [[META6]] !kernel_arg_type_qual [[META7:![0-9]+]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT4X4:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT4X4]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT3X3:%.*]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT3X3]], ptr addrspace(1) [[ARRAYIDX1]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP3:%.*]] = load [9 x i32], ptr addrspace(1) [[TMP2]], align 4
+// AMDGCN30-NEXT:    [[CALL:%.*]] = call [[STRUCT_MAT4X4]] @[[FOO:[a-zA-Z0-9_$\"\\.-]*[a-zA-Z_$\"\\.-][a-zA-Z0-9_$\"\\.-]*]]([9 x i32] [[TMP3]]) #[[ATTR3:[0-9]+]]
+// AMDGCN30-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_MAT4X4]], ptr addrspace(5) [[TMP]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP5:%.*]] = extractvalue [[STRUCT_MAT4X4]] [[CALL]], 0
+// AMDGCN30-NEXT:    store [16 x i32] [[TMP5]], ptr addrspace(5) [[TMP4]], align 4
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 64, i1 false)
+// AMDGCN30-NEXT:    ret void
+//
 kernel void ker(global Mat3X3 *in, global Mat4X4 *out) {
   out[0] = foo(in[1]);
 }
 
-// X86-LABEL: define{{.*}} void @foo_large(ptr dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr noundef byval(%struct.Mat32X32) align 4 %in)
-// AMDGCN-LABEL: define{{.*}} void @foo_large(ptr addrspace(5) dead_on_unwind noalias writable sret(%struct.Mat64X64) align 4 %agg.result, ptr addrspace(5) noundef byref(%struct.Mat32X32) align 4 %{{.*}}
-// AMDGCN:       %in = alloca %struct.Mat32X32, align 4, addrspace(5)
-// AMDGCN-NEXT:  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 %in, ptr addrspace(5) align 4 %{{.*}}, i64 4096, i1 false)
+//
+// X86-LABEL: define void @foo_large(
+// X86-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[RESULT_PTR:%.*]] = alloca ptr, align 4
+// X86-NEXT:    store ptr [[AGG_RESULT]], ptr [[RESULT_PTR]], align 4
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local void @foo_large(
+// AMDGCN-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local void @foo_large(
+// AMDGCN20-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @foo_large(
+// SPIR-SAME: ptr dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr noundef byval([[STRUCT_MAT32X32:%.*]]) align 4 [[IN:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @foo_large(
+// AMDGCN30-GVAR-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local void @foo_large(
+// AMDGCN30-SAME: ptr addrspace(5) dead_on_unwind noalias writable sret([[STRUCT_MAT64X64:%.*]]) align 4 [[AGG_RESULT:%.*]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[IN:%.*]] = alloca [[STRUCT_MAT32X32]], align 4, addrspace(5)
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 4 [[IN]], ptr addrspace(5) align 4 [[TMP0]], i64 4096, i1 false)
+// AMDGCN30-NEXT:    ret void
+//
 Mat64X64 __attribute__((noinline)) foo_large(Mat32X32 in) {
   Mat64X64 out;
   return out;
 }
 
-// ALL-LABEL: define {{.*}} void @ker_large
-// Expect two mem copies: one for the argument "in", and one for
-// the return value.
-// X86: call void @llvm.memcpy.p0.p1.i32(ptr
-// X86: call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1)
-// AMDGCN: call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1)
+//
+// X86-LABEL: define spir_kernel void @ker_large(
+// X86-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// X86-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
+// X86-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// X86-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0
+// X86-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// X86-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
+// X86-NEXT:    call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
+// X86-NEXT:    call void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT:    call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN20-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN20-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN20-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @ker_large(
+// SPIR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META3]] !kernel_arg_access_qual [[META4]] !kernel_arg_type [[META7:![0-9]+]] !kernel_arg_base_type [[META7]] !kernel_arg_type_qual [[META6]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4
+// SPIR-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4
+// SPIR-NEXT:    store ptr addrspace(1) [[IN]], ptr [[IN_ADDR]], align 4
+// SPIR-NEXT:    store ptr addrspace(1) [[OUT]], ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[OUT_ADDR]], align 4
+// SPIR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i32 0
+// SPIR-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr [[IN_ADDR]], align 4
+// SPIR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i32 1
+// SPIR-NEXT:    call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i32 4096, i1 false)
+// SPIR-NEXT:    call spir_func void @foo_large(ptr dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr noundef byval([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// SPIR-NEXT:    call void @llvm.memcpy.p1.p0.i32(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr align 4 [[TMP]], i32 16384, i1 false)
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN30-GVAR-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @ker_large(
+// AMDGCN30-SAME: ptr addrspace(1) noundef align 4 [[IN:%.*]], ptr addrspace(1) noundef align 4 [[OUT:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META4]] !kernel_arg_access_qual [[META5]] !kernel_arg_type [[META8:![0-9]+]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META7]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[IN_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[OUT_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[TMP:%.*]] = alloca [[STRUCT_MAT64X64:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_MAT32X32:%.*]], align 4, addrspace(5)
+// AMDGCN30-NEXT:    store ptr addrspace(1) [[IN]], ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT:    store ptr addrspace(1) [[OUT]], ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[OUT_ADDR]], align 8
+// AMDGCN30-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [[STRUCT_MAT64X64]], ptr addrspace(1) [[TMP0]], i64 0
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[IN_ADDR]], align 8
+// AMDGCN30-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [[STRUCT_MAT32X32]], ptr addrspace(1) [[TMP1]], i64 1
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 4 [[ARRAYIDX1]], i64 4096, i1 false)
+// AMDGCN30-NEXT:    call void @foo_large(ptr addrspace(5) dead_on_unwind writable sret([[STRUCT_MAT64X64]]) align 4 [[TMP]], ptr addrspace(5) noundef byref([[STRUCT_MAT32X32]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p1.p5.i64(ptr addrspace(1) align 4 [[ARRAYIDX]], ptr addrspace(5) align 4 [[TMP]], i64 16384, i1 false)
+// AMDGCN30-NEXT:    ret void
+//
 kernel void ker_large(global Mat32X32 *in, global Mat64X64 *out) {
   out[0] = foo_large(in[1]);
 }
 
-// AMDGCN-LABEL: define{{.*}} void @FuncOneMember(<2 x i32> %u.coerce)
+//
+// X86-LABEL: define void @FuncOneMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER]], align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 8, i1 false)
+// X86-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// X86-NEXT:    store <2 x i32> [[TMP1]], ptr [[X]], align 8
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncOneMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// SPIR-NEXT:    store <2 x i32> [[TMP0]], ptr [[X]], align 8
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncOneMember(
+// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    store <2 x i32> [[TMP0]], ptr addrspace(5) [[X]], align 8
+// AMDGCN30-NEXT:    ret void
+//
 void FuncOneMember(struct StructOneMember u) {
   u.x = (int2)(0, 0);
 }
 
-// AMDGCN-LABEL: define{{.*}} void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %{{.*}}
-// AMDGCN:  %u = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN:  call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %u, ptr addrspace(5) align 8 %{{.*}}, i64 800, i1 false)
-// AMDGCN-NOT: addrspacecast
-// AMDGCN:   store <2 x i32> %{{.*}}, ptr addrspace(5)
+//
+// X86-LABEL: define void @FuncOneLargeMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 800, i1 false)
+// X86-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0
+// X86-NEXT:    store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncOneLargeMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr [[U]], i32 0, i32 0
+// SPIR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr [[X]], i32 0, i32 0
+// SPIR-NEXT:    store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncOneLargeMember(
+// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 800, i1 false)
+// AMDGCN30-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[X:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x <2 x i32>], ptr addrspace(5) [[X]], i64 0, i64 0
+// AMDGCN30-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-NEXT:    ret void
+//
 void FuncOneLargeMember(struct LargeStructOneMember u) {
   u.x[0] = (int2)(0, 0);
 }
 
-// AMDGCN20-LABEL: define{{.*}} void @test_indirect_arg_globl()
-// AMDGCN20:  %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN20:  call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
-// AMDGCN20:  call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
 #if (__OPENCL_C_VERSION__ == 200) || (__OPENCL_C_VERSION__ >= 300 && defined(__opencl_c_program_scope_global_variables))
+// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_globl(
+// AMDGCN20-SAME: ) #[[ATTR0]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
+// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_globl(
+// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p1.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 @g_s, i64 800, i1 false)
+// AMDGCN30-GVAR-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
 void test_indirect_arg_globl(void) {
   FuncOneLargeMember(g_s);
 }
 #endif
 
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @test_indirect_arg_local()
-// AMDGCN: %[[byval_temp:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 %[[byval_temp]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
-// AMDGCN: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[byval_temp]])
+//
+// X86-LABEL: define spir_kernel void @test_indirect_arg_local(
+// X86-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT:    call void @llvm.memcpy.p0.p3.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// X86-NEXT:    call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN20-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @test_indirect_arg_local(
+// SPIR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META8:![0-9]+]] !kernel_arg_access_qual [[META8]] !kernel_arg_type [[META8]] !kernel_arg_base_type [[META8]] !kernel_arg_type_qual [[META8]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
+// SPIR-NEXT:    call void @llvm.memcpy.p0.p3.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i32 800, i1 false)
+// SPIR-NEXT:    call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN30-GVAR-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN30-GVAR-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @test_indirect_arg_local(
+// AMDGCN30-SAME: ) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META9]] !kernel_arg_type [[META9]] !kernel_arg_base_type [[META9]] !kernel_arg_type_qual [[META9]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p3.i64(ptr addrspace(5) align 8 [[BYVAL_TEMP]], ptr addrspace(3) align 8 @test_indirect_arg_local.l_s, i64 800, i1 false)
+// AMDGCN30-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    ret void
+//
 kernel void test_indirect_arg_local(void) {
   local struct LargeStructOneMember l_s;
   FuncOneLargeMember(l_s);
 }
 
-// AMDGCN-LABEL: define{{.*}} void @test_indirect_arg_private()
-// AMDGCN: %[[p_s:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN-NOT: @llvm.memcpy
-// AMDGCN-NEXT: call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[p_s]])
+//
+// X86-LABEL: define void @test_indirect_arg_private(
+// X86-SAME: ) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
+// X86-NEXT:    call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[P_S]]) #[[ATTR3]]
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN-SAME: ) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN20-SAME: ) #[[ATTR0]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @test_indirect_arg_private(
+// SPIR-SAME: ) #[[ATTR0]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8
+// SPIR-NEXT:    call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN30-GVAR-SAME: ) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local void @test_indirect_arg_private(
+// AMDGCN30-SAME: ) #[[ATTR0]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[P_S:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[P_S]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    ret void
+//
 void test_indirect_arg_private(void) {
   struct LargeStructOneMember p_s;
   FuncOneLargeMember(p_s);
 }
 
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelOneMember
-// AMDGCN-SAME:  (<2 x i32> %[[u_coerce:.*]])
-// AMDGCN:  %[[u:.*]] = alloca %struct.StructOneMember, align 8, addrspace(5)
-// AMDGCN:  %[[coerce_dive:.*]] = getelementptr inbounds nuw %struct.StructOneMember, ptr addrspace(5) %[[u]], i32 0, i32 0
-// AMDGCN:  store <2 x i32> %[[u_coerce]], ptr addrspace(5) %[[coerce_dive]]
-// AMDGCN:  call void @FuncOneMember(<2 x i32>
+//
+// X86-LABEL: define spir_kernel void @KernelOneMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN20-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN20-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9:![0-9]+]] !kernel_arg_access_qual [[META10:![0-9]+]] !kernel_arg_type [[META11:![0-9]+]] !kernel_arg_base_type [[META11]] !kernel_arg_type_qual [[META12:![0-9]+]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-GVAR-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMember(
+// AMDGCN30-SAME: <2 x i32> [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10:![0-9]+]] !kernel_arg_access_qual [[META11:![0-9]+]] !kernel_arg_type [[META12:![0-9]+]] !kernel_arg_base_type [[META12]] !kernel_arg_type_qual [[META13:![0-9]+]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    store <2 x i32> [[U_COERCE]], ptr addrspace(5) [[COERCE_DIVE]], align 8
+// AMDGCN30-NEXT:    [[COERCE_DIVE1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr addrspace(5) [[COERCE_DIVE1]], align 8
+// AMDGCN30-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP0]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    ret void
+//
 kernel void KernelOneMember(struct StructOneMember u) {
   FuncOneMember(u);
 }
 
-// SPIR: call void @llvm.memcpy.p0.p1.i32
-// SPIR-NOT: addrspacecast
+//
+// X86-LABEL: define spir_kernel void @KernelOneMemberSpir(
+// X86-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// X86-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 4
+// X86-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
+// X86-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
+// X86-NEXT:    call void @llvm.memcpy.p0.p1.i32(ptr align 4 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
+// X86-NEXT:    call void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 4 [[BYVAL_TEMP]]) #[[ATTR3]]
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN20-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN20-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN20-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN20-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelOneMemberSpir(
+// SPIR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META13:![0-9]+]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META14:![0-9]+]] !kernel_arg_base_type [[META14]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 4
+// SPIR-NEXT:    [[BYVAL_TEMP:%.*]] = alloca [[STRUCT_STRUCTONEMEMBER:%.*]], align 8
+// SPIR-NEXT:    store ptr addrspace(1) [[U]], ptr [[U_ADDR]], align 4
+// SPIR-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr [[U_ADDR]], align 4
+// SPIR-NEXT:    call void @llvm.memcpy.p0.p1.i32(ptr align 8 [[BYVAL_TEMP]], ptr addrspace(1) align 8 [[TMP0]], i32 8, i1 false)
+// SPIR-NEXT:    call spir_func void @FuncOneMember(ptr noundef byval([[STRUCT_STRUCTONEMEMBER]]) align 8 [[BYVAL_TEMP]]) #[[ATTR3]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN30-GVAR-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-GVAR-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelOneMemberSpir(
+// AMDGCN30-SAME: ptr addrspace(1) noundef align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META14:![0-9]+]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U_ADDR:%.*]] = alloca ptr addrspace(1), align 8, addrspace(5)
+// AMDGCN30-NEXT:    store ptr addrspace(1) [[U]], ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = load ptr addrspace(1), ptr addrspace(5) [[U_ADDR]], align 8
+// AMDGCN30-NEXT:    [[COERCE_DIVE:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTONEMEMBER:%.*]], ptr addrspace(1) [[TMP0]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(1) [[COERCE_DIVE]], align 8
+// AMDGCN30-NEXT:    call void @FuncOneMember(<2 x i32> [[TMP1]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    ret void
+//
 kernel void KernelOneMemberSpir(global struct StructOneMember* u) {
   FuncOneMember(*u);
 }
 
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeOneMember(
-// AMDGCN:  %[[U:.*]] = alloca %struct.LargeStructOneMember, align 8, addrspace(5)
-// AMDGCN:  %[[U_ELEM:.*]] = getelementptr inbounds nuw %struct.LargeStructOneMember, ptr addrspace(5) %[[U]], i32 0, i32 0
-// AMDGCN:  %[[EXTRACT:.*]] = extractvalue %struct.LargeStructOneMember %u.coerce, 0
-// AMDGCN:  store [100 x <2 x i32>] %[[EXTRACT]], ptr addrspace(5) %[[U_ELEM]], align 8
-// AMDGCN:  call void @FuncOneLargeMember(ptr addrspace(5) noundef byref(%struct.LargeStructOneMember) align 8 %[[U]])
+//
+// X86-LABEL: define spir_kernel void @KernelLargeOneMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    call void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN20-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeOneMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META15:![0-9]+]] !kernel_arg_base_type [[META15]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    call spir_func void @FuncOneLargeMember(ptr noundef byval([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-GVAR-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeOneMember(
+// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTONEMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTONEMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTONEMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTONEMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-NEXT:    store [100 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT:    call void @FuncOneLargeMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTONEMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    ret void
+//
 kernel void KernelLargeOneMember(struct LargeStructOneMember u) {
   FuncOneLargeMember(u);
 }
 
-// AMDGCN-LABEL: define{{.*}} void @FuncTwoMember(<2 x i32> %u.coerce0, <2 x i32> %u.coerce1)
+//
+// X86-LABEL: define void @FuncTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 16, i1 false)
+// X86-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// X86-NEXT:    store <2 x i32> [[TMP1]], ptr [[Y]], align 8
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN20-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// SPIR-NEXT:    store <2 x i32> [[TMP0]], ptr [[Y]], align 8
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN30-GVAR-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncTwoMember(
+// AMDGCN30-SAME: <2 x i32> [[U_COERCE0:%.*]], <2 x i32> [[U_COERCE1:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER:%.*]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    store <2 x i32> [[U_COERCE0]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    store <2 x i32> [[U_COERCE1]], ptr addrspace(5) [[TMP1]], align 8
+// AMDGCN30-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[TMP2:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    store <2 x i32> [[TMP2]], ptr addrspace(5) [[Y]], align 8
+// AMDGCN30-NEXT:    ret void
+//
 void FuncTwoMember(struct StructTwoMember u) {
   u.y = (int2)(0, 0);
 }
 
-// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember
-// AMDGCN-SAME: (ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]])
-// AMDGCN: %[[U:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN: call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 %[[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+//
+// X86-LABEL: define void @FuncLargeTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 4 [[TMP0:%.*]]) #[[ATTR0]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8
+// X86-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// X86-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 8 [[U]], ptr align 4 [[TMP0]], i32 480, i1 false)
+// X86-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// X86-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// X86-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0
+// X86-NEXT:    store <2 x i32> [[TMP1]], ptr [[ARRAYIDX]], align 8
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN20-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN20-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN20-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN20-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_func void @FuncLargeTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR0]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8
+// SPIR-NEXT:    store <2 x i32> zeroinitializer, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[TMP0:%.*]] = load <2 x i32>, ptr [[DOTCOMPOUNDLITERAL]], align 8
+// SPIR-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr [[U]], i32 0, i32 1
+// SPIR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr [[Y]], i32 0, i32 0
+// SPIR-NEXT:    store <2 x i32> [[TMP0]], ptr [[ARRAYIDX]], align 8
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN30-GVAR-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-GVAR-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local void @FuncLargeTwoMember(
+// AMDGCN30-SAME: ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[TMP0:%.*]]) #[[ATTR0]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca <2 x i32>, align 8, addrspace(5)
+// AMDGCN30-NEXT:    call void @llvm.memcpy.p5.p5.i64(ptr addrspace(5) align 8 [[U]], ptr addrspace(5) align 8 [[TMP0]], i64 480, i1 false)
+// AMDGCN30-NEXT:    store <2 x i32> zeroinitializer, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = load <2 x i32>, ptr addrspace(5) [[DOTCOMPOUNDLITERAL]], align 8
+// AMDGCN30-NEXT:    [[Y:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [20 x <2 x i32>], ptr addrspace(5) [[Y]], i64 0, i64 0
+// AMDGCN30-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[ARRAYIDX]], align 8
+// AMDGCN30-NEXT:    ret void
+//
 void FuncLargeTwoMember(struct LargeStructTwoMember u) {
   u.y[0] = (int2)(0, 0);
 }
 
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelTwoMember
-// AMDGCN-SAME:  (%struct.StructTwoMember %[[u_coerce:.*]])
-// AMDGCN:  %[[u:.*]] = alloca %struct.StructTwoMember, align 8, addrspace(5)
-// AMDGCN: %[[LD0:.*]] = load <2 x i32>, ptr addrspace(5)
-// AMDGCN: %[[LD1:.*]] = load <2 x i32>, ptr addrspace(5)
-// AMDGCN: call void @FuncTwoMember(<2 x i32> %[[LD0]], <2 x i32> %[[LD1]])
+//
+// X86-LABEL: define spir_kernel void @KernelTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    call void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN20-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN20-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN20-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN20-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN20-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_STRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META16:![0-9]+]] !kernel_arg_base_type [[META16]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    call spir_func void @FuncTwoMember(ptr noundef byval([[STRUCT_STRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN30-GVAR-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-GVAR-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelTwoMember(
+// AMDGCN30-SAME: [[STRUCT_STRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_STRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-NEXT:    store <2 x i32> [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_STRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-NEXT:    store <2 x i32> [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-NEXT:    [[TMP4:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP5:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP4]], align 8
+// AMDGCN30-NEXT:    [[TMP6:%.*]] = getelementptr inbounds nuw [[STRUCT_STRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    [[TMP7:%.*]] = load <2 x i32>, ptr addrspace(5) [[TMP6]], align 8
+// AMDGCN30-NEXT:    call void @FuncTwoMember(<2 x i32> [[TMP5]], <2 x i32> [[TMP7]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    ret void
+//
 kernel void KernelTwoMember(struct StructTwoMember u) {
   FuncTwoMember(u);
 }
 
-// AMDGCN-LABEL: define{{.*}} amdgpu_kernel void @KernelLargeTwoMember
-// AMDGCN-SAME:  (%struct.LargeStructTwoMember %[[u_coerce:.*]])
-// AMDGCN:  %[[u:.*]] = alloca %struct.LargeStructTwoMember, align 8, addrspace(5)
-// AMDGCN:  %[[U_PTR0:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 0
-// AMDGCN:  %[[EXTRACT0:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 0
-// AMDGCN:  store [40 x <2 x i32>] %[[EXTRACT0]], ptr addrspace(5) %[[U_PTR0]]
-// AMDGCN:  %[[U_PTR1:.*]] = getelementptr inbounds nuw %struct.LargeStructTwoMember, ptr addrspace(5) %[[u]], i32 0, i32 1
-// AMDGCN:  %[[EXTRACT1:.*]] = extractvalue %struct.LargeStructTwoMember %u.coerce, 1
-// AMDGCN:  store [20 x <2 x i32>] %[[EXTRACT1]], ptr addrspace(5) %[[U_PTR1]]
-// AMDGCN:  call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref(%struct.LargeStructTwoMember) align 8 %[[u]])
+//
+// X86-LABEL: define spir_kernel void @KernelLargeTwoMember(
+// X86-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// X86-NEXT:  [[ENTRY:.*:]]
+// X86-NEXT:    call void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 4 [[U]]) #[[ATTR3]]
+// X86-NEXT:    ret void
+//
+// AMDGCN-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN-NEXT:  [[ENTRY:.*:]]
+// AMDGCN-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN-NEXT:    ret void
+//
+// AMDGCN20-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN20-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN20-NEXT:  [[ENTRY:.*:]]
+// AMDGCN20-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN20-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN20-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN20-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN20-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN20-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN20-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN20-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN20-NEXT:    ret void
+//
+// SPIR-LABEL: define dso_local spir_kernel void @KernelLargeTwoMember(
+// SPIR-SAME: ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER:%.*]]) align 8 [[U:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META9]] !kernel_arg_access_qual [[META10]] !kernel_arg_type [[META17:![0-9]+]] !kernel_arg_base_type [[META17]] !kernel_arg_type_qual [[META12]] {
+// SPIR-NEXT:  [[ENTRY:.*:]]
+// SPIR-NEXT:    call spir_func void @FuncLargeTwoMember(ptr noundef byval([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// SPIR-NEXT:    ret void
+//
+// AMDGCN30-GVAR-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN30-GVAR-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-GVAR-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-GVAR-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-GVAR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-GVAR-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-GVAR-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-GVAR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-GVAR-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-GVAR-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-GVAR-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-GVAR-NEXT:    ret void
+//
+// AMDGCN30-LABEL: define dso_local amdgpu_kernel void @KernelLargeTwoMember(
+// AMDGCN30-SAME: [[STRUCT_LARGESTRUCTTWOMEMBER:%.*]] [[U_COERCE:%.*]]) #[[ATTR1]] !kernel_arg_addr_space [[META10]] !kernel_arg_access_qual [[META11]] !kernel_arg_type [[META18:![0-9]+]] !kernel_arg_base_type [[META18]] !kernel_arg_type_qual [[META13]] {
+// AMDGCN30-NEXT:  [[ENTRY:.*:]]
+// AMDGCN30-NEXT:    [[U:%.*]] = alloca [[STRUCT_LARGESTRUCTTWOMEMBER]], align 8, addrspace(5)
+// AMDGCN30-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 0
+// AMDGCN30-NEXT:    [[TMP1:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 0
+// AMDGCN30-NEXT:    store [40 x <2 x i32>] [[TMP1]], ptr addrspace(5) [[TMP0]], align 8
+// AMDGCN30-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw [[STRUCT_LARGESTRUCTTWOMEMBER]], ptr addrspace(5) [[U]], i32 0, i32 1
+// AMDGCN30-NEXT:    [[TMP3:%.*]] = extractvalue [[STRUCT_LARGESTRUCTTWOMEMBER]] [[U_COERCE]], 1
+// AMDGCN30-NEXT:    store [20 x <2 x i32>] [[TMP3]], ptr addrspace(5) [[TMP2]], align 8
+// AMDGCN30-NEXT:    call void @FuncLargeTwoMember(ptr addrspace(5) noundef byref([[STRUCT_LARGESTRUCTTWOMEMBER]]) align 8 [[U]]) #[[ATTR3]]
+// AMDGCN30-NEXT:    ret void
+//
 kernel void KernelLargeTwoMember(struct LargeStructTwoMember u) {
   FuncLargeTwoMember(u);
 }
+//.
+// X86: [[META4]] = !{i32 1, i32 1}
+// X86: [[META5]] = !{!"none", !"none"}
+// X86: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// X86: [[META7]] = !{!"", !""}
+// X86: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// X86: [[META9]] = !{}
+// X86: [[META10]] = !{i32 0}
+// X86: [[META11]] = !{!"none"}
+// X86: [[META12]] = !{!"struct StructOneMember"}
+// X86: [[META13]] = !{!""}
+// X86: [[META14]] = !{i32 1}
+// X86: [[META15]] = !{!"struct StructOneMember*"}
+// X86: [[META16]] = !{!"struct LargeStructOneMember"}
+// X86: [[META17]] = !{!"struct StructTwoMember"}
+// X86: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN: [[META4]] = !{i32 1, i32 1}
+// AMDGCN: [[META5]] = !{!"none", !"none"}
+// AMDGCN: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN: [[META7]] = !{!"", !""}
+// AMDGCN: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN: [[META9]] = !{}
+// AMDGCN: [[META10]] = !{i32 0}
+// AMDGCN: [[META11]] = !{!"none"}
+// AMDGCN: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN: [[META13]] = !{!""}
+// AMDGCN: [[META14]] = !{i32 1}
+// AMDGCN: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN20: [[META4]] = !{i32 1, i32 1}
+// AMDGCN20: [[META5]] = !{!"none", !"none"}
+// AMDGCN20: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN20: [[META7]] = !{!"", !""}
+// AMDGCN20: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN20: [[META9]] = !{}
+// AMDGCN20: [[META10]] = !{i32 0}
+// AMDGCN20: [[META11]] = !{!"none"}
+// AMDGCN20: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN20: [[META13]] = !{!""}
+// AMDGCN20: [[META14]] = !{i32 1}
+// AMDGCN20: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN20: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN20: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN20: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// SPIR: [[META3]] = !{i32 1, i32 1}
+// SPIR: [[META4]] = !{!"none", !"none"}
+// SPIR: [[META5]] = !{!"Mat3X3*", !"Mat4X4*"}
+// SPIR: [[META6]] = !{!"", !""}
+// SPIR: [[META7]] = !{!"Mat32X32*", !"Mat64X64*"}
+// SPIR: [[META8]] = !{}
+// SPIR: [[META9]] = !{i32 0}
+// SPIR: [[META10]] = !{!"none"}
+// SPIR: [[META11]] = !{!"struct StructOneMember"}
+// SPIR: [[META12]] = !{!""}
+// SPIR: [[META13]] = !{i32 1}
+// SPIR: [[META14]] = !{!"struct StructOneMember*"}
+// SPIR: [[META15]] = !{!"struct LargeStructOneMember"}
+// SPIR: [[META16]] = !{!"struct StructTwoMember"}
+// SPIR: [[META17]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN30-GVAR: [[META4]] = !{i32 1, i32 1}
+// AMDGCN30-GVAR: [[META5]] = !{!"none", !"none"}
+// AMDGCN30-GVAR: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN30-GVAR: [[META7]] = !{!"", !""}
+// AMDGCN30-GVAR: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN30-GVAR: [[META9]] = !{}
+// AMDGCN30-GVAR: [[META10]] = !{i32 0}
+// AMDGCN30-GVAR: [[META11]] = !{!"none"}
+// AMDGCN30-GVAR: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN30-GVAR: [[META13]] = !{!""}
+// AMDGCN30-GVAR: [[META14]] = !{i32 1}
+// AMDGCN30-GVAR: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN30-GVAR: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN30-GVAR: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN30-GVAR: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
+// AMDGCN30: [[META4]] = !{i32 1, i32 1}
+// AMDGCN30: [[META5]] = !{!"none", !"none"}
+// AMDGCN30: [[META6]] = !{!"Mat3X3*", !"Mat4X4*"}
+// AMDGCN30: [[META7]] = !{!"", !""}
+// AMDGCN30: [[META8]] = !{!"Mat32X32*", !"Mat64X64*"}
+// AMDGCN30: [[META9]] = !{}
+// AMDGCN30: [[META10]] = !{i32 0}
+// AMDGCN30: [[META11]] = !{!"none"}
+// AMDGCN30: [[META12]] = !{!"struct StructOneMember"}
+// AMDGCN30: [[META13]] = !{!""}
+// AMDGCN30: [[META14]] = !{i32 1}
+// AMDGCN30: [[META15]] = !{!"struct StructOneMember*"}
+// AMDGCN30: [[META16]] = !{!"struct LargeStructOneMember"}
+// AMDGCN30: [[META17]] = !{!"struct StructTwoMember"}
+// AMDGCN30: [[META18]] = !{!"struct LargeStructTwoMember"}
+//.
diff --git a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
index f26495bc44aa..c847f5850b22 100644
--- a/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
+++ b/clang/test/CodeGenOpenCL/amdgcn-automatic-variable.cl
@@ -1,67 +1,107 @@
-// RUN: %clang_cc1 -O0 -cl-std=CL1.2 -triple amdgcn---amdgizcl -emit-llvm %s -o - | FileCheck -check-prefixes=CHECK,CL12 %s
-// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn---amdgizcl -emit-llvm %s -o - | FileCheck -check-prefixes=CHECK,CL20 %s
+// NOTE: Assertions have been autogenerated by utils/update_cc_test_checks.py UTC_ARGS: --version 5
+// RUN: %clang_cc1 -O0 -cl-std=CL1.2 -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck -check-prefixes=CL12 %s
+// RUN: %clang_cc1 -O0 -cl-std=CL2.0 -triple amdgcn-amd-amdhsa -emit-llvm %s -o - | FileCheck -check-prefixes=CL20 %s
 
-// CL12-LABEL: define{{.*}} void @func1(ptr addrspace(5) noundef %x)
-// CL20-LABEL: define{{.*}} void @func1(ptr noundef %x)
+// CL12-LABEL: define dso_local void @func1(
+// CL12-SAME: ptr addrspace(5) noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CL12-NEXT:  [[ENTRY:.*:]]
+// CL12-NEXT:    [[X_ADDR:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// CL12-NEXT:    store ptr addrspace(5) [[X]], ptr addrspace(5) [[X_ADDR]], align 4
+// CL12-NEXT:    [[TMP0:%.*]] = load ptr addrspace(5), ptr addrspace(5) [[X_ADDR]], align 4
+// CL12-NEXT:    store i32 1, ptr addrspace(5) [[TMP0]], align 4
+// CL12-NEXT:    ret void
+//
+// CL20-LABEL: define dso_local void @func1(
+// CL20-SAME: ptr noundef [[X:%.*]]) #[[ATTR0:[0-9]+]] {
+// CL20-NEXT:  [[ENTRY:.*:]]
+// CL20-NEXT:    [[X_ADDR:%.*]] = alloca ptr, align 8, addrspace(5)
+// CL20-NEXT:    store ptr [[X]], ptr addrspace(5) [[X_ADDR]], align 8
+// CL20-NEXT:    [[TMP0:%.*]] = load ptr, ptr addrspace(5) [[X_ADDR]], align 8
+// CL20-NEXT:    store i32 1, ptr [[TMP0]], align 4
+// CL20-NEXT:    ret void
+//
 void func1(int *x) {
-  // CL12: %[[x_addr:.*]] = alloca ptr addrspace(5){{.*}}addrspace(5)
-  // CL12: store ptr addrspace(5) %x, ptr addrspace(5) %[[x_addr]]
-  // CL12: %[[r0:.*]] = load ptr addrspace(5), ptr addrspace(5) %[[x_addr]]
-  // CL12: store i32 1, ptr addrspace(5) %[[r0]]
-  // CL20: %[[x_addr:.*]] = alloca ptr{{.*}}addrspace(5)
-  // CL20: store ptr %x, ptr addrspace(5) %[[x_addr]]
-  // CL20: %[[r0:.*]] = load ptr, ptr addrspace(5) %[[x_addr]]
-  // CL20: store i32 1, ptr %[[r0]]
   *x = 1;
 }
 
-// CHECK-LABEL: define{{.*}} void @func2()
+// CL12-LABEL: define dso_local void @func2(
+// CL12-SAME: ) #[[ATTR0]] {
+// CL12-NEXT:  [[ENTRY:.*:]]
+// CL12-NEXT:    [[LV1:%.*]] = alloca i32, align 4, addrspace(5)
+// CL12-NEXT:    [[LV2:%.*]] = alloca i32, align 4, addrspace(5)
+// CL12-NEXT:    [[LA:%.*]] = alloca [100 x i32], align 4, addrspace(5)
+// CL12-NEXT:    [[LP1:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// CL12-NEXT:    [[LP2:%.*]] = alloca ptr addrspace(5), align 4, addrspace(5)
+// CL12-NEXT:    [[LVC:%.*]] = alloca i32, align 4, addrspace(5)
+// CL12-NEXT:    store i32 1, ptr addrspace(5) [[LV1]], align 4
+// CL12-NEXT:    store i32 2, ptr addrspace(5) [[LV2]], align 4
+// CL12-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL12-NEXT:    store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4
+// CL12-NEXT:    store ptr addrspace(5) [[LV1]], ptr addrspace(5) [[LP1]], align 4
+// CL12-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL12-NEXT:    store ptr addrspace(5) [[ARRAYDECAY]], ptr addrspace(5) [[LP2]], align 4
+// CL12-NEXT:    call void @func1(ptr addrspace(5) noundef [[LV1]]) #[[ATTR2:[0-9]+]]
+// CL12-NEXT:    store i32 4, ptr addrspace(5) [[LVC]], align 4
+// CL12-NEXT:    store i32 4, ptr addrspace(5) [[LV1]], align 4
+// CL12-NEXT:    ret void
+//
+// CL20-LABEL: define dso_local void @func2(
+// CL20-SAME: ) #[[ATTR0]] {
+// CL20-NEXT:  [[ENTRY:.*:]]
+// CL20-NEXT:    [[LV1:%.*]] = alloca i32, align 4, addrspace(5)
+// CL20-NEXT:    [[LV2:%.*]] = alloca i32, align 4, addrspace(5)
+// CL20-NEXT:    [[LA:%.*]] = alloca [100 x i32], align 4, addrspace(5)
+// CL20-NEXT:    [[LP1:%.*]] = alloca ptr, align 8, addrspace(5)
+// CL20-NEXT:    [[LP2:%.*]] = alloca ptr, align 8, addrspace(5)
+// CL20-NEXT:    [[LVC:%.*]] = alloca i32, align 4, addrspace(5)
+// CL20-NEXT:    store i32 1, ptr addrspace(5) [[LV1]], align 4
+// CL20-NEXT:    store i32 2, ptr addrspace(5) [[LV2]], align 4
+// CL20-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL20-NEXT:    store i32 3, ptr addrspace(5) [[ARRAYIDX]], align 4
+// CL20-NEXT:    [[LV1_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
+// CL20-NEXT:    store ptr [[LV1_ASCAST]], ptr addrspace(5) [[LP1]], align 8
+// CL20-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) [[LA]], i64 0, i64 0
+// CL20-NEXT:    [[ARRAYDECAY_ASCAST:%.*]] = addrspacecast ptr addrspace(5) [[ARRAYDECAY]] to ptr
+// CL20-NEXT:    store ptr [[ARRAYDECAY_ASCAST]], ptr addrspace(5) [[LP2]], align 8
+// CL20-NEXT:    [[LV1_ASCAST1:%.*]] = addrspacecast ptr addrspace(5) [[LV1]] to ptr
+// CL20-NEXT:    call void @func1(ptr noundef [[LV1_ASCAST1]]) #[[ATTR2:[0-9]+]]
+// CL20-NEXT:    store i32 4, ptr addrspace(5) [[LVC]], align 4
+// CL20-NEXT:    store i32 4, ptr addrspace(5) [[LV1]], align 4
+// CL20-NEXT:    ret void
+//
 void func2(void) {
-  // CHECK: %lv1 = alloca i32, align 4, addrspace(5)
-  // CHECK: %lv2 = alloca i32, align 4, addrspace(5)
-  // CHECK: %la = alloca [100 x i32], align 4, addrspace(5)
-  // CL12: %lp1 = alloca ptr addrspace(5), align 4, addrspace(5)
-  // CL12: %lp2 = alloca ptr addrspace(5), align 4, addrspace(5)
-  // CL20: %lp1 = alloca ptr, align 8, addrspace(5)
-  // CL20: %lp2 = alloca ptr, align 8, addrspace(5)
-  // CHECK: %lvc = alloca i32, align 4, addrspace(5)
-
-  // CHECK: store i32 1, ptr addrspace(5) %lv1
   int lv1;
   lv1 = 1;
-  // CHECK: store i32 2, ptr addrspace(5) %lv2
+
   int lv2 = 2;
 
-  // CHECK: %[[arrayidx:.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) %la, i64 0, i64 0
-  // CHECK: store i32 3, ptr addrspace(5) %[[arrayidx]], align 4
   int la[100];
   la[0] = 3;
 
-  // CL12: store ptr addrspace(5) %lv1, ptr addrspace(5) %lp1, align 4
-  // CL20: %[[r0:.*]] = addrspacecast ptr addrspace(5) %lv1 to ptr
-  // CL20: store ptr %[[r0]], ptr addrspace(5) %lp1, align 8
   int *lp1 = &lv1;
 
-  // CHECK: %[[arraydecay:.*]] = getelementptr inbounds [100 x i32], ptr addrspace(5) %la, i64 0, i64 0
-  // CL12: store ptr addrspace(5) %[[arraydecay]], ptr addrspace(5) %lp2, align 4
-  // CL20: %[[r1:.*]] = addrspacecast ptr addrspace(5) %[[arraydecay]] to ptr
-  // CL20: store ptr %[[r1]], ptr addrspace(5) %lp2, align 8
   int *lp2 = la;
 
-  // CL12: call void @func1(ptr addrspace(5) noundef %lv1)
-  // CL20: %[[r2:.*]] = addrspacecast ptr addrspace(5) %lv1 to ptr
-  // CL20: call void @func1(ptr noundef %[[r2]])
   func1(&lv1);
 
-  // CHECK: store i32 4, ptr addrspace(5) %lvc
-  // CHECK: store i32 4, ptr addrspace(5) %lv1
   const int lvc = 4;
   lv1 = lvc;
 }
 
-// CHECK-LABEL: define{{.*}} void @func3()
-// CHECK: %a = alloca [16 x [1 x float]], align 4, addrspace(5)
-// CHECK: call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 %a, i8 0, i64 64, i1 false)
+// CL12-LABEL: define dso_local void @func3(
+// CL12-SAME: ) #[[ATTR0]] {
+// CL12-NEXT:  [[ENTRY:.*:]]
+// CL12-NEXT:    [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5)
+// CL12-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false)
+// CL12-NEXT:    ret void
+//
+// CL20-LABEL: define dso_local void @func3(
+// CL20-SAME: ) #[[ATTR0]] {
+// CL20-NEXT:  [[ENTRY:.*:]]
+// CL20-NEXT:    [[A:%.*]] = alloca [16 x [1 x float]], align 4, addrspace(5)
+// CL20-NEXT:    call void @llvm.memset.p5.i64(ptr addrspace(5) align 4 [[A]], i8 0, i64 64, i1 false)
+// CL20-NEXT:    ret void
+//
 void func3(void) {
   float a[16][1] = {{0.}};
 }
-- 
GitLab


From b238c2b199a4ed5b0d76e412c7310bea054ee6a2 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Wed, 16 Oct 2024 19:05:02 +0100
Subject: [PATCH 161/329] [X86] Regenerate test checks with vpternlog comments

---
 llvm/test/CodeGen/X86/combine-sdiv.ll            | 2 +-
 llvm/test/CodeGen/X86/masked_store_trunc.ll      | 2 +-
 llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll | 2 +-
 llvm/test/CodeGen/X86/masked_store_trunc_usat.ll | 2 +-
 llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll | 4 ++--
 5 files changed, 6 insertions(+), 6 deletions(-)

diff --git a/llvm/test/CodeGen/X86/combine-sdiv.ll b/llvm/test/CodeGen/X86/combine-sdiv.ll
index 8e424664363b..2b392e69297f 100644
--- a/llvm/test/CodeGen/X86/combine-sdiv.ll
+++ b/llvm/test/CodeGen/X86/combine-sdiv.ll
@@ -974,7 +974,7 @@ define <32 x i16> @combine_vec_sdiv_by_pow2b_v32i16(<32 x i16> %x) {
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
 ; AVX512F-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535,0,65535,65535,65535,65535,65535,65535,65535]
 ; AVX512F-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; AVX512F-NEXT:    vpternlogq $216, %zmm2, %zmm1, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 ^ (zmm2 & (zmm0 ^ zmm1))
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: combine_vec_sdiv_by_pow2b_v32i16:
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc.ll b/llvm/test/CodeGen/X86/masked_store_trunc.ll
index f4a0207dafde..1e56f346030c 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc.ll
@@ -6403,7 +6403,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = ~zmm1
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
 ; AVX512F-NEXT:    vpshufb {{.*#+}} xmm0 = xmm0[0,2,4,6,8,10,12,14,u,u,u,u,u,u,u,u]
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
index 487f7298f442..da4432bd88e7 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_ssat.ll
@@ -7298,7 +7298,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = ~zmm1
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
 ; AVX512F-NEXT:    vpacksswb %xmm0, %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
index 498f250f11c6..1597e13f0271 100644
--- a/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
+++ b/llvm/test/CodeGen/X86/masked_store_trunc_usat.ll
@@ -7148,7 +7148,7 @@ define void @truncstore_v8i16_v8i8(<8 x i16> %x, ptr %p, <8 x i16> %mask) {
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm1, %xmm1
-; AVX512F-NEXT:    vpternlogq $15, %zmm1, %zmm1, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = ~zmm1
 ; AVX512F-NEXT:    vpmovsxwq %xmm1, %zmm1
 ; AVX512F-NEXT:    vptestmq %zmm1, %zmm1, %k0
 ; AVX512F-NEXT:    vpminuw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm0, %xmm0
diff --git a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
index 220c2e5012ea..a2bcadd104a7 100644
--- a/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
+++ b/llvm/test/CodeGen/X86/srem-seteq-vec-nonsplat.ll
@@ -2465,7 +2465,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm2 # [197,0,27,0,1,0,1,0,223,0,205,0,161,0,171,0,171,0,183,0,61,0,127,0,9,0,41,0,1,0,161,0]
 ; CHECK-AVX512VL-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm0, %ymm3 # [0,205,0,241,0,1,0,163,0,223,0,183,0,1,0,239,0,103,0,171,0,1,0,183,0,0,0,183,0,1,0,221]
 ; CHECK-AVX512VL-NEXT:    vpsllw $8, %ymm3, %ymm3
-; CHECK-AVX512VL-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm3
+; CHECK-AVX512VL-NEXT:    vpternlogd {{.*#+}} ymm3 = ymm3 | (ymm2 & mem)
 ; CHECK-AVX512VL-NEXT:    vpaddb {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm2
 ; CHECK-AVX512VL-NEXT:    vpunpckhbw {{.*#+}} ymm3 = ymm2[8,8,9,9,10,10,11,11,12,12,13,13,14,14,15,15,24,24,25,25,26,26,27,27,28,28,29,29,30,30,31,31]
 ; CHECK-AVX512VL-NEXT:    vpmullw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3 # [128,128,1,1,1,128,1,64,128,1,128,1,128,32,1,1]
@@ -2483,7 +2483,7 @@ define <32 x i1> @pr51133(<32 x i8> %x, <32 x i8> %y) {
 ; CHECK-AVX512VL-NEXT:    vpcmpgtb %ymm4, %ymm0, %ymm0
 ; CHECK-AVX512VL-NEXT:    vpandn %ymm0, %ymm3, %ymm3
 ; CHECK-AVX512VL-NEXT:    vpcmpeqb %ymm4, %ymm1, %ymm0
-; CHECK-AVX512VL-NEXT:    vpternlogq $14, %ymm3, %ymm2, %ymm0
+; CHECK-AVX512VL-NEXT:    vpternlogq {{.*#+}} ymm0 = ~ymm0 & (ymm2 | ymm3)
 ; CHECK-AVX512VL-NEXT:    retq
   %rem = srem <32 x i8> %x, <i8 13, i8 5, i8 19, i8 34, i8 2, i8 8, i8 2, i8 88, i8 62, i8 62, i8 5, i8 7, i8 97, i8 2, i8 3, i8 60, i8 3, i8 87, i8 7, i8 6, i8 84, i8 -128, i8 127, i8 56, i8 114, i8 1, i8 50, i8 7, i8 2, i8 8, i8 97, i8 117>
   %cmp = icmp ne <32 x i8> %rem, zeroinitializer
-- 
GitLab


From 569ad7cf346dd56ea95bfd98767f2f8deb73be4f Mon Sep 17 00:00:00 2001
From: David Green <david.green@arm.com>
Date: Wed, 16 Oct 2024 19:26:57 +0100
Subject: [PATCH 162/329] [AArch64][GlobalISel] Move UseOutlineAtomics to a
 bool check. NFC

Similar to #111287, this moves the UseOutlineAtomics legalization rules to a
boolean predicate as opposed to needing the be nested functions.

There appeared to be a pair of redundant customIfs for s128 sizes (assuming
only scalars are supported).
---
 .../llvm/CodeGen/GlobalISel/LegalizerInfo.h   |  6 ++++
 .../AArch64/GISel/AArch64LegalizerInfo.cpp    | 24 ++++++----------
 .../GlobalISel/legalizer-info-validation.mir  | 28 +++++++++----------
 3 files changed, 28 insertions(+), 30 deletions(-)

diff --git a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
index 4e5a6cf92b76..bcd44abb2088 100644
--- a/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
+++ b/llvm/include/llvm/CodeGen/GlobalISel/LegalizerInfo.h
@@ -760,6 +760,12 @@ public:
     return actionFor(LegalizeAction::Libcall, Types);
   }
   LegalizeRuleSet &
+  libcallFor(bool Pred, std::initializer_list<std::pair<LLT, LLT>> Types) {
+    if (!Pred)
+      return *this;
+    return actionFor(LegalizeAction::Libcall, Types);
+  }
+  LegalizeRuleSet &
   libcallForCartesianProduct(std::initializer_list<LLT> Types) {
     return actionForCartesianProduct(LegalizeAction::Libcall, Types);
   }
diff --git a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
index 773f5c0923e9..e9d01602c298 100644
--- a/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
+++ b/llvm/lib/Target/AArch64/GISel/AArch64LegalizerInfo.cpp
@@ -848,29 +848,21 @@ AArch64LegalizerInfo::AArch64LegalizerInfo(const AArch64Subtarget &ST)
       .lowerIf(
           all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(2, p0)));
 
-  LegalityPredicate UseOutlineAtomics = [&ST](const LegalityQuery &Query) {
-    return ST.outlineAtomics() && !ST.hasLSE();
-  };
+  bool UseOutlineAtomics = ST.outlineAtomics() && !ST.hasLSE();
 
   getActionDefinitionsBuilder(G_ATOMIC_CMPXCHG)
-      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
-                   predNot(UseOutlineAtomics)))
-      .customIf(all(typeIs(0, s128), predNot(UseOutlineAtomics)))
-      .customIf([UseOutlineAtomics](const LegalityQuery &Query) {
-        return Query.Types[0].getSizeInBits() == 128 &&
-               !UseOutlineAtomics(Query);
-      })
-      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64, s128}), typeIs(1, p0),
-                     UseOutlineAtomics))
+      .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
+      .customFor(!UseOutlineAtomics, {{s128, p0}})
+      .libcallFor(UseOutlineAtomics,
+                  {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}, {s128, p0}})
       .clampScalar(0, s32, s64);
 
   getActionDefinitionsBuilder({G_ATOMICRMW_XCHG, G_ATOMICRMW_ADD,
                                G_ATOMICRMW_SUB, G_ATOMICRMW_AND, G_ATOMICRMW_OR,
                                G_ATOMICRMW_XOR})
-      .legalIf(all(typeInSet(0, {s32, s64}), typeIs(1, p0),
-                   predNot(UseOutlineAtomics)))
-      .libcallIf(all(typeInSet(0, {s8, s16, s32, s64}), typeIs(1, p0),
-                     UseOutlineAtomics))
+      .legalFor(!UseOutlineAtomics, {{s32, p0}, {s64, p0}})
+      .libcallFor(UseOutlineAtomics,
+                  {{s8, p0}, {s16, p0}, {s32, p0}, {s64, p0}})
       .clampScalar(0, s32, s64);
 
   // Do not outline these atomics operations, as per comment in
diff --git a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
index 073c3cafa062..146d1177f469 100644
--- a/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
+++ b/llvm/test/CodeGen/AArch64/GlobalISel/legalizer-info-validation.mir
@@ -205,34 +205,34 @@
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
 # DEBUG-NEXT: G_ATOMIC_CMPXCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ATOMICRMW_XCHG (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ATOMICRMW_ADD (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ATOMICRMW_SUB (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ATOMICRMW_AND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ATOMICRMW_NAND (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. type index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: .. imm index coverage check SKIPPED: no rules defined
 # DEBUG-NEXT: G_ATOMICRMW_OR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ATOMICRMW_XOR (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
-# DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-# DEBUG-NEXT: .. imm index coverage check SKIPPED: user-defined predicate detected
+# DEBUG-NEXT: .. the first uncovered type index: 2, OK
+# DEBUG-NEXT: .. the first uncovered imm index: 0, OK
 # DEBUG-NEXT: G_ATOMICRMW_MAX (opcode {{[0-9]+}}): 2 type indices, 0 imm indices
 # DEBUG-NEXT: .. opcode {{[0-9]+}} is aliased to {{[0-9]+}}
 # DEBUG-NEXT: .. type index coverage check SKIPPED: user-defined predicate detected
-- 
GitLab


From 835feaaf35306d1c18c01f2f2792ce01357c7a09 Mon Sep 17 00:00:00 2001
From: Finn Plummer <50529406+inbelic@users.noreply.github.com>
Date: Wed, 16 Oct 2024 11:31:28 -0700
Subject: [PATCH 163/329] [DXIL] Add scalarization support for WaveReadLaneAt
 (#112570)

- Implement trivial scalarization for the `WaveReadLaneAt` DXIL
intrinsic
- Add test case to demonstrate the lowering path

Resolves #70104
---
 .../DirectX/DirectXTargetTransformInfo.cpp    |  3 ++
 .../CodeGen/DirectX/WaveReadLaneAt-vec.ll     | 35 +++++++++++++++++++
 2 files changed, 38 insertions(+)
 create mode 100644 llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll

diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index be714b5c8789..8ea31401121b 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -18,6 +18,8 @@ using namespace llvm;
 bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
                                                         unsigned ScalarOpdIdx) {
   switch (ID) {
+  case Intrinsic::dx_wave_readlane:
+    return ScalarOpdIdx == 1;
   default:
     return false;
   }
@@ -28,6 +30,7 @@ bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
   switch (ID) {
   case Intrinsic::dx_frac:
   case Intrinsic::dx_rsqrt:
+  case Intrinsic::dx_wave_readlane:
     return true;
   default:
     return false;
diff --git a/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll b/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll
new file mode 100644
index 000000000000..8c2a11a3557a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/WaveReadLaneAt-vec.ll
@@ -0,0 +1,35 @@
+; RUN: opt -S -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-compute %s | FileCheck %s
+
+; Test that for vector values, WaveReadLaneAt scalarizes and maps down to the
+; DirectX op
+
+define noundef <2 x half> @wave_read_lane_v2half(<2 x half> noundef %expr, i32 %idx) {
+entry:
+; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr.i0, i32 %idx)
+; CHECK: call half @dx.op.waveReadLaneAt.f16(i32 117, half %expr.i1, i32 %idx)
+  %ret = call <2 x half> @llvm.dx.wave.readlane.f16(<2 x half> %expr, i32 %idx)
+  ret <2 x half> %ret
+}
+
+define noundef <3 x i32> @wave_read_lane_v3i32(<3 x i32> noundef %expr, i32 %idx) {
+entry:
+; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i0, i32 %idx)
+; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i1, i32 %idx)
+; CHECK: call i32 @dx.op.waveReadLaneAt.i32(i32 117, i32 %expr.i2, i32 %idx)
+  %ret = call <3 x i32> @llvm.dx.wave.readlane(<3 x i32> %expr, i32 %idx)
+  ret <3 x i32> %ret
+}
+
+define noundef <4 x double> @wave_read_lane_v4f64(<4 x double> noundef %expr, i32 %idx) {
+entry:
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i0, i32 %idx)
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i1, i32 %idx)
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i2, i32 %idx)
+; CHECK: call double @dx.op.waveReadLaneAt.f64(i32 117, double %expr.i3, i32 %idx)
+  %ret = call <4 x double> @llvm.dx.wave.readlane(<4 x double> %expr, i32 %idx)
+  ret <4 x double> %ret
+}
+
+declare <2 x half> @llvm.dx.wave.readlane.v2f16(<2 x half>, i32)
+declare <3 x i32> @llvm.dx.wave.readlane.v3i32(<3 x i32>, i32)
+declare <4 x double> @llvm.dx.wave.readlane.v4f64(<4 x double>, i32)
-- 
GitLab


From 875afa939df0bd3ede101447618e6d3bfc4692b3 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Wed, 16 Oct 2024 11:43:17 -0700
Subject: [PATCH 164/329] [X86][CodeGen] Add base atan2 intrinsic lowering (p4)
 (#110760)

This change is part of this proposal:
https://discourse.llvm.org/t/rfc-all-the-math-intrinsics/78294

Based on example PR #96222 and fix PR #101268, with some differences due
to 2-arg intrinsic and intermediate refactor (RuntimeLibCalls.cpp).

- Add llvm.experimental.constrained.atan2 - Intrinsics.td,
ConstrainedOps.def, LangRef.rst
- Add to ISDOpcodes.h and TargetSelectionDAG.td, connect to intrinsic in
BasicTTIImpl.h, and LibFunc_ in SelectionDAGBuilder.cpp
- Update LegalizeDAG.cpp, LegalizeFloatTypes.cpp, LegalizeVectorOps.cpp,
and LegalizeVectorTypes.cpp
- Update isKnownNeverNaN in SelectionDAG.cpp
- Update SelectionDAGDumper.cpp
- Update libcalls - RuntimeLibcalls.def, RuntimeLibcalls.cpp
- TargetLoweringBase.cpp - Expand for vectors, promote f16
- X86ISelLowering.cpp - Expand f80, promote f32 to f64 for MSVC

Part 4 for Implement the atan2 HLSL Function #70096.
---
 llvm/include/llvm/CodeGen/BasicTTIImpl.h      |   3 +
 llvm/include/llvm/CodeGen/ISDOpcodes.h        |   3 +
 llvm/include/llvm/IR/ConstrainedOps.def       |   1 +
 llvm/include/llvm/IR/Intrinsics.td            |   5 +
 llvm/include/llvm/IR/RuntimeLibcalls.def      |   5 +
 .../include/llvm/Target/TargetSelectionDAG.td |   6 +
 llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp |   7 +
 .../SelectionDAG/LegalizeFloatTypes.cpp       |  22 ++
 llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h |   2 +
 .../SelectionDAG/LegalizeVectorOps.cpp        |   1 +
 .../SelectionDAG/LegalizeVectorTypes.cpp      |   3 +
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |   1 +
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  12 +
 .../SelectionDAG/SelectionDAGDumper.cpp       |   2 +
 llvm/lib/CodeGen/TargetLoweringBase.cpp       |   7 +-
 llvm/lib/IR/RuntimeLibcalls.cpp               |   1 +
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   2 +
 llvm/test/Assembler/fp-intrinsics-attr.ll     |   8 +
 llvm/test/CodeGen/X86/fp-intrinsics.ll        |  59 ++++
 .../test/CodeGen/X86/fp128-libcalls-strict.ll |  45 +++
 llvm/test/CodeGen/X86/fp80-strict-libcalls.ll |  30 ++
 llvm/test/CodeGen/X86/llvm.atan2.ll           |  80 ++++++
 .../X86/vector-constrained-fp-intrinsics.ll   | 258 ++++++++++++++++++
 llvm/test/Feature/fp-intrinsics.ll            |  15 +-
 24 files changed, 574 insertions(+), 4 deletions(-)
 create mode 100644 llvm/test/CodeGen/X86/llvm.atan2.ll

diff --git a/llvm/include/llvm/CodeGen/BasicTTIImpl.h b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
index 57d1fa33c848..db3b5cddd7c1 100644
--- a/llvm/include/llvm/CodeGen/BasicTTIImpl.h
+++ b/llvm/include/llvm/CodeGen/BasicTTIImpl.h
@@ -1998,6 +1998,9 @@ public:
     case Intrinsic::atan:
       ISD = ISD::FATAN;
       break;
+    case Intrinsic::atan2:
+      ISD = ISD::FATAN2;
+      break;
     case Intrinsic::sinh:
       ISD = ISD::FSINH;
       break;
diff --git a/llvm/include/llvm/CodeGen/ISDOpcodes.h b/llvm/include/llvm/CodeGen/ISDOpcodes.h
index da43f5be10ff..0b6d155b6d16 100644
--- a/llvm/include/llvm/CodeGen/ISDOpcodes.h
+++ b/llvm/include/llvm/CodeGen/ISDOpcodes.h
@@ -425,6 +425,7 @@ enum NodeType {
   STRICT_FASIN,
   STRICT_FACOS,
   STRICT_FATAN,
+  STRICT_FATAN2,
   STRICT_FSINH,
   STRICT_FCOSH,
   STRICT_FTANH,
@@ -994,6 +995,8 @@ enum NodeType {
   FPOWI,
   /// FLDEXP - ldexp, inspired by libm (op0 * 2**op1).
   FLDEXP,
+  /// FATAN2 - atan2, inspired by libm.
+  FATAN2,
 
   /// FFREXP - frexp, extract fractional and exponent component of a
   /// floating-point value. Returns the two components as separate return
diff --git a/llvm/include/llvm/IR/ConstrainedOps.def b/llvm/include/llvm/IR/ConstrainedOps.def
index 56304c377b83..30a82bf633d5 100644
--- a/llvm/include/llvm/IR/ConstrainedOps.def
+++ b/llvm/include/llvm/IR/ConstrainedOps.def
@@ -72,6 +72,7 @@ CMP_INSTRUCTION(FCmp,         2, 0, experimental_constrained_fcmps,      FSETCCS
 DAG_FUNCTION(acos,            1, 1, experimental_constrained_acos,       FACOS)
 DAG_FUNCTION(asin,            1, 1, experimental_constrained_asin,       FASIN)
 DAG_FUNCTION(atan,            1, 1, experimental_constrained_atan,       FATAN)
+DAG_FUNCTION(atan2,           2, 1, experimental_constrained_atan2,      FATAN2)
 DAG_FUNCTION(ceil,            1, 0, experimental_constrained_ceil,       FCEIL)
 DAG_FUNCTION(cos,             1, 1, experimental_constrained_cos,        FCOS)
 DAG_FUNCTION(cosh,            1, 1, experimental_constrained_cosh,       FCOSH)
diff --git a/llvm/include/llvm/IR/Intrinsics.td b/llvm/include/llvm/IR/Intrinsics.td
index 8a0721cf23f5..94e53f372127 100644
--- a/llvm/include/llvm/IR/Intrinsics.td
+++ b/llvm/include/llvm/IR/Intrinsics.td
@@ -1235,6 +1235,11 @@ let IntrProperties = [IntrInaccessibleMemOnly, IntrWillReturn, IntrStrictFP] in
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
                                                       llvm_metadata_ty ]>;
+  def int_experimental_constrained_atan2 : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
+                                                    [ LLVMMatchType<0>,
+                                                      LLVMMatchType<0>,
+                                                      llvm_metadata_ty,
+                                                      llvm_metadata_ty ]>;
   def int_experimental_constrained_sin  : DefaultAttrsIntrinsic<[ llvm_anyfloat_ty ],
                                                     [ LLVMMatchType<0>,
                                                       llvm_metadata_ty,
diff --git a/llvm/include/llvm/IR/RuntimeLibcalls.def b/llvm/include/llvm/IR/RuntimeLibcalls.def
index 69cf43140ad4..4aab658a8669 100644
--- a/llvm/include/llvm/IR/RuntimeLibcalls.def
+++ b/llvm/include/llvm/IR/RuntimeLibcalls.def
@@ -232,6 +232,11 @@ HANDLE_LIBCALL(ATAN_F64, "atan")
 HANDLE_LIBCALL(ATAN_F80, "atanl")
 HANDLE_LIBCALL(ATAN_F128,"atanl")
 HANDLE_LIBCALL(ATAN_PPCF128, "atanl")
+HANDLE_LIBCALL(ATAN2_F32, "atan2f")
+HANDLE_LIBCALL(ATAN2_F64, "atan2")
+HANDLE_LIBCALL(ATAN2_F80, "atan2l")
+HANDLE_LIBCALL(ATAN2_F128,"atan2l")
+HANDLE_LIBCALL(ATAN2_PPCF128, "atan2l")
 HANDLE_LIBCALL(SINCOS_F32, nullptr)
 HANDLE_LIBCALL(SINCOS_F64, nullptr)
 HANDLE_LIBCALL(SINCOS_F80, nullptr)
diff --git a/llvm/include/llvm/Target/TargetSelectionDAG.td b/llvm/include/llvm/Target/TargetSelectionDAG.td
index adf8a75f6202..fa516fc9b101 100644
--- a/llvm/include/llvm/Target/TargetSelectionDAG.td
+++ b/llvm/include/llvm/Target/TargetSelectionDAG.td
@@ -534,6 +534,7 @@ def ftan       : SDNode<"ISD::FTAN"       , SDTFPUnaryOp>;
 def fasin      : SDNode<"ISD::FASIN"      , SDTFPUnaryOp>;
 def facos      : SDNode<"ISD::FACOS"      , SDTFPUnaryOp>;
 def fatan      : SDNode<"ISD::FATAN"      , SDTFPUnaryOp>;
+def fatan2     : SDNode<"ISD::FATAN2"     , SDTFPBinOp>;
 def fsinh      : SDNode<"ISD::FSINH"      , SDTFPUnaryOp>;
 def fcosh      : SDNode<"ISD::FCOSH"      , SDTFPUnaryOp>;
 def ftanh      : SDNode<"ISD::FTANH"      , SDTFPUnaryOp>;
@@ -602,6 +603,8 @@ def strict_facos      : SDNode<"ISD::STRICT_FACOS",
                                SDTFPUnaryOp, [SDNPHasChain]>;
 def strict_fatan      : SDNode<"ISD::STRICT_FATAN",
                                SDTFPUnaryOp, [SDNPHasChain]>;
+def strict_fatan2     : SDNode<"ISD::STRICT_FATAN2",
+                               SDTFPBinOp, [SDNPHasChain]>;
 def strict_fsinh      : SDNode<"ISD::STRICT_FSINH",
                                SDTFPUnaryOp, [SDNPHasChain]>;
 def strict_fcosh      : SDNode<"ISD::STRICT_FCOSH",
@@ -1588,6 +1591,9 @@ def any_facos      : PatFrags<(ops node:$src),
 def any_fatan      : PatFrags<(ops node:$src),
                               [(strict_fatan node:$src),
                                (fatan node:$src)]>;
+def any_fatan2      : PatFrags<(ops node:$src1, node:$src2),
+                              [(strict_fatan2 node:$src1, node:$src2),
+                               (fatan2 node:$src1, node:$src2)]>;
 def any_fsinh      : PatFrags<(ops node:$src),
                               [(strict_fsinh node:$src),
                                (fsinh node:$src)]>;
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
index ea22b4670d6f..e0a03383358b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeDAG.cpp
@@ -4600,6 +4600,11 @@ void SelectionDAGLegalize::ConvertNodeToLibcall(SDNode *Node) {
     ExpandFPLibCall(Node, RTLIB::ATAN_F32, RTLIB::ATAN_F64, RTLIB::ATAN_F80,
                     RTLIB::ATAN_F128, RTLIB::ATAN_PPCF128, Results);
     break;
+  case ISD::FATAN2:
+  case ISD::STRICT_FATAN2:
+    ExpandFPLibCall(Node, RTLIB::ATAN2_F32, RTLIB::ATAN2_F64, RTLIB::ATAN2_F80,
+                    RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128, Results);
+    break;
   case ISD::FSINH:
   case ISD::STRICT_FSINH:
     ExpandFPLibCall(Node, RTLIB::SINH_F32, RTLIB::SINH_F64, RTLIB::SINH_F80,
@@ -5486,6 +5491,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::FMINIMUMNUM:
   case ISD::FMAXIMUMNUM:
   case ISD::FPOW:
+  case ISD::FATAN2:
     Tmp1 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(0));
     Tmp2 = DAG.getNode(ISD::FP_EXTEND, dl, NVT, Node->getOperand(1));
     Tmp3 = DAG.getNode(Node->getOpcode(), dl, NVT, Tmp1, Tmp2,
@@ -5502,6 +5508,7 @@ void SelectionDAGLegalize::PromoteNode(SDNode *Node) {
   case ISD::STRICT_FMAXNUM:
   case ISD::STRICT_FREM:
   case ISD::STRICT_FPOW:
+  case ISD::STRICT_FATAN2:
     Tmp1 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
                        {Node->getOperand(0), Node->getOperand(1)});
     Tmp2 = DAG.getNode(ISD::STRICT_FP_EXTEND, dl, {NVT, MVT::Other},
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
index 2c81c829e75c..73c258f0f6f1 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeFloatTypes.cpp
@@ -84,6 +84,8 @@ void DAGTypeLegalizer::SoftenFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FASIN:       R = SoftenFloatRes_FASIN(N); break;
     case ISD::STRICT_FATAN:
     case ISD::FATAN:       R = SoftenFloatRes_FATAN(N); break;
+    case ISD::STRICT_FATAN2:
+    case ISD::FATAN2:      R = SoftenFloatRes_FATAN2(N); break;
     case ISD::FCBRT:       R = SoftenFloatRes_FCBRT(N); break;
     case ISD::STRICT_FCEIL:
     case ISD::FCEIL:       R = SoftenFloatRes_FCEIL(N); break;
@@ -366,6 +368,13 @@ SDValue DAGTypeLegalizer::SoftenFloatRes_FATAN(SDNode *N) {
                       RTLIB::ATAN_F80, RTLIB::ATAN_F128, RTLIB::ATAN_PPCF128));
 }
 
+SDValue DAGTypeLegalizer::SoftenFloatRes_FATAN2(SDNode *N) {
+  return SoftenFloatRes_Binary(
+      N,
+      GetFPLibCall(N->getValueType(0), RTLIB::ATAN2_F32, RTLIB::ATAN2_F64,
+                   RTLIB::ATAN2_F80, RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128));
+}
+
 SDValue DAGTypeLegalizer::SoftenFloatRes_FCBRT(SDNode *N) {
   return SoftenFloatRes_Unary(N, GetFPLibCall(N->getValueType(0),
                                            RTLIB::CBRT_F32,
@@ -1430,6 +1439,8 @@ void DAGTypeLegalizer::ExpandFloatResult(SDNode *N, unsigned ResNo) {
   case ISD::FASIN:      ExpandFloatRes_FASIN(N, Lo, Hi); break;
   case ISD::STRICT_FATAN:
   case ISD::FATAN:      ExpandFloatRes_FATAN(N, Lo, Hi); break;
+  case ISD::STRICT_FATAN2:
+  case ISD::FATAN2:     ExpandFloatRes_FATAN2(N, Lo, Hi); break;
   case ISD::FCBRT:      ExpandFloatRes_FCBRT(N, Lo, Hi); break;
   case ISD::STRICT_FCEIL:
   case ISD::FCEIL:      ExpandFloatRes_FCEIL(N, Lo, Hi); break;
@@ -1631,6 +1642,15 @@ void DAGTypeLegalizer::ExpandFloatRes_FATAN(SDNode *N, SDValue &Lo,
                        Lo, Hi);
 }
 
+void DAGTypeLegalizer::ExpandFloatRes_FATAN2(SDNode *N, SDValue &Lo,
+                                             SDValue &Hi) {
+  ExpandFloatRes_Binary(N,
+                        GetFPLibCall(N->getValueType(0), RTLIB::ATAN2_F32,
+                                     RTLIB::ATAN2_F64, RTLIB::ATAN2_F80,
+                                     RTLIB::ATAN2_F128, RTLIB::ATAN2_PPCF128),
+                        Lo, Hi);
+}
+
 void DAGTypeLegalizer::ExpandFloatRes_FCBRT(SDNode *N, SDValue &Lo,
                                             SDValue &Hi) {
   ExpandFloatRes_Unary(N, GetFPLibCall(N->getValueType(0), RTLIB::CBRT_F32,
@@ -2673,6 +2693,7 @@ void DAGTypeLegalizer::PromoteFloatResult(SDNode *N, unsigned ResNo) {
     case ISD::FMINNUM_IEEE:
     case ISD::FMUL:
     case ISD::FPOW:
+    case ISD::FATAN2:
     case ISD::FREM:
     case ISD::FSUB:       R = PromoteFloatRes_BinOp(N); break;
 
@@ -3115,6 +3136,7 @@ void DAGTypeLegalizer::SoftPromoteHalfResult(SDNode *N, unsigned ResNo) {
   case ISD::FMINNUM:
   case ISD::FMUL:
   case ISD::FPOW:
+  case ISD::FATAN2:
   case ISD::FREM:
   case ISD::FSUB:        R = SoftPromoteHalfRes_BinOp(N); break;
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
index d14516ef3e2f..868da25ca8cb 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeTypes.h
@@ -567,6 +567,7 @@ private:
   SDValue SoftenFloatRes_FACOS(SDNode *N);
   SDValue SoftenFloatRes_FASIN(SDNode *N);
   SDValue SoftenFloatRes_FATAN(SDNode *N);
+  SDValue SoftenFloatRes_FATAN2(SDNode *N);
   SDValue SoftenFloatRes_FMINNUM(SDNode *N);
   SDValue SoftenFloatRes_FMAXNUM(SDNode *N);
   SDValue SoftenFloatRes_FMINIMUMNUM(SDNode *N);
@@ -661,6 +662,7 @@ private:
   void ExpandFloatRes_FACOS     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FASIN     (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FATAN     (SDNode *N, SDValue &Lo, SDValue &Hi);
+  void ExpandFloatRes_FATAN2    (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMINNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMAXNUM   (SDNode *N, SDValue &Lo, SDValue &Hi);
   void ExpandFloatRes_FMINIMUMNUM(SDNode *N, SDValue &Lo, SDValue &Hi);
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
index ffecca78a225..a8042fc3e7a6 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorOps.cpp
@@ -410,6 +410,7 @@ SDValue VectorLegalizer::LegalizeOp(SDValue Op) {
   case ISD::FASIN:
   case ISD::FACOS:
   case ISD::FATAN:
+  case ISD::FATAN2:
   case ISD::FSINH:
   case ISD::FCOSH:
   case ISD::FTANH:
diff --git a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
index e0b47e1045b9..50e2a923699c 100644
--- a/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/LegalizeVectorTypes.cpp
@@ -164,6 +164,7 @@ void DAGTypeLegalizer::ScalarizeVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::USHLSAT:
 
   case ISD::FPOW:
+  case ISD::FATAN2:
   case ISD::FREM:
   case ISD::FSUB:
   case ISD::MUL:
@@ -1293,6 +1294,7 @@ void DAGTypeLegalizer::SplitVectorResult(SDNode *N, unsigned ResNo) {
   case ISD::UDIV: case ISD::VP_UDIV:
   case ISD::FDIV: case ISD::VP_FDIV:
   case ISD::FPOW:
+  case ISD::FATAN2:
   case ISD::AND: case ISD::VP_AND:
   case ISD::OR: case ISD::VP_OR:
   case ISD::XOR: case ISD::VP_XOR:
@@ -4581,6 +4583,7 @@ void DAGTypeLegalizer::WidenVectorResult(SDNode *N, unsigned ResNo) {
     break;
 
   case ISD::FPOW:
+  case ISD::FATAN2:
   case ISD::FREM:
     if (unrollExpandedOp())
       break;
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index ff4b2f409d7c..d63ed7ecf023 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -5471,6 +5471,7 @@ bool SelectionDAG::isKnownNeverNaN(SDValue Op, bool SNaN, unsigned Depth) const
   case ISD::FASIN:
   case ISD::FACOS:
   case ISD::FATAN:
+  case ISD::FATAN2:
   case ISD::FSINH:
   case ISD::FCOSH:
   case ISD::FTANH:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 805b8ecf0095..9d8224749967 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -6861,6 +6861,12 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
                              getValue(I.getArgOperand(0)), Flags));
     return;
   }
+  case Intrinsic::atan2:
+    setValue(&I, DAG.getNode(ISD::FATAN2, sdl,
+                             getValue(I.getArgOperand(0)).getValueType(),
+                             getValue(I.getArgOperand(0)),
+                             getValue(I.getArgOperand(1)), Flags));
+    return;
   case Intrinsic::lround:
   case Intrinsic::llround:
   case Intrinsic::lrint:
@@ -9353,6 +9359,12 @@ void SelectionDAGBuilder::visitCall(const CallInst &I) {
         if (visitUnaryFloatCall(I, ISD::FATAN))
           return;
         break;
+      case LibFunc_atan2:
+      case LibFunc_atan2f:
+      case LibFunc_atan2l:
+        if (visitBinaryFloatCall(I, ISD::FATAN2))
+          return;
+        break;
       case LibFunc_sinh:
       case LibFunc_sinhf:
       case LibFunc_sinhl:
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
index 56fc538172f9..703efb700897 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGDumper.cpp
@@ -227,6 +227,8 @@ std::string SDNode::getOperationName(const SelectionDAG *G) const {
   case ISD::STRICT_FACOS:               return "strict_facos";
   case ISD::FATAN:                      return "fatan";
   case ISD::STRICT_FATAN:               return "strict_fatan";
+  case ISD::FATAN2:                     return "fatan2";
+  case ISD::STRICT_FATAN2:              return "strict_fatan2";
   case ISD::FSINH:                      return "fsinh";
   case ISD::STRICT_FSINH:               return "strict_fsinh";
   case ISD::FCOSH:                      return "fcosh";
diff --git a/llvm/lib/CodeGen/TargetLoweringBase.cpp b/llvm/lib/CodeGen/TargetLoweringBase.cpp
index 1f49d60c9705..7a28f7892cbf 100644
--- a/llvm/lib/CodeGen/TargetLoweringBase.cpp
+++ b/llvm/lib/CodeGen/TargetLoweringBase.cpp
@@ -783,7 +783,7 @@ void TargetLoweringBase::initActions() {
            ISD::SIGN_EXTEND_VECTOR_INREG, ISD::ZERO_EXTEND_VECTOR_INREG,
            ISD::SPLAT_VECTOR, ISD::LRINT, ISD::LLRINT, ISD::LROUND,
            ISD::LLROUND, ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN,
-           ISD::FCOSH, ISD::FSINH, ISD::FTANH},
+           ISD::FCOSH, ISD::FSINH, ISD::FTANH, ISD::FATAN2},
           VT, Expand);
 
       // Constrained floating-point operations default to expand.
@@ -842,7 +842,8 @@ void TargetLoweringBase::initActions() {
                       ISD::FEXP,       ISD::FEXP2, ISD::FEXP10, ISD::FFLOOR,
                       ISD::FNEARBYINT, ISD::FCEIL, ISD::FRINT,  ISD::FTRUNC,
                       ISD::FROUNDEVEN, ISD::FTAN,  ISD::FACOS,  ISD::FASIN,
-                      ISD::FATAN,      ISD::FCOSH, ISD::FSINH,  ISD::FTANH},
+                      ISD::FATAN,      ISD::FCOSH, ISD::FSINH,  ISD::FTANH,
+                      ISD::FATAN2},
                      {MVT::f32, MVT::f64, MVT::f128}, Expand);
 
   // FIXME: Query RuntimeLibCalls to make the decision.
@@ -850,7 +851,7 @@ void TargetLoweringBase::initActions() {
                      {MVT::f32, MVT::f64, MVT::f128}, LibCall);
 
   setOperationAction({ISD::FTAN, ISD::FACOS, ISD::FASIN, ISD::FATAN, ISD::FCOSH,
-                      ISD::FSINH, ISD::FTANH},
+                      ISD::FSINH, ISD::FTANH, ISD::FATAN2},
                      MVT::f16, Promote);
   // Default ISD::TRAP to expand (which turns it into abort).
   setOperationAction(ISD::TRAP, MVT::Other, Expand);
diff --git a/llvm/lib/IR/RuntimeLibcalls.cpp b/llvm/lib/IR/RuntimeLibcalls.cpp
index d806f8093459..06167559a776 100644
--- a/llvm/lib/IR/RuntimeLibcalls.cpp
+++ b/llvm/lib/IR/RuntimeLibcalls.cpp
@@ -49,6 +49,7 @@ void RuntimeLibcallsInfo::initLibcalls(const Triple &TT) {
     setLibcallName(RTLIB::ASIN_F128, "asinf128");
     setLibcallName(RTLIB::ACOS_F128, "acosf128");
     setLibcallName(RTLIB::ATAN_F128, "atanf128");
+    setLibcallName(RTLIB::ATAN2_F128, "atan2f128");
     setLibcallName(RTLIB::SINH_F128, "sinhf128");
     setLibcallName(RTLIB::COSH_F128, "coshf128");
     setLibcallName(RTLIB::TANH_F128, "tanhf128");
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 5b4b27c88895..0155409dfda0 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -858,6 +858,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
     setOperationAction(ISD::FASIN  , MVT::f80, Expand);
     setOperationAction(ISD::FACOS  , MVT::f80, Expand);
     setOperationAction(ISD::FATAN  , MVT::f80, Expand);
+    setOperationAction(ISD::FATAN2 , MVT::f80, Expand);
     setOperationAction(ISD::FSINH  , MVT::f80, Expand);
     setOperationAction(ISD::FCOSH  , MVT::f80, Expand);
     setOperationAction(ISD::FTANH  , MVT::f80, Expand);
@@ -2562,6 +2563,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
          {ISD::FACOS,  ISD::STRICT_FACOS,
           ISD::FASIN,  ISD::STRICT_FASIN,
           ISD::FATAN,  ISD::STRICT_FATAN,
+          ISD::FATAN2, ISD::STRICT_FATAN2,
           ISD::FCEIL,  ISD::STRICT_FCEIL,
           ISD::FCOS,   ISD::STRICT_FCOS,
           ISD::FCOSH,  ISD::STRICT_FCOSH,
diff --git a/llvm/test/Assembler/fp-intrinsics-attr.ll b/llvm/test/Assembler/fp-intrinsics-attr.ll
index da6507f05176..5b9a44710763 100644
--- a/llvm/test/Assembler/fp-intrinsics-attr.ll
+++ b/llvm/test/Assembler/fp-intrinsics-attr.ll
@@ -105,6 +105,11 @@ define void @func(double %a, double %b, double %c, i32 %i) strictfp {
                                                metadata !"round.dynamic",
                                                metadata !"fpexcept.strict")
 
+  %atan2 = call double @llvm.experimental.constrained.atan2.f64(
+                                               double %a, double %b,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict")
+
   %cosh = call double @llvm.experimental.constrained.cosh.f64(
                                                double %a,
                                                metadata !"round.dynamic",
@@ -291,6 +296,9 @@ declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadat
 declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.atan.f64({{.*}}) #[[ATTR1]]
 
+declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
+; CHECK: @llvm.experimental.constrained.atan2.f64({{.*}}) #[[ATTR1]]
+
 declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata)
 ; CHECK: @llvm.experimental.constrained.sinh.f64({{.*}}) #[[ATTR1]]
 
diff --git a/llvm/test/CodeGen/X86/fp-intrinsics.ll b/llvm/test/CodeGen/X86/fp-intrinsics.ll
index bb87252e0b9b..3577f252f50d 100644
--- a/llvm/test/CodeGen/X86/fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/fp-intrinsics.ll
@@ -2962,6 +2962,64 @@ entry:
   ret double %result
 }
 
+; Verify that atan2(42.1, 3.0) isn't simplified when the rounding mode is unknown.
+define double @fatan2() #0 {
+; X87-LABEL: fatan2:
+; X87:       # %bb.0: # %entry
+; X87-NEXT:    subl $28, %esp
+; X87-NEXT:    .cfi_def_cfa_offset 32
+; X87-NEXT:    flds {{\.?LCPI[0-9]+_[0-9]+}}
+; X87-NEXT:    fstpl {{[0-9]+}}(%esp)
+; X87-NEXT:    fldl {{\.?LCPI[0-9]+_[0-9]+}}
+; X87-NEXT:    fstpl (%esp)
+; X87-NEXT:    wait
+; X87-NEXT:    calll atan2
+; X87-NEXT:    addl $28, %esp
+; X87-NEXT:    .cfi_def_cfa_offset 4
+; X87-NEXT:    retl
+;
+; X86-SSE-LABEL: fatan2:
+; X86-SSE:       # %bb.0: # %entry
+; X86-SSE-NEXT:    subl $28, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 32
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [3.0E+0,0.0E+0]
+; X86-SSE-NEXT:    movsd %xmm0, {{[0-9]+}}(%esp)
+; X86-SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; X86-SSE-NEXT:    movsd %xmm0, (%esp)
+; X86-SSE-NEXT:    calll atan2
+; X86-SSE-NEXT:    addl $28, %esp
+; X86-SSE-NEXT:    .cfi_def_cfa_offset 4
+; X86-SSE-NEXT:    retl
+;
+; SSE-LABEL: fatan2:
+; SSE:       # %bb.0: # %entry
+; SSE-NEXT:    pushq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 16
+; SSE-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; SSE-NEXT:    movsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
+; SSE-NEXT:    callq atan2@PLT
+; SSE-NEXT:    popq %rax
+; SSE-NEXT:    .cfi_def_cfa_offset 8
+; SSE-NEXT:    retq
+;
+; AVX-LABEL: fatan2:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [3.0E+0,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %result = call double @llvm.experimental.constrained.atan2.f64(double 42.1,
+                                               double 3.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret double %result
+}
+
 ; Verify that cosh(42.0) isn't simplified when the rounding mode is unknown.
 define double @fcosh() #0 {
 ; X87-LABEL: fcosh:
@@ -3132,6 +3190,7 @@ declare double @llvm.experimental.constrained.tan.f64(double, metadata, metadata
 declare double @llvm.experimental.constrained.asin.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.acos.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.atan.f64(double, metadata, metadata)
+declare double @llvm.experimental.constrained.atan2.f64(double, double, metadata, metadata)
 declare double @llvm.experimental.constrained.sinh.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.cosh.f64(double, metadata, metadata)
 declare double @llvm.experimental.constrained.tanh.f64(double, metadata, metadata)
diff --git a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
index 84574e369184..ffaa9f6297ed 100644
--- a/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
+++ b/llvm/test/CodeGen/X86/fp128-libcalls-strict.ll
@@ -1247,6 +1247,50 @@ entry:
   ret fp128 %atan
 }
 
+define fp128 @atan2(fp128 %x, fp128 %y) nounwind strictfp {
+; ANDROID-LABEL: atan2:
+; ANDROID:       # %bb.0: # %entry
+; ANDROID-NEXT:    pushq %rax
+; ANDROID-NEXT:    callq atan2l@PLT
+; ANDROID-NEXT:    popq %rax
+; ANDROID-NEXT:    retq
+;
+; GNU-LABEL: atan2:
+; GNU:       # %bb.0: # %entry
+; GNU-NEXT:    pushq %rax
+; GNU-NEXT:    callq atan2f128@PLT
+; GNU-NEXT:    popq %rax
+; GNU-NEXT:    retq
+;
+; X86-LABEL: atan2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    pushl %esi
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    movl {{[0-9]+}}(%esp), %esi
+; X86-NEXT:    subl $12, %esp
+; X86-NEXT:    leal {{[0-9]+}}(%esp), %eax
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl {{[0-9]+}}(%esp)
+; X86-NEXT:    pushl %eax
+; X86-NEXT:    calll atan2l
+; X86-NEXT:    addl $44, %esp
+; X86-NEXT:    movaps (%esp), %xmm0
+; X86-NEXT:    movaps %xmm0, (%esi)
+; X86-NEXT:    movl %esi, %eax
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    popl %esi
+; X86-NEXT:    retl $4
+entry:
+  %atan2 = call fp128 @llvm.experimental.constrained.atan2.f128(fp128 %x, fp128 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret fp128 %atan2
+}
+
 define fp128 @tan(fp128 %x) nounwind strictfp {
 ; ANDROID-LABEL: tan:
 ; ANDROID:       # %bb.0: # %entry
@@ -1948,6 +1992,7 @@ declare fp128 @llvm.experimental.constrained.sin.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.sinh.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.sqrt.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.atan.f128(fp128, metadata, metadata)
+declare fp128 @llvm.experimental.constrained.atan2.f128(fp128, fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.tan.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.tanh.f128(fp128, metadata, metadata)
 declare fp128 @llvm.experimental.constrained.trunc.f128(fp128, metadata)
diff --git a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
index 293133b08e76..8bbc6247dbaf 100644
--- a/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
+++ b/llvm/test/CodeGen/X86/fp80-strict-libcalls.ll
@@ -629,6 +629,35 @@ entry:
   ret x86_fp80 %atan
 }
 
+define x86_fp80 @atan2(x86_fp80 %x, x86_fp80 %y) nounwind strictfp {
+; X86-LABEL: atan2:
+; X86:       # %bb.0: # %entry
+; X86-NEXT:    subl $24, %esp
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fldt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt {{[0-9]+}}(%esp)
+; X86-NEXT:    fstpt (%esp)
+; X86-NEXT:    wait
+; X86-NEXT:    calll atan2l
+; X86-NEXT:    addl $24, %esp
+; X86-NEXT:    retl
+;
+; X64-LABEL: atan2:
+; X64:       # %bb.0: # %entry
+; X64-NEXT:    subq $40, %rsp
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fldt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; X64-NEXT:    fstpt (%rsp)
+; X64-NEXT:    wait
+; X64-NEXT:    callq atan2l@PLT
+; X64-NEXT:    addq $40, %rsp
+; X64-NEXT:    retq
+entry:
+  %atan2 = call x86_fp80 @llvm.experimental.constrained.atan2.f80(x86_fp80 %x, x86_fp80 %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret x86_fp80 %atan2
+}
+
 define x86_fp80 @tan(x86_fp80 %x) nounwind strictfp {
 ; X86-LABEL: tan:
 ; X86:       # %bb.0: # %entry
@@ -830,6 +859,7 @@ declare x86_fp80 @llvm.experimental.constrained.asin.f80(x86_fp80, metadata, met
 declare x86_fp80 @llvm.experimental.constrained.sin.f80(x86_fp80, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.sinh.f80(x86_fp80, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.atan.f80(x86_fp80, metadata, metadata)
+declare x86_fp80 @llvm.experimental.constrained.atan2.f80(x86_fp80, x86_fp80, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.tan.f80(x86_fp80, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.tanh.f80(x86_fp80, metadata, metadata)
 declare x86_fp80 @llvm.experimental.constrained.trunc.f80(x86_fp80, metadata)
diff --git a/llvm/test/CodeGen/X86/llvm.atan2.ll b/llvm/test/CodeGen/X86/llvm.atan2.ll
new file mode 100644
index 000000000000..ef2e4be36203
--- /dev/null
+++ b/llvm/test/CodeGen/X86/llvm.atan2.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 2
+; RUN: llc < %s -mtriple=x86_64-unknown-linux-gnu | FileCheck %s
+
+define half @use_atan2f16(half %a, half %b) nounwind {
+; CHECK-LABEL: use_atan2f16:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    movss %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 4-byte Spill
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    callq __extendhfsf2@PLT
+; CHECK-NEXT:    movss %xmm0, (%rsp) # 4-byte Spill
+; CHECK-NEXT:    movss {{[-0-9]+}}(%r{{[sb]}}p), %xmm0 # 4-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq __extendhfsf2@PLT
+; CHECK-NEXT:    movss (%rsp), %xmm1 # 4-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero,zero,zero
+; CHECK-NEXT:    callq atan2f@PLT
+; CHECK-NEXT:    callq __truncsfhf2@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %x = call half @llvm.atan2.f16(half %a, half %b)
+  ret half %x
+}
+
+define float @use_atan2f32(float %a, float %b) nounwind {
+; CHECK-LABEL: use_atan2f32:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp atan2f@PLT # TAILCALL
+  %x = call float @llvm.atan2.f32(float %a, float %b)
+  ret float %x
+}
+
+define double @use_atan2f64(double %a, double %b) nounwind {
+; CHECK-LABEL: use_atan2f64:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp atan2@PLT # TAILCALL
+  %x = call double @llvm.atan2.f64(double %a, double %b)
+  ret double %x
+}
+
+define x86_fp80 @use_atan2f80(x86_fp80 %a, x86_fp80 %b) nounwind {
+; CHECK-LABEL: use_atan2f80:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fstpt (%rsp)
+; CHECK-NEXT:    callq atan2l@PLT
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    retq
+  %x = call x86_fp80 @llvm.atan2.f80(x86_fp80 %a, x86_fp80 %b)
+  ret x86_fp80 %x
+}
+
+define fp128 @use_atan2fp128(fp128 %a, fp128 %b) nounwind {
+; CHECK-LABEL: use_atan2fp128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    jmp atan2f128@PLT # TAILCALL
+  %x = call fp128 @llvm.atan2.f128(fp128 %a, fp128 %b)
+  ret fp128 %x
+}
+
+define ppc_fp128 @use_atan2ppc_fp128(ppc_fp128 %a, ppc_fp128 %b) nounwind {
+; CHECK-LABEL: use_atan2ppc_fp128:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    callq atan2l@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    retq
+  %x = call ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128 %a, ppc_fp128 %b)
+  ret ppc_fp128 %x
+}
+
+declare half @llvm.atan2.f16(half, half)
+declare float @llvm.atan2.f32(float, float)
+declare double @llvm.atan2.f64(double, double)
+declare x86_fp80 @llvm.atan2.f80(x86_fp80, x86_fp80)
+declare fp128 @llvm.atan2.f128(fp128, fp128)
+declare ppc_fp128 @llvm.atan2.ppcf128(ppc_fp128, ppc_fp128)
diff --git a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
index b48601467846..21dfdc3c2abe 100644
--- a/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
+++ b/llvm/test/CodeGen/X86/vector-constrained-fp-intrinsics.ll
@@ -8672,6 +8672,263 @@ entry:
   ret <4 x double> %atan
 }
 
+define <1 x float> @constrained_vector_atan2_v1f32() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v1f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    pushq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq atan2f@PLT
+; CHECK-NEXT:    popq %rax
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_atan2_v1f32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    pushq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 16
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq atan2f@PLT
+; AVX-NEXT:    popq %rax
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %atan2 = call <1 x float> @llvm.experimental.constrained.atan2.v1f32(
+                             <1 x float> <float 42.0>,
+                             <1 x float> <float 23.0>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <1 x float> %atan2
+}
+
+define <2 x double> @constrained_vector_atan2_v2f64() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_atan2_v2f64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $24, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 32
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    addq $24, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %atan2 = call <2 x double> @llvm.experimental.constrained.atan2.v2f64(
+                             <2 x double> <double 42.0, double 42.1>,
+                             <2 x double> <double 23.0, double 23.1>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <2 x double> %atan2
+}
+
+define <3 x float> @constrained_vector_atan2_v3f32() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v3f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq atan2f@PLT
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq atan2f@PLT
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    movss {{.*#+}} xmm1 = [2.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; CHECK-NEXT:    callq atan2f@PLT
+; CHECK-NEXT:    movaps (%rsp), %xmm1 # 16-byte Reload
+; CHECK-NEXT:    unpcklps {{.*#+}} xmm1 = xmm1[0],xmm0[0],xmm1[1],xmm0[1]
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps %xmm1, %xmm0
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_atan2_v3f32:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [2.5E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq atan2f@PLT
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.2E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [2.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq atan2f@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovss {{.*#+}} xmm0 = [4.3E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    vmovss {{.*#+}} xmm1 = [2.4E+1,0.0E+0,0.0E+0,0.0E+0]
+; AVX-NEXT:    callq atan2f@PLT
+; AVX-NEXT:    vmovaps (%rsp), %xmm1 # 16-byte Reload
+; AVX-NEXT:    vinsertps {{.*#+}} xmm0 = xmm1[0],xmm0[0],xmm1[2,3]
+; AVX-NEXT:    vinsertps $32, {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0,1],mem[0],xmm0[3]
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %atan2 = call <3 x float> @llvm.experimental.constrained.atan2.v3f32(
+                              <3 x float> <float 42.0, float 43.0, float 44.0>,
+                              <3 x float> <float 23.0, float 24.0, float 25.0>,
+                              metadata !"round.dynamic",
+                              metadata !"fpexcept.strict") #0
+  ret <3 x float> %atan2
+}
+
+define <3 x double> @constrained_vector_atan2_v3f64() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v3f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 32
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    movsd %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    movsd %xmm0, (%rsp) # 8-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    movsd %xmm0, {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    fldl {{[0-9]+}}(%rsp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    movsd (%rsp), %xmm0 # 8-byte Reload
+; CHECK-NEXT:    # xmm0 = mem[0],zero
+; CHECK-NEXT:    movsd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 8-byte Reload
+; CHECK-NEXT:    # xmm1 = mem[0],zero
+; CHECK-NEXT:    addq $24, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_atan2_v3f64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovups %ymm0, (%rsp) # 32-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; AVX-NEXT:    vzeroupper
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vmovups (%rsp), %ymm1 # 32-byte Reload
+; AVX-NEXT:    vinsertf128 $1, %xmm0, %ymm1, %ymm0
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %atan2 = call <3 x double> @llvm.experimental.constrained.atan2.v3f64(
+                          <3 x double> <double 42.0, double 42.1, double 42.2>,
+                          <3 x double> <double 23.0, double 23.1, double 23.2>,
+                          metadata !"round.dynamic",
+                          metadata !"fpexcept.strict") #0
+  ret <3 x double> %atan2
+}
+
+define <4 x double> @constrained_vector_atan2_v4f64() #0 {
+; CHECK-LABEL: constrained_vector_atan2_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    subq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    unpcklpd (%rsp), %xmm0 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm0 = xmm0[0],mem[0]
+; CHECK-NEXT:    movaps %xmm0, (%rsp) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3300000000000001E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    movaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; CHECK-NEXT:    movsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; CHECK-NEXT:    movsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; CHECK-NEXT:    callq atan2@PLT
+; CHECK-NEXT:    movaps %xmm0, %xmm1
+; CHECK-NEXT:    unpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm1 # 16-byte Folded Reload
+; CHECK-NEXT:    # xmm1 = xmm1[0],mem[0]
+; CHECK-NEXT:    movaps (%rsp), %xmm0 # 16-byte Reload
+; CHECK-NEXT:    addq $40, %rsp
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    retq
+;
+; AVX-LABEL: constrained_vector_atan2_v4f64:
+; AVX:       # %bb.0: # %entry
+; AVX-NEXT:    subq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 48
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2299999999999997E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3300000000000001E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2200000000000003E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3199999999999999E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vunpcklpd (%rsp), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vmovaps %xmm0, (%rsp) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2100000000000001E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3100000000000001E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vmovaps %xmm0, {{[-0-9]+}}(%r{{[sb]}}p) # 16-byte Spill
+; AVX-NEXT:    vmovsd {{.*#+}} xmm0 = [4.2E+1,0.0E+0]
+; AVX-NEXT:    vmovsd {{.*#+}} xmm1 = [2.3E+1,0.0E+0]
+; AVX-NEXT:    callq atan2@PLT
+; AVX-NEXT:    vunpcklpd {{[-0-9]+}}(%r{{[sb]}}p), %xmm0, %xmm0 # 16-byte Folded Reload
+; AVX-NEXT:    # xmm0 = xmm0[0],mem[0]
+; AVX-NEXT:    vinsertf128 $1, (%rsp), %ymm0, %ymm0 # 16-byte Folded Reload
+; AVX-NEXT:    addq $40, %rsp
+; AVX-NEXT:    .cfi_def_cfa_offset 8
+; AVX-NEXT:    retq
+entry:
+  %atan2 = call <4 x double> @llvm.experimental.constrained.atan2.v4f64(
+                             <4 x double> <double 42.0, double 42.1,
+                                           double 42.2, double 42.3>,
+                             <4 x double> <double 23.0, double 23.1,
+                                           double 23.2, double 23.3>,
+                             metadata !"round.dynamic",
+                             metadata !"fpexcept.strict") #0
+  ret <4 x double> %atan2
+}
+
 define <1 x float> @constrained_vector_cosh_v1f32() #0 {
 ; CHECK-LABEL: constrained_vector_cosh_v1f32:
 ; CHECK:       # %bb.0: # %entry
@@ -9546,6 +9803,7 @@ declare <4 x double> @llvm.experimental.constrained.tan.v4f64(<4 x double>, meta
 declare <4 x double> @llvm.experimental.constrained.asin.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.acos.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.atan.v4f64(<4 x double>, metadata, metadata)
+declare <4 x double> @llvm.experimental.constrained.atan2.v4f64(<4 x double>, <4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.sinh.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.cosh.v4f64(<4 x double>, metadata, metadata)
 declare <4 x double> @llvm.experimental.constrained.tanh.v4f64(<4 x double>, metadata, metadata)
diff --git a/llvm/test/Feature/fp-intrinsics.ll b/llvm/test/Feature/fp-intrinsics.ll
index 80f8b15abfaa..ada22c39abc9 100644
--- a/llvm/test/Feature/fp-intrinsics.ll
+++ b/llvm/test/Feature/fp-intrinsics.ll
@@ -184,7 +184,7 @@ entry:
   ret double %result
 }
 
-; Verify that atan(42.0) isn't simplified when the rounding mode is unknown.
+; Verify that atan(42.0, 23.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: fatan
 ; CHECK: call double @llvm.experimental.constrained.atan
 define double @fatan() #0 {
@@ -195,6 +195,19 @@ entry:
   ret double %result
 }
 
+; Verify that atan2(42.0) isn't simplified when the rounding mode is unknown.
+; CHECK-LABEL: fatan2
+; CHECK: call double @llvm.experimental.constrained.atan2
+define double @fatan2() #0 {
+entry:
+  %result = call double @llvm.experimental.constrained.atan2.f64(
+                                              double 42.0,
+                                              double 23.0,
+                                               metadata !"round.dynamic",
+                                               metadata !"fpexcept.strict") #0
+  ret double %result
+}
+
 ; Verify that cosh(42.0) isn't simplified when the rounding mode is unknown.
 ; CHECK-LABEL: fcosh
 ; CHECK: call double @llvm.experimental.constrained.cosh
-- 
GitLab


From e768b076e3b7ed38485a29244a0b989076e4b131 Mon Sep 17 00:00:00 2001
From: Rahul Joshi <rjoshi@nvidia.com>
Date: Wed, 16 Oct 2024 11:46:38 -0700
Subject: [PATCH 165/329] [MLIR][TableGen] Use const pointers for various
 `Init` objects (#112562)

This reverts commit 0eed3055511381436ee69d1caf64a4af47f8d65c and applies
additional fixes in `verifyArgument` in OmpOpGen.cpp for gcc-7 bot
failures
---
 mlir/include/mlir/TableGen/AttrOrTypeDef.h    |  2 +-
 mlir/include/mlir/TableGen/Dialect.h          |  2 +-
 mlir/include/mlir/TableGen/Operator.h         | 15 ++++----
 mlir/lib/TableGen/AttrOrTypeDef.cpp           | 12 +++---
 mlir/lib/TableGen/Attribute.cpp               |  2 +-
 mlir/lib/TableGen/Dialect.cpp                 |  2 +-
 mlir/lib/TableGen/Interfaces.cpp              |  6 +--
 mlir/lib/TableGen/Operator.cpp                | 21 +++++-----
 mlir/lib/TableGen/Pattern.cpp                 |  2 +-
 mlir/lib/TableGen/Type.cpp                    |  2 +-
 mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp | 16 ++++----
 mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp | 38 ++++++++++---------
 mlir/tools/mlir-tblgen/DialectGen.cpp         |  9 +++--
 mlir/tools/mlir-tblgen/OmpOpGen.cpp           | 17 +++++----
 14 files changed, 78 insertions(+), 68 deletions(-)

diff --git a/mlir/include/mlir/TableGen/AttrOrTypeDef.h b/mlir/include/mlir/TableGen/AttrOrTypeDef.h
index 36744c85bc70..c3d730e42ef7 100644
--- a/mlir/include/mlir/TableGen/AttrOrTypeDef.h
+++ b/mlir/include/mlir/TableGen/AttrOrTypeDef.h
@@ -105,7 +105,7 @@ public:
   std::optional<StringRef> getDefaultValue() const;
 
   /// Return the underlying def of this parameter.
-  llvm::Init *getDef() const;
+  const llvm::Init *getDef() const;
 
   /// The parameter is pointer-comparable.
   bool operator==(const AttrOrTypeParameter &other) const {
diff --git a/mlir/include/mlir/TableGen/Dialect.h b/mlir/include/mlir/TableGen/Dialect.h
index 3530d240c976..ea8f40555e44 100644
--- a/mlir/include/mlir/TableGen/Dialect.h
+++ b/mlir/include/mlir/TableGen/Dialect.h
@@ -92,7 +92,7 @@ public:
   /// dialect.
   bool usePropertiesForAttributes() const;
 
-  llvm::DagInit *getDiscardableAttributes() const;
+  const llvm::DagInit *getDiscardableAttributes() const;
 
   const llvm::Record *getDef() const { return def; }
 
diff --git a/mlir/include/mlir/TableGen/Operator.h b/mlir/include/mlir/TableGen/Operator.h
index 768291a3a726..9e570373d9cd 100644
--- a/mlir/include/mlir/TableGen/Operator.h
+++ b/mlir/include/mlir/TableGen/Operator.h
@@ -119,14 +119,15 @@ public:
 
   /// A utility iterator over a list of variable decorators.
   struct VariableDecoratorIterator
-      : public llvm::mapped_iterator<llvm::Init *const *,
-                                     VariableDecorator (*)(llvm::Init *)> {
+      : public llvm::mapped_iterator<const llvm::Init *const *,
+                                     VariableDecorator (*)(
+                                         const llvm::Init *)> {
     /// Initializes the iterator to the specified iterator.
-    VariableDecoratorIterator(llvm::Init *const *it)
-        : llvm::mapped_iterator<llvm::Init *const *,
-                                VariableDecorator (*)(llvm::Init *)>(it,
-                                                                     &unwrap) {}
-    static VariableDecorator unwrap(llvm::Init *init);
+    VariableDecoratorIterator(const llvm::Init *const *it)
+        : llvm::mapped_iterator<const llvm::Init *const *,
+                                VariableDecorator (*)(const llvm::Init *)>(
+              it, &unwrap) {}
+    static VariableDecorator unwrap(const llvm::Init *init);
   };
   using var_decorator_iterator = VariableDecoratorIterator;
   using var_decorator_range = llvm::iterator_range<VariableDecoratorIterator>;
diff --git a/mlir/lib/TableGen/AttrOrTypeDef.cpp b/mlir/lib/TableGen/AttrOrTypeDef.cpp
index 9b9d9fd2317d..e72ca155bcf7 100644
--- a/mlir/lib/TableGen/AttrOrTypeDef.cpp
+++ b/mlir/lib/TableGen/AttrOrTypeDef.cpp
@@ -40,7 +40,7 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) {
   auto *builderList =
       dyn_cast_or_null<llvm::ListInit>(def->getValueInit("builders"));
   if (builderList && !builderList->empty()) {
-    for (llvm::Init *init : builderList->getValues()) {
+    for (const llvm::Init *init : builderList->getValues()) {
       AttrOrTypeBuilder builder(cast<llvm::DefInit>(init)->getDef(),
                                 def->getLoc());
 
@@ -58,8 +58,8 @@ AttrOrTypeDef::AttrOrTypeDef(const llvm::Record *def) : def(def) {
   if (auto *traitList = def->getValueAsListInit("traits")) {
     SmallPtrSet<const llvm::Init *, 32> traitSet;
     traits.reserve(traitSet.size());
-    llvm::unique_function<void(llvm::ListInit *)> processTraitList =
-        [&](llvm::ListInit *traitList) {
+    llvm::unique_function<void(const llvm::ListInit *)> processTraitList =
+        [&](const llvm::ListInit *traitList) {
           for (auto *traitInit : *traitList) {
             if (!traitSet.insert(traitInit).second)
               continue;
@@ -335,7 +335,9 @@ std::optional<StringRef> AttrOrTypeParameter::getDefaultValue() const {
   return result && !result->empty() ? result : std::nullopt;
 }
 
-llvm::Init *AttrOrTypeParameter::getDef() const { return def->getArg(index); }
+const llvm::Init *AttrOrTypeParameter::getDef() const {
+  return def->getArg(index);
+}
 
 std::optional<Constraint> AttrOrTypeParameter::getConstraint() const {
   if (auto *param = dyn_cast<llvm::DefInit>(getDef()))
@@ -349,7 +351,7 @@ std::optional<Constraint> AttrOrTypeParameter::getConstraint() const {
 //===----------------------------------------------------------------------===//
 
 bool AttributeSelfTypeParameter::classof(const AttrOrTypeParameter *param) {
-  llvm::Init *paramDef = param->getDef();
+  const llvm::Init *paramDef = param->getDef();
   if (auto *paramDefInit = dyn_cast<llvm::DefInit>(paramDef))
     return paramDefInit->getDef()->isSubClassOf("AttributeSelfTypeParameter");
   return false;
diff --git a/mlir/lib/TableGen/Attribute.cpp b/mlir/lib/TableGen/Attribute.cpp
index de930cb40070..887553bca661 100644
--- a/mlir/lib/TableGen/Attribute.cpp
+++ b/mlir/lib/TableGen/Attribute.cpp
@@ -126,7 +126,7 @@ StringRef Attribute::getDerivedCodeBody() const {
 Dialect Attribute::getDialect() const {
   const llvm::RecordVal *record = def->getValue("dialect");
   if (record && record->getValue()) {
-    if (DefInit *init = dyn_cast<DefInit>(record->getValue()))
+    if (const DefInit *init = dyn_cast<DefInit>(record->getValue()))
       return Dialect(init->getDef());
   }
   return Dialect(nullptr);
diff --git a/mlir/lib/TableGen/Dialect.cpp b/mlir/lib/TableGen/Dialect.cpp
index 081f6e56f9de..ef39818e439b 100644
--- a/mlir/lib/TableGen/Dialect.cpp
+++ b/mlir/lib/TableGen/Dialect.cpp
@@ -106,7 +106,7 @@ bool Dialect::usePropertiesForAttributes() const {
   return def->getValueAsBit("usePropertiesForAttributes");
 }
 
-llvm::DagInit *Dialect::getDiscardableAttributes() const {
+const llvm::DagInit *Dialect::getDiscardableAttributes() const {
   return def->getValueAsDag("discardableAttrs");
 }
 
diff --git a/mlir/lib/TableGen/Interfaces.cpp b/mlir/lib/TableGen/Interfaces.cpp
index a209b003b0f3..4a6709a43d0a 100644
--- a/mlir/lib/TableGen/Interfaces.cpp
+++ b/mlir/lib/TableGen/Interfaces.cpp
@@ -22,7 +22,7 @@ using namespace mlir::tblgen;
 //===----------------------------------------------------------------------===//
 
 InterfaceMethod::InterfaceMethod(const llvm::Record *def) : def(def) {
-  llvm::DagInit *args = def->getValueAsDag("arguments");
+  const llvm::DagInit *args = def->getValueAsDag("arguments");
   for (unsigned i = 0, e = args->getNumArgs(); i != e; ++i) {
     arguments.push_back(
         {llvm::cast<llvm::StringInit>(args->getArg(i))->getValue(),
@@ -78,7 +78,7 @@ Interface::Interface(const llvm::Record *def) : def(def) {
 
   // Initialize the interface methods.
   auto *listInit = dyn_cast<llvm::ListInit>(def->getValueInit("methods"));
-  for (llvm::Init *init : listInit->getValues())
+  for (const llvm::Init *init : listInit->getValues())
     methods.emplace_back(cast<llvm::DefInit>(init)->getDef());
 
   // Initialize the interface base classes.
@@ -98,7 +98,7 @@ Interface::Interface(const llvm::Record *def) : def(def) {
         baseInterfaces.push_back(std::make_unique<Interface>(baseInterface));
         basesAdded.insert(baseInterface.getName());
       };
-  for (llvm::Init *init : basesInit->getValues())
+  for (const llvm::Init *init : basesInit->getValues())
     addBaseInterfaceFn(Interface(cast<llvm::DefInit>(init)->getDef()));
 }
 
diff --git a/mlir/lib/TableGen/Operator.cpp b/mlir/lib/TableGen/Operator.cpp
index 6a33ff5ecd67..86670e9f8712 100644
--- a/mlir/lib/TableGen/Operator.cpp
+++ b/mlir/lib/TableGen/Operator.cpp
@@ -161,7 +161,7 @@ std::string Operator::getQualCppClassName() const {
 StringRef Operator::getCppNamespace() const { return cppNamespace; }
 
 int Operator::getNumResults() const {
-  DagInit *results = def.getValueAsDag("results");
+  const DagInit *results = def.getValueAsDag("results");
   return results->getNumArgs();
 }
 
@@ -198,12 +198,12 @@ auto Operator::getResults() const -> const_value_range {
 }
 
 TypeConstraint Operator::getResultTypeConstraint(int index) const {
-  DagInit *results = def.getValueAsDag("results");
+  const DagInit *results = def.getValueAsDag("results");
   return TypeConstraint(cast<DefInit>(results->getArg(index)));
 }
 
 StringRef Operator::getResultName(int index) const {
-  DagInit *results = def.getValueAsDag("results");
+  const DagInit *results = def.getValueAsDag("results");
   return results->getArgNameStr(index);
 }
 
@@ -241,7 +241,7 @@ Operator::arg_range Operator::getArgs() const {
 }
 
 StringRef Operator::getArgName(int index) const {
-  DagInit *argumentValues = def.getValueAsDag("arguments");
+  const DagInit *argumentValues = def.getValueAsDag("arguments");
   return argumentValues->getArgNameStr(index);
 }
 
@@ -557,7 +557,7 @@ void Operator::populateOpStructure() {
   auto *opVarClass = recordKeeper.getClass("OpVariable");
   numNativeAttributes = 0;
 
-  DagInit *argumentValues = def.getValueAsDag("arguments");
+  const DagInit *argumentValues = def.getValueAsDag("arguments");
   unsigned numArgs = argumentValues->getNumArgs();
 
   // Mapping from name of to argument or result index. Arguments are indexed
@@ -721,8 +721,8 @@ void Operator::populateOpStructure() {
                   " to precede it in traits list");
     };
 
-    std::function<void(llvm::ListInit *)> insert;
-    insert = [&](llvm::ListInit *traitList) {
+    std::function<void(const llvm::ListInit *)> insert;
+    insert = [&](const llvm::ListInit *traitList) {
       for (auto *traitInit : *traitList) {
         auto *def = cast<DefInit>(traitInit)->getDef();
         if (def->isSubClassOf("TraitList")) {
@@ -780,7 +780,7 @@ void Operator::populateOpStructure() {
   auto *builderList =
       dyn_cast_or_null<llvm::ListInit>(def.getValueInit("builders"));
   if (builderList && !builderList->empty()) {
-    for (llvm::Init *init : builderList->getValues())
+    for (const llvm::Init *init : builderList->getValues())
       builders.emplace_back(cast<llvm::DefInit>(init)->getDef(), def.getLoc());
   } else if (skipDefaultBuilders()) {
     PrintFatalError(
@@ -818,7 +818,8 @@ bool Operator::hasAssemblyFormat() const {
 }
 
 StringRef Operator::getAssemblyFormat() const {
-  return TypeSwitch<llvm::Init *, StringRef>(def.getValueInit("assemblyFormat"))
+  return TypeSwitch<const llvm::Init *, StringRef>(
+             def.getValueInit("assemblyFormat"))
       .Case<llvm::StringInit>([&](auto *init) { return init->getValue(); });
 }
 
@@ -832,7 +833,7 @@ void Operator::print(llvm::raw_ostream &os) const {
   }
 }
 
-auto Operator::VariableDecoratorIterator::unwrap(llvm::Init *init)
+auto Operator::VariableDecoratorIterator::unwrap(const llvm::Init *init)
     -> VariableDecorator {
   return VariableDecorator(cast<llvm::DefInit>(init)->getDef());
 }
diff --git a/mlir/lib/TableGen/Pattern.cpp b/mlir/lib/TableGen/Pattern.cpp
index 6437839ef208..bee20354387f 100644
--- a/mlir/lib/TableGen/Pattern.cpp
+++ b/mlir/lib/TableGen/Pattern.cpp
@@ -700,7 +700,7 @@ int Pattern::getBenefit() const {
   // The initial benefit value is a heuristic with number of ops in the source
   // pattern.
   int initBenefit = getSourcePattern().getNumOps();
-  llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
+  const llvm::DagInit *delta = def.getValueAsDag("benefitDelta");
   if (delta->getNumArgs() != 1 || !isa<llvm::IntInit>(delta->getArg(0))) {
     PrintFatalError(&def,
                     "The 'addBenefit' takes and only takes one integer value");
diff --git a/mlir/lib/TableGen/Type.cpp b/mlir/lib/TableGen/Type.cpp
index cda752297988..c3b813ec598d 100644
--- a/mlir/lib/TableGen/Type.cpp
+++ b/mlir/lib/TableGen/Type.cpp
@@ -50,7 +50,7 @@ std::optional<StringRef> TypeConstraint::getBuilderCall() const {
   const llvm::RecordVal *builderCall = baseType->getValue("builderCall");
   if (!builderCall || !builderCall->getValue())
     return std::nullopt;
-  return TypeSwitch<llvm::Init *, std::optional<StringRef>>(
+  return TypeSwitch<const llvm::Init *, std::optional<StringRef>>(
              builderCall->getValue())
       .Case<llvm::StringInit>([&](auto *init) {
         StringRef value = init->getValue();
diff --git a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
index 7119324dd125..20ad4292a548 100644
--- a/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
+++ b/mlir/lib/Tools/mlir-tblgen/MlirTblgenMain.cpp
@@ -30,8 +30,8 @@ enum DeprecatedAction { None, Warn, Error };
 static DeprecatedAction actionOnDeprecatedValue;
 
 // Returns if there is a use of `deprecatedInit` in `field`.
-static bool findUse(Init *field, Init *deprecatedInit,
-                    llvm::DenseMap<Init *, bool> &known) {
+static bool findUse(const Init *field, const Init *deprecatedInit,
+                    llvm::DenseMap<const Init *, bool> &known) {
   if (field == deprecatedInit)
     return true;
 
@@ -64,13 +64,13 @@ static bool findUse(Init *field, Init *deprecatedInit,
     if (findUse(dagInit->getOperator(), deprecatedInit, known))
       return memoize(true);
 
-    return memoize(llvm::any_of(dagInit->getArgs(), [&](Init *arg) {
+    return memoize(llvm::any_of(dagInit->getArgs(), [&](const Init *arg) {
       return findUse(arg, deprecatedInit, known);
     }));
   }
 
-  if (ListInit *li = dyn_cast<ListInit>(field)) {
-    return memoize(llvm::any_of(li->getValues(), [&](Init *jt) {
+  if (const ListInit *li = dyn_cast<ListInit>(field)) {
+    return memoize(llvm::any_of(li->getValues(), [&](const Init *jt) {
       return findUse(jt, deprecatedInit, known);
     }));
   }
@@ -83,8 +83,8 @@ static bool findUse(Init *field, Init *deprecatedInit,
 }
 
 // Returns if there is a use of `deprecatedInit` in `record`.
-static bool findUse(Record &record, Init *deprecatedInit,
-                    llvm::DenseMap<Init *, bool> &known) {
+static bool findUse(Record &record, const Init *deprecatedInit,
+                    llvm::DenseMap<const Init *, bool> &known) {
   return llvm::any_of(record.getValues(), [&](const RecordVal &val) {
     return findUse(val.getValue(), deprecatedInit, known);
   });
@@ -100,7 +100,7 @@ static void warnOfDeprecatedUses(const RecordKeeper &records) {
     if (!r || !r->getValue())
       continue;
 
-    llvm::DenseMap<Init *, bool> hasUse;
+    llvm::DenseMap<const Init *, bool> hasUse;
     if (auto *si = dyn_cast<StringInit>(r->getValue())) {
       for (auto &jt : records.getDefs()) {
         // Skip anonymous defs.
diff --git a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
index 86ebaf2cf27d..6a3d5a25e28c 100644
--- a/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/BytecodeDialectGen.cpp
@@ -46,8 +46,9 @@ public:
 private:
   /// Emits parse calls to construct given kind.
   void emitParseHelper(StringRef kind, StringRef returnType, StringRef builder,
-                       ArrayRef<Init *> args, ArrayRef<std::string> argNames,
-                       StringRef failure, mlir::raw_indented_ostream &ios);
+                       ArrayRef<const Init *> args,
+                       ArrayRef<std::string> argNames, StringRef failure,
+                       mlir::raw_indented_ostream &ios);
 
   /// Emits print instructions.
   void emitPrintHelper(const Record *memberRec, StringRef kind,
@@ -135,10 +136,12 @@ void Generator::emitParse(StringRef kind, const Record &x) {
       R"(static {0} read{1}(MLIRContext* context, DialectBytecodeReader &reader) )";
   mlir::raw_indented_ostream os(output);
   std::string returnType = getCType(&x);
-  os << formatv(head, kind == "attribute" ? "::mlir::Attribute" : "::mlir::Type", x.getName());
-  DagInit *members = x.getValueAsDag("members");
-  SmallVector<std::string> argNames =
-      llvm::to_vector(map_range(members->getArgNames(), [](StringInit *init) {
+  os << formatv(head,
+                kind == "attribute" ? "::mlir::Attribute" : "::mlir::Type",
+                x.getName());
+  const DagInit *members = x.getValueAsDag("members");
+  SmallVector<std::string> argNames = llvm::to_vector(
+      map_range(members->getArgNames(), [](const StringInit *init) {
         return init->getAsUnquotedString();
       }));
   StringRef builder = x.getValueAsString("cBuilder").trim();
@@ -148,7 +151,7 @@ void Generator::emitParse(StringRef kind, const Record &x) {
 }
 
 void printParseConditional(mlir::raw_indented_ostream &ios,
-                           ArrayRef<Init *> args,
+                           ArrayRef<const Init *> args,
                            ArrayRef<std::string> argNames) {
   ios << "if ";
   auto parenScope = ios.scope("(", ") {");
@@ -159,7 +162,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
   };
 
   auto parsedArgs =
-      llvm::to_vector(make_filter_range(args, [](Init *const attr) {
+      llvm::to_vector(make_filter_range(args, [](const Init *const attr) {
         const Record *def = cast<DefInit>(attr)->getDef();
         if (def->isSubClassOf("Array"))
           return true;
@@ -168,7 +171,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
 
   interleave(
       zip(parsedArgs, argNames),
-      [&](std::tuple<llvm::Init *&, const std::string &> it) {
+      [&](std::tuple<const Init *&, const std::string &> it) {
         const Record *attr = cast<DefInit>(std::get<0>(it))->getDef();
         std::string parser;
         if (auto optParser = attr->getValueAsOptionalString("cParser")) {
@@ -196,7 +199,7 @@ void printParseConditional(mlir::raw_indented_ostream &ios,
 }
 
 void Generator::emitParseHelper(StringRef kind, StringRef returnType,
-                                StringRef builder, ArrayRef<Init *> args,
+                                StringRef builder, ArrayRef<const Init *> args,
                                 ArrayRef<std::string> argNames,
                                 StringRef failure,
                                 mlir::raw_indented_ostream &ios) {
@@ -210,7 +213,7 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType,
   // Print decls.
   std::string lastCType = "";
   for (auto [arg, name] : zip(args, argNames)) {
-    DefInit *first = dyn_cast<DefInit>(arg);
+    const DefInit *first = dyn_cast<DefInit>(arg);
     if (!first)
       PrintFatalError("Unexpected type for " + name);
     const Record *def = first->getDef();
@@ -251,13 +254,14 @@ void Generator::emitParseHelper(StringRef kind, StringRef returnType,
     std::string returnType = getCType(def);
     ios << "auto " << listHelperName(name) << " = [&]() -> FailureOr<"
         << returnType << "> ";
-    SmallVector<Init *> args;
+    SmallVector<const Init *> args;
     SmallVector<std::string> argNames;
     if (def->isSubClassOf("CompositeBytecode")) {
-      DagInit *members = def->getValueAsDag("members");
-      args = llvm::to_vector(members->getArgs());
+      const DagInit *members = def->getValueAsDag("members");
+      args = llvm::to_vector(map_range(
+          members->getArgs(), [](Init *init) { return (const Init *)init; }));
       argNames = llvm::to_vector(
-          map_range(members->getArgNames(), [](StringInit *init) {
+          map_range(members->getArgNames(), [](const StringInit *init) {
             return init->getAsUnquotedString();
           }));
     } else {
@@ -332,7 +336,7 @@ void Generator::emitPrint(StringRef kind, StringRef type,
     auto *members = rec->getValueAsDag("members");
     for (auto [arg, name] :
          llvm::zip(members->getArgs(), members->getArgNames())) {
-      DefInit *def = dyn_cast<DefInit>(arg);
+      const DefInit *def = dyn_cast<DefInit>(arg);
       assert(def);
       const Record *memberRec = def->getDef();
       emitPrintHelper(memberRec, kind, kind, name->getAsUnquotedString(), os);
@@ -385,7 +389,7 @@ void Generator::emitPrintHelper(const Record *memberRec, StringRef kind,
     auto *members = memberRec->getValueAsDag("members");
     for (auto [arg, argName] :
          zip(members->getArgs(), members->getArgNames())) {
-      DefInit *def = dyn_cast<DefInit>(arg);
+      const DefInit *def = dyn_cast<DefInit>(arg);
       assert(def);
       emitPrintHelper(def->getDef(), kind, parent,
                       argName->getAsUnquotedString(), ios);
diff --git a/mlir/tools/mlir-tblgen/DialectGen.cpp b/mlir/tools/mlir-tblgen/DialectGen.cpp
index 55c3d9da2590..414cad5e1dcc 100644
--- a/mlir/tools/mlir-tblgen/DialectGen.cpp
+++ b/mlir/tools/mlir-tblgen/DialectGen.cpp
@@ -46,10 +46,10 @@ using DialectFilterIterator =
 } // namespace
 
 static void populateDiscardableAttributes(
-    Dialect &dialect, llvm::DagInit *discardableAttrDag,
+    Dialect &dialect, const llvm::DagInit *discardableAttrDag,
     SmallVector<std::pair<std::string, std::string>> &discardableAttributes) {
   for (int i : llvm::seq<int>(0, discardableAttrDag->getNumArgs())) {
-    llvm::Init *arg = discardableAttrDag->getArg(i);
+    const llvm::Init *arg = discardableAttrDag->getArg(i);
 
     StringRef givenName = discardableAttrDag->getArgNameStr(i);
     if (givenName.empty())
@@ -271,7 +271,8 @@ static void emitDialectDecl(Dialect &dialect, raw_ostream &os) {
     if (dialect.hasOperationInterfaceFallback())
       os << operationInterfaceFallbackDecl;
 
-    llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
+    const llvm::DagInit *discardableAttrDag =
+        dialect.getDiscardableAttributes();
     SmallVector<std::pair<std::string, std::string>> discardableAttributes;
     populateDiscardableAttributes(dialect, discardableAttrDag,
                                   discardableAttributes);
@@ -370,7 +371,7 @@ static void emitDialectDef(Dialect &dialect, const RecordKeeper &records,
   StringRef superClassName =
       dialect.isExtensible() ? "ExtensibleDialect" : "Dialect";
 
-  llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
+  const llvm::DagInit *discardableAttrDag = dialect.getDiscardableAttributes();
   SmallVector<std::pair<std::string, std::string>> discardableAttributes;
   populateDiscardableAttributes(dialect, discardableAttrDag,
                                 discardableAttributes);
diff --git a/mlir/tools/mlir-tblgen/OmpOpGen.cpp b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
index 1c20a6a9bcf4..04f81a4a2dce 100644
--- a/mlir/tools/mlir-tblgen/OmpOpGen.cpp
+++ b/mlir/tools/mlir-tblgen/OmpOpGen.cpp
@@ -102,11 +102,11 @@ static StringRef extractOmpClauseName(const Record *clause) {
 
 /// Check that the given argument, identified by its name and initialization
 /// value, is present in the \c arguments `dag`.
-static bool verifyArgument(DagInit *arguments, StringRef argName,
-                           Init *argInit) {
+static bool verifyArgument(const DagInit *arguments, StringRef argName,
+                           const Init *argInit) {
   auto range = zip_equal(arguments->getArgNames(), arguments->getArgs());
   return llvm::any_of(
-      range, [&](std::tuple<llvm::StringInit *const &, llvm::Init *const &> v) {
+      range, [&](std::tuple<const llvm::StringInit *, const llvm::Init *> v) {
         return std::get<0>(v)->getAsUnquotedString() == argName &&
                std::get<1>(v) == argInit;
       });
@@ -141,8 +141,8 @@ static void verifyClause(const Record *op, const Record *clause) {
   StringRef clauseClassName = extractOmpClauseName(clause);
 
   if (!clause->getValueAsBit("ignoreArgs")) {
-    DagInit *opArguments = op->getValueAsDag("arguments");
-    DagInit *arguments = clause->getValueAsDag("arguments");
+    const DagInit *opArguments = op->getValueAsDag("arguments");
+    const DagInit *arguments = clause->getValueAsDag("arguments");
 
     for (auto [name, arg] :
          zip(arguments->getArgNames(), arguments->getArgs())) {
@@ -208,8 +208,9 @@ static void verifyClause(const Record *op, const Record *clause) {
 ///
 /// \return the name of the base type to represent elements of the argument
 ///         type.
-static StringRef translateArgumentType(ArrayRef<SMLoc> loc, StringInit *name,
-                                       Init *init, int &nest, int &rank) {
+static StringRef translateArgumentType(ArrayRef<SMLoc> loc,
+                                       const StringInit *name, const Init *init,
+                                       int &nest, int &rank) {
   const Record *def = cast<DefInit>(init)->getDef();
 
   llvm::StringSet<> superClasses;
@@ -282,7 +283,7 @@ static void genClauseOpsStruct(const Record *clause, raw_ostream &os) {
   StringRef clauseName = extractOmpClauseName(clause);
   os << "struct " << clauseName << "ClauseOps {\n";
 
-  DagInit *arguments = clause->getValueAsDag("arguments");
+  const DagInit *arguments = clause->getValueAsDag("arguments");
   for (auto [name, arg] :
        zip_equal(arguments->getArgNames(), arguments->getArgs())) {
     int nest = 0, rank = 1;
-- 
GitLab


From ae68d532f810e217c747b10b26aeea3bb84c3844 Mon Sep 17 00:00:00 2001
From: Michael Maitland <michaeltmaitland@gmail.com>
Date: Wed, 16 Oct 2024 14:58:00 -0400
Subject: [PATCH 166/329] [RISCV][VLOPT] Allow propagation even when VL isn't
 VLMAX (#112228)

The original goal of this pass was to focus on vector operations with
VLMAX. However, users often utilize only part of the result, and such
usage may come from the vectorizer.

We found that relaxing this constraint can capture more optimization
opportunities, such as non-power-of-2 code generation and vector
operation sequences with different VLs.

---------

Co-authored-by: Kito Cheng <kito.cheng@sifive.com>
---
 llvm/lib/Target/RISCV/RISCVInstrInfo.cpp      | 14 ++++
 llvm/lib/Target/RISCV/RISCVInstrInfo.h        |  3 +
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp    | 84 ++++++++++++++-----
 llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp | 22 +----
 llvm/test/CodeGen/RISCV/rvv/vl-opt.ll         | 50 +++++++----
 5 files changed, 118 insertions(+), 55 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
index b8539a5d1add..3989a966edfd 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.cpp
@@ -4102,3 +4102,17 @@ unsigned RISCV::getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW) {
   assert(Scaled >= 3 && Scaled <= 6);
   return Scaled;
 }
+
+/// Given two VL operands, do we know that LHS <= RHS?
+bool RISCV::isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
+  if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
+      LHS.getReg() == RHS.getReg())
+    return true;
+  if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
+    return true;
+  if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
+    return false;
+  if (!LHS.isImm() || !RHS.isImm())
+    return false;
+  return LHS.getImm() <= RHS.getImm();
+}
diff --git a/llvm/lib/Target/RISCV/RISCVInstrInfo.h b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
index 457db9b9860d..c3aa36748662 100644
--- a/llvm/lib/Target/RISCV/RISCVInstrInfo.h
+++ b/llvm/lib/Target/RISCV/RISCVInstrInfo.h
@@ -346,6 +346,9 @@ unsigned getDestLog2EEW(const MCInstrDesc &Desc, unsigned Log2SEW);
 // Special immediate for AVL operand of V pseudo instructions to indicate VLMax.
 static constexpr int64_t VLMaxSentinel = -1LL;
 
+/// Given two VL operands, do we know that LHS <= RHS?
+bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS);
+
 // Mask assignments for floating-point
 static constexpr unsigned FPMASK_Negative_Infinity = 0x001;
 static constexpr unsigned FPMASK_Negative_Normal = 0x002;
diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index 6053899987db..ee494c468151 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -51,7 +51,7 @@ public:
   StringRef getPassName() const override { return PASS_NAME; }
 
 private:
-  bool checkUsers(std::optional<Register> &CommonVL, MachineInstr &MI);
+  bool checkUsers(const MachineOperand *&CommonVL, MachineInstr &MI);
   bool tryReduceVL(MachineInstr &MI);
   bool isCandidate(const MachineInstr &MI) const;
 };
@@ -658,10 +658,34 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   if (MI.getNumDefs() != 1)
     return false;
 
+  // If we're not using VLMAX, then we need to be careful whether we are using
+  // TA/TU when there is a non-undef Passthru. But when we are using VLMAX, it
+  // does not matter whether we are using TA/TU with a non-undef Passthru, since
+  // there are no tail elements to be perserved.
   unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
   const MachineOperand &VLOp = MI.getOperand(VLOpNum);
-  if (!VLOp.isImm() || VLOp.getImm() != RISCV::VLMaxSentinel)
+  if (VLOp.isReg() || VLOp.getImm() != RISCV::VLMaxSentinel) {
+    // If MI has a non-undef passthru, we will not try to optimize it since
+    // that requires us to preserve tail elements according to TA/TU.
+    // Otherwise, The MI has an undef Passthru, so it doesn't matter whether we
+    // are using TA/TU.
+    bool HasPassthru = RISCVII::isFirstDefTiedToFirstUse(Desc);
+    unsigned PassthruOpIdx = MI.getNumExplicitDefs();
+    if (HasPassthru &&
+        MI.getOperand(PassthruOpIdx).getReg() != RISCV::NoRegister) {
+      LLVM_DEBUG(
+          dbgs() << "  Not a candidate because it uses non-undef passthru"
+                    " with non-VLMAX VL\n");
+      return false;
+    }
+  }
+
+  // If the VL is 1, then there is no need to reduce it. This is an
+  // optimization, not needed to preserve correctness.
+  if (VLOp.isImm() && VLOp.getImm() == 1) {
+    LLVM_DEBUG(dbgs() << "  Not a candidate because VL is already 1\n");
     return false;
+  }
 
   // Some instructions that produce vectors have semantics that make it more
   // difficult to determine whether the VL can be reduced. For example, some
@@ -684,7 +708,7 @@ bool RISCVVLOptimizer::isCandidate(const MachineInstr &MI) const {
   return true;
 }
 
-bool RISCVVLOptimizer::checkUsers(std::optional<Register> &CommonVL,
+bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL,
                                   MachineInstr &MI) {
   // FIXME: Avoid visiting each user for each time we visit something on the
   // worklist, combined with an extra visit from the outer loop. Restructure
@@ -730,16 +754,17 @@ bool RISCVVLOptimizer::checkUsers(std::optional<Register> &CommonVL,
 
     unsigned VLOpNum = RISCVII::getVLOpNum(Desc);
     const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
-    // Looking for a register VL that isn't X0.
-    if (!VLOp.isReg() || VLOp.getReg() == RISCV::X0) {
-      LLVM_DEBUG(dbgs() << "    Abort due to user uses X0 as VL.\n");
-      CanReduceVL = false;
-      break;
-    }
+
+    // Looking for an immediate or a register VL that isn't X0.
+    assert(!VLOp.isReg() ||
+           VLOp.getReg() != RISCV::X0 && "Did not expect X0 VL");
 
     if (!CommonVL) {
-      CommonVL = VLOp.getReg();
-    } else if (*CommonVL != VLOp.getReg()) {
+      CommonVL = &VLOp;
+      LLVM_DEBUG(dbgs() << "    User VL is: " << VLOp << "\n");
+    } else if (!CommonVL->isIdenticalTo(VLOp)) {
+      // FIXME: This check requires all users to have the same VL. We can relax
+      // this and get the largest VL amongst all users.
       LLVM_DEBUG(dbgs() << "    Abort because users have different VL\n");
       CanReduceVL = false;
       break;
@@ -776,7 +801,7 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
     MachineInstr &MI = *Worklist.pop_back_val();
     LLVM_DEBUG(dbgs() << "Trying to reduce VL for " << MI << "\n");
 
-    std::optional<Register> CommonVL;
+    const MachineOperand *CommonVL = nullptr;
     bool CanReduceVL = true;
     if (isVectorRegClass(MI.getOperand(0).getReg(), MRI))
       CanReduceVL = checkUsers(CommonVL, MI);
@@ -784,21 +809,34 @@ bool RISCVVLOptimizer::tryReduceVL(MachineInstr &OrigMI) {
     if (!CanReduceVL || !CommonVL)
       continue;
 
-    if (!CommonVL->isVirtual()) {
-      LLVM_DEBUG(
-          dbgs() << "    Abort due to new VL is not virtual register.\n");
+    assert((CommonVL->isImm() || CommonVL->getReg().isVirtual()) &&
+           "Expected VL to be an Imm or virtual Reg");
+
+    unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
+    MachineOperand &VLOp = MI.getOperand(VLOpNum);
+
+    if (!RISCV::isVLKnownLE(*CommonVL, VLOp)) {
+      LLVM_DEBUG(dbgs() << "    Abort due to CommonVL not <= VLOp.\n");
       continue;
     }
 
-    const MachineInstr *VLMI = MRI->getVRegDef(*CommonVL);
-    if (!MDT->dominates(VLMI, &MI))
-      continue;
+    if (CommonVL->isImm()) {
+      LLVM_DEBUG(dbgs() << "  Reduce VL from " << VLOp << " to "
+                        << CommonVL->getImm() << " for " << MI << "\n");
+      VLOp.ChangeToImmediate(CommonVL->getImm());
+    } else {
+      const MachineInstr *VLMI = MRI->getVRegDef(CommonVL->getReg());
+      if (!MDT->dominates(VLMI, &MI))
+        continue;
+      LLVM_DEBUG(
+          dbgs() << "  Reduce VL from " << VLOp << " to "
+                 << printReg(CommonVL->getReg(), MRI->getTargetRegisterInfo())
+                 << " for " << MI << "\n");
+
+      // All our checks passed. We can reduce VL.
+      VLOp.ChangeToRegister(CommonVL->getReg(), false);
+    }
 
-    // All our checks passed. We can reduce VL.
-    LLVM_DEBUG(dbgs() << "    Reducing VL for: " << MI << "\n");
-    unsigned VLOpNum = RISCVII::getVLOpNum(MI.getDesc());
-    MachineOperand &VLOp = MI.getOperand(VLOpNum);
-    VLOp.ChangeToRegister(*CommonVL, false);
     MadeChange = true;
 
     // Now add all inputs to this instruction to the worklist.
diff --git a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
index b883c50beadc..a57bc5a3007d 100644
--- a/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVectorPeephole.cpp
@@ -86,20 +86,6 @@ char RISCVVectorPeephole::ID = 0;
 INITIALIZE_PASS(RISCVVectorPeephole, DEBUG_TYPE, "RISC-V Fold Masks", false,
                 false)
 
-/// Given two VL operands, do we know that LHS <= RHS?
-static bool isVLKnownLE(const MachineOperand &LHS, const MachineOperand &RHS) {
-  if (LHS.isReg() && RHS.isReg() && LHS.getReg().isVirtual() &&
-      LHS.getReg() == RHS.getReg())
-    return true;
-  if (RHS.isImm() && RHS.getImm() == RISCV::VLMaxSentinel)
-    return true;
-  if (LHS.isImm() && LHS.getImm() == RISCV::VLMaxSentinel)
-    return false;
-  if (!LHS.isImm() || !RHS.isImm())
-    return false;
-  return LHS.getImm() <= RHS.getImm();
-}
-
 /// Given \p User that has an input operand with EEW=SEW, which uses the dest
 /// operand of \p Src with an unknown EEW, return true if their EEWs match.
 bool RISCVVectorPeephole::hasSameEEW(const MachineInstr &User,
@@ -191,7 +177,7 @@ bool RISCVVectorPeephole::tryToReduceVL(MachineInstr &MI) const {
     return false;
 
   MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
-  if (VL.isIdenticalTo(SrcVL) || !isVLKnownLE(VL, SrcVL))
+  if (VL.isIdenticalTo(SrcVL) || !RISCV::isVLKnownLE(VL, SrcVL))
     return false;
 
   if (!ensureDominates(VL, *Src))
@@ -580,7 +566,7 @@ bool RISCVVectorPeephole::foldUndefPassthruVMV_V_V(MachineInstr &MI) {
     MachineOperand &SrcPolicy =
         Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc()));
 
-    if (isVLKnownLE(MIVL, SrcVL))
+    if (RISCV::isVLKnownLE(MIVL, SrcVL))
       SrcPolicy.setImm(SrcPolicy.getImm() | RISCVII::TAIL_AGNOSTIC);
   }
 
@@ -631,7 +617,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
   // so we don't need to handle a smaller source VL here.  However, the
   // user's VL may be larger
   MachineOperand &SrcVL = Src->getOperand(RISCVII::getVLOpNum(Src->getDesc()));
-  if (!isVLKnownLE(SrcVL, MI.getOperand(3)))
+  if (!RISCV::isVLKnownLE(SrcVL, MI.getOperand(3)))
     return false;
 
   // If the new passthru doesn't dominate Src, try to move Src so it does.
@@ -650,7 +636,7 @@ bool RISCVVectorPeephole::foldVMV_V_V(MachineInstr &MI) {
   // If MI was tail agnostic and the VL didn't increase, preserve it.
   int64_t Policy = RISCVII::TAIL_UNDISTURBED_MASK_UNDISTURBED;
   if ((MI.getOperand(5).getImm() & RISCVII::TAIL_AGNOSTIC) &&
-      isVLKnownLE(MI.getOperand(3), SrcVL))
+      RISCV::isVLKnownLE(MI.getOperand(3), SrcVL))
     Policy |= RISCVII::TAIL_AGNOSTIC;
   Src->getOperand(RISCVII::getVecPolicyOpNum(Src->getDesc())).setImm(Policy);
 
diff --git a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
index 0b3e67ec8955..1a1472fcfc66 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vl-opt.ll
@@ -11,19 +11,46 @@
 declare <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32>, <vscale x 4 x i32>, <vscale x 4 x i32>, iXLen)
 
 define <vscale x 4 x i32> @different_imm_vl_with_ta(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
-; CHECK-LABEL: different_imm_vl_with_ta:
-; CHECK:       # %bb.0:
-; CHECK-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v10, v12
-; CHECK-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
-; CHECK-NEXT:    vadd.vv v8, v8, v10
-; CHECK-NEXT:    ret
+; NOVLOPT-LABEL: different_imm_vl_with_ta:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetivli zero, 5, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v12
+; NOVLOPT-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: different_imm_vl_with_ta:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v10, v12
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    ret
   %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen 5)
   %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
   ret <vscale x 4 x i32> %w
 }
 
-; No benificial to propagate VL since VL is larger in the use side.
+define <vscale x 4 x i32> @vlmax_and_imm_vl_with_ta(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
+; NOVLOPT-LABEL: vlmax_and_imm_vl_with_ta:
+; NOVLOPT:       # %bb.0:
+; NOVLOPT-NEXT:    vsetvli a0, zero, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v10, v12
+; NOVLOPT-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; NOVLOPT-NEXT:    vadd.vv v8, v8, v10
+; NOVLOPT-NEXT:    ret
+;
+; VLOPT-LABEL: vlmax_and_imm_vl_with_ta:
+; VLOPT:       # %bb.0:
+; VLOPT-NEXT:    vsetivli zero, 4, e32, m2, ta, ma
+; VLOPT-NEXT:    vadd.vv v8, v10, v12
+; VLOPT-NEXT:    vadd.vv v8, v8, v10
+; VLOPT-NEXT:    ret
+  %v = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen -1)
+  %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> poison, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a, iXLen 4)
+  ret <vscale x 4 x i32> %w
+}
+
+; Not beneficial to propagate VL since VL is larger in the use side.
 define <vscale x 4 x i32> @different_imm_vl_with_ta_larger_vl(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
 ; CHECK-LABEL: different_imm_vl_with_ta_larger_vl:
 ; CHECK:       # %bb.0:
@@ -50,8 +77,7 @@ define <vscale x 4 x i32> @different_imm_reg_vl_with_ta(<vscale x 4 x i32> %pass
   ret <vscale x 4 x i32> %w
 }
 
-
-; No benificial to propagate VL since VL is already one.
+; Not beneficial to propagate VL since VL is already one.
 define <vscale x 4 x i32> @different_imm_vl_with_ta_1(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %a, <vscale x 4 x i32> %b, iXLen %vl1, iXLen %vl2) {
 ; CHECK-LABEL: different_imm_vl_with_ta_1:
 ; CHECK:       # %bb.0:
@@ -110,7 +136,3 @@ define <vscale x 4 x i32> @different_imm_vl_with_tu(<vscale x 4 x i32> %passthru
   %w = call <vscale x 4 x i32> @llvm.riscv.vadd.nxv4i32.nxv4i32(<vscale x 4 x i32> %passthru, <vscale x 4 x i32> %v, <vscale x 4 x i32> %a,iXLen 4)
   ret <vscale x 4 x i32> %w
 }
-
-;; NOTE: These prefixes are unused and the list is autogenerated. Do not add tests below this line:
-; NOVLOPT: {{.*}}
-; VLOPT: {{.*}}
-- 
GitLab


From 36d936a2d057ddbd7822614edf01e39a0c21d654 Mon Sep 17 00:00:00 2001
From: Matthias Springer <me@m-sp.org>
Date: Wed, 16 Oct 2024 21:04:11 +0200
Subject: [PATCH 167/329] [mlir][IR] Improve error message when return type
 could not be inferred (#112336)

Print an error such as the following one before terminating program
execution.
```
mlir/test/Dialect/SparseTensor/convert_dense2sparse.mlir:26:8: remark: location of op
  %0 = sparse_tensor.convert %arg0 : tensor<?xi32> to tensor<?xi32, #SparseVector>
       ^
LLVM ERROR: Failed to infer result type(s):
"sparse_tensor.positions"(...) {} : (index) -> ( ??? )

(stack trace follows)
```
---
 .../include/mlir/Interfaces/InferTypeOpInterface.h |  4 ++++
 mlir/lib/Interfaces/InferTypeOpInterface.cpp       | 14 ++++++++++++++
 mlir/test/mlir-tblgen/op-decl-and-defs.td          |  5 +++++
 mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp        |  3 ++-
 4 files changed, 25 insertions(+), 1 deletion(-)

diff --git a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
index 47bcfc9bbd4f..4fcbeff9df56 100644
--- a/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
+++ b/mlir/include/mlir/Interfaces/InferTypeOpInterface.h
@@ -244,6 +244,10 @@ inferReturnTensorTypes(ArrayRef<ShapedTypeComponents> retComponents,
 /// Verifies that the inferred result types match the actual result types for
 /// the op. Precondition: op implements InferTypeOpInterface.
 LogicalResult verifyInferredResultTypes(Operation *op);
+
+/// Report a fatal error indicating that the result types could not be
+/// inferred.
+void reportFatalInferReturnTypesError(OperationState &state);
 } // namespace detail
 
 namespace OpTrait {
diff --git a/mlir/lib/Interfaces/InferTypeOpInterface.cpp b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
index e52d0e17cda2..8cc4206dae6e 100644
--- a/mlir/lib/Interfaces/InferTypeOpInterface.cpp
+++ b/mlir/lib/Interfaces/InferTypeOpInterface.cpp
@@ -247,3 +247,17 @@ LogicalResult mlir::detail::verifyInferredResultTypes(Operation *op) {
 
   return result;
 }
+
+void mlir::detail::reportFatalInferReturnTypesError(OperationState &state) {
+  std::string buffer;
+  llvm::raw_string_ostream os(buffer);
+  os << "Failed to infer result type(s):\n";
+  os << "\"" << state.name << "\"(...) ";
+  os << state.attributes.getDictionary(state.location.getContext());
+  os << " : (";
+  llvm::interleaveComma(state.operands, os,
+                        [&](Value val) { os << val.getType(); });
+  os << ") -> ( ??? )";
+  emitRemark(state.location, "location of op");
+  llvm::report_fatal_error(llvm::StringRef(buffer));
+}
diff --git a/mlir/test/mlir-tblgen/op-decl-and-defs.td b/mlir/test/mlir-tblgen/op-decl-and-defs.td
index 31dd53725c59..a03d0b40d465 100644
--- a/mlir/test/mlir-tblgen/op-decl-and-defs.td
+++ b/mlir/test/mlir-tblgen/op-decl-and-defs.td
@@ -208,6 +208,11 @@ def NS_FOp : NS_Op<"op_with_all_types_constraint",
 // CHECK-LABEL: class FOp :
 // CHECK: static ::llvm::LogicalResult inferReturnTypes
 
+// DEFS: void FOp::build(::mlir::OpBuilder &odsBuilder, ::mlir::OperationState &odsState, ::mlir::Value a) {
+// DEFS:   if (::mlir::succeeded(FOp::inferReturnTypes(odsBuilder.getContext(),
+// DEFS:   else
+// DEFS:     ::mlir::detail::reportFatalInferReturnTypesError(odsState);
+
 def NS_GOp : NS_Op<"op_with_fixed_return_type", []> {
   let arguments = (ins AnyType:$a);
   let results = (outs I32:$b);
diff --git a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
index ce2b6ed94c39..71fa5011a476 100644
--- a/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
+++ b/mlir/tools/mlir-tblgen/OpDefinitionsGen.cpp
@@ -2503,7 +2503,8 @@ void OpEmitter::genSeparateArgParamBuilder() {
                       {1}.regions, inferredReturnTypes)))
           {1}.addTypes(inferredReturnTypes);
         else
-          ::llvm::report_fatal_error("Failed to infer result type(s).");)",
+          ::mlir::detail::reportFatalInferReturnTypesError({1});
+        )",
                       opClass.getClassName(), builderOpState);
       return;
     }
-- 
GitLab


From ad5e2bf6e934abd9fef39d3b88f40d4f3c1a7d60 Mon Sep 17 00:00:00 2001
From: Brooks Davis <brooks@one-eyed-alien.net>
Date: Wed, 16 Oct 2024 12:41:38 -0700
Subject: [PATCH 168/329] [llbd] Finish Turn lldb_private::Status into a value
 type. (#10616) (#112420)

Fix a few bare Status() invocations that were missed in the conversion.
This is sufficent to build lldb on FreeBSD/aaarch64.

Fixes: 0642cd768b80
---
 .../NativeRegisterContextFreeBSD_arm64.cpp    | 20 +++++++++----------
 1 file changed, 9 insertions(+), 11 deletions(-)

diff --git a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
index 1a6defbff354..7adc00622ec2 100644
--- a/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
+++ b/lldb/source/Plugins/Process/FreeBSD/NativeRegisterContextFreeBSD_arm64.cpp
@@ -119,17 +119,15 @@ NativeRegisterContextFreeBSD_arm64::ReadRegister(const RegisterInfo *reg_info,
                                                  RegisterValue &reg_value) {
   Status error;
 
-  if (!reg_info) {
-    error = Status::FromErrorString("reg_info NULL");
-    return error;
-  }
+  if (!reg_info)
+    return Status::FromErrorString("reg_info NULL");
 
   const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB];
 
   if (reg == LLDB_INVALID_REGNUM)
-    return Status("no lldb regnum for %s", reg_info && reg_info->name
-                                               ? reg_info->name
-                                               : "<unknown register>");
+    return Status::FromErrorStringWithFormat(
+        "no lldb regnum for %s",
+        reg_info && reg_info->name ? reg_info->name : "<unknown register>");
 
   uint32_t set = GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg);
   error = ReadRegisterSet(set);
@@ -147,14 +145,14 @@ Status NativeRegisterContextFreeBSD_arm64::WriteRegister(
   Status error;
 
   if (!reg_info)
-    return Status("reg_info NULL");
+    return Status::FromErrorString("reg_info NULL");
 
   const uint32_t reg = reg_info->kinds[lldb::eRegisterKindLLDB];
 
   if (reg == LLDB_INVALID_REGNUM)
-    return Status("no lldb regnum for %s", reg_info && reg_info->name
-                                               ? reg_info->name
-                                               : "<unknown register>");
+    return Status::FromErrorStringWithFormat(
+        "no lldb regnum for %s",
+        reg_info && reg_info->name ? reg_info->name : "<unknown register>");
 
   uint32_t set = GetRegisterInfo().GetRegisterSetFromRegisterIndex(reg);
   error = ReadRegisterSet(set);
-- 
GitLab


From 5000c688bf9dad3ed5ec98cf427b3c5160e6e74c Mon Sep 17 00:00:00 2001
From: Artem Dergachev <adergachev@apple.com>
Date: Wed, 16 Oct 2024 12:42:49 -0700
Subject: [PATCH 169/329] [-Wunsafe-buffer-usage] Add user documentation.
 (#111624)

Originally: https://reviews.llvm.org/D136811
---
 clang/docs/SafeBuffers.rst | 585 +++++++++++++++++++++++++++++++++++++
 clang/docs/index.rst       |   1 +
 2 files changed, 586 insertions(+)
 create mode 100644 clang/docs/SafeBuffers.rst

diff --git a/clang/docs/SafeBuffers.rst b/clang/docs/SafeBuffers.rst
new file mode 100644
index 000000000000..144c3a76a583
--- /dev/null
+++ b/clang/docs/SafeBuffers.rst
@@ -0,0 +1,585 @@
+================
+C++ Safe Buffers
+================
+
+.. contents::
+   :local:
+
+
+Introduction
+============
+
+Clang can be used to harden your C++ code against buffer overflows, an otherwise
+common security issue with C-based languages.
+
+The solution described in this document is an integrated programming model as
+it combines:
+
+- a family of opt-in Clang warnings (``-Wunsafe-buffer-usage``) emitted at
+  during compilation to help you update your code to encapsulate and propagate
+  the bounds information associated with pointers;
+- runtime assertions implemented as part of
+  (`libc++ hardening modes <https://libcxx.llvm.org/Hardening.html>`_)
+  that eliminate undefined behavior as long as the coding convention
+  is followed and the bounds information is therefore available and correct.
+
+The goal of this work is to enable development of bounds-safe C++ code. It is
+not a "push-button" solution; depending on your codebase's existing
+coding style, significant (even if largely mechanical) changes to your code
+may be necessary. However, it allows you to achieve valuable safety guarantees
+on security-critical parts of your codebase.
+
+This solution is under active development. It is already useful for its purpose
+but more work is being done to improve ergonomics and safety guarantees
+and reduce adoption costs.
+
+The solution aligns in spirit with the "Ranges" safety profile
+that was `proposed <https://www.open-std.org/jtc1/sc22/wg21/docs/papers/2024/p3274r0.pdf>`_
+by Bjarne Stroustrup for standardization alongside other C++ safety features.
+
+
+Pre-Requisites
+==============
+
+In order to achieve bounds safety, your codebase needs to have access to
+well-encapsulated bounds-safe container, view, and iterator types.
+If your project uses libc++, standard container and view types such as
+``std::vector`` and ``std::span`` can be made bounds-safe by enabling
+the "fast" `hardening mode <https://libcxx.llvm.org/Hardening.html>`_
+(passing ``-D_LIBCPP_HARDENING_MODE=_LIBCPP_HARDENING_MODE_FAST``) to your
+compiler) or any of the stricter hardening modes.
+
+In order to harden iterators, you'll need to also obtain a libc++ binary
+built with ``_LIBCPP_ABI_BOUNDED_ITERATORS`` -- which is a libc++ ABI setting
+that needs to be set for your entire target platform if you need to maintain
+binary compatibility with the rest of the platform.
+
+A relatively fresh version of C++ is recommended. In particular, the very useful
+standard view class ``std::span`` requires C++20.
+
+Other implementations of the C++ standard library may provide different
+flags to enable such hardening hardening.
+
+If you're using custom containers and views, they will need to be hardened
+this way as well, but you don't necessarily need to do this ahead of time.
+
+This approach can theoretically be applied to plain C codebases,
+assuming that safe primitives are developed to encapsulate all buffer accesses,
+acting as "hardened custom containers" to replace raw pointers.
+However, such approach would be very unergonomic in C, and safety guarantees
+will be lower due to lack of good encapsulation technology. A better approach
+to bounds safety for non-C++ programs,
+`-fbounds-safety <https://clang.llvm.org/docs/BoundsSafety.html>`_,
+is currently in development.
+
+Technically, safety guarantees cannot be provided without hardening
+the entire technology stack, including all of your dependencies.
+However, applying such hardening technology to even a small portion
+of your code may be significantly better than nothing.
+
+
+The Programming Model for C++
+=============================
+
+Assuming that hardened container, view, and iterator classes are available,
+what remains is to make sure they are used consistently in your code.
+Below we define the specific coding convention that needs to be followed
+in order to guarantee safety and how the compiler technology
+around ``-Wunsafe-buffer-usage`` assists with that.
+
+
+Buffer operations should never be performed over raw pointers
+-------------------------------------------------------------
+
+Every time a memory access is made, a bounds-safe program must guarantee
+that the range of accessed memory addresses falls into the boundaries
+of the memory allocated for the object that's being accessed.
+In order to establish such a guarantee, the information about such valid range
+of addresses -- the **bounds information** associated with the accessed address
+-- must be formally available every time a memory access is performed.
+
+A raw pointer does not naturally carry any bounds information.
+The bounds information for the pointer may be available *somewhere*, but
+it is not associated with the pointer in a formal manner, so a memory access
+performed through a raw pointer cannot be automatically verified to be
+bounds-safe by the compiler.
+
+That said, the Safe Buffers programming model does **not** try to eliminate
+**all** pointer usage. Instead it assumes that most pointers point to
+individual objects, not buffers, and therefore they typically aren't
+associated with buffer overflow risks. For that reason, in order to identify
+the code that requires manual intervention, it is desirable to initially shift
+the focus away from the pointers themselves, and instead focus on their
+**usage patterns**.
+
+The compiler warning ``-Wunsafe-buffer-usage`` is built to assist you
+with this step of the process. A ``-Wunsafe-buffer-usage`` warning is
+emitted whenever one of the following **buffer operations** are performed
+on a raw pointer:
+
+- array indexing with ``[]``,
+- pointer arithmetic,
+- bounds-unsafe standard C functions such as ``std::memcpy()``,
+- C++ smart pointer operations such as ``std::unique_ptr<T[N]>::operator[]()``,
+  which unfortunately cannot be made fully safe within the rules of
+  the C++ standard (as of C++23).
+
+This is sufficient for identifying each raw buffer pointer in the program at
+**at least one point** during its lifetime across your software stack.
+
+For example, both of the following functions are flagged by
+``-Wunsafe-buffer-usage`` because ``pointer`` gets identified as an unsafe
+buffer pointer. Even though the second function does not directly access
+the buffer, the pointer arithmetic operation inside it may easily be
+the only formal "hint" in the program that the pointer does indeed point
+to a buffer of multiple objects::
+
+    int get_last_element(int *pointer, size_t size) {
+      return ptr[sz - 1]; // warning: unsafe buffer access
+    }
+
+    int *get_last_element_ptr(int *pointer, size_t size) {
+      return ptr + (size - 1); // warning: unsafe pointer arithmetic
+    }
+
+
+All buffers need to be encapsulated into safe container and view types
+----------------------------------------------------------------------
+
+It immediately follows from the previous requirement that once an unsafe pointer
+is identified at any point during its lifetime, it should be immediately wrapped
+into a safe container type (if the allocation site is "nearby") or a safe
+view type (if the allocation site is "far away"). Not only memory accesses,
+but also non-access operations such as pointer arithmetic need to be covered
+this way in order to benefit from the respective runtime bounds checks.
+
+If a **container** type (``std::array``, ``std::vector``, ``std::string``)
+is used for allocating the buffer, this is the best-case scenario because
+the container naturally has access to the correct bounds information for the
+buffer, and the runtime bounds checks immediately kick in. Additionally,
+the container type may provide automatic lifetime management for the buffer
+(which may or may not be desirable).
+
+If a **view** type is used (``std::span``, ``std::string_view``), this typically
+means that the bounds information for the "adopted" pointer needs to be passed
+to the view's constructor manually. This makes runtime checks immediately
+kick in with respect to the provided bounds information, which is an immediate
+improvement over the raw pointer. However, this situation is still fundamentally
+insufficient for security purposes, because **bounds information provided
+this way cannot be guaranteed to be correct**.
+
+For example, the function ``get_last_element()`` we've seen in the previous
+section can be made **slightly** safer this way::
+
+    int get_last_element(int *pointer, size_t size) {
+      std::span<int> sp(pointer, size);
+      return sp[size - 1]; // warning addressed
+    }
+
+Here ``std::span`` eliminates the potential concern that the operation
+``size - 1`` may overflow when ``sz`` is equal to ``0``, leading to a buffer
+"underrun". However, such program does not provide a guarantee that
+the variable ``sz`` correctly represents the **actual** size fo the buffer
+pointed to by ``ptr``. The ``std::span`` constructed this way may be ill-formed.
+It may fail to protect you from overrunning the original buffer.
+
+The following example demonstrates one of the most dangerous anti-patterns
+of this nature::
+
+    void convert_data(int *source_buf, size_t source_size,
+                      int *target_buf, size_t target_size) {
+      // Terrible: mismatched pointer / size.
+      std::span<int> target_span(target_buf, source_size);
+      // ...
+    }
+
+The second parameter of ``std::span`` should never be the **desired** size
+of the buffer. It should always be the **actual** size of the buffer.
+Such code often indicates that the original code has already contained
+a vulnerability -- and the use of a safe view class failed to prevent it.
+
+If ``target_span`` actually needs to be of size ``source_size``, a significantly
+safer way to produce such a span would be to build it with the correct size
+first, and then resize it to the desired size by calling ``.first()``::
+
+    void convert_data(int *source_buf, size_t source_size,
+                      int *target_buf, size_t target_size) {
+      // Safer.
+      std::span<int> target_span(target_buf, target_size).first(source_size);
+      // ...
+    }
+
+However, these are still half-measures. This code still accepts the
+bounds information from the caller in an **informal** manner, and such bounds
+information cannot be guaranteed to be correct.
+
+In order to mitigate problems of this nature in their entirety,
+the third guideline is imposed.
+
+
+Encapsulation of bounds information must be respected continuously
+------------------------------------------------------------------
+
+The allocation site of the object is the only reliable source of bounds
+information for that object. For objects with long lifespans across
+multiple functions or even libraries in the software stack, it is essential
+to formally preserve the original bounds information as it's being passed
+from one piece of code to another.
+
+Standard container and view classes are designed to preserve bounds information
+correctly **by construction**. However, they offer a number of ways to "break"
+encapsulation, which may cause you to temporarily lose track of the correct
+bounds information:
+
+- The two-parameter constructor ``std::span(ptr, size)`` allows you to
+  assemble an ill-formed ``std::span``;
+- Conversely, you can unwrap a container or a view object into a raw pointer
+  and a raw size by calling its ``.data()`` and ``.size()`` methods.
+- The overloaded ``operator&()`` found on container and iterator classes
+  acts similarly to ``.data()`` in this regard; operations such as
+  ``&span[0]`` and ``&*span.begin()`` are effectively unsafe.
+
+Additional ``-Wunsafe-buffer-usage`` warnings are emitted when encapsulation
+of **standard** containers is broken in this manner. If you're using
+non-standard containers, you can achieve a similar effect with facilities
+described in the next section: :ref:`customization`.
+
+For example, our previous attempt to address the warning in
+``get_last_element()`` has actually introduced a new warning along the way,
+that notifies you about the potentially incorrect bounds information
+passed into the two-parameter constructor of ``std::span``::
+
+    int get_last_element(int *pointer, size_t size) {
+      std::span<int> sp(pointer, size); // warning: unsafe constructor
+      return sp[size - 1];
+    }
+
+In order to address this warning, you need to make the function receive
+the bounds information from the allocation site in a formal manner.
+The function doesn't necessarily need to know where the allocation site is;
+it simply needs to be able to accept bounds information **when** it's available.
+You can achieve this by refactoring the function to accept a ``std::span``
+as a parameter::
+
+    int get_last_element(std::span<int> sp) {
+      return sp[size - 1];
+    }
+
+This solution puts the responsibility for making sure the span is well-formed
+on the **caller**. They should do the same, so that eventually the
+responsibility is placed on the allocation site!
+
+Such definition is also very ergonomic as it naturally accepts arbitrary
+standard containers without any additional code at the call site::
+
+    void use_last_element() {
+      std::vector<int> vec { 1, 2, 3 };
+      int x = get_last_element(vec);  // x = 3
+    }
+
+Such code is naturally bounds-safe because bounds-information is passed down
+from the allocation site to the buffer access site. Only safe operations
+are performed on container types. The containers are never "unforged" into
+raw pointer-size pairs and never "reforged" again. This is what ideal
+bounds-safe C++ code looks like.
+
+
+.. _customization:
+
+Backwards Compatibility, Interoperation with Unsafe Code, Customization
+=======================================================================
+
+Some of the code changes described above can be somewhat intrusive.
+For example, changing a function that previously accepted a pointer and a size
+separately, to accept a ``std::span`` instead, may require you to update
+every call site of the function. This is often undesirable and sometimes
+completely unacceptable when backwards compatibility is required.
+
+In order to facilitate **incremental adoption** of the coding convention
+described above, as well as to handle various unusual situations, the compiler
+provides two additional facilities to give the user more control over
+``-Wunsafe-buffer-usage`` diagnostics:
+
+- ``#pragma clang unsafe_buffer_usage`` to mark code as unsafe and **suppress**
+  ``-Wunsafe-buffer-usage`` warnings in that code.
+- ``[[clang::unsafe_buffer_usage]]`` to annotate potential sources of
+  discontinuity of bounds information -- thus introducing
+  **additional** ``-Wunsafe-buffer-usage`` warnings.
+
+In this section we describe these facilities in detail and show how they can
+help you with various unusual situations.
+
+Suppress unwanted warnings with ``#pragma clang unsafe_buffer_usage``
+---------------------------------------------------------------------
+
+If you really need to write unsafe code, you can always suppress all
+``-Wunsafe-buffer-usage`` warnings in a section of code by surrounding
+that code with the ``unsafe_buffer_usage`` pragma. For example, if you don't
+want to address the warning in our example function ``get_last_element()``,
+here is how you can suppress it::
+
+    int get_last_element(int *pointer, size_t size) {
+      #pragma clang unsafe_buffer_usage begin
+      return ptr[sz - 1]; // warning suppressed
+      #pragma clang unsafe_buffer_usage end
+    }
+
+This behavior is analogous to ``#pragma clang diagnostic`` (`documentation
+<https://clang.llvm.org/docs/UsersManual.html#controlling-diagnostics-via-pragmas>`_)
+However, ``#pragma clang unsafe_buffer_usage`` is specialized and recommended
+over ``#pragma clang diagnostic`` for a number of technical and non-technical
+reasons. Most importantly, ``#pragma clang unsafe_buffer_usage`` is more
+suitable for security audits because it is significantly simpler and
+describes unsafe code in a more formal manner. On the contrary,
+``#pragma clang diagnostic`` comes with a push/pop syntax (as opposed to
+the begin/end syntax) and it offers ways to suppress warnings without
+mentioning them by name (such as ``-Weverything``), which can make it
+difficult to determine at a glance whether the warning is suppressed
+on any given line of code.
+
+There are a few natural reasons to use this pragma:
+
+- In implementations of safe custom containers. You need this because ultimately
+  ``-Wunsafe-buffer-usage`` cannot help you verify that your custom container
+  is safe. It will naturally remind you to audit your container's implementation
+  to make sure it has all the necessary runtime checks, but ultimately you'll
+  need to suppress it once the audit is complete.
+- In performance-critical code where bounds-safety-related runtime checks
+  cause an unacceptable performance regression. The compiler can theoretically
+  optimize them away (eg. replace a repeated bounds check in a loop with
+  a single check before the loop) but it is not guaranteed to do that.
+- For incremental adoption purposes. If you want to adopt the coding convention
+  gradually, you can always surround an entire file with the
+  ``unsafe_buffer_usage`` pragma and then "make holes" in it whenever
+  you address warnings on specific portions of the code.
+- In the code that interoperates with unsafe code. This may be code that
+  will never follow the programming model (such as plain C  code that will
+  never be converted to C++) or with the code that simply haven't been converted
+  yet.
+
+Interoperation with unsafe code may require a lot of suppressions.
+You are encouraged to introduce "unsafe wrapper functions" for various unsafe
+operations that you need to perform regularly.
+
+For example, if you regularly receive pointer/size pairs from unsafe code,
+you may want to introduce a wrapper function for the unsafe span constructor::
+
+    #pragma clang unsafe_buffer_usage begin
+
+    template <typename T>
+    std::span<T> unsafe_forge_span(T *pointer, size_t size) {
+      return std::span(pointer, size);
+    }
+
+    #pragma clang unsafe_buffer_usage end
+
+Such wrapper function can be used to suppress warnings about unsafe span
+constructor usage in a more ergonomic manner::
+
+    void use_unsafe_c_struct(unsafe_c_struct *s) {
+      // No warning here.
+      std::span<int> sp = unsafe_forge_span(s->pointer, s->size);
+      // ...
+    }
+
+The code remains unsafe but it also continues to be nicely readable, and it
+proves that ``-Wunsafe-buffer-usage`` has done it best to notify you about
+the potential unsafety. A security auditor will need to keep an eye on such
+unsafe wrappers. **It is still up to you to confirm that the bounds information
+passed into the wrapper is correct.**
+
+
+Flag bounds information discontinuities with ``[[clang::unsafe_buffer_usage]]``
+-------------------------------------------------------------------------------
+
+The clang attribute ``[[clang::unsafe_buffer_usage]]``
+(`attribute documentation
+<https://clang.llvm.org/docs/AttributeReference.html#unsafe-buffer-usage>`_)
+allows the user to annotate various objects, such as functions or member
+variables, as incompatible with the Safe Buffers programming model.
+You are encouraged to do that for arbitrary reasons, but typically the main
+reason to do that is when an unsafe function needs to be provided for
+backwards compatibility.
+
+For example, in the previous section we've seen how the example function
+``get_last_element()`` needed to have its parameter types changed in order
+to preserve the continuity of bounds information when receiving a buffer pointer
+from the caller. However, such a change breaks both API and ABI compatibility.
+The code that previously used this function will no longer compile, nor link,
+until every call site of that function is updated. You can reclaim the
+backwards compatibility -- in terms of both API and ABI -- by adding
+a "compatibility overload"::
+
+    int get_last_element(std::span<int> sp) {
+      return sp[size - 1];
+    }
+
+    [[clang::unsafe_buffer_usage]] // Please use the new function.
+    int get_last_element(int *pointer, size_t size) {
+      // Avoid code duplication - simply invoke the safe function!
+      // The pragma suppresses the unsafe constructor warning.
+      #pragma clang unsafe_buffer_usage begin
+      return get_last_element(std::span(pointer, size));
+      #pragma clang unsafe_buffer_usage end
+    }
+
+
+Such an overload allows the surrounding code to continue to work.
+It is both source-compatible and binary-compatible. It is also strictly safer
+than the original function because the unsafe buffer access through raw pointer
+is replaced with a safe ``std::span`` access no matter how it's called. However,
+because it requires the caller to pass the pointer and the size separately,
+it violates our "bounds information continuity" principle. This means that
+the callers who care about bounds safety needs to be encouraged to use the
+``std::span``-based overload instead. Luckily, the attribute
+``[[clang::unsafe_buffer_usage]]`` causes a ``-Wunsafe-buffer-usage`` warning
+to be displayed at every call site of the compatibility overload in order to
+remind the callers to update their code::
+
+    void use_last_element() {
+      std::vector<int> vec { 1, 2, 3 };
+
+      // no warning
+      int x = get_last_element(vec);
+
+      // warning: this overload introduces unsafe buffer manipulation
+      int x = get_last_element(vec.data(), vec.size());
+    }
+
+The compatibility overload can be further simplified with the help of the
+``unsafe_forge_span()`` wrapper as described in the previous section --
+and it even makes the pragmas unnecessary::
+
+    [[clang::unsafe_buffer_usage]] // Please use the new function.
+    int get_last_element(int *pointer, size_t size) {
+      // Avoid code duplication - simply invoke the safe function!
+      return get_last_element(unsafe_forge_span(pointer, size));
+    }
+
+Notice how the attribute ``[[clang::unsafe_buffer_usage]]`` does **not**
+suppress the warnings within the function on its own. Similarly, functions whose
+entire definitions are covered by ``#pragma clang unsafe_buffer_usage`` do
+**not** become automatically annotated with the attribute
+``[[clang::unsafe_buffer_usage]]``. They serve two different purposes:
+
+- The pragma says that the function isn't safely **written**;
+- The attribute says that the function isn't safe to **use**.
+
+Also notice how we've made an **unsafe** wrapper for a **safe** function.
+This is significantly better than making a **safe** wrapper for an **unsafe**
+function. In other words, the following solution is significantly more unsafe
+and undesirable than the previous solution::
+
+    int get_last_element(std::span<int> sp) {
+      // You've just added that attribute, and now you need to
+      // immediately suppress the warning that comes with it?
+      #pragma clang unsafe_buffer_usage begin
+      return get_last_element(sp.data(), sp.size());
+      #pragma clang unsafe_buffer_usage end
+    }
+
+
+    [[clang::unsafe_buffer_usage]]
+    int get_last_element(int *pointer, size_t size) {
+      // This access is still completely unchecked. What's the point of having
+      // perfect bounds information if you aren't performing runtime checks?
+      #pragma clang unsafe_buffer_usage begin
+      return ptr[sz - 1];
+      #pragma clang unsafe_buffer_usage end
+    }
+
+**Structs and classes**, unlike functions, cannot be overloaded. If a struct
+contains an unsafe buffer (in the form of a nested array or a pointer/size pair)
+then it is typically impossible to replace them with a safe container (such as
+``std::array`` or ``std::span`` respectively) without breaking the layout
+of the struct and introducing both source and binary incompatibilities with
+the surrounding client code.
+
+Additionally, member variables of a class cannot be naturally "hidden" from
+client code. If a class needs to be used by clients who haven't updated to
+C++20 yet, you cannot use the C++20-specific ``std::span`` as a member variable
+type. If the definition of a struct is shared with plain C code that manipulates
+member variables directly, you cannot use any C++-specific types for these
+member variables.
+
+In such cases there's usually no backwards-compatible way to use safe types
+directly. The best option is usually to discourage the clients from using
+member variables directly by annotating the member variables with the attribute
+``[[clang::unsafe_buffer_usage]]``, and then to change the interface
+of the class to provide safe "accessors" to the unsafe data.
+
+For example, let's assume the worst-case scenario: ``struct foo`` is an unsafe
+struct type fully defined in a header shared between plain C code and C++ code::
+
+    struct foo {
+      int *pointer;
+      size_t size;
+    };
+
+In this case you can achieve safety in C++ code by annotating the member
+variables as unsafe and encapsulating them into safe accessor methods::
+
+    struct foo {
+      [[clang::unsafe_buffer_usage]]
+      int *pointer;
+      [[clang::unsafe_buffer_usage]]
+      size_t size;
+
+    // Avoid showing this code to clients who are unable to digest it.
+    #if __cplusplus >= 202002L
+      std::span<int> get_pointer_as_span() {
+        #pragma clang unsafe_buffer_usage begin
+        return std::span(pointer, size);
+        #pragma clang unsafe_buffer_usage end
+      }
+
+      void set_pointer_from_span(std::span<int> sp) {
+        #pragma clang unsafe_buffer_usage begin
+        pointer = sp.data();
+        size = sp.size();
+        #pragma clang unsafe_buffer_usage end
+      }
+
+      // Potentially more utility functions.
+    #endif
+    };
+
+Future Work
+===========
+
+The ``-Wunsafe-buffer-usage`` technology is in active development. The warning
+is largely ready for everyday use but it is continuously improved to reduce
+unnecessary noise as well as cover some of the trickier unsafe operations.
+
+Fix-It Hints for ``-Wunsafe-buffer-usage``
+------------------------------------------
+
+A code transformation tool is in development that can semi-automatically
+transform large bodies of code to follow the C++ Safe Buffers programming model.
+It can currently be accessed by passing the experimental flag
+``-fsafe-buffer-usage-suggestions`` in addition to ``-Wunsafe-buffer-usage``.
+
+Fixits produced this way currently assume the default approach described
+in this document as they suggest standard containers and views (most notably
+``std::span`` and ``std::array``) as replacements for raw buffer pointers.
+This also additionally requires libc++ hardening in order to make the runtime
+bounds checks actually happen.
+
+Static Analysis to Identify Suspicious Sources of Bounds Information
+--------------------------------------------------------------------
+
+The unsafe constructor ``span(pointer, size)`` is often a necessary evil
+when it comes to interoperation with unsafe code. However, passing the
+correct bounds information to such constructor is often difficult.
+In order to detect those ``span(target_pointer, source_size)`` anti-patterns,
+path-sensitive analysis performed by `the clang static analyzer
+<https://clang-analyzer.llvm.org>`_ can be taught to identify situations
+when the pointer and the size are coming from "suspiciously different" sources.
+
+Such analysis will be able to identify the source of information with
+significantly higher precision than that of the compiler, making it much better
+at identifying incorrect bounds information in your code while producing
+significantly fewer warnings. It will also need to bypass
+``#pragma clang unsafe_buffer_usage`` suppressions and "see through"
+unsafe wrappers such as ``unsafe_forge_span`` -- something that
+the static analyzer is naturally capable of doing.
diff --git a/clang/docs/index.rst b/clang/docs/index.rst
index f4fdc93290a0..0f6fb36c4d33 100644
--- a/clang/docs/index.rst
+++ b/clang/docs/index.rst
@@ -25,6 +25,7 @@ Using Clang as a Compiler
    CrossCompilation
    ClangStaticAnalyzer
    ThreadSafetyAnalysis
+   SafeBuffers
    DataFlowAnalysisIntro
    AddressSanitizer
    ThreadSanitizer
-- 
GitLab


From 6fcea431eed78f75e8ddb48e074c0078b93c109f Mon Sep 17 00:00:00 2001
From: Ramkumar Ramachandra <ramkumar.ramachandra@codasip.com>
Date: Wed, 16 Oct 2024 21:01:28 +0100
Subject: [PATCH 170/329] LVer: improve a test, regen with UTC (NFC) (#112544)

---
 ...wrapping-pointer-non-integral-addrspace.ll | 140 +++++++++++++-----
 1 file changed, 107 insertions(+), 33 deletions(-)

diff --git a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
index 430baa1cb4f8..5abdde9e0564 100644
--- a/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
+++ b/llvm/test/Transforms/LoopVersioning/wrapping-pointer-non-integral-addrspace.ll
@@ -1,4 +1,5 @@
-; RUN: opt -passes=loop-versioning -S < %s | FileCheck %s -check-prefix=LV
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -passes=loop-versioning -S < %s | FileCheck %s
 
 ; NB: addrspaces 10-13 are non-integral
 target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
@@ -12,40 +13,113 @@ target datalayout = "e-m:e-i64:64-f80:128-n8:16:32:64-S128-ni:10:11:12:13"
 
 declare i64 @julia_steprange_last_4949()
 
-define void @"japi1_align!_9477"(ptr %arg) {
-; LV-LAVEL: L26.lver.check
-; LV: [[OFMul:%[^ ]*]]  = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[Step:%[^ ]*]])
-; LV-NEXT: [[OFMulResult:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 0
-; LV-NEXT: [[OFMulOverflow:%[^ ]*]] = extractvalue { i64, i1 } [[OFMul]], 1
-; LV: [[OFNegMulResult:%[^ ]*]] = sub i64 0, [[OFMulResult]]
-; LV-NEXT: [[NegGEP:%[^ ]*]] = getelementptr i8, ptr addrspace(13) [[Base:%[^ ]*]], i64 [[OFNegMulResult]]
-; LV-NEXT: icmp ugt ptr addrspace(13) [[NegGEP]], [[Base]]
-; LV-NOT: inttoptr
-; LV-NOT: ptrtoint
+define void @wrapping_ptr_nonint_addrspace(ptr %arg) {
+; CHECK-LABEL: define void @wrapping_ptr_nonint_addrspace(
+; CHECK-SAME: ptr [[ARG:%.*]]) {
+; CHECK-NEXT:  [[LOOP_LVER_CHECK:.*:]]
+; CHECK-NEXT:    [[LOAD0:%.*]] = load ptr addrspace(10), ptr [[ARG]], align 8
+; CHECK-NEXT:    [[LOAD1:%.*]] = load i32, ptr inttoptr (i64 12 to ptr), align 4
+; CHECK-NEXT:    [[SUB:%.*]] = sub i32 0, [[LOAD1]]
+; CHECK-NEXT:    [[CALL:%.*]] = call i64 @julia_steprange_last_4949()
+; CHECK-NEXT:    [[CAST0:%.*]] = addrspacecast ptr addrspace(10) [[LOAD0]] to ptr addrspace(11)
+; CHECK-NEXT:    [[LOAD2:%.*]] = load ptr addrspace(10), ptr addrspace(11) [[CAST0]], align 8
+; CHECK-NEXT:    [[CAST1:%.*]] = addrspacecast ptr addrspace(10) [[LOAD2]] to ptr addrspace(11)
+; CHECK-NEXT:    [[LOAD3:%.*]] = load ptr addrspace(13), ptr addrspace(11) [[CAST1]], align 8
+; CHECK-NEXT:    [[SEXT:%.*]] = sext i32 [[SUB]] to i64
+; CHECK-NEXT:    [[TMP0:%.*]] = shl i64 [[CALL]], 2
+; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[SEXT]], 2
+; CHECK-NEXT:    [[TMP2:%.*]] = add i64 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    [[TMP3:%.*]] = add i64 [[TMP2]], -4
+; CHECK-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP3]]
+; CHECK-NEXT:    [[SCEVGEP1:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP1]]
+; CHECK-NEXT:    [[TMP4:%.*]] = add i64 [[TMP0]], -4
+; CHECK-NEXT:    [[SCEVGEP2:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP4]]
+; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr addrspace(13) [[SCEVGEP]], [[LOAD3]]
+; CHECK-NEXT:    [[BOUND1:%.*]] = icmp ult ptr addrspace(13) [[SCEVGEP2]], [[SCEVGEP1]]
+; CHECK-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
+; CHECK-NEXT:    [[TMP5:%.*]] = sub i64 0, [[CALL]]
+; CHECK-NEXT:    [[TMP6:%.*]] = shl nsw i64 [[SEXT]], 2
+; CHECK-NEXT:    [[TMP7:%.*]] = add nsw i64 [[TMP6]], -4
+; CHECK-NEXT:    [[SCEVGEP3:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 [[TMP7]]
+; CHECK-NEXT:    [[MUL:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP5]])
+; CHECK-NEXT:    [[MUL_RESULT:%.*]] = extractvalue { i64, i1 } [[MUL]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW:%.*]] = extractvalue { i64, i1 } [[MUL]], 1
+; CHECK-NEXT:    [[TMP8:%.*]] = sub i64 0, [[MUL_RESULT]]
+; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr addrspace(13) [[SCEVGEP3]], i64 [[TMP8]]
+; CHECK-NEXT:    [[TMP10:%.*]] = icmp ugt ptr addrspace(13) [[TMP9]], [[SCEVGEP3]]
+; CHECK-NEXT:    [[TMP11:%.*]] = or i1 [[TMP10]], [[MUL_OVERFLOW]]
+; CHECK-NEXT:    [[SCEVGEP4:%.*]] = getelementptr i8, ptr addrspace(13) [[LOAD3]], i64 -4
+; CHECK-NEXT:    [[MUL5:%.*]] = call { i64, i1 } @llvm.umul.with.overflow.i64(i64 4, i64 [[TMP5]])
+; CHECK-NEXT:    [[MUL_RESULT6:%.*]] = extractvalue { i64, i1 } [[MUL5]], 0
+; CHECK-NEXT:    [[MUL_OVERFLOW7:%.*]] = extractvalue { i64, i1 } [[MUL5]], 1
+; CHECK-NEXT:    [[TMP12:%.*]] = sub i64 0, [[MUL_RESULT6]]
+; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr addrspace(13) [[SCEVGEP4]], i64 [[TMP12]]
+; CHECK-NEXT:    [[TMP14:%.*]] = icmp ugt ptr addrspace(13) [[TMP13]], [[SCEVGEP4]]
+; CHECK-NEXT:    [[TMP15:%.*]] = or i1 [[TMP14]], [[MUL_OVERFLOW7]]
+; CHECK-NEXT:    [[TMP16:%.*]] = or i1 [[TMP11]], [[TMP15]]
+; CHECK-NEXT:    [[LVER_SAFE:%.*]] = or i1 [[FOUND_CONFLICT]], [[TMP16]]
+; CHECK-NEXT:    br i1 [[LVER_SAFE]], label %[[LOOP_PH_LVER_ORIG:.*]], label %[[LOOP_PH:.*]]
+; CHECK:       [[LOOP_PH_LVER_ORIG]]:
+; CHECK-NEXT:    br label %[[LOOP_LVER_ORIG:.*]]
+; CHECK:       [[LOOP_LVER_ORIG]]:
+; CHECK-NEXT:    [[VALUE_PHI3_LVER_ORIG:%.*]] = phi i64 [ 0, %[[LOOP_PH_LVER_ORIG]] ], [ [[ADD0_LVER_ORIG:%.*]], %[[LOOP_LVER_ORIG]] ]
+; CHECK-NEXT:    [[ADD0_LVER_ORIG]] = add i64 [[VALUE_PHI3_LVER_ORIG]], -1
+; CHECK-NEXT:    [[GEP0_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD0_LVER_ORIG]]
+; CHECK-NEXT:    [[LOAD4_LVER_ORIG:%.*]] = load i32, ptr addrspace(13) [[GEP0_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[ADD1_LVER_ORIG:%.*]] = add i64 [[ADD0_LVER_ORIG]], [[SEXT]]
+; CHECK-NEXT:    [[GEP1_LVER_ORIG:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD1_LVER_ORIG]]
+; CHECK-NEXT:    store i32 [[LOAD4_LVER_ORIG]], ptr addrspace(13) [[GEP1_LVER_ORIG]], align 4
+; CHECK-NEXT:    [[CMP_LVER_ORIG:%.*]] = icmp eq i64 [[VALUE_PHI3_LVER_ORIG]], [[CALL]]
+; CHECK-NEXT:    br i1 [[CMP_LVER_ORIG]], label %[[EXIT_LOOPEXIT:.*]], label %[[LOOP_LVER_ORIG]]
+; CHECK:       [[LOOP_PH]]:
+; CHECK-NEXT:    br label %[[LOOP:.*]]
+; CHECK:       [[LOOP]]:
+; CHECK-NEXT:    [[VALUE_PHI3:%.*]] = phi i64 [ 0, %[[LOOP_PH]] ], [ [[ADD0:%.*]], %[[LOOP]] ]
+; CHECK-NEXT:    [[ADD0]] = add i64 [[VALUE_PHI3]], -1
+; CHECK-NEXT:    [[GEP0:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD0]]
+; CHECK-NEXT:    [[LOAD4:%.*]] = load i32, ptr addrspace(13) [[GEP0]], align 4, !alias.scope [[META0:![0-9]+]]
+; CHECK-NEXT:    [[ADD1:%.*]] = add i64 [[ADD0]], [[SEXT]]
+; CHECK-NEXT:    [[GEP1:%.*]] = getelementptr inbounds i32, ptr addrspace(13) [[LOAD3]], i64 [[ADD1]]
+; CHECK-NEXT:    store i32 [[LOAD4]], ptr addrspace(13) [[GEP1]], align 4, !alias.scope [[META3:![0-9]+]], !noalias [[META0]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i64 [[VALUE_PHI3]], [[CALL]]
+; CHECK-NEXT:    br i1 [[CMP]], label %[[EXIT_LOOPEXIT8:.*]], label %[[LOOP]]
+; CHECK:       [[EXIT_LOOPEXIT]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[EXIT_LOOPEXIT8]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    ret void
+;
 top:
-  %tmp = load ptr addrspace(10), ptr %arg, align 8
-  %tmp1 = load i32, ptr inttoptr (i64 12 to ptr), align 4
-  %tmp2 = sub i32 0, %tmp1
-  %tmp3 = call i64 @julia_steprange_last_4949()
-  %tmp4 = addrspacecast ptr addrspace(10) %tmp to ptr addrspace(11)
-  %tmp6 = load ptr addrspace(10), ptr addrspace(11) %tmp4, align 8
-  %tmp7 = addrspacecast ptr addrspace(10) %tmp6 to ptr addrspace(11)
-  %tmp9 = load ptr addrspace(13), ptr addrspace(11) %tmp7, align 8
-  %tmp10 = sext i32 %tmp2 to i64
-  br label %L26
+  %load0 = load ptr addrspace(10), ptr %arg, align 8
+  %load1 = load i32, ptr inttoptr (i64 12 to ptr), align 4
+  %sub = sub i32 0, %load1
+  %call = call i64 @julia_steprange_last_4949()
+  %cast0 = addrspacecast ptr addrspace(10) %load0 to ptr addrspace(11)
+  %load2 = load ptr addrspace(10), ptr addrspace(11) %cast0, align 8
+  %cast1 = addrspacecast ptr addrspace(10) %load2 to ptr addrspace(11)
+  %load3 = load ptr addrspace(13), ptr addrspace(11) %cast1, align 8
+  %sext = sext i32 %sub to i64
+  br label %loop
 
-L26:
-  %value_phi3 = phi i64 [ 0, %top ], [ %tmp11, %L26 ]
-  %tmp11 = add i64 %value_phi3, -1
-  %tmp12 = getelementptr inbounds i32, ptr addrspace(13) %tmp9, i64 %tmp11
-  %tmp13 = load i32, ptr addrspace(13) %tmp12, align 4
-  %tmp14 = add i64 %tmp11, %tmp10
-  %tmp15 = getelementptr inbounds i32, ptr addrspace(13) %tmp9, i64 %tmp14
-  store i32 %tmp13, ptr addrspace(13) %tmp15, align 4
-  %tmp16 = icmp eq i64 %value_phi3, %tmp3
-  br i1 %tmp16, label %L45, label %L26
+loop:
+  %value_phi3 = phi i64 [ 0, %top ], [ %add0, %loop ]
+  %add0 = add i64 %value_phi3, -1
+  %gep0 = getelementptr inbounds i32, ptr addrspace(13) %load3, i64 %add0
+  %load4 = load i32, ptr addrspace(13) %gep0, align 4
+  %add1 = add i64 %add0, %sext
+  %gep1 = getelementptr inbounds i32, ptr addrspace(13) %load3, i64 %add1
+  store i32 %load4, ptr addrspace(13) %gep1, align 4
+  %cmp = icmp eq i64 %value_phi3, %call
+  br i1 %cmp, label %exit, label %loop
 
-L45:
+exit:
   ret void
 }
-
+;.
+; CHECK: [[META0]] = !{[[META1:![0-9]+]]}
+; CHECK: [[META1]] = distinct !{[[META1]], [[META2:![0-9]+]]}
+; CHECK: [[META2]] = distinct !{[[META2]], !"LVerDomain"}
+; CHECK: [[META3]] = !{[[META4:![0-9]+]]}
+; CHECK: [[META4]] = distinct !{[[META4]], [[META2]]}
+;.
-- 
GitLab


From e88bcc12042265964da9a0d274665439dca53595 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 16 Oct 2024 21:40:18 +0100
Subject: [PATCH 171/329] [RISCV] Lower vector_splice on zvfhmin/zvfbfmin
 (#112579)

Similar to other permutation ops, we can just reuse the existing
lowering.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp  |   2 +-
 llvm/test/Analysis/CostModel/RISCV/splice.ll |  26 +-
 llvm/test/CodeGen/RISCV/rvv/vector-splice.ll | 333 ++++++++++++++++++-
 3 files changed, 356 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index bf333b7b7901..076ed173f64e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -1076,7 +1076,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction({ISD::INSERT_VECTOR_ELT, ISD::CONCAT_VECTORS,
                           ISD::INSERT_SUBVECTOR, ISD::EXTRACT_SUBVECTOR,
                           ISD::VECTOR_DEINTERLEAVE, ISD::VECTOR_INTERLEAVE,
-                          ISD::VECTOR_REVERSE},
+                          ISD::VECTOR_REVERSE, ISD::VECTOR_SPLICE},
                          VT, Custom);
       MVT EltVT = VT.getVectorElementType();
       if (isTypeLegal(EltVT))
diff --git a/llvm/test/Analysis/CostModel/RISCV/splice.ll b/llvm/test/Analysis/CostModel/RISCV/splice.ll
index 8d7d1576a532..ddfaa8c13d42 100644
--- a/llvm/test/Analysis/CostModel/RISCV/splice.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/splice.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_analyze_test_checks.py
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s
-; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh,+zvfbfmin | FileCheck %s
+; RUN: opt < %s -passes="print<cost-model>" 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin,+zvfbfmin | FileCheck %s
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfh | FileCheck %s --check-prefix=SIZE
 ; RUN: opt < %s -passes="print<cost-model>" -cost-kind=code-size 2>&1 -disable-output -S -mtriple=riscv64 -mattr=+v,+f,+d,+zfh,+zvfhmin | FileCheck %s --check-prefix=SIZE
 
@@ -34,6 +34,13 @@ define void @vector_splice() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 64 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 128 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1bf16 = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv32bf16 = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> zeroinitializer, <vscale x 32 x bfloat> zeroinitializer, i32 -1)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %splice.nxv64bf16 = call <vscale x 64 x bfloat> @llvm.vector.splice.nxv64bf16(<vscale x 64 x bfloat> zeroinitializer, <vscale x 64 x bfloat> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
@@ -86,6 +93,13 @@ define void @vector_splice() {
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %splice.nxv16i64 = call <vscale x 16 x i64> @llvm.vector.splice.nxv16i64(<vscale x 16 x i64> zeroinitializer, <vscale x 16 x i64> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %splice.nxv1bf16 = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %splice.nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %splice.nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %splice.nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %splice.nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %splice.nxv32bf16 = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> zeroinitializer, <vscale x 32 x bfloat> zeroinitializer, i32 -1)
+; SIZE-NEXT:  Cost Model: Invalid cost for instruction: %splice.nxv64bf16 = call <vscale x 64 x bfloat> @llvm.vector.splice.nxv64bf16(<vscale x 64 x bfloat> zeroinitializer, <vscale x 64 x bfloat> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
 ; SIZE-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
@@ -141,6 +155,14 @@ define void @vector_splice() {
   %splice.nxv32i64 = call <vscale x 32 x i64> @llvm.vector.splice.nxv32i64(<vscale x 32 x i64> zeroinitializer, <vscale x 32 x i64> zeroinitializer, i32 -1)
   %splice.nxv64i64 = call <vscale x 64 x i64> @llvm.vector.splice.nxv64i64(<vscale x 64 x i64> zeroinitializer, <vscale x 64 x i64> zeroinitializer, i32 -1)
 
+  %splice.nxv1bf16 = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> zeroinitializer, <vscale x 1 x bfloat> zeroinitializer, i32 -1)
+  %splice.nxv2bf16 = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> zeroinitializer, <vscale x 2 x bfloat> zeroinitializer, i32 -1)
+  %splice.nxv4bf16 = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> zeroinitializer, <vscale x 4 x bfloat> zeroinitializer, i32 -1)
+  %splice.nxv8bf16 = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer, i32 -1)
+  %splice.nxv16bf16 = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> zeroinitializer, <vscale x 16 x bfloat> zeroinitializer, i32 -1)
+  %splice.nxv32bf16 = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> zeroinitializer, <vscale x 32 x bfloat> zeroinitializer, i32 -1)
+  %splice.nxv64bf16 = call <vscale x 64 x bfloat> @llvm.vector.splice.nxv64bf16(<vscale x 64 x bfloat> zeroinitializer, <vscale x 64 x bfloat> zeroinitializer, i32 -1)
+
   %splice.nxv1f16 = call <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half> zeroinitializer, <vscale x 1 x half> zeroinitializer, i32 -1)
   %splice.nxv2f16 = call <vscale x 2 x half> @llvm.vector.splice.nxv2f16(<vscale x 2 x half> zeroinitializer, <vscale x 2 x half> zeroinitializer, i32 -1)
   %splice.nxv4f16 = call <vscale x 4 x half> @llvm.vector.splice.nxv4f16(<vscale x 4 x half> zeroinitializer, <vscale x 4 x half> zeroinitializer, i32 -1)
diff --git a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
index 8cb6fed2f588..5460caea196c 100644
--- a/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
+++ b/llvm/test/CodeGen/RISCV/rvv/vector-splice.ll
@@ -1,6 +1,8 @@
 ; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
-; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s
-; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zfh,+zvfh < %s | FileCheck %s
+; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfhmin,+zvfbfmin < %s | FileCheck %s
+; RUN: llc -mtriple riscv32 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s
+; RUN: llc -mtriple riscv64 -mattr=+m,+f,+d,+v,+zvfh,+zvfbfmin < %s | FileCheck %s
 
 ; Tests assume VLEN=128 or vscale_range_min=2.
 
@@ -1533,6 +1535,333 @@ define <vscale x 8 x i64> @splice_nxv8i64_offset_max(<vscale x 8 x i64> %a, <vsc
   ret <vscale x 8 x i64> %res
 }
 
+declare <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat>, <vscale x 1 x bfloat>, i32)
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_zero(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 0)
+  ret <vscale x 1 x bfloat> %res
+}
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_negone(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_negone:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 -1)
+  ret <vscale x 1 x bfloat> %res
+}
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_min(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, -2
+; CHECK-NEXT:    vsetivli zero, 2, e16, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 2
+; CHECK-NEXT:    ret
+  %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 -2)
+  ret <vscale x 1 x bfloat> %res
+}
+
+define <vscale x 1 x bfloat> @splice_nxv1bf16_offset_max(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv1bf16_offset_max:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 3
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 1
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf4, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %res = call <vscale x 1 x bfloat> @llvm.vector.splice.nxv1bf16(<vscale x 1 x bfloat> %a, <vscale x 1 x bfloat> %b, i32 1)
+  ret <vscale x 1 x bfloat> %res
+}
+
+declare <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat>, <vscale x 2 x bfloat>, i32)
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_zero(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 0)
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_negone(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_negone:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -1)
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_min(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -4
+; CHECK-NEXT:    vsetivli zero, 4, e16, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 4
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 -4)
+  ret <vscale x 2 x bfloat> %res
+}
+
+define <vscale x 2 x bfloat> @splice_nxv2bf16_offset_max(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv2bf16_offset_max:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -3
+; CHECK-NEXT:    vsetvli zero, a0, e16, mf2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 3
+; CHECK-NEXT:    vsetvli a1, zero, e16, mf2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %res = call <vscale x 2 x bfloat> @llvm.vector.splice.nxv2bf16(<vscale x 2 x bfloat> %a, <vscale x 2 x bfloat> %b, i32 3)
+  ret <vscale x 2 x bfloat> %res
+}
+
+declare <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat>, <vscale x 4 x bfloat>, i32)
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_zero(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 0)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_negone(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_negone:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vslideup.vi v8, v9, 1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -1)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_min(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, -8
+; CHECK-NEXT:    vsetivli zero, 8, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v9, 8
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 -8)
+  ret <vscale x 4 x bfloat> %res
+}
+
+define <vscale x 4 x bfloat> @splice_nxv4bf16_offset_max(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv4bf16_offset_max:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    srli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, -7
+; CHECK-NEXT:    vsetvli zero, a0, e16, m1, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 7
+; CHECK-NEXT:    vsetvli a1, zero, e16, m1, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v9, a0
+; CHECK-NEXT:    ret
+  %res = call <vscale x 4 x bfloat> @llvm.vector.splice.nxv4bf16(<vscale x 4 x bfloat> %a, <vscale x 4 x bfloat> %b, i32 7)
+  ret <vscale x 4 x bfloat> %res
+}
+
+declare <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat>, <vscale x 8 x bfloat>, i32)
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_zero(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 0)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_negone(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_negone:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetivli zero, 1, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 -1)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_min(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    addi a0, a0, -16
+; CHECK-NEXT:    vsetivli zero, 16, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v10, 16
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 -16)
+  ret <vscale x 8 x bfloat> %res
+}
+
+define <vscale x 8 x bfloat> @splice_nxv8bf16_offset_max(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv8bf16_offset_max:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    addi a0, a0, -15
+; CHECK-NEXT:    vsetvli zero, a0, e16, m2, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 15
+; CHECK-NEXT:    vsetvli a1, zero, e16, m2, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v10, a0
+; CHECK-NEXT:    ret
+  %res = call <vscale x 8 x bfloat> @llvm.vector.splice.nxv8bf16(<vscale x 8 x bfloat> %a, <vscale x 8 x bfloat> %b, i32 15)
+  ret <vscale x 8 x bfloat> %res
+}
+
+declare <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat>, <vscale x 16 x bfloat>, i32)
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_zero(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 0)
+  ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_negone(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_negone:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetivli zero, 1, e16, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v12, 1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 -1)
+  ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_min(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, -32
+; CHECK-NEXT:    li a1, 32
+; CHECK-NEXT:    vsetvli zero, a1, e16, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 -32)
+  ret <vscale x 16 x bfloat> %res
+}
+
+define <vscale x 16 x bfloat> @splice_nxv16bf16_offset_max(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv16bf16_offset_max:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 1
+; CHECK-NEXT:    addi a0, a0, -31
+; CHECK-NEXT:    vsetvli zero, a0, e16, m4, ta, ma
+; CHECK-NEXT:    vslidedown.vi v8, v8, 31
+; CHECK-NEXT:    vsetvli a1, zero, e16, m4, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v12, a0
+; CHECK-NEXT:    ret
+  %res = call <vscale x 16 x bfloat> @llvm.vector.splice.nxv16bf16(<vscale x 16 x bfloat> %a, <vscale x 16 x bfloat> %b, i32 31)
+  ret <vscale x 16 x bfloat> %res
+}
+
+declare <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat>, <vscale x 32 x bfloat>, i32)
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_zero(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_zero:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    ret
+  %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 0)
+  ret <vscale x 32 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_negone(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_negone:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -1
+; CHECK-NEXT:    vsetivli zero, 1, e16, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vslideup.vi v8, v16, 1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 -1)
+  ret <vscale x 32 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_min(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_min:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -64
+; CHECK-NEXT:    li a1, 64
+; CHECK-NEXT:    vsetvli zero, a1, e16, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a0
+; CHECK-NEXT:    vsetvli a0, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a1
+; CHECK-NEXT:    ret
+  %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 -64)
+  ret <vscale x 32 x bfloat> %res
+}
+
+define <vscale x 32 x bfloat> @splice_nxv32bf16_offset_max(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b) #0 {
+; CHECK-LABEL: splice_nxv32bf16_offset_max:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    csrr a0, vlenb
+; CHECK-NEXT:    slli a0, a0, 2
+; CHECK-NEXT:    addi a0, a0, -63
+; CHECK-NEXT:    li a1, 63
+; CHECK-NEXT:    vsetvli zero, a0, e16, m8, ta, ma
+; CHECK-NEXT:    vslidedown.vx v8, v8, a1
+; CHECK-NEXT:    vsetvli a1, zero, e16, m8, ta, ma
+; CHECK-NEXT:    vslideup.vx v8, v16, a0
+; CHECK-NEXT:    ret
+  %res = call <vscale x 32 x bfloat> @llvm.vector.splice.nxv32bf16(<vscale x 32 x bfloat> %a, <vscale x 32 x bfloat> %b, i32 63)
+  ret <vscale x 32 x bfloat> %res
+}
+
 declare <vscale x 1 x half> @llvm.vector.splice.nxv1f16(<vscale x 1 x half>, <vscale x 1 x half>, i32)
 
 define <vscale x 1 x half> @splice_nxv1f16_offset_zero(<vscale x 1 x half> %a, <vscale x 1 x half> %b) #0 {
-- 
GitLab


From 2b6b7f664d9c93877ce6678820255fc006d56b54 Mon Sep 17 00:00:00 2001
From: Luke Lau <luke@igalia.com>
Date: Wed, 16 Oct 2024 21:40:37 +0100
Subject: [PATCH 172/329] [RISCV] Mark math functions as expanded for
 zvfhmin/zvfbfmin (#112508)

For regular floating point types we mark these as expanded on scalable
vectors so they're not legal in the cost model, so this does the same
for f16 w/ zvfhmin and bf16.
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  19 ++-
 .../test/Analysis/CostModel/RISCV/arith-fp.ll |  56 +++-----
 .../Analysis/CostModel/RISCV/fp-sqrt-pow.ll   |  20 +--
 .../CostModel/RISCV/fp-trig-log-exp.ll        | 126 +++++++++---------
 4 files changed, 101 insertions(+), 120 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 076ed173f64e..14249e34921e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -717,6 +717,10 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
         ISD::VECREDUCE_FADD, ISD::VECREDUCE_SEQ_FADD, ISD::VECREDUCE_FMIN,
         ISD::VECREDUCE_FMAX, ISD::VECREDUCE_FMINIMUM, ISD::VECREDUCE_FMAXIMUM};
 
+    static const unsigned FloatingPointLibCallOps[] = {
+        ISD::FREM,  ISD::FPOW,   ISD::FCOS, ISD::FSIN,  ISD::FSINCOS, ISD::FEXP,
+        ISD::FEXP2, ISD::FEXP10, ISD::FLOG, ISD::FLOG2, ISD::FLOG10};
+
     if (!Subtarget.is64Bit()) {
       // We must custom-lower certain vXi64 operations on RV32 due to the vector
       // element type being illegal.
@@ -1002,17 +1006,7 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(FloatingPointVecReduceOps, VT, Custom);
 
       // Expand FP operations that need libcalls.
-      setOperationAction(ISD::FREM, VT, Expand);
-      setOperationAction(ISD::FPOW, VT, Expand);
-      setOperationAction(ISD::FCOS, VT, Expand);
-      setOperationAction(ISD::FSIN, VT, Expand);
-      setOperationAction(ISD::FSINCOS, VT, Expand);
-      setOperationAction(ISD::FEXP, VT, Expand);
-      setOperationAction(ISD::FEXP2, VT, Expand);
-      setOperationAction(ISD::FEXP10, VT, Expand);
-      setOperationAction(ISD::FLOG, VT, Expand);
-      setOperationAction(ISD::FLOG2, VT, Expand);
-      setOperationAction(ISD::FLOG10, VT, Expand);
+      setOperationAction(FloatingPointLibCallOps, VT, Expand);
 
       setOperationAction(ISD::FCOPYSIGN, VT, Legal);
 
@@ -1097,6 +1091,9 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
       setOperationAction(ISD::FABS, VT, Expand);
       setOperationAction(ISD::FCOPYSIGN, VT, Expand);
 
+      // Expand FP operations that need libcalls.
+      setOperationAction(FloatingPointLibCallOps, VT, Expand);
+
       // Custom split nxv32[b]f16 since nxv32[b]f32 is not legal.
       if (getLMUL(VT) == RISCVII::VLMUL::LMUL_8) {
         setOperationAction(ZvfhminZvfbfminPromoteOps, VT, Custom);
diff --git a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
index 20d47001739e..b6b49982a732 100644
--- a/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/arith-fp.ll
@@ -526,11 +526,11 @@ define void @frem() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4BF16 = frem <4 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8BF16 = frem <8 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16BF16 = frem <16 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> undef, undef
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1BF16 = frem <vscale x 1 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2BF16 = frem <vscale x 2 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4BF16 = frem <vscale x 4 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8BF16 = frem <vscale x 8 x bfloat> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16BF16 = frem <vscale x 16 x bfloat> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F32 = frem <1 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F32 = frem <2 x float> undef, undef
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F32 = frem <4 x float> undef, undef
@@ -593,37 +593,21 @@ define void @frem() {
 }
 
 define void @frem_f16() {
-; ZVFH-LABEL: 'frem_f16'
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
-; ZVFH-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
-;
-; ZVFHMIN-LABEL: 'frem_f16'
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV8F16 = frem <vscale x 8 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+; CHECK-LABEL: 'frem_f16'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %F16 = frem half undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %V1F16 = frem <1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %V2F16 = frem <2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 12 for instruction: %V4F16 = frem <4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 24 for instruction: %V8F16 = frem <8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 63 for instruction: %V16F16 = frem <16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 127 for instruction: %V32F16 = frem <32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV1F16 = frem <vscale x 1 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV2F16 = frem <vscale x 2 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV4F16 = frem <vscale x 4 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV8F16 = frem <vscale x 8 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV16F16 = frem <vscale x 16 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %NXV32F16 = frem <vscale x 32 x half> undef, undef
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   %F16 = frem half undef, undef
 
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
index 1768222b8a92..efe17f2b76a7 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-sqrt-pow.ll
@@ -97,11 +97,11 @@ define void @pow() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.pow.v4bf16(<4 x bfloat> undef, <4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.pow.v8bf16(<8 x bfloat> undef, <8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.pow.v16bf16(<16 x bfloat> undef, <16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.pow.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.pow.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.pow.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.pow.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.pow.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.pow.nxv1bf16(<vscale x 1 x bfloat> undef, <vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.pow.nxv2bf16(<vscale x 2 x bfloat> undef, <vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.pow.nxv4bf16(<vscale x 4 x bfloat> undef, <vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.pow.nxv8bf16(<vscale x 8 x bfloat> undef, <vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.pow.nxv16bf16(<vscale x 16 x bfloat> undef, <vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.pow.f32(float undef, float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.pow.v2f32(<2 x float> undef, <2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.pow.v4f32(<4 x float> undef, <4 x float> undef)
@@ -175,11 +175,11 @@ define void @pow_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.pow.v4f16(<4 x half> undef, <4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.pow.v8f16(<8 x half> undef, <8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.pow.v16f16(<16 x half> undef, <16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.pow.nxv1f16(<vscale x 1 x half> undef, <vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.pow.nxv2f16(<vscale x 2 x half> undef, <vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.pow.nxv4f16(<vscale x 4 x half> undef, <vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.pow.nxv8f16(<vscale x 8 x half> undef, <vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x half> @llvm.pow.nxv16f16(<vscale x 16 x half> undef, <vscale x 16 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.pow.f16(half undef, half undef)
diff --git a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
index d65fa43b8995..34d6c93f4577 100644
--- a/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/fp-trig-log-exp.ll
@@ -9,11 +9,11 @@ define void @sin() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.sin.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.sin.v8bf16(<8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.sin.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.sin.nxv1bf16(<vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.sin.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.sin.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.sin.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.sin.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.sin.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.sin.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.sin.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.sin.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.sin.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.sin.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.sin.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.sin.v4f32(<4 x float> undef)
@@ -86,10 +86,10 @@ define void @sin_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.sin.v4f16(<4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.sin.v8f16(<8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.sin.v16f16(<16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.sin.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.sin.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.sin.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.sin.nxv8f16(<vscale x 8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.sin.f16(half undef)
@@ -111,11 +111,11 @@ define void @cos() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.cos.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.cos.v8bf16(<8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.cos.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.cos.nxv1bf16(<vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.cos.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.cos.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.cos.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.cos.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.cos.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.cos.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.cos.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.cos.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.cos.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.cos.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.cos.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.cos.v4f32(<4 x float> undef)
@@ -188,10 +188,10 @@ define void @cos_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.cos.v4f16(<4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.cos.v8f16(<8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.cos.v16f16(<16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.cos.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.cos.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.cos.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.cos.nxv8f16(<vscale x 8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.cos.f16(half undef)
@@ -213,11 +213,11 @@ define void @exp() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp.v8bf16(<8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp.nxv1bf16(<vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp.v4f32(<4 x float> undef)
@@ -290,10 +290,10 @@ define void @exp_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp.v4f16(<4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp.v8f16(<8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp.v16f16(<16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp.nxv8f16(<vscale x 8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.exp.f16(half undef)
@@ -315,11 +315,11 @@ define void @exp2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.exp2.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.exp2.v8bf16(<8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.exp2.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp2.nxv1bf16(<vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp2.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp2.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp2.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.exp2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.exp2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.exp2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.exp2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.exp2.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.exp2.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.exp2.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.exp2.v4f32(<4 x float> undef)
@@ -392,10 +392,10 @@ define void @exp2_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.exp2.v4f16(<4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.exp2.v8f16(<8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.exp2.v16f16(<16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.exp2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.exp2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.exp2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.exp2.nxv8f16(<vscale x 8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.exp2.f16(half undef)
@@ -417,11 +417,11 @@ define void @log() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log.v8bf16(<8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log.nxv1bf16(<vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log.v4f32(<4 x float> undef)
@@ -494,10 +494,10 @@ define void @log_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log.v4f16(<4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log.v8f16(<8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log.v16f16(<16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log.nxv8f16(<vscale x 8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.log.f16(half undef)
@@ -519,11 +519,11 @@ define void @log10() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log10.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log10.v8bf16(<8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log10.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log10.nxv1bf16(<vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log10.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log10.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log10.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log10.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log10.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log10.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log10.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log10.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log10.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log10.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log10.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log10.v4f32(<4 x float> undef)
@@ -596,10 +596,10 @@ define void @log10_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log10.v4f16(<4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log10.v8f16(<8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log10.v16f16(<16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log10.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log10.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log10.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log10.nxv8f16(<vscale x 8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.log10.f16(half undef)
@@ -621,11 +621,11 @@ define void @log2() {
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x bfloat> @llvm.log2.v4bf16(<4 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x bfloat> @llvm.log2.v8bf16(<8 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x bfloat> @llvm.log2.v16bf16(<16 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log2.nxv1bf16(<vscale x 1 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log2.nxv2bf16(<vscale x 2 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log2.nxv4bf16(<vscale x 4 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log2.nxv8bf16(<vscale x 8 x bfloat> undef)
-; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log2.nxv16bf16(<vscale x 16 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x bfloat> @llvm.log2.nxv1bf16(<vscale x 1 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x bfloat> @llvm.log2.nxv2bf16(<vscale x 2 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x bfloat> @llvm.log2.nxv4bf16(<vscale x 4 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x bfloat> @llvm.log2.nxv8bf16(<vscale x 8 x bfloat> undef)
+; CHECK-NEXT:  Cost Model: Invalid cost for instruction: %10 = call <vscale x 16 x bfloat> @llvm.log2.nxv16bf16(<vscale x 16 x bfloat> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 10 for instruction: %11 = call float @llvm.log2.f32(float undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 22 for instruction: %12 = call <2 x float> @llvm.log2.v2f32(<2 x float> undef)
 ; CHECK-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %13 = call <4 x float> @llvm.log2.v4f32(<4 x float> undef)
@@ -698,10 +698,10 @@ define void @log2_f16() {
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 44 for instruction: %3 = call <4 x half> @llvm.log2.v4f16(<4 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 88 for instruction: %4 = call <8 x half> @llvm.log2.v8f16(<8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 191 for instruction: %5 = call <16 x half> @llvm.log2.v16f16(<16 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
-; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %6 = call <vscale x 1 x half> @llvm.log2.nxv1f16(<vscale x 1 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %7 = call <vscale x 2 x half> @llvm.log2.nxv2f16(<vscale x 2 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %8 = call <vscale x 4 x half> @llvm.log2.nxv4f16(<vscale x 4 x half> undef)
+; ZVFHMIN-NEXT:  Cost Model: Invalid cost for instruction: %9 = call <vscale x 8 x half> @llvm.log2.nxv8f16(<vscale x 8 x half> undef)
 ; ZVFHMIN-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
 ;
   call half @llvm.log2.f16(half undef)
-- 
GitLab


From 9e6d24f61f3a6730465f3427463dd958cdcd8b9a Mon Sep 17 00:00:00 2001
From: Arthur Eubanks <aeubanks@google.com>
Date: Wed, 16 Oct 2024 21:17:45 +0000
Subject: [PATCH 173/329] Revert "[Inliner] Propagate more attributes to params
 when inlining (#91101)"

This reverts commit ae778ae7ce72219270c30d5c8b3d88c9a4803f81.

Creates broken IR, see comments in #91101.
---
 .../test/CodeGen/attr-counted-by-pr88931.cpp  |   2 +-
 clang/test/OpenMP/bug57757.cpp                |   2 +-
 llvm/include/llvm/IR/Attributes.h             |   7 -
 llvm/lib/IR/Attributes.cpp                    |  15 --
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  90 ++--------
 .../Inline/access-attributes-prop.ll          | 164 +-----------------
 .../Inline/assumptions-from-callsite-attrs.ll |   2 +-
 llvm/test/Transforms/Inline/byval.ll          |   4 +-
 llvm/test/Transforms/PhaseOrdering/pr95152.ll |   2 +-
 9 files changed, 29 insertions(+), 259 deletions(-)

diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.cpp b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
index 6d0c46bbbe8f..2a8cc1d07e50 100644
--- a/clang/test/CodeGen/attr-counted-by-pr88931.cpp
+++ b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
@@ -13,7 +13,7 @@ void init(void * __attribute__((pass_dynamic_object_size(0))));
 // CHECK-LABEL: define dso_local void @_ZN3foo3barC1Ev(
 // CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(1) [[THIS:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull align 4 dereferenceable(1) [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 foo::bar::bar() {
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index eabf233dde24..240b22a30671 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -39,7 +39,7 @@ void foo() {
 // CHECK-NEXT:    ]
 // CHECK:       .untied.jmp..i:
 // CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope [[META13]], !noalias [[META17]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr nonnull [[TMP1]]), !noalias [[META13]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias [[META13]]
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 80490e3b7f49..57db52e4879b 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -947,9 +947,6 @@ public:
   /// arg.
   uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const;
 
-  /// Get range (or std::nullopt if unknown) of an arg.
-  std::optional<ConstantRange> getParamRange(unsigned ArgNo) const;
-
   /// Get the disallowed floating-point classes of the return value.
   FPClassTest getRetNoFPClass() const;
 
@@ -1126,10 +1123,6 @@ public:
   /// invalid if the Kind is not present in the builder.
   Attribute getAttribute(StringRef Kind) const;
 
-  /// Retrieve the range if the attribute exists (std::nullopt is returned
-  /// otherwise).
-  std::optional<ConstantRange> getRange() const;
-
   /// Return raw (possibly packed/encoded) value of integer attribute or
   /// std::nullopt if not set.
   std::optional<uint64_t> getRawIntAttr(Attribute::AttrKind Kind) const;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 55851d499c60..c2fba49692c7 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1931,14 +1931,6 @@ AttributeList::getParamDereferenceableOrNullBytes(unsigned Index) const {
   return getParamAttrs(Index).getDereferenceableOrNullBytes();
 }
 
-std::optional<ConstantRange>
-AttributeList::getParamRange(unsigned ArgNo) const {
-  auto RangeAttr = getParamAttrs(ArgNo).getAttribute(Attribute::Range);
-  if (RangeAttr.isValid())
-    return RangeAttr.getRange();
-  return std::nullopt;
-}
-
 FPClassTest AttributeList::getRetNoFPClass() const {
   return getRetAttrs().getNoFPClass();
 }
@@ -2285,13 +2277,6 @@ Attribute AttrBuilder::getAttribute(StringRef A) const {
   return {};
 }
 
-std::optional<ConstantRange> AttrBuilder::getRange() const {
-  const Attribute RangeAttr = getAttribute(Attribute::Range);
-  if (RangeAttr.isValid())
-    return RangeAttr.getRange();
-  return std::nullopt;
-}
-
 bool AttrBuilder::contains(Attribute::AttrKind A) const {
   return getAttribute(A).isValid();
 }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 71ca527e5daa..55ad2b6d6200 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -34,7 +34,6 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/AttributeMask.h"
-#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -60,7 +59,6 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
-#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -1360,36 +1358,18 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB,
   auto &Context = CalledFunction->getContext();
 
   // Collect valid attributes for all params.
-  SmallVector<AttrBuilder> ValidObjParamAttrs, ValidExactParamAttrs;
+  SmallVector<AttrBuilder> ValidParamAttrs;
   bool HasAttrToPropagate = false;
 
-  // Attributes we can only propagate if the exact parameter is forwarded.
-  // We can propagate both poison generating and UB generating attributes
-  // without any extra checks. The only attribute that is tricky to propagate
-  // is `noundef` (skipped for now) as that can create new UB where previous
-  // behavior was just using a poison value.
-  static const Attribute::AttrKind ExactAttrsToPropagate[] = {
-      Attribute::Dereferenceable, Attribute::DereferenceableOrNull,
-      Attribute::NonNull, Attribute::Alignment, Attribute::Range};
-
   for (unsigned I = 0, E = CB.arg_size(); I < E; ++I) {
-    ValidObjParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
-    ValidExactParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
+    ValidParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
     // Access attributes can be propagated to any param with the same underlying
     // object as the argument.
     if (CB.paramHasAttr(I, Attribute::ReadNone))
-      ValidObjParamAttrs.back().addAttribute(Attribute::ReadNone);
+      ValidParamAttrs.back().addAttribute(Attribute::ReadNone);
     if (CB.paramHasAttr(I, Attribute::ReadOnly))
-      ValidObjParamAttrs.back().addAttribute(Attribute::ReadOnly);
-
-    for (Attribute::AttrKind AK : ExactAttrsToPropagate) {
-      Attribute Attr = CB.getParamAttr(I, AK);
-      if (Attr.isValid())
-        ValidExactParamAttrs.back().addAttribute(Attr);
-    }
-
-    HasAttrToPropagate |= ValidObjParamAttrs.back().hasAttributes();
-    HasAttrToPropagate |= ValidExactParamAttrs.back().hasAttributes();
+      ValidParamAttrs.back().addAttribute(Attribute::ReadOnly);
+    HasAttrToPropagate |= ValidParamAttrs.back().hasAttributes();
   }
 
   // Won't be able to propagate anything.
@@ -1411,60 +1391,22 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB,
 
       AttributeList AL = NewInnerCB->getAttributes();
       for (unsigned I = 0, E = InnerCB->arg_size(); I < E; ++I) {
-        // It's unsound or requires special handling to propagate
-        // attributes to byval arguments. Even if CalledFunction
-        // doesn't e.g. write to the argument (readonly), the call to
-        // NewInnerCB may write to its by-value copy.
-        if (NewInnerCB->paramHasAttr(I, Attribute::ByVal))
+        // Check if the underlying value for the parameter is an argument.
+        const Value *UnderlyingV =
+            getUnderlyingObject(InnerCB->getArgOperand(I));
+        const Argument *Arg = dyn_cast<Argument>(UnderlyingV);
+        if (!Arg)
           continue;
 
-        // Don't bother propagating attrs to constants.
-        if (match(NewInnerCB->getArgOperand(I),
-                  llvm::PatternMatch::m_ImmConstant()))
+        if (NewInnerCB->paramHasAttr(I, Attribute::ByVal))
+          // It's unsound to propagate memory attributes to byval arguments.
+          // Even if CalledFunction doesn't e.g. write to the argument,
+          // the call to NewInnerCB may write to its by-value copy.
           continue;
 
-        // Check if the underlying value for the parameter is an argument.
-        const Argument *Arg = dyn_cast<Argument>(InnerCB->getArgOperand(I));
-        unsigned ArgNo;
-        if (Arg) {
-          ArgNo = Arg->getArgNo();
-          // For dereferenceable, dereferenceable_or_null, align, etc...
-          // we don't want to propagate if the existing param has the same
-          // attribute with "better" constraints. So  remove from the
-          // new AL if the region of the existing param is larger than
-          // what we can propagate.
-          AttrBuilder NewAB{
-              Context, AttributeSet::get(Context, ValidExactParamAttrs[ArgNo])};
-          if (AL.getParamDereferenceableBytes(I) >
-              NewAB.getDereferenceableBytes())
-            NewAB.removeAttribute(Attribute::Dereferenceable);
-          if (AL.getParamDereferenceableOrNullBytes(I) >
-              NewAB.getDereferenceableOrNullBytes())
-            NewAB.removeAttribute(Attribute::DereferenceableOrNull);
-          if (AL.getParamAlignment(I).valueOrOne() >
-              NewAB.getAlignment().valueOrOne())
-            NewAB.removeAttribute(Attribute::Alignment);
-          if (auto ExistingRange = AL.getParamRange(I)) {
-            if (auto NewRange = NewAB.getRange()) {
-              ConstantRange CombinedRange =
-                  ExistingRange->intersectWith(*NewRange);
-              NewAB.removeAttribute(Attribute::Range);
-              NewAB.addRangeAttr(CombinedRange);
-            }
-          }
-          AL = AL.addParamAttributes(Context, I, NewAB);
-        } else {
-          // Check if the underlying value for the parameter is an argument.
-          const Value *UnderlyingV =
-              getUnderlyingObject(InnerCB->getArgOperand(I));
-          Arg = dyn_cast<Argument>(UnderlyingV);
-          if (!Arg)
-            continue;
-          ArgNo = Arg->getArgNo();
-        }
-
+        unsigned ArgNo = Arg->getArgNo();
         // If so, propagate its access attributes.
-        AL = AL.addParamAttributes(Context, I, ValidObjParamAttrs[ArgNo]);
+        AL = AL.addParamAttributes(Context, I, ValidParamAttrs[ArgNo]);
         // We can have conflicting attributes from the inner callsite and
         // to-be-inlined callsite. In that case, choose the most
         // restrictive.
diff --git a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll
index 5bf845d5ba94..5051c92345ec 100644
--- a/llvm/test/Transforms/Inline/access-attributes-prop.ll
+++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll
@@ -47,6 +47,7 @@ define dso_local void @foo3_writable(ptr %p) {
   ret void
 }
 
+
 define dso_local void @foo1_bar_aligned64_deref512(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@foo1_bar_aligned64_deref512
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -305,7 +306,7 @@ define void @prop_param_callbase_def_1x_partial_3(ptr %p, ptr %p2) {
 define void @prop_deref(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_deref
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr dereferenceable(16) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr dereferenceable(16) %p)
@@ -315,7 +316,7 @@ define void @prop_deref(ptr %p) {
 define void @prop_deref_or_null(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_deref_or_null
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr dereferenceable_or_null(256) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr dereferenceable_or_null(256) %p)
@@ -325,23 +326,13 @@ define void @prop_deref_or_null(ptr %p) {
 define void @prop_param_nonnull_and_align(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_nonnull_and_align
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr nonnull align 32 [[P]])
+; CHECK-NEXT:    call void @bar1(ptr [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr nonnull align 32 %p)
   ret void
 }
 
-define void @prop_param_nofree_and_align(ptr %p) {
-; CHECK-LABEL: define {{[^@]+}}@prop_param_nofree_and_align
-; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr align 32 [[P]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo1(ptr nofree align 32 %p)
-  ret void
-}
-
 define void @prop_param_deref_align_no_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_no_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -355,7 +346,7 @@ define void @prop_param_deref_align_no_update(ptr %p) {
 define void @prop_param_deref_align_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr align 128 dereferenceable(1024) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr align 64 dereferenceable(512) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1_bar_aligned64_deref512(ptr align 128 dereferenceable(1024) %p)
@@ -365,7 +356,7 @@ define void @prop_param_deref_align_update(ptr %p) {
 define void @prop_param_deref_or_null_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_or_null_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr align 512 dereferenceable_or_null(1024) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr align 512 dereferenceable_or_null(512) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1_bar_aligned512_deref_or_null512(ptr dereferenceable_or_null(1024) %p)
@@ -548,6 +539,7 @@ define void @prop_no_conflict_writable(ptr %p) {
   ret void
 }
 
+
 define void @prop_no_conflict_writable2(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_no_conflict_writable2
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -608,145 +600,3 @@ define void @prop_byval_readonly2(ptr %p) {
   call void @foo_byval_readonly2(ptr %p)
   ret void
 }
-
-declare void @bar5(i32)
-
-define dso_local void @foo4_range_0_10(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@foo4_range_0_10
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 10) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @bar5(i32 range(i32 0, 10) %v)
-  ret void
-}
-
-define dso_local void @foo4_range_10_40(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@foo4_range_10_40
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @bar5(i32 range(i32 10, 40) %v)
-  ret void
-}
-
-define dso_local void @foo4_2_range_0_10(i32 range(i32 0, 10) %v) {
-; CHECK-LABEL: define {{[^@]+}}@foo4_2_range_0_10
-; CHECK-SAME: (i32 range(i32 0, 10) [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @bar5(i32 %v)
-  ret void
-}
-
-define dso_local void @foo4(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@foo4
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @bar5(i32 %v)
-  ret void
-}
-
-define void @prop_range_empty_intersect(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_intersect
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 0) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_range_0_10(i32 range(i32 11, 50) %v)
-  ret void
-}
-
-define void @prop_range_empty(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_empty
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 0) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4(i32 range(i32 1, 0) %v)
-  ret void
-}
-
-define void @prop_range_empty_with_intersect(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_with_intersect
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 10) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_range_0_10(i32 range(i32 1, 0) %v)
-  ret void
-}
-
-define void @prop_range_intersect1(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect1
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 9) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_range_0_10(i32 range(i32 0, 9) %v)
-  ret void
-}
-
-define void @prop_range_intersect2(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect2
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 9) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_range_0_10(i32 range(i32 1, 9) %v)
-  ret void
-}
-
-define void @prop_range_intersect3(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect3
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 11) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_2_range_0_10(i32 range(i32 0, 11) %v)
-  ret void
-}
-
-define void @prop_range_intersect4(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect4
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 5) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_range_0_10(i32 range(i32 40, 5) %v)
-  ret void
-}
-
-define void @prop_range_intersect5(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect5
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_range_10_40(i32 range(i32 30, 20) %v)
-  ret void
-}
-
-define void @prop_range_keep(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_keep
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4_range_10_40(i32 %v)
-  ret void
-}
-
-define void @prop_range_direct(i32 %v) {
-; CHECK-LABEL: define {{[^@]+}}@prop_range_direct
-; CHECK-SAME: (i32 [[V:%.*]]) {
-; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 11) [[V]])
-; CHECK-NEXT:    ret void
-;
-  call void @foo4(i32 range(i32 1, 11) %v)
-  ret void
-}
diff --git a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
index c0943f4aefb8..1a219a22019c 100644
--- a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
+++ b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
@@ -8,7 +8,7 @@ declare void @h(ptr %p, ptr %q, ptr %z)
 define void @f(ptr %p, ptr %q, ptr %z) {
 ; CHECK-LABEL: define void @f
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[Z:%.*]]) {
-; CHECK-NEXT:    call void @h(ptr nonnull [[P]], ptr [[Q]], ptr nonnull [[Z]])
+; CHECK-NEXT:    call void @h(ptr [[P]], ptr [[Q]], ptr [[Z]])
 ; CHECK-NEXT:    ret void
 ;
   call void @g(ptr nonnull %p, ptr %q, ptr nonnull %z)
diff --git a/llvm/test/Transforms/Inline/byval.ll b/llvm/test/Transforms/Inline/byval.ll
index 1a70da8472cb..dd5be40b90a8 100644
--- a/llvm/test/Transforms/Inline/byval.ll
+++ b/llvm/test/Transforms/Inline/byval.ll
@@ -106,7 +106,7 @@ define void @test3() nounwind  {
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS]], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[S1]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1]], ptr align 1 [[S]], i64 12, i1 false)
-; CHECK-NEXT:    call void @g3(ptr align 64 [[S1]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @g3(ptr [[S1]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[S1]])
 ; CHECK-NEXT:    ret void
 ;
@@ -131,7 +131,7 @@ define i32 @test4() nounwind  {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 64
-; CHECK-NEXT:    call void @g3(ptr align 64 [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @g3(ptr [[S]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret i32 4
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
index fff94673a1a5..16610c439f4c 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
@@ -47,7 +47,7 @@ define void @f(ptr dead_on_unwind noalias %p) {
 ; CHECK-LABEL: define void @f(
 ; CHECK-SAME: ptr dead_on_unwind noalias [[P:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:    store i64 3, ptr [[P]], align 4
-; CHECK-NEXT:    tail call void @j(ptr nonnull align 8 dereferenceable(8) [[P]])
+; CHECK-NEXT:    tail call void @j(ptr nonnull [[P]])
 ; CHECK-NEXT:    store i64 43, ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
-- 
GitLab


From 0a53f43c0c7e33cde07b24169e8f45db7eba2fea Mon Sep 17 00:00:00 2001
From: George Burgess IV <george.burgess.iv@gmail.com>
Date: Wed, 16 Oct 2024 14:39:36 -0700
Subject: [PATCH 174/329] [utils] support "Reverts ${PR}" in commit messages
 (#112226)

A bisection in ChromeOS ended at a reverted commit, which wasn't flagged
by this revert checking script, since it used `Reverts ${PR}` rather
than `This reverts commit ${SHA}`.

`grep` says that somewhere around 400 reverts in the last year have used
`Reverts ${PR}` syntax. Support it.

Tested in part by running the command that was expected to catch this
revert:

```
$ ./revert_checker.py -C ~/llvm/main/ \
    3b5e7c83a6e226d5bd7ed2e9b67449b64812074c origin/main \
    | grep -q 4b0276d1c9cb558f3c20736dce802ceb26c0b958
$ echo $?
0
```
---
 llvm/utils/revert_checker.py      | 139 ++++++++++++++++++++++++++----
 llvm/utils/revert_checker_test.py |  23 +++++
 2 files changed, 147 insertions(+), 15 deletions(-)

diff --git a/llvm/utils/revert_checker.py b/llvm/utils/revert_checker.py
index da80bdff8685..b1c6e228e4d4 100755
--- a/llvm/utils/revert_checker.py
+++ b/llvm/utils/revert_checker.py
@@ -45,35 +45,78 @@ import logging
 import re
 import subprocess
 import sys
-from typing import Generator, List, NamedTuple, Iterable
+from typing import Dict, Generator, Iterable, List, NamedTuple, Optional, Tuple
 
 assert sys.version_info >= (3, 6), "Only Python 3.6+ is supported."
 
 # People are creative with their reverts, and heuristics are a bit difficult.
-# Like 90% of of reverts have "This reverts commit ${full_sha}".
-# Some lack that entirely, while others have many of them specified in ad-hoc
-# ways, while others use short SHAs and whatever.
+# At a glance, most reverts have "This reverts commit ${full_sha}". Many others
+# have `Reverts llvm/llvm-project#${PR_NUMBER}`.
 #
-# The 90% case is trivial to handle (and 100% free + automatic). The extra 10%
-# starts involving human intervention, which is probably not worth it for now.
+# By their powers combined, we should be able to automatically catch something
+# like 80% of reverts with reasonable confidence. At some point, human
+# intervention will always be required (e.g., I saw
+# ```
+# This reverts commit ${commit_sha_1} and
+# also ${commit_sha_2_shorthand}
+# ```
+# during my sample)
+
+_CommitMessageReverts = NamedTuple(
+    "_CommitMessageReverts",
+    [
+        ("potential_shas", List[str]),
+        ("potential_pr_numbers", List[int]),
+    ],
+)
+
 
+def _try_parse_reverts_from_commit_message(
+    commit_message: str,
+) -> _CommitMessageReverts:
+    """Tries to parse revert SHAs and LLVM PR numbers form the commit message.
 
-def _try_parse_reverts_from_commit_message(commit_message: str) -> List[str]:
+    Returns:
+        A namedtuple containing:
+        - A list of potentially reverted SHAs
+        - A list of potentially reverted LLVM PR numbers
+    """
     if not commit_message:
-        return []
+        return _CommitMessageReverts([], [])
 
-    results = re.findall(r"This reverts commit ([a-f0-9]{40})\b", commit_message)
+    sha_reverts = re.findall(
+        r"This reverts commit ([a-f0-9]{40})\b",
+        commit_message,
+    )
 
     first_line = commit_message.splitlines()[0]
     initial_revert = re.match(r'Revert ([a-f0-9]{6,}) "', first_line)
     if initial_revert:
-        results.append(initial_revert.group(1))
-    return results
+        sha_reverts.append(initial_revert.group(1))
 
+    pr_numbers = [
+        int(x)
+        for x in re.findall(
+            r"Reverts llvm/llvm-project#(\d+)",
+            commit_message,
+        )
+    ]
+
+    return _CommitMessageReverts(
+        potential_shas=sha_reverts,
+        potential_pr_numbers=pr_numbers,
+    )
 
-def _stream_stdout(command: List[str]) -> Generator[str, None, None]:
+
+def _stream_stdout(
+    command: List[str], cwd: Optional[str] = None
+) -> Generator[str, None, None]:
     with subprocess.Popen(
-        command, stdout=subprocess.PIPE, encoding="utf-8", errors="replace"
+        command,
+        cwd=cwd,
+        stdout=subprocess.PIPE,
+        encoding="utf-8",
+        errors="replace",
     ) as p:
         assert p.stdout is not None  # for mypy's happiness.
         yield from p.stdout
@@ -175,10 +218,43 @@ def _find_common_parent_commit(git_dir: str, ref_a: str, ref_b: str) -> str:
     ).strip()
 
 
-def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
+def _load_pr_commit_mappings(
+    git_dir: str, root: str, min_ref: str
+) -> Dict[int, List[str]]:
+    git_log = ["git", "log", "--format=%H %s", f"{min_ref}..{root}"]
+    results = collections.defaultdict(list)
+    pr_regex = re.compile(r"\s\(#(\d+)\)$")
+    for line in _stream_stdout(git_log, cwd=git_dir):
+        m = pr_regex.search(line)
+        if not m:
+            continue
+
+        pr_number = int(m.group(1))
+        sha = line.split(None, 1)[0]
+        # N.B., these are kept in log (read: reverse chronological) order,
+        # which is what's expected by `find_reverts`.
+        results[pr_number].append(sha)
+    return results
+
+
+# N.B., max_pr_lookback's default of 20K commits is arbitrary, but should be
+# enough for the 99% case of reverts: rarely should someone land a cleanish
+# revert of a >6 month old change...
+def find_reverts(
+    git_dir: str, across_ref: str, root: str, max_pr_lookback: int = 20000
+) -> List[Revert]:
     """Finds reverts across `across_ref` in `git_dir`, starting from `root`.
 
     These reverts are returned in order of oldest reverts first.
+
+    Args:
+        git_dir: git directory to find reverts in.
+        across_ref: the ref to find reverts across.
+        root: the 'main' ref to look for reverts on.
+        max_pr_lookback: this function uses heuristics to map PR numbers to
+            SHAs. These heuristics require that commit history from `root` to
+            `some_parent_of_root` is loaded in memory. `max_pr_lookback` is how
+            many commits behind `across_ref` should be loaded in memory.
     """
     across_sha = _rev_parse(git_dir, across_ref)
     root_sha = _rev_parse(git_dir, root)
@@ -201,8 +277,41 @@ def find_reverts(git_dir: str, across_ref: str, root: str) -> List[Revert]:
     )
 
     all_reverts = []
+    # Lazily load PR <-> commit mappings, since it can be expensive.
+    pr_commit_mappings = None
     for sha, commit_message in _log_stream(git_dir, root_sha, across_sha):
-        reverts = _try_parse_reverts_from_commit_message(commit_message)
+        reverts, pr_reverts = _try_parse_reverts_from_commit_message(
+            commit_message,
+        )
+        if pr_reverts:
+            if pr_commit_mappings is None:
+                logging.info(
+                    "Loading PR <-> commit mappings. This may take a moment..."
+                )
+                pr_commit_mappings = _load_pr_commit_mappings(
+                    git_dir, root_sha, f"{across_sha}~{max_pr_lookback}"
+                )
+                logging.info(
+                    "Loaded %d PR <-> commit mappings", len(pr_commit_mappings)
+                )
+
+            for reverted_pr_number in pr_reverts:
+                reverted_shas = pr_commit_mappings.get(reverted_pr_number)
+                if not reverted_shas:
+                    logging.warning(
+                        "No SHAs for reverted PR %d (commit %s)",
+                        reverted_pr_number,
+                        sha,
+                    )
+                    continue
+                logging.debug(
+                    "Inferred SHAs %s for reverted PR %d (commit %s)",
+                    reverted_shas,
+                    reverted_pr_number,
+                    sha,
+                )
+                reverts.extend(reverted_shas)
+
         if not reverts:
             continue
 
diff --git a/llvm/utils/revert_checker_test.py b/llvm/utils/revert_checker_test.py
index 9d992663c5be..c149be8dc0dd 100755
--- a/llvm/utils/revert_checker_test.py
+++ b/llvm/utils/revert_checker_test.py
@@ -96,6 +96,7 @@ class Test(unittest.TestCase):
             git_dir=get_llvm_project_path(),
             across_ref="c9944df916e41b1014dff5f6f75d52297b48ecdc~",
             root="c9944df916e41b1014dff5f6f75d52297b48ecdc",
+            max_pr_lookback=50,
         )
         self.assertEqual(reverts, [])
 
@@ -113,6 +114,7 @@ class Test(unittest.TestCase):
             git_dir=get_llvm_project_path(),
             across_ref="c47f971694be0159ffddfee8a75ae515eba91439",
             root="9f981e9adf9c8d29bb80306daf08d2770263ade6",
+            max_pr_lookback=50,
         )
         self.assertEqual(
             reverts,
@@ -128,6 +130,27 @@ class Test(unittest.TestCase):
             ],
         )
 
+    def test_pr_based_revert_works(self) -> None:
+        reverts = revert_checker.find_reverts(
+            git_dir=get_llvm_project_path(),
+            # This SHA is a direct child of the reverted SHA expected below.
+            across_ref="2d5f3b0a61fb171617012a2c3ba05fd31fb3bb1d",
+            # This SHA is a direct child of the revert SHA listed below.
+            root="2c01b278580212914ec037bb5dd9b73702dfe7f1",
+            max_pr_lookback=50,
+        )
+        self.assertEqual(
+            reverts,
+            [
+                revert_checker.Revert(
+                    # This SHA is a `Reverts ${PR}` for #111004.
+                    sha="50866e84d1da8462aeb96607bf6d9e5bbd5869c5",
+                    # ...And this was the commit for #111004.
+                    reverted_sha="67160c5ab5f5b7fd5fa7851abcfde367c8a9f91b",
+                ),
+            ],
+        )
+
 
 if __name__ == "__main__":
     unittest.main()
-- 
GitLab


From a24c468782010e17563f6aa93c5bb173c7f873b2 Mon Sep 17 00:00:00 2001
From: Alexander Pivovarov <pivovaa@amazon.com>
Date: Wed, 16 Oct 2024 15:22:29 -0700
Subject: [PATCH 175/329] [MLIR] Fix assert expressions (#112474)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

I noticed that several assertions in MLIR codebase have issues with
operator precedence

The issue with operator precedence in these assertions is due to the way
logical operators are evaluated. The `&&` operator has higher precedence
than the `||` operator, which means the assertion is currently
evaluating incorrectly, like this:
```
assert((resType.getNumDynamicDims() == dynOutDims.size()) ||
       (dynOutDims.empty() && "Either none or all output dynamic dims must be specified!"));
```

We should add parentheses around the entire expression involving
`dynOutDims.empty()` to ensure that the logical conditions are grouped
correctly. Here’s the corrected version:
```
assert(((resType.getNumDynamicDims() == dynOutDims.size()) || dynOutDims.empty()) &&
       "Either none or all output dynamic dims must be specified!");

```
---
 mlir/lib/Analysis/FlatLinearValueConstraints.cpp     |  4 ++--
 mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp | 10 +++++-----
 mlir/lib/Dialect/Tensor/Utils/Utils.cpp              |  6 +++---
 3 files changed, 10 insertions(+), 10 deletions(-)

diff --git a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
index e628fb152b52..0d6ff2fd908d 100644
--- a/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
+++ b/mlir/lib/Analysis/FlatLinearValueConstraints.cpp
@@ -892,8 +892,8 @@ FlatLinearValueConstraints::FlatLinearValueConstraints(IntegerSet set,
                             set.getNumDims() + set.getNumSymbols() + 1,
                             set.getNumDims(), set.getNumSymbols(),
                             /*numLocals=*/0) {
-  assert(operands.empty() ||
-         set.getNumInputs() == operands.size() && "operand count mismatch");
+  assert((operands.empty() || set.getNumInputs() == operands.size()) &&
+         "operand count mismatch");
   // Set the values for the non-local variables.
   for (unsigned i = 0, e = operands.size(); i < e; ++i)
     setValue(i, operands[i]);
diff --git a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
index 09c6b2683b43..635273bcbc02 100644
--- a/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
+++ b/mlir/lib/Dialect/Linalg/Transforms/Vectorization.cpp
@@ -840,11 +840,11 @@ enum VectorMemoryAccessKind { ScalarBroadcast, Contiguous, Gather };
 /// TODO: Statically shaped loops + vector masking
 static uint64_t getTrailingNonUnitLoopDimIdx(LinalgOp linalgOp) {
   SmallVector<int64_t> loopRanges = linalgOp.getStaticLoopRanges();
-  assert(linalgOp.hasDynamicShape() ||
-         llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) ==
-                 1 &&
-             "For statically shaped Linalg Ops, only one "
-             "non-unit loop dim is expected");
+  assert(
+      (linalgOp.hasDynamicShape() ||
+       llvm::count_if(loopRanges, [](int64_t dim) { return dim != 1; }) == 1) &&
+      "For statically shaped Linalg Ops, only one "
+      "non-unit loop dim is expected");
 
   size_t idx = loopRanges.size() - 1;
   for (; idx >= 0; idx--)
diff --git a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
index e0b91f323b0e..5c16e538ac24 100644
--- a/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
+++ b/mlir/lib/Dialect/Tensor/Utils/Utils.cpp
@@ -27,9 +27,9 @@ PadOp mlir::tensor::createPadHighOp(RankedTensorType resType, Value source,
                                     OpBuilder &b,
                                     SmallVector<Value> dynOutDims) {
 
-  assert((resType.getNumDynamicDims() == dynOutDims.size()) ||
-         dynOutDims.empty() &&
-             "Either none or all output dynamic dims must be specified!");
+  assert(((resType.getNumDynamicDims() == dynOutDims.size()) ||
+          dynOutDims.empty()) &&
+         "Either none or all output dynamic dims must be specified!");
 
   // Init "low" and "high" padding values ("low" is kept as is, "high" is
   // computed below).
-- 
GitLab


From 5f9e6c811ba64e5d86e01f4df3995776c8090254 Mon Sep 17 00:00:00 2001
From: SahilPatidar <patidarsahil2001@gmail.com>
Date: Thu, 17 Oct 2024 04:12:01 +0530
Subject: [PATCH 176/329] [Orc][Runtime] Refactor `dlupdate` to remove the
 `mode` argument (#110491)

---
 compiler-rt/lib/orc/dlfcn_wrapper.cpp  |  8 ++++----
 compiler-rt/lib/orc/macho_platform.cpp | 14 +++++++-------
 compiler-rt/lib/orc/macho_platform.h   |  2 +-
 llvm/lib/ExecutionEngine/Orc/LLJIT.cpp |  5 ++---
 4 files changed, 14 insertions(+), 15 deletions(-)

diff --git a/compiler-rt/lib/orc/dlfcn_wrapper.cpp b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
index bbbc79f607f2..dec8d1e5bbc3 100644
--- a/compiler-rt/lib/orc/dlfcn_wrapper.cpp
+++ b/compiler-rt/lib/orc/dlfcn_wrapper.cpp
@@ -20,7 +20,7 @@ using namespace orc_rt;
 
 extern "C" const char *__orc_rt_jit_dlerror();
 extern "C" void *__orc_rt_jit_dlopen(const char *path, int mode);
-extern "C" int __orc_rt_jit_dlupdate(void *dso_handle, int mode);
+extern "C" int __orc_rt_jit_dlupdate(void *dso_handle);
 extern "C" int __orc_rt_jit_dlclose(void *dso_handle);
 
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
@@ -45,10 +45,10 @@ __orc_rt_jit_dlopen_wrapper(const char *ArgData, size_t ArgSize) {
 #ifdef __APPLE__
 ORC_RT_INTERFACE orc_rt_CWrapperFunctionResult
 __orc_rt_jit_dlupdate_wrapper(const char *ArgData, size_t ArgSize) {
-  return WrapperFunction<int32_t(SPSExecutorAddr, int32_t)>::handle(
+  return WrapperFunction<int32_t(SPSExecutorAddr)>::handle(
              ArgData, ArgSize,
-             [](ExecutorAddr &DSOHandle, int32_t mode) {
-               return __orc_rt_jit_dlupdate(DSOHandle.toPtr<void *>(), mode);
+             [](ExecutorAddr &DSOHandle) {
+               return __orc_rt_jit_dlupdate(DSOHandle.toPtr<void *>());
              })
       .release();
 }
diff --git a/compiler-rt/lib/orc/macho_platform.cpp b/compiler-rt/lib/orc/macho_platform.cpp
index afd90c791ae1..8ca68587aeb3 100644
--- a/compiler-rt/lib/orc/macho_platform.cpp
+++ b/compiler-rt/lib/orc/macho_platform.cpp
@@ -245,7 +245,7 @@ public:
 
   const char *dlerror();
   void *dlopen(std::string_view Name, int Mode);
-  int dlupdate(void *DSOHandle, int Mode);
+  int dlupdate(void *DSOHandle);
   int dlclose(void *DSOHandle);
   void *dlsym(void *DSOHandle, const char *Symbol);
 
@@ -295,7 +295,7 @@ private:
   Error dlopenInitialize(std::unique_lock<std::mutex> &JDStatesLock,
                          JITDylibState &JDS, MachOJITDylibDepInfoMap &DepInfo);
 
-  Error dlupdateImpl(void *DSOHandle, int Mode);
+  Error dlupdateImpl(void *DSOHandle);
   Error dlupdateFull(std::unique_lock<std::mutex> &JDStatesLock,
                      JITDylibState &JDS);
   Error dlupdateInitialize(std::unique_lock<std::mutex> &JDStatesLock,
@@ -710,13 +710,13 @@ void *MachOPlatformRuntimeState::dlopen(std::string_view Path, int Mode) {
   }
 }
 
-int MachOPlatformRuntimeState::dlupdate(void *DSOHandle, int Mode) {
+int MachOPlatformRuntimeState::dlupdate(void *DSOHandle) {
   ORC_RT_DEBUG({
     std::string S;
     printdbg("MachOPlatform::dlupdate(%p) (%s)\n", DSOHandle, S.c_str());
   });
   std::lock_guard<std::recursive_mutex> Lock(DyldAPIMutex);
-  if (auto Err = dlupdateImpl(DSOHandle, Mode)) {
+  if (auto Err = dlupdateImpl(DSOHandle)) {
     // FIXME: Make dlerror thread safe.
     DLFcnError = toString(std::move(Err));
     return -1;
@@ -1179,7 +1179,7 @@ Error MachOPlatformRuntimeState::dlopenInitialize(
   return Error::success();
 }
 
-Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle, int Mode) {
+Error MachOPlatformRuntimeState::dlupdateImpl(void *DSOHandle) {
   std::unique_lock<std::mutex> Lock(JDStatesMutex);
 
   // Try to find JITDylib state by DSOHandle.
@@ -1513,8 +1513,8 @@ void *__orc_rt_macho_jit_dlopen(const char *path, int mode) {
   return MachOPlatformRuntimeState::get().dlopen(path, mode);
 }
 
-int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode) {
-  return MachOPlatformRuntimeState::get().dlupdate(dso_handle, mode);
+int __orc_rt_macho_jit_dlupdate(void *dso_handle) {
+  return MachOPlatformRuntimeState::get().dlupdate(dso_handle);
 }
 
 int __orc_rt_macho_jit_dlclose(void *dso_handle) {
diff --git a/compiler-rt/lib/orc/macho_platform.h b/compiler-rt/lib/orc/macho_platform.h
index ad70c97809d2..aeab248f7f8a 100644
--- a/compiler-rt/lib/orc/macho_platform.h
+++ b/compiler-rt/lib/orc/macho_platform.h
@@ -24,7 +24,7 @@ ORC_RT_INTERFACE void __orc_rt_macho_cxa_finalize(void *dso_handle);
 // dlfcn functions.
 ORC_RT_INTERFACE const char *__orc_rt_macho_jit_dlerror();
 ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlopen(const char *path, int mode);
-ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle, int mode);
+ORC_RT_INTERFACE int __orc_rt_macho_jit_dlupdate(void *dso_handle);
 ORC_RT_INTERFACE int __orc_rt_macho_jit_dlclose(void *dso_handle);
 ORC_RT_INTERFACE void *__orc_rt_macho_jit_dlsym(void *dso_handle,
                                                 const char *symbol);
diff --git a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
index c56ec196772b..401ed525fd5c 100644
--- a/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
+++ b/llvm/lib/ExecutionEngine/Orc/LLJIT.cpp
@@ -608,7 +608,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
   using llvm::orc::shared::SPSExecutorAddr;
   using llvm::orc::shared::SPSString;
   using SPSDLOpenSig = SPSExecutorAddr(SPSString, int32_t);
-  using SPSDLUpdateSig = int32_t(SPSExecutorAddr, int32_t);
+  using SPSDLUpdateSig = int32_t(SPSExecutorAddr);
   enum dlopen_mode : int32_t {
     ORC_RT_RTLD_LAZY = 0x1,
     ORC_RT_RTLD_NOW = 0x2,
@@ -634,8 +634,7 @@ Error ORCPlatformSupport::initialize(orc::JITDylib &JD) {
     if (dlupdate) {
       int32_t result;
       auto E = ES.callSPSWrapper<SPSDLUpdateSig>(WrapperAddr->getAddress(),
-                                                 result, DSOHandles[&JD],
-                                                 int32_t(ORC_RT_RTLD_LAZY));
+                                                 result, DSOHandles[&JD]);
       if (E)
         return E;
       else if (result)
-- 
GitLab


From f796a0c7c9299ec16d459de70a92d8a675f47a42 Mon Sep 17 00:00:00 2001
From: David Blaikie <dblaikie@gmail.com>
Date: Wed, 16 Oct 2024 15:53:52 -0700
Subject: [PATCH 177/329] [formatv] Leave format parameters unstripped
 (#112625)

This is consistent with std::formatv and allows formatters to support a
wider variety of use cases (like having a bare string in their formatter
if that's useful, etc).

Came up in the context of some Carbon diagnostic work here:
https://github.com/carbon-language/carbon-lang/pull/4411#discussion_r1803688859
---
 llvm/lib/Support/FormatVariadic.cpp           | 7 +++----
 llvm/unittests/Support/FormatVariadicTest.cpp | 2 +-
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Support/FormatVariadic.cpp b/llvm/lib/Support/FormatVariadic.cpp
index 7eb108879405..f3e8d0a7fe6f 100644
--- a/llvm/lib/Support/FormatVariadic.cpp
+++ b/llvm/lib/Support/FormatVariadic.cpp
@@ -64,11 +64,10 @@ static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
   AlignStyle Where = AlignStyle::Right;
   StringRef Options;
   unsigned Index = ~0U;
-  RepString = RepString.trim();
+  RepString = RepString.ltrim();
 
   // If index is not specified, keep it ~0U to indicate unresolved index.
   RepString.consumeInteger(0, Index);
-  RepString = RepString.trim();
 
   if (RepString.consume_front(",")) {
     if (!consumeFieldLayout(RepString, Where, Align, Pad)) {
@@ -76,9 +75,9 @@ static std::optional<ReplacementItem> parseReplacementItem(StringRef Spec) {
       return std::nullopt;
     }
   }
-  RepString = RepString.trim();
+  RepString = RepString.ltrim();
   if (RepString.consume_front(":")) {
-    Options = RepString.trim();
+    Options = RepString;
     RepString = StringRef();
   }
   RepString = RepString.trim();
diff --git a/llvm/unittests/Support/FormatVariadicTest.cpp b/llvm/unittests/Support/FormatVariadicTest.cpp
index e745f99a5a6c..03102c96d466 100644
--- a/llvm/unittests/Support/FormatVariadicTest.cpp
+++ b/llvm/unittests/Support/FormatVariadicTest.cpp
@@ -150,7 +150,7 @@ TEST(FormatVariadicTest, ValidReplacementSequence) {
   EXPECT_EQ(0u, Replacements[0].Index);
   EXPECT_EQ(3u, Replacements[0].Width);
   EXPECT_EQ(AlignStyle::Left, Replacements[0].Where);
-  EXPECT_EQ("foo", Replacements[0].Options);
+  EXPECT_EQ(" foo ", Replacements[0].Options);
 
   // 8. Everything after the first option specifier is part of the style, even
   // if it contains another option specifier.
-- 
GitLab


From 2ce0a90d5c026ee4ec4e7e38e7939ca60236e127 Mon Sep 17 00:00:00 2001
From: Tim Gymnich <tgymnich@icloud.com>
Date: Thu, 17 Oct 2024 01:31:38 +0200
Subject: [PATCH 178/329] [NFC][HLSL] Fix test function names (#112602)

Fix test names changing `int` to `uint`.
https://github.com/llvm/llvm-project/pull/108396#discussion_r1803524539

@bob80905
---
 clang/test/CodeGenHLSL/builtins/sign.hlsl | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/test/CodeGenHLSL/builtins/sign.hlsl b/clang/test/CodeGenHLSL/builtins/sign.hlsl
index 0ed9a9468d86..1cdefa815b10 100644
--- a/clang/test/CodeGenHLSL/builtins/sign.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/sign.hlsl
@@ -202,19 +202,19 @@ int4 test_sign_int64_t4(int64_t4 p0) { return sign(p0); }
 // CHECK: define [[FNATTRS]] i32 @
 // CHECK: [[CMP:%.*]] = icmp eq i64 [[ARG:%.*]], 0
 // CHECK: %hlsl.sign = select i1 [[CMP]], i32 0, i32 1
-int test_sign_int64_t(uint64_t p0) { return sign(p0); }
+int test_sign_uint64_t(uint64_t p0) { return sign(p0); }
 
 // CHECK: define [[FNATTRS]] <2 x i32> @
 // CHECK: [[CMP:%.*]] = icmp eq <2 x i64> [[ARG:%.*]], zeroinitializer
 // CHECK: %hlsl.sign = select <2 x i1> [[CMP]], <2 x i32> zeroinitializer, <2 x i32> <i32 1, i32 1>
-int2 test_sign_int64_t2(uint64_t2 p0) { return sign(p0); }
+int2 test_sign_uint64_t2(uint64_t2 p0) { return sign(p0); }
 
 // CHECK: define [[FNATTRS]] <3 x i32> @
 // CHECK: [[CMP:%.*]] = icmp eq <3 x i64> [[ARG:%.*]], zeroinitializer
 // CHECK: %hlsl.sign = select <3 x i1> [[CMP]], <3 x i32> zeroinitializer, <3 x i32> <i32 1, i32 1, i32 1>
-int3 test_sign_int64_t3(uint64_t3 p0) { return sign(p0); }
+int3 test_sign_uint64_t3(uint64_t3 p0) { return sign(p0); }
 
 // CHECK: define [[FNATTRS]] <4 x i32> @
 // CHECK: [[CMP:%.*]] = icmp eq <4 x i64> [[ARG:%.*]], zeroinitializer
 // CHECK: %hlsl.sign = select <4 x i1> [[CMP]], <4 x i32> zeroinitializer, <4 x i32> <i32 1, i32 1, i32 1, i32 1>
-int4 test_sign_int64_t4(uint64_t4 p0) { return sign(p0); }
+int4 test_sign_uint64_t4(uint64_t4 p0) { return sign(p0); }
-- 
GitLab


From 8f683b552d7a0cf1bdd93f220e2552f9ea0a6b8d Mon Sep 17 00:00:00 2001
From: Haowei <haowei@google.com>
Date: Wed, 16 Oct 2024 16:34:02 -0700
Subject: [PATCH 179/329] [lldb] Use system c++ lib for LLDB STL tests
 (#112598)

This change partially reverts #112357 to avoid test failures on machines
which do not have gcc installed.
---
 lldb/test/API/lang/cpp/stl/Makefile | 6 ------
 1 file changed, 6 deletions(-)

diff --git a/lldb/test/API/lang/cpp/stl/Makefile b/lldb/test/API/lang/cpp/stl/Makefile
index 4408691f01b7..99998b20bcb0 100644
--- a/lldb/test/API/lang/cpp/stl/Makefile
+++ b/lldb/test/API/lang/cpp/stl/Makefile
@@ -1,9 +1,3 @@
 CXX_SOURCES := main.cpp
 
-ifneq ($(OS),Darwin)
-    USE_LIBSTDCPP := 1
-else
-    USE_SYSTEM_STDLIB := 1
-endif
-
 include Makefile.rules
-- 
GitLab


From afc6da43d5ae068d041728d96b6b6590f94afbb0 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 17 Oct 2024 01:35:54 +0200
Subject: [PATCH 180/329] [clang][dataflow] Silence unused variable warning.
 NFCI

---
 .../Analysis/FlowSensitive/CachedConstAccessorsLattice.h     | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
index 3c3028eb9452..3402d105746e 100644
--- a/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
+++ b/clang/include/clang/Analysis/FlowSensitive/CachedConstAccessorsLattice.h
@@ -193,9 +193,8 @@ StorageLocation *
 CachedConstAccessorsLattice<Base>::getOrCreateConstMethodReturnStorageLocation(
     const RecordStorageLocation &RecordLoc, const CallExpr *CE,
     Environment &Env, llvm::function_ref<void(StorageLocation &)> Initialize) {
-  QualType Type = CE->getType();
-  assert(!Type.isNull());
-  assert(CE->isGLValue() || Type->isRecordType());
+  assert(!CE->getType().isNull());
+  assert(CE->isGLValue() || CE->getType()->isRecordType());
   auto &ObjMap = ConstMethodReturnStorageLocations[&RecordLoc];
   const FunctionDecl *DirectCallee = CE->getDirectCallee();
   if (DirectCallee == nullptr)
-- 
GitLab


From 1efa6625ef145624f7134bcb957f8ffa19c3c68e Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Wed, 16 Oct 2024 16:38:14 -0700
Subject: [PATCH 181/329] [rtsan] Introduce function-name-matches suppression
 (#112108)

Introduces a new type of suppression:

1. function-name-matches - allows users to disable `malloc`, `free`,
`pthread_mutex_lock` or similar. This could be helpful if a user thinks
these are real-time safe on their OS. Also allows disabling of any
function marked [[blocking]].

This is useful as a **more performant "early outs" compared to the
`call-stack-contains` suppression**. `call-stack-contains` is inherently
VERY costly, needing to inspect every frame of every stack for a
matching string. This new suppression has an early out before we unwind
the stack.
---
 compiler-rt/lib/rtsan/rtsan_assertions.h      |  3 +++
 compiler-rt/lib/rtsan/rtsan_checks.inc        |  1 +
 compiler-rt/lib/rtsan/rtsan_suppressions.cpp  | 13 ++++++++++
 compiler-rt/lib/rtsan/rtsan_suppressions.h    |  1 +
 compiler-rt/test/rtsan/stack_suppressions.cpp | 25 +++++++++++++++----
 .../test/rtsan/stack_suppressions.cpp.supp    |  5 ++--
 6 files changed, 41 insertions(+), 7 deletions(-)

diff --git a/compiler-rt/lib/rtsan/rtsan_assertions.h b/compiler-rt/lib/rtsan/rtsan_assertions.h
index 8183a8202478..927b32e03cf0 100644
--- a/compiler-rt/lib/rtsan/rtsan_assertions.h
+++ b/compiler-rt/lib/rtsan/rtsan_assertions.h
@@ -28,6 +28,9 @@ void ExpectNotRealtime(Context &context, const DiagnosticsInfo &info,
   if (context.InRealtimeContext() && !context.IsBypassed()) {
     ScopedBypass sb{context};
 
+    if (IsFunctionSuppressed(info.func_name))
+      return;
+
     __sanitizer::BufferedStackTrace stack;
 
     // We use the unwind_on_fatal flag here because of precedent with other
diff --git a/compiler-rt/lib/rtsan/rtsan_checks.inc b/compiler-rt/lib/rtsan/rtsan_checks.inc
index f5f23e044bd5..676b6a579194 100644
--- a/compiler-rt/lib/rtsan/rtsan_checks.inc
+++ b/compiler-rt/lib/rtsan/rtsan_checks.inc
@@ -17,3 +17,4 @@
 // SummaryKind should be a string literal.
 
 RTSAN_CHECK(CallStackContains, "call-stack-contains")
+RTSAN_CHECK(FunctionNameMatches, "function-name-matches")
diff --git a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
index c5051dd19102..a7c3d42ac68a 100644
--- a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
@@ -92,3 +92,16 @@ bool __rtsan::IsStackTraceSuppressed(const StackTrace &stack) {
   }
   return false;
 }
+
+bool __rtsan::IsFunctionSuppressed(const char *function_name) {
+  if (suppression_ctx == nullptr)
+    return false;
+
+  const char *flag_name = ConvertTypeToFlagName(ErrorType::FunctionNameMatches);
+
+  if (!suppression_ctx->HasSuppressionType(flag_name))
+    return false;
+
+  Suppression *s;
+  return suppression_ctx->Match(function_name, flag_name, &s);
+}
diff --git a/compiler-rt/lib/rtsan/rtsan_suppressions.h b/compiler-rt/lib/rtsan/rtsan_suppressions.h
index 45545f8c0e0b..9990b99f3b52 100644
--- a/compiler-rt/lib/rtsan/rtsan_suppressions.h
+++ b/compiler-rt/lib/rtsan/rtsan_suppressions.h
@@ -18,5 +18,6 @@ namespace __rtsan {
 
 void InitializeSuppressions();
 bool IsStackTraceSuppressed(const __sanitizer::StackTrace &stack);
+bool IsFunctionSuppressed(const char *function_name);
 
 } // namespace __rtsan
diff --git a/compiler-rt/test/rtsan/stack_suppressions.cpp b/compiler-rt/test/rtsan/stack_suppressions.cpp
index 2aceedbb313b..b9b2d0957636 100644
--- a/compiler-rt/test/rtsan/stack_suppressions.cpp
+++ b/compiler-rt/test/rtsan/stack_suppressions.cpp
@@ -1,4 +1,5 @@
 // RUN: %clangxx -fsanitize=realtime %s -o %t
+// RUN: %env_rtsan_opts=halt_on_error=false %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NOSUPPRESSIONS
 // RUN: %env_rtsan_opts=suppressions='%s.supp' not %run %t 2>&1 | FileCheck %s
 // UNSUPPORTED: ios
 
@@ -8,8 +9,11 @@
 #include <stdlib.h>
 #include <unistd.h>
 
+#include <atomic>
 #include <vector>
 
+std::atomic<int> cas_atomic{0};
+
 void *MallocViolation() { return malloc(10); }
 
 void VectorViolations() {
@@ -22,13 +26,18 @@ void VectorViolations() {
   v.reserve(10);
 }
 
-void BlockFunc() [[clang::blocking]] { usleep(1); }
+void BlockFunc() [[clang::blocking]] {
+  int expected = 0;
+  while (!cas_atomic.compare_exchange_weak(expected, 1)) {
+    expected = cas_atomic.load();
+  }
+}
 
 void *process() [[clang::nonblocking]] {
-  void *ptr = MallocViolation();
-  VectorViolations();
-  BlockFunc();
-  free(ptr);
+  void *ptr = MallocViolation(); // Suppressed call-stack-contains
+  VectorViolations();            // Suppressed call-stack-contains with regex
+  BlockFunc();                   // Suppressed function-name-matches
+  free(ptr);                     // Suppressed function-name-matches
 
   // This is the one that should abort the program
   // Everything else is suppressed
@@ -51,3 +60,9 @@ int main() {
 // CHECK-NOT: vector
 // CHECK-NOT: free
 // CHECK-NOT: BlockFunc
+
+// CHECK-NOSUPPRESSIONS: malloc
+// CHECK-NOSUPPRESSIONS: vector
+// CHECK-NOSUPPRESSIONS: free
+// CHECK-NOSUPPRESSIONS: BlockFunc
+// CHECK-NOSUPPRESSIONS: usleep
diff --git a/compiler-rt/test/rtsan/stack_suppressions.cpp.supp b/compiler-rt/test/rtsan/stack_suppressions.cpp.supp
index bec4db259a3e..9aaa5a5f0890 100644
--- a/compiler-rt/test/rtsan/stack_suppressions.cpp.supp
+++ b/compiler-rt/test/rtsan/stack_suppressions.cpp.supp
@@ -1,4 +1,5 @@
 call-stack-contains:MallocViolation
 call-stack-contains:std::*vector
-call-stack-contains:free
-call-stack-contains:BlockFunc
+
+function-name-matches:free
+function-name-matches:Block*
-- 
GitLab


From 1834660b4ca79ae75b2bf9fd157f3ca6957bae0c Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 16 Oct 2024 16:40:24 -0700
Subject: [PATCH 182/329] [nfc][lsan] Replace
 `suspended_threads.GetThreadID(i)` with local var (#112607)

---
 compiler-rt/lib/lsan/lsan_common.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp
index 8cdc6d1651f1..329e29477d07 100644
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@@ -405,7 +405,7 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
   InternalMmapVector<uptr> registers;
   InternalMmapVector<Range> extra_ranges;
   for (uptr i = 0; i < suspended_threads.ThreadCount(); i++) {
-    tid_t os_id = static_cast<tid_t>(suspended_threads.GetThreadID(i));
+    const tid_t os_id = static_cast<tid_t>(suspended_threads.GetThreadID(i));
     LOG_THREADS("Processing thread %llu.\n", os_id);
     uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
     DTLS *dtls;
@@ -429,9 +429,8 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
         continue;
       sp = stack_begin;
     }
-    if (suspended_threads.GetThreadID(i) == caller_tid) {
+    if (os_id == caller_tid)
       sp = caller_sp;
-    }
 
     if (flags()->use_registers && have_registers) {
       uptr registers_begin = reinterpret_cast<uptr>(registers.data());
-- 
GitLab


From 566012a64e8d91dd7abca6aee0814ae293f412d5 Mon Sep 17 00:00:00 2001
From: Elvis Wang <elvis.wang@sifive.com>
Date: Thu, 17 Oct 2024 07:47:43 +0800
Subject: [PATCH 183/329] [RISCV][TTI] Implement instruction cost for vp_merge.
 (#112327)

This patch implement the instruction for `vp_merge`, which will generate
similar instruction sequence to the `select` instruction.
---
 .../Target/RISCV/RISCVTargetTransformInfo.cpp |   6 +-
 .../Analysis/CostModel/RISCV/rvv-select.ll    | 180 ++++++++++++++++++
 2 files changed, 185 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
index cba73abdd150..df5c6b522e67 100644
--- a/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
+++ b/llvm/lib/Target/RISCV/RISCVTargetTransformInfo.cpp
@@ -1130,6 +1130,10 @@ RISCVTTIImpl::getIntrinsicInstrCost(const IntrinsicCostAttributes &ICA,
     return getCmpSelInstrCost(*FOp, ICA.getReturnType(), ICA.getArgTypes()[0],
                               CmpInst::BAD_ICMP_PREDICATE, CostKind);
   }
+  case Intrinsic::vp_merge:
+    return getCmpSelInstrCost(Instruction::Select, ICA.getReturnType(),
+                              ICA.getArgTypes()[0], CmpInst::BAD_ICMP_PREDICATE,
+                              CostKind);
   }
 
   if (ST->hasVInstructions() && RetTy->isVectorTy()) {
@@ -2429,4 +2433,4 @@ bool RISCVTTIImpl::isProfitableToSinkOperands(
     Ops.push_back(&OpIdx.value());
   }
   return true;
-}
\ No newline at end of file
+}
diff --git a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
index 2bf1e5d26e2d..ef17d8dc60c1 100644
--- a/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
+++ b/llvm/test/Analysis/CostModel/RISCV/rvv-select.ll
@@ -414,4 +414,184 @@ define void @select_of_constants() {
   ret void
 }
 
+define void @vp_merge() {
+; CHECK-LABEL: 'vp_merge'
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %1 = call <1 x i1> @llvm.vp.merge.v1i1(<1 x i1> undef, <1 x i1> undef, <1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %2 = call <2 x i1> @llvm.vp.merge.v2i1(<2 x i1> undef, <2 x i1> undef, <2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %3 = call <4 x i1> @llvm.vp.merge.v4i1(<4 x i1> undef, <4 x i1> undef, <4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %4 = call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> undef, <8 x i1> undef, <8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %5 = call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> undef, <16 x i1> undef, <16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 6 for instruction: %6 = call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> undef, <32 x i1> undef, <32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %7 = call <vscale x 1 x i1> @llvm.vp.merge.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %8 = call <vscale x 2 x i1> @llvm.vp.merge.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %9 = call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %10 = call <vscale x 8 x i1> @llvm.vp.merge.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %11 = call <vscale x 16 x i1> @llvm.vp.merge.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 3 for instruction: %12 = call <vscale x 32 x i1> @llvm.vp.merge.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %13 = call <1 x i8> @llvm.vp.merge.v1i8(<1 x i1> undef, <1 x i8> undef, <1 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %14 = call <2 x i8> @llvm.vp.merge.v2i8(<2 x i1> undef, <2 x i8> undef, <2 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %15 = call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> undef, <4 x i8> undef, <4 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %16 = call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> undef, <8 x i8> undef, <8 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %17 = call <16 x i8> @llvm.vp.merge.v16i8(<16 x i1> undef, <16 x i8> undef, <16 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %18 = call <32 x i8> @llvm.vp.merge.v32i8(<32 x i1> undef, <32 x i8> undef, <32 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %19 = call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %20 = call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %21 = call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %22 = call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %23 = call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %24 = call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %25 = call <1 x i16> @llvm.vp.merge.v1i16(<1 x i1> undef, <1 x i16> undef, <1 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %26 = call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> undef, <2 x i16> undef, <2 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %27 = call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> undef, <4 x i16> undef, <4 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %28 = call <8 x i16> @llvm.vp.merge.v8i16(<8 x i1> undef, <8 x i16> undef, <8 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %29 = call <16 x i16> @llvm.vp.merge.v16i16(<16 x i1> undef, <16 x i16> undef, <16 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %30 = call <32 x i16> @llvm.vp.merge.v32i16(<32 x i1> undef, <32 x i16> undef, <32 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %31 = call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %32 = call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %33 = call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %34 = call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %35 = call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %36 = call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %37 = call <1 x i32> @llvm.vp.merge.v1i32(<1 x i1> undef, <1 x i32> undef, <1 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %38 = call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> undef, <2 x i32> undef, <2 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %39 = call <4 x i32> @llvm.vp.merge.v4i32(<4 x i1> undef, <4 x i32> undef, <4 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %40 = call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> undef, <8 x i32> undef, <8 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %41 = call <16 x i32> @llvm.vp.merge.v16i32(<16 x i1> undef, <16 x i32> undef, <16 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %42 = call <32 x i32> @llvm.vp.merge.v32i32(<32 x i1> undef, <32 x i32> undef, <32 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %43 = call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %44 = call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %45 = call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %46 = call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %47 = call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %48 = call <vscale x 32 x i32> @llvm.vp.merge.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i32> undef, <vscale x 32 x i32> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %49 = call <1 x i64> @llvm.vp.merge.v1i64(<1 x i1> undef, <1 x i64> undef, <1 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %50 = call <2 x i64> @llvm.vp.merge.v2i64(<2 x i1> undef, <2 x i64> undef, <2 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %51 = call <4 x i64> @llvm.vp.merge.v4i64(<4 x i1> undef, <4 x i64> undef, <4 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %52 = call <8 x i64> @llvm.vp.merge.v8i64(<8 x i1> undef, <8 x i64> undef, <8 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %53 = call <16 x i64> @llvm.vp.merge.v16i64(<16 x i1> undef, <16 x i64> undef, <16 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %54 = call <32 x i64> @llvm.vp.merge.v32i64(<32 x i1> undef, <32 x i64> undef, <32 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %55 = call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %56 = call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %57 = call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %58 = call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %59 = call <vscale x 16 x i64> @llvm.vp.merge.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i64> undef, <vscale x 16 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %60 = call <vscale x 32 x i64> @llvm.vp.merge.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i64> undef, <vscale x 32 x i64> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %61 = call <1 x float> @llvm.vp.merge.v1f32(<1 x i1> undef, <1 x float> undef, <1 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %62 = call <2 x float> @llvm.vp.merge.v2f32(<2 x i1> undef, <2 x float> undef, <2 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %63 = call <4 x float> @llvm.vp.merge.v4f32(<4 x i1> undef, <4 x float> undef, <4 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %64 = call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> undef, <8 x float> undef, <8 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %65 = call <16 x float> @llvm.vp.merge.v16f32(<16 x i1> undef, <16 x float> undef, <16 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %66 = call <32 x float> @llvm.vp.merge.v32f32(<32 x i1> undef, <32 x float> undef, <32 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %67 = call <vscale x 1 x float> @llvm.vp.merge.nxv1f32(<vscale x 1 x i1> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %68 = call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %69 = call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %70 = call <vscale x 8 x float> @llvm.vp.merge.nxv8f32(<vscale x 8 x i1> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %71 = call <vscale x 16 x float> @llvm.vp.merge.nxv16f32(<vscale x 16 x i1> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %72 = call <vscale x 32 x float> @llvm.vp.merge.nxv32f32(<vscale x 32 x i1> undef, <vscale x 32 x float> undef, <vscale x 32 x float> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %73 = call <1 x double> @llvm.vp.merge.v1f64(<1 x i1> undef, <1 x double> undef, <1 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %74 = call <2 x double> @llvm.vp.merge.v2f64(<2 x i1> undef, <2 x double> undef, <2 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %75 = call <4 x double> @llvm.vp.merge.v4f64(<4 x i1> undef, <4 x double> undef, <4 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %76 = call <8 x double> @llvm.vp.merge.v8f64(<8 x i1> undef, <8 x double> undef, <8 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %77 = call <16 x double> @llvm.vp.merge.v16f64(<16 x i1> undef, <16 x double> undef, <16 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %78 = call <32 x double> @llvm.vp.merge.v32f64(<32 x i1> undef, <32 x double> undef, <32 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 1 for instruction: %79 = call <vscale x 1 x double> @llvm.vp.merge.nxv1f64(<vscale x 1 x i1> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 2 for instruction: %80 = call <vscale x 2 x double> @llvm.vp.merge.nxv2f64(<vscale x 2 x i1> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 4 for instruction: %81 = call <vscale x 4 x double> @llvm.vp.merge.nxv4f64(<vscale x 4 x i1> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 8 for instruction: %82 = call <vscale x 8 x double> @llvm.vp.merge.nxv8f64(<vscale x 8 x i1> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 16 for instruction: %83 = call <vscale x 16 x double> @llvm.vp.merge.nxv16f64(<vscale x 16 x i1> undef, <vscale x 16 x double> undef, <vscale x 16 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 32 for instruction: %84 = call <vscale x 32 x double> @llvm.vp.merge.nxv32f64(<vscale x 32 x i1> undef, <vscale x 32 x double> undef, <vscale x 32 x double> undef, i32 undef)
+; CHECK-NEXT:  Cost Model: Found an estimated cost of 0 for instruction: ret void
+;
+  call <1 x i1> @llvm.vp.merge.v1i1(<1 x i1> undef, <1 x i1> undef, <1 x i1> undef, i32 undef)
+  call <2 x i1> @llvm.vp.merge.v2i1(<2 x i1> undef, <2 x i1> undef, <2 x i1> undef, i32 undef)
+  call <4 x i1> @llvm.vp.merge.v4i1(<4 x i1> undef, <4 x i1> undef, <4 x i1> undef, i32 undef)
+  call <8 x i1> @llvm.vp.merge.v8i1(<8 x i1> undef, <8 x i1> undef, <8 x i1> undef, i32 undef)
+  call <16 x i1> @llvm.vp.merge.v16i1(<16 x i1> undef, <16 x i1> undef, <16 x i1> undef, i32 undef)
+  call <32 x i1> @llvm.vp.merge.v32i1(<32 x i1> undef, <32 x i1> undef, <32 x i1> undef, i32 undef)
+  call <vscale x 1 x i1> @llvm.vp.merge.nxv1i1(<vscale x 1 x i1> undef, <vscale x 1 x i1> undef, <vscale x 1 x i1> undef, i32 undef)
+  call <vscale x 2 x i1> @llvm.vp.merge.nxv2i1(<vscale x 2 x i1> undef, <vscale x 2 x i1> undef, <vscale x 2 x i1> undef, i32 undef)
+  call <vscale x 4 x i1> @llvm.vp.merge.nxv4i1(<vscale x 4 x i1> undef, <vscale x 4 x i1> undef, <vscale x 4 x i1> undef, i32 undef)
+  call <vscale x 8 x i1> @llvm.vp.merge.nxv8i1(<vscale x 8 x i1> undef, <vscale x 8 x i1> undef, <vscale x 8 x i1> undef, i32 undef)
+  call <vscale x 16 x i1> @llvm.vp.merge.nxv16i1(<vscale x 16 x i1> undef, <vscale x 16 x i1> undef, <vscale x 16 x i1> undef, i32 undef)
+  call <vscale x 32 x i1> @llvm.vp.merge.nxv32i1(<vscale x 32 x i1> undef, <vscale x 32 x i1> undef, <vscale x 32 x i1> undef, i32 undef)
+
+  call <1 x i8> @llvm.vp.merge.v1i8(<1 x i1> undef, <1 x i8> undef, <1 x i8> undef, i32 undef)
+  call <2 x i8> @llvm.vp.merge.v2i8(<2 x i1> undef, <2 x i8> undef, <2 x i8> undef, i32 undef)
+  call <4 x i8> @llvm.vp.merge.v4i8(<4 x i1> undef, <4 x i8> undef, <4 x i8> undef, i32 undef)
+  call <8 x i8> @llvm.vp.merge.v8i8(<8 x i1> undef, <8 x i8> undef, <8 x i8> undef, i32 undef)
+  call <16 x i8> @llvm.vp.merge.v16i8(<16 x i1> undef, <16 x i8> undef, <16 x i8> undef, i32 undef)
+  call <32 x i8> @llvm.vp.merge.v32i8(<32 x i1> undef, <32 x i8> undef, <32 x i8> undef, i32 undef)
+  call <vscale x 1 x i8> @llvm.vp.merge.nxv1i8(<vscale x 1 x i1> undef, <vscale x 1 x i8> undef, <vscale x 1 x i8> undef, i32 undef)
+  call <vscale x 2 x i8> @llvm.vp.merge.nxv2i8(<vscale x 2 x i1> undef, <vscale x 2 x i8> undef, <vscale x 2 x i8> undef, i32 undef)
+  call <vscale x 4 x i8> @llvm.vp.merge.nxv4i8(<vscale x 4 x i1> undef, <vscale x 4 x i8> undef, <vscale x 4 x i8> undef, i32 undef)
+  call <vscale x 8 x i8> @llvm.vp.merge.nxv8i8(<vscale x 8 x i1> undef, <vscale x 8 x i8> undef, <vscale x 8 x i8> undef, i32 undef)
+  call <vscale x 16 x i8> @llvm.vp.merge.nxv16i8(<vscale x 16 x i1> undef, <vscale x 16 x i8> undef, <vscale x 16 x i8> undef, i32 undef)
+  call <vscale x 32 x i8> @llvm.vp.merge.nxv32i8(<vscale x 32 x i1> undef, <vscale x 32 x i8> undef, <vscale x 32 x i8> undef, i32 undef)
+
+  call <1 x i16> @llvm.vp.merge.v1i16(<1 x i1> undef, <1 x i16> undef, <1 x i16> undef, i32 undef)
+  call <2 x i16> @llvm.vp.merge.v2i16(<2 x i1> undef, <2 x i16> undef, <2 x i16> undef, i32 undef)
+  call <4 x i16> @llvm.vp.merge.v4i16(<4 x i1> undef, <4 x i16> undef, <4 x i16> undef, i32 undef)
+  call <8 x i16> @llvm.vp.merge.v8i16(<8 x i1> undef, <8 x i16> undef, <8 x i16> undef, i32 undef)
+  call <16 x i16> @llvm.vp.merge.v16i16(<16 x i1> undef, <16 x i16> undef, <16 x i16> undef, i32 undef)
+  call <32 x i16> @llvm.vp.merge.v32i16(<32 x i1> undef, <32 x i16> undef, <32 x i16> undef, i32 undef)
+  call <vscale x 1 x i16> @llvm.vp.merge.nxv1i16(<vscale x 1 x i1> undef, <vscale x 1 x i16> undef, <vscale x 1 x i16> undef, i32 undef)
+  call <vscale x 2 x i16> @llvm.vp.merge.nxv2i16(<vscale x 2 x i1> undef, <vscale x 2 x i16> undef, <vscale x 2 x i16> undef, i32 undef)
+  call <vscale x 4 x i16> @llvm.vp.merge.nxv4i16(<vscale x 4 x i1> undef, <vscale x 4 x i16> undef, <vscale x 4 x i16> undef, i32 undef)
+  call <vscale x 8 x i16> @llvm.vp.merge.nxv8i16(<vscale x 8 x i1> undef, <vscale x 8 x i16> undef, <vscale x 8 x i16> undef, i32 undef)
+  call <vscale x 16 x i16> @llvm.vp.merge.nxv16i16(<vscale x 16 x i1> undef, <vscale x 16 x i16> undef, <vscale x 16 x i16> undef, i32 undef)
+  call <vscale x 32 x i16> @llvm.vp.merge.nxv32i16(<vscale x 32 x i1> undef, <vscale x 32 x i16> undef, <vscale x 32 x i16> undef, i32 undef)
+
+  call <1 x i32> @llvm.vp.merge.v1i32(<1 x i1> undef, <1 x i32> undef, <1 x i32> undef, i32 undef)
+  call <2 x i32> @llvm.vp.merge.v2i32(<2 x i1> undef, <2 x i32> undef, <2 x i32> undef, i32 undef)
+  call <4 x i32> @llvm.vp.merge.v4i32(<4 x i1> undef, <4 x i32> undef, <4 x i32> undef, i32 undef)
+  call <8 x i32> @llvm.vp.merge.v8i32(<8 x i1> undef, <8 x i32> undef, <8 x i32> undef, i32 undef)
+  call <16 x i32> @llvm.vp.merge.v16i32(<16 x i1> undef, <16 x i32> undef, <16 x i32> undef, i32 undef)
+  call <32 x i32> @llvm.vp.merge.v32i32(<32 x i1> undef, <32 x i32> undef, <32 x i32> undef, i32 undef)
+  call <vscale x 1 x i32> @llvm.vp.merge.nxv1i32(<vscale x 1 x i1> undef, <vscale x 1 x i32> undef, <vscale x 1 x i32> undef, i32 undef)
+  call <vscale x 2 x i32> @llvm.vp.merge.nxv2i32(<vscale x 2 x i1> undef, <vscale x 2 x i32> undef, <vscale x 2 x i32> undef, i32 undef)
+  call <vscale x 4 x i32> @llvm.vp.merge.nxv4i32(<vscale x 4 x i1> undef, <vscale x 4 x i32> undef, <vscale x 4 x i32> undef, i32 undef)
+  call <vscale x 8 x i32> @llvm.vp.merge.nxv8i32(<vscale x 8 x i1> undef, <vscale x 8 x i32> undef, <vscale x 8 x i32> undef, i32 undef)
+  call <vscale x 16 x i32> @llvm.vp.merge.nxv16i32(<vscale x 16 x i1> undef, <vscale x 16 x i32> undef, <vscale x 16 x i32> undef, i32 undef)
+  call <vscale x 32 x i32> @llvm.vp.merge.nxv32i32(<vscale x 32 x i1> undef, <vscale x 32 x i32> undef, <vscale x 32 x i32> undef, i32 undef)
 
+  call <1 x i64> @llvm.vp.merge.v1i64(<1 x i1> undef, <1 x i64> undef, <1 x i64> undef, i32 undef)
+  call <2 x i64> @llvm.vp.merge.v2i64(<2 x i1> undef, <2 x i64> undef, <2 x i64> undef, i32 undef)
+  call <4 x i64> @llvm.vp.merge.v4i64(<4 x i1> undef, <4 x i64> undef, <4 x i64> undef, i32 undef)
+  call <8 x i64> @llvm.vp.merge.v8i64(<8 x i1> undef, <8 x i64> undef, <8 x i64> undef, i32 undef)
+  call <16 x i64> @llvm.vp.merge.v16i64(<16 x i1> undef, <16 x i64> undef, <16 x i64> undef, i32 undef)
+  call <32 x i64> @llvm.vp.merge.v32i64(<32 x i1> undef, <32 x i64> undef, <32 x i64> undef, i32 undef)
+  call <vscale x 1 x i64> @llvm.vp.merge.nxv1i64(<vscale x 1 x i1> undef, <vscale x 1 x i64> undef, <vscale x 1 x i64> undef, i32 undef)
+  call <vscale x 2 x i64> @llvm.vp.merge.nxv2i64(<vscale x 2 x i1> undef, <vscale x 2 x i64> undef, <vscale x 2 x i64> undef, i32 undef)
+  call <vscale x 4 x i64> @llvm.vp.merge.nxv4i64(<vscale x 4 x i1> undef, <vscale x 4 x i64> undef, <vscale x 4 x i64> undef, i32 undef)
+  call <vscale x 8 x i64> @llvm.vp.merge.nxv8i64(<vscale x 8 x i1> undef, <vscale x 8 x i64> undef, <vscale x 8 x i64> undef, i32 undef)
+  call <vscale x 16 x i64> @llvm.vp.merge.nxv16i64(<vscale x 16 x i1> undef, <vscale x 16 x i64> undef, <vscale x 16 x i64> undef, i32 undef)
+  call <vscale x 32 x i64> @llvm.vp.merge.nxv32i64(<vscale x 32 x i1> undef, <vscale x 32 x i64> undef, <vscale x 32 x i64> undef, i32 undef)
+
+  call <1 x float> @llvm.vp.merge.v1f32(<1 x i1> undef, <1 x float> undef, <1 x float> undef, i32 undef)
+  call <2 x float> @llvm.vp.merge.v2f32(<2 x i1> undef, <2 x float> undef, <2 x float> undef, i32 undef)
+  call <4 x float> @llvm.vp.merge.v4f32(<4 x i1> undef, <4 x float> undef, <4 x float> undef, i32 undef)
+  call <8 x float> @llvm.vp.merge.v8f32(<8 x i1> undef, <8 x float> undef, <8 x float> undef, i32 undef)
+  call <16 x float> @llvm.vp.merge.v16f32(<16 x i1> undef, <16 x float> undef, <16 x float> undef, i32 undef)
+  call <32 x float> @llvm.vp.merge.v32f32(<32 x i1> undef, <32 x float> undef, <32 x float> undef, i32 undef)
+  call <vscale x 1 x float> @llvm.vp.merge.nxv1f32(<vscale x 1 x i1> undef, <vscale x 1 x float> undef, <vscale x 1 x float> undef, i32 undef)
+  call <vscale x 2 x float> @llvm.vp.merge.nxv2f32(<vscale x 2 x i1> undef, <vscale x 2 x float> undef, <vscale x 2 x float> undef, i32 undef)
+  call <vscale x 4 x float> @llvm.vp.merge.nxv4f32(<vscale x 4 x i1> undef, <vscale x 4 x float> undef, <vscale x 4 x float> undef, i32 undef)
+  call <vscale x 8 x float> @llvm.vp.merge.nxv8f32(<vscale x 8 x i1> undef, <vscale x 8 x float> undef, <vscale x 8 x float> undef, i32 undef)
+  call <vscale x 16 x float> @llvm.vp.merge.nxv16f32(<vscale x 16 x i1> undef, <vscale x 16 x float> undef, <vscale x 16 x float> undef, i32 undef)
+  call <vscale x 32 x float> @llvm.vp.merge.nxv32f32(<vscale x 32 x i1> undef, <vscale x 32 x float> undef, <vscale x 32 x float> undef, i32 undef)
+
+  call <1 x double> @llvm.vp.merge.v1f64(<1 x i1> undef, <1 x double> undef, <1 x double> undef, i32 undef)
+  call <2 x double> @llvm.vp.merge.v2f64(<2 x i1> undef, <2 x double> undef, <2 x double> undef, i32 undef)
+  call <4 x double> @llvm.vp.merge.v4f64(<4 x i1> undef, <4 x double> undef, <4 x double> undef, i32 undef)
+  call <8 x double> @llvm.vp.merge.v8f64(<8 x i1> undef, <8 x double> undef, <8 x double> undef, i32 undef)
+  call <16 x double> @llvm.vp.merge.v16f64(<16 x i1> undef, <16 x double> undef, <16 x double> undef, i32 undef)
+  call <32 x double> @llvm.vp.merge.v32f64(<32 x i1> undef, <32 x double> undef, <32 x double> undef, i32 undef)
+  call <vscale x 1 x double> @llvm.vp.merge.nxv1f64(<vscale x 1 x i1> undef, <vscale x 1 x double> undef, <vscale x 1 x double> undef, i32 undef)
+  call <vscale x 2 x double> @llvm.vp.merge.nxv2f64(<vscale x 2 x i1> undef, <vscale x 2 x double> undef, <vscale x 2 x double> undef, i32 undef)
+  call <vscale x 4 x double> @llvm.vp.merge.nxv4f64(<vscale x 4 x i1> undef, <vscale x 4 x double> undef, <vscale x 4 x double> undef, i32 undef)
+  call <vscale x 8 x double> @llvm.vp.merge.nxv8f64(<vscale x 8 x i1> undef, <vscale x 8 x double> undef, <vscale x 8 x double> undef, i32 undef)
+  call <vscale x 16 x double> @llvm.vp.merge.nxv16f64(<vscale x 16 x i1> undef, <vscale x 16 x double> undef, <vscale x 16 x double> undef, i32 undef)
+  call <vscale x 32 x double> @llvm.vp.merge.nxv32f64(<vscale x 32 x i1> undef, <vscale x 32 x double> undef, <vscale x 32 x double> undef, i32 undef)
+
+  ret void
+}
-- 
GitLab


From 0bbdc76c865ad6875a3968c5e66a0dc277c0845a Mon Sep 17 00:00:00 2001
From: Jinsong Ji <jinsong.ji@intel.com>
Date: Wed, 16 Oct 2024 20:02:44 -0400
Subject: [PATCH 184/329] [NVPTX] Allow MemTransferInst in
 adjustByValArgAlignment (#112462)

Before b7b28e770c46, AreSupportedUsers will skip
MemTransferInst, it may cause unexpected assertion.
https://godbolt.org/z/z5d691fj1
In b7b28e770c46, we start to allow MemTransferInst,
we should allow it in adjustByValArgAlignment too.
---
 llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp    |  3 +++
 llvm/test/CodeGen/NVPTX/lower-byval-args.ll | 19 ++++++++++++++++++-
 2 files changed, 21 insertions(+), 1 deletion(-)

diff --git a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
index 3041c16c7a76..bb76cfd6fdb7 100644
--- a/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXLowerArgs.cpp
@@ -435,6 +435,9 @@ static void adjustByValArgAlignment(Argument *Arg, Value *ArgInParamAS,
         continue;
       }
 
+      if (isa<MemTransferInst>(CurUser))
+        continue;
+
       // supported for grid_constant
       if (IsGridConstant &&
           (isa<CallInst>(CurUser) || isa<StoreInst>(CurUser) ||
diff --git a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
index a7dbc4c1620a..013694277039 100644
--- a/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
+++ b/llvm/test/CodeGen/NVPTX/lower-byval-args.ll
@@ -219,6 +219,22 @@ entry:
   ret void
 }
 
+; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
+define dso_local void @memcpy_from_param_noalign (ptr nocapture noundef writeonly %out, ptr nocapture noundef readonly byval(%struct.S) %s) local_unnamed_addr #0 {
+; COMMON-LABEL: define dso_local void @memcpy_from_param_noalign(
+; COMMON-SAME: ptr nocapture noundef writeonly [[OUT:%.*]], ptr nocapture noundef readonly byval([[STRUCT_S:%.*]]) align 4 [[S:%.*]]) local_unnamed_addr #[[ATTR0]] {
+; COMMON-NEXT:  [[ENTRY:.*:]]
+; COMMON-NEXT:    [[S3:%.*]] = addrspacecast ptr [[S]] to ptr addrspace(101)
+; COMMON-NEXT:    [[OUT1:%.*]] = addrspacecast ptr [[OUT]] to ptr addrspace(1)
+; COMMON-NEXT:    [[OUT2:%.*]] = addrspacecast ptr addrspace(1) [[OUT1]] to ptr
+; COMMON-NEXT:    call void @llvm.memcpy.p0.p101.i64(ptr [[OUT2]], ptr addrspace(101) [[S3]], i64 16, i1 true)
+; COMMON-NEXT:    ret void
+;
+entry:
+  tail call void @llvm.memcpy.p0.p0.i64(ptr %out, ptr %s, i64 16, i1 true)
+  ret void
+}
+
 ; Function Attrs: mustprogress nofree norecurse nosync nounwind willreturn memory(argmem: readwrite)
 define dso_local void @memcpy_to_param(ptr nocapture noundef readonly %in, ptr nocapture noundef readnone byval(%struct.S) align 4 %s) local_unnamed_addr #0 {
 ; COMMON-LABEL: define dso_local void @memcpy_to_param(
@@ -426,7 +442,7 @@ attributes #1 = { nocallback nofree nounwind willreturn memory(argmem: readwrite
 attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
 
 !llvm.module.flags = !{!0, !1, !2, !3}
-!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19}
+!nvvm.annotations = !{!4, !5, !6, !7, !8, !9, !10, !11, !12, !13, !14, !15, !16, !17, !18, !19, !23}
 !llvm.ident = !{!20, !21}
 
 !0 = !{i32 2, !"SDK Version", [2 x i32] [i32 11, i32 8]}
@@ -451,3 +467,4 @@ attributes #2 = { nocallback nofree nounwind willreturn memory(argmem: write) }
 !19 = !{ptr @test_select_write, !"kernel", i32 1}
 !20 = !{!"clang version 20.0.0git"}
 !21 = !{!"clang version 3.8.0 (tags/RELEASE_380/final)"}
+!23 = !{ptr @memcpy_from_param_noalign, !"kernel", i32 1}
-- 
GitLab


From ec24e23d8452e29c36518b64851a012e1d71f546 Mon Sep 17 00:00:00 2001
From: Sterling-Augustine
 <56981066+Sterling-Augustine@users.noreply.github.com>
Date: Wed, 16 Oct 2024 17:06:23 -0700
Subject: [PATCH 185/329] [SandboxVectorizer][NFC] Make SeedContainer dump
 follow preferred approach (#112634)

---
 .../Vectorize/SandboxVectorizer/SeedCollector.h      |  1 +
 .../Vectorize/SandboxVectorizer/SeedCollector.cpp    | 12 +++++++-----
 2 files changed, 8 insertions(+), 5 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
index 6bad38b637d2..a4512862136a 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SandboxVectorizer/SeedCollector.h
@@ -279,6 +279,7 @@ public:
   unsigned size() const { return Bundles.size(); }
 
 #ifndef NDEBUG
+  void print(raw_ostream &OS) const;
   LLVM_DUMP_METHOD void dump() const;
 #endif // NDEBUG
 };
diff --git a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
index 20df9e344b61..66fac080a7b7 100644
--- a/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
+++ b/llvm/lib/Transforms/Vectorize/SandboxVectorizer/SeedCollector.cpp
@@ -112,21 +112,23 @@ template void SeedContainer::insert<LoadInst>(LoadInst *);
 template void SeedContainer::insert<StoreInst>(StoreInst *);
 
 #ifndef NDEBUG
-void SeedContainer::dump() const {
+void SeedContainer::print(raw_ostream &OS) const {
   for (const auto &Pair : Bundles) {
     auto [I, Ty, Opc] = Pair.first;
     const auto &SeedsVec = Pair.second;
     std::string RefType = dyn_cast<LoadInst>(I)    ? "Load"
                           : dyn_cast<StoreInst>(I) ? "Store"
                                                    : "Other";
-    dbgs() << "[Inst=" << *I << " Ty=" << Ty << " " << RefType << "]\n";
+    OS << "[Inst=" << *I << " Ty=" << Ty << " " << RefType << "]\n";
     for (const auto &SeedPtr : SeedsVec) {
-      SeedPtr->dump(dbgs());
-      dbgs() << "\n";
+      SeedPtr->dump(OS);
+      OS << "\n";
     }
   }
-  dbgs() << "\n";
+  OS << "\n";
 }
+
+LLVM_DUMP_METHOD void SeedContainer::dump() const { print(dbgs()); }
 #endif // NDEBUG
 
 } // namespace llvm::sandboxir
-- 
GitLab


From c5c11f340436a88cfc2165f2dcd64e4d63285068 Mon Sep 17 00:00:00 2001
From: John Harrison <harjohn@google.com>
Date: Wed, 16 Oct 2024 17:19:51 -0700
Subject: [PATCH 186/329] [lldb-dap] Creating an API for sending dap events
 from a script in lldb-dap. (#112384)

Custom DAP events can be detected using
https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent.

This API allows an lldb python script to send events to the DAP
client to allow extensions to handle these custom events.
---
 .../test/tools/lldb-dap/dap_server.py         |  2 +-
 .../API/tools/lldb-dap/send-event/Makefile    |  3 +
 .../lldb-dap/send-event/TestDAP_sendEvent.py  | 67 +++++++++++++++++++
 .../test/API/tools/lldb-dap/send-event/main.c |  6 ++
 lldb/tools/lldb-dap/DAP.cpp                   | 62 +++++++++++++++++
 lldb/tools/lldb-dap/DAP.h                     |  5 ++
 lldb/tools/lldb-dap/README.md                 | 31 +++++++++
 lldb/tools/lldb-dap/lldb-dap.cpp              |  2 +
 8 files changed, 177 insertions(+), 1 deletion(-)
 create mode 100644 lldb/test/API/tools/lldb-dap/send-event/Makefile
 create mode 100644 lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
 create mode 100644 lldb/test/API/tools/lldb-dap/send-event/main.c

diff --git a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
index 1d5e6e0d75c7..63748a71f112 100644
--- a/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
+++ b/lldb/packages/Python/lldbsuite/test/tools/lldb-dap/dap_server.py
@@ -1267,7 +1267,7 @@ def run_vscode(dbg, args, options):
 def main():
     parser = optparse.OptionParser(
         description=(
-            "A testing framework for the Visual Studio Code Debug " "Adaptor protocol"
+            "A testing framework for the Visual Studio Code Debug Adaptor protocol"
         )
     )
 
diff --git a/lldb/test/API/tools/lldb-dap/send-event/Makefile b/lldb/test/API/tools/lldb-dap/send-event/Makefile
new file mode 100644
index 000000000000..10495940055b
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/send-event/Makefile
@@ -0,0 +1,3 @@
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
new file mode 100644
index 000000000000..de47651bb2fa
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/send-event/TestDAP_sendEvent.py
@@ -0,0 +1,67 @@
+"""
+Test lldb-dap send-event integration.
+"""
+
+import json
+
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+import lldbdap_testcase
+
+
+class TestDAP_sendEvent(lldbdap_testcase.DAPTestCaseBase):
+    def test_send_event(self):
+        """
+        Test sending a custom event.
+        """
+        program = self.getBuildArtifact("a.out")
+        source = "main.c"
+        custom_event_body = {
+            "key": 321,
+            "arr": [True],
+        }
+        self.build_and_launch(
+            program,
+            stopCommands=[
+                "lldb-dap send-event my-custom-event-no-body",
+                "lldb-dap send-event my-custom-event '{}'".format(
+                    json.dumps(custom_event_body)
+                ),
+            ],
+        )
+
+        breakpoint_line = line_number(source, "// breakpoint")
+
+        self.set_source_breakpoints(source, [breakpoint_line])
+        self.continue_to_next_stop()
+
+        custom_event = self.dap_server.wait_for_event(
+            filter=["my-custom-event-no-body"]
+        )
+        self.assertEquals(custom_event["event"], "my-custom-event-no-body")
+        self.assertIsNone(custom_event.get("body", None))
+
+        custom_event = self.dap_server.wait_for_event(filter=["my-custom-event"])
+        self.assertEquals(custom_event["event"], "my-custom-event")
+        self.assertEquals(custom_event["body"], custom_event_body)
+
+    def test_send_internal_event(self):
+        """
+        Test sending an internal event produces an error.
+        """
+        program = self.getBuildArtifact("a.out")
+        source = "main.c"
+        self.build_and_launch(program)
+
+        breakpoint_line = line_number(source, "// breakpoint")
+
+        self.set_source_breakpoints(source, [breakpoint_line])
+        self.continue_to_next_stop()
+
+        resp = self.dap_server.request_evaluate(
+            "`lldb-dap send-event stopped", context="repl"
+        )
+        self.assertRegex(
+            resp["body"]["result"],
+            r"Invalid use of lldb-dap send-event, event \"stopped\" should be handled by lldb-dap internally.",
+        )
diff --git a/lldb/test/API/tools/lldb-dap/send-event/main.c b/lldb/test/API/tools/lldb-dap/send-event/main.c
new file mode 100644
index 000000000000..27bc22b94794
--- /dev/null
+++ b/lldb/test/API/tools/lldb-dap/send-event/main.c
@@ -0,0 +1,6 @@
+#include <stdio.h>
+
+int main(int argc, char const *argv[]) {
+  printf("example\n"); // breakpoint 1
+  return 0;
+}
diff --git a/lldb/tools/lldb-dap/DAP.cpp b/lldb/tools/lldb-dap/DAP.cpp
index 119779d7bfec..68559e382006 100644
--- a/lldb/tools/lldb-dap/DAP.cpp
+++ b/lldb/tools/lldb-dap/DAP.cpp
@@ -962,6 +962,68 @@ bool ReplModeRequestHandler::DoExecute(lldb::SBDebugger debugger,
   return true;
 }
 
+// Sends a DAP event with an optional body.
+//
+// See
+// https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent
+bool SendEventRequestHandler::DoExecute(lldb::SBDebugger debugger,
+                                        char **command,
+                                        lldb::SBCommandReturnObject &result) {
+  // Command format like: `send-event <name> <body>?`
+  if (!command || !command[0] || llvm::StringRef(command[0]).empty()) {
+    result.SetError("Not enough arguments found, expected format "
+                    "`lldb-dap send-event <name> <body>?`.");
+    return false;
+  }
+
+  llvm::StringRef name{command[0]};
+  // Events that are stateful and should be handled by lldb-dap internally.
+  const std::array internal_events{"breakpoint", "capabilities", "continued",
+                                   "exited",     "initialize",   "loadedSource",
+                                   "module",     "process",      "stopped",
+                                   "terminated", "thread"};
+  if (std::find(internal_events.begin(), internal_events.end(), name) !=
+      std::end(internal_events)) {
+    std::string msg =
+        llvm::formatv("Invalid use of lldb-dap send-event, event \"{0}\" "
+                      "should be handled by lldb-dap internally.",
+                      name)
+            .str();
+    result.SetError(msg.c_str());
+    return false;
+  }
+
+  llvm::json::Object event(CreateEventObject(name));
+
+  if (command[1] && !llvm::StringRef(command[1]).empty()) {
+    // See if we have unused arguments.
+    if (command[2]) {
+      result.SetError(
+          "Additional arguments found, expected `lldb-dap send-event "
+          "<name> <body>?`.");
+      return false;
+    }
+
+    llvm::StringRef raw_body{command[1]};
+
+    llvm::Expected<llvm::json::Value> body = llvm::json::parse(raw_body);
+
+    if (!body) {
+      llvm::Error err = body.takeError();
+      std::string msg = "Failed to parse custom event body: " +
+                        llvm::toString(std::move(err));
+      result.SetError(msg.c_str());
+      return false;
+    }
+
+    event.try_emplace("body", std::move(*body));
+  }
+
+  g_dap.SendJSON(llvm::json::Value(std::move(event)));
+  result.SetStatus(lldb::eReturnStatusSuccessFinishNoResult);
+  return true;
+}
+
 void DAP::SetFrameFormat(llvm::StringRef format) {
   if (format.empty())
     return;
diff --git a/lldb/tools/lldb-dap/DAP.h b/lldb/tools/lldb-dap/DAP.h
index ba6d3d80410e..acc10ade75fd 100644
--- a/lldb/tools/lldb-dap/DAP.h
+++ b/lldb/tools/lldb-dap/DAP.h
@@ -144,6 +144,11 @@ struct ReplModeRequestHandler : public lldb::SBCommandPluginInterface {
                  lldb::SBCommandReturnObject &result) override;
 };
 
+struct SendEventRequestHandler : public lldb::SBCommandPluginInterface {
+  bool DoExecute(lldb::SBDebugger debugger, char **command,
+                 lldb::SBCommandReturnObject &result) override;
+};
+
 struct DAP {
   std::string debug_adaptor_path;
   InputStream input;
diff --git a/lldb/tools/lldb-dap/README.md b/lldb/tools/lldb-dap/README.md
index 11086eb222d2..42b5f501e32c 100644
--- a/lldb/tools/lldb-dap/README.md
+++ b/lldb/tools/lldb-dap/README.md
@@ -290,6 +290,37 @@ The initial repl-mode can be configured with the cli flag `--repl-mode=<mode>`
 and may also be adjusted at runtime using the lldb command
 `lldb-dap repl-mode <mode>`.
 
+#### `lldb-dap send-event`
+
+lldb-dap includes a command to trigger a Debug Adapter Protocol event
+from a script.
+
+The event maybe a custom DAP event or a standard event, if the event is not 
+handled internally by `lldb-dap`.
+
+This command has the format:
+
+```
+lldb-dap send-event <name> <body>?
+```
+
+For example you can use a launch configuration hook to trigger custom events like:
+
+```json
+{
+  "program": "exe",
+  "stopCommands": [
+    "lldb-dap send-event MyStopEvent",
+    "lldb-dap send-event MyStopEvent '{\"key\": 321}",
+  ]
+}
+```
+
+[See the specification](https://microsoft.github.io/debug-adapter-protocol/specification#Base_Protocol_Event) 
+for more details on Debug Adapter Protocol events and the VS Code 
+[debug.onDidReceiveDebugSessionCustomEvent](https://code.visualstudio.com/api/references/vscode-api#debug.onDidReceiveDebugSessionCustomEvent) 
+API for handling a custom event from an extension.
+
 ## Contributing
 
 `lldb-dap` and `lldb` are developed under the umbrella of the [LLVM project](https://llvm.org/).
diff --git a/lldb/tools/lldb-dap/lldb-dap.cpp b/lldb/tools/lldb-dap/lldb-dap.cpp
index 5e351ab11ab6..f70b0d3d4cbe 100644
--- a/lldb/tools/lldb-dap/lldb-dap.cpp
+++ b/lldb/tools/lldb-dap/lldb-dap.cpp
@@ -1896,6 +1896,8 @@ void request_initialize(const llvm::json::Object &request) {
   cmd.AddCommand(
       "repl-mode", new ReplModeRequestHandler(),
       "Get or set the repl behavior of lldb-dap evaluation requests.");
+  cmd.AddCommand("send-event", new SendEventRequestHandler(),
+                 "Sends an DAP event to the client.");
 
   g_dap.progress_event_thread = std::thread(ProgressEventThreadFunction);
 
-- 
GitLab


From 90767bc41bd69fb4b9ac01a8420ef58bbbaeab7c Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Wed, 16 Oct 2024 17:33:12 -0700
Subject: [PATCH 187/329] [lldb] Remove more mentions of ASLLogCallback
 (#112639)

---
 lldb/tools/debugserver/source/RNBRemote.cpp      | 16 +++-------------
 lldb/tools/debugserver/source/libdebugserver.cpp |  8 --------
 2 files changed, 3 insertions(+), 21 deletions(-)

diff --git a/lldb/tools/debugserver/source/RNBRemote.cpp b/lldb/tools/debugserver/source/RNBRemote.cpp
index f22d626c4af2..07211c6e9db4 100644
--- a/lldb/tools/debugserver/source/RNBRemote.cpp
+++ b/lldb/tools/debugserver/source/RNBRemote.cpp
@@ -176,9 +176,6 @@ void append_hexified_string(std::ostream &ostrm, const std::string &string) {
   }
 }
 
-extern void ASLLogCallback(void *baton, uint32_t flags, const char *format,
-                           va_list args);
-
 // from System.framework/Versions/B/PrivateHeaders/sys/codesign.h
 extern "C" {
 #define CS_OPS_STATUS 0       /* return status */
@@ -1773,8 +1770,6 @@ static std::string get_value(std::string &line) {
 
 extern void FileLogCallback(void *baton, uint32_t flags, const char *format,
                             va_list args);
-extern void ASLLogCallback(void *baton, uint32_t flags, const char *format,
-                           va_list args);
 
 rnb_err_t RNBRemote::HandlePacket_qRcmd(const char *p) {
   const char *c = p + strlen("qRcmd,");
@@ -1809,8 +1804,8 @@ rnb_err_t RNBRemote::HandlePacket_qRcmd(const char *p) {
             static_cast<uint32_t>(strtoul(value.c_str(), &end, 0));
         if (errno == 0 && end && *end == '\0') {
           DNBLogSetLogMask(logmask);
-          if (!DNBLogGetLogCallback())
-            DNBLogSetLogCallback(ASLLogCallback, NULL);
+          if (auto log_callback = OsLogger::GetLogFunction())
+            DNBLogSetLogCallback(log_callback, nullptr);
           return SendPacket("OK");
         }
         errno = 0;
@@ -2177,13 +2172,8 @@ rnb_err_t set_logging(const char *p) {
         // Enable DNB logging.
         // Use the existing log callback if one was already configured.
         if (!DNBLogGetLogCallback()) {
-          // Use the os_log()-based logger if available; otherwise,
-          // fallback to ASL.
-          auto log_callback = OsLogger::GetLogFunction();
-          if (log_callback)
+          if (auto log_callback = OsLogger::GetLogFunction())
             DNBLogSetLogCallback(log_callback, nullptr);
-          else
-            DNBLogSetLogCallback(ASLLogCallback, nullptr);
         }
 
         // Update logging to use the configured log channel bitmask.
diff --git a/lldb/tools/debugserver/source/libdebugserver.cpp b/lldb/tools/debugserver/source/libdebugserver.cpp
index 6da3708b4240..17a5c137c1af 100644
--- a/lldb/tools/debugserver/source/libdebugserver.cpp
+++ b/lldb/tools/debugserver/source/libdebugserver.cpp
@@ -311,13 +311,6 @@ RNBRunLoopMode RNBRunLoopInferiorExecuting(RNBRemoteSP &remote) {
   return mode;
 }
 
-void ASLLogCallback(void *baton, uint32_t flags, const char *format,
-                    va_list args) {
-#if 0
-	vprintf(format, args);
-#endif
-}
-
 extern "C" int debug_server_main(int fd) {
 #if 1
   g_isatty = 0;
@@ -327,7 +320,6 @@ extern "C" int debug_server_main(int fd) {
   DNBLogSetDebug(1);
   DNBLogSetVerbose(1);
   DNBLogSetLogMask(-1);
-  DNBLogSetLogCallback(ASLLogCallback, NULL);
 #endif
 
   signal(SIGPIPE, signal_handler);
-- 
GitLab


From f5aee1f18bdbc5694330a5e86eb46cf60e653d0c Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Thu, 17 Oct 2024 09:08:24 +0800
Subject: [PATCH 188/329] [mlir][memref] Fix type conversion in
 emulate-wide-int and emulate-narrow-type (#112214)

This PR follows with #112104, using `nullptr` to indicate that type
conversion failed and no fallback conversion should be attempted.
---
 .../Arith/Transforms/EmulateNarrowType.cpp    |  4 +--
 .../MemRef/Transforms/EmulateNarrowType.cpp   | 17 ++++-----
 .../MemRef/Transforms/EmulateWideInt.cpp      |  2 +-
 .../Dialect/MemRef/emulate-narrow-type.mlir   | 21 +++++------
 .../test/Dialect/MemRef/emulate-wide-int.mlir | 35 ++++++++++++++++++-
 5 files changed, 57 insertions(+), 22 deletions(-)

diff --git a/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp
index 4be0e06fe2a5..fddd7c51bfbc 100644
--- a/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/EmulateNarrowType.cpp
@@ -40,11 +40,11 @@ arith::NarrowTypeEmulationConverter::NarrowTypeEmulationConverter(
   addConversion([this](FunctionType ty) -> std::optional<Type> {
     SmallVector<Type> inputs;
     if (failed(convertTypes(ty.getInputs(), inputs)))
-      return std::nullopt;
+      return nullptr;
 
     SmallVector<Type> results;
     if (failed(convertTypes(ty.getResults(), results)))
-      return std::nullopt;
+      return nullptr;
 
     return FunctionType::get(ty.getContext(), inputs, results);
   });
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
index 9efea066a03c..28f9061d9873 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateNarrowType.cpp
@@ -169,8 +169,9 @@ struct ConvertMemRefAllocation final : OpConversionPattern<OpTy> {
                       std::is_same<OpTy, memref::AllocaOp>(),
                   "expected only memref::AllocOp or memref::AllocaOp");
     auto currentType = cast<MemRefType>(op.getMemref().getType());
-    auto newResultType = dyn_cast<MemRefType>(
-        this->getTypeConverter()->convertType(op.getType()));
+    auto newResultType =
+        this->getTypeConverter()->template convertType<MemRefType>(
+            op.getType());
     if (!newResultType) {
       return rewriter.notifyMatchFailure(
           op->getLoc(),
@@ -378,7 +379,7 @@ struct ConvertMemRefReinterpretCast final
   matchAndRewrite(memref::ReinterpretCastOp op, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
     MemRefType newTy =
-        dyn_cast<MemRefType>(getTypeConverter()->convertType(op.getType()));
+        getTypeConverter()->convertType<MemRefType>(op.getType());
     if (!newTy) {
       return rewriter.notifyMatchFailure(
           op->getLoc(),
@@ -466,8 +467,8 @@ struct ConvertMemRefSubview final : OpConversionPattern<memref::SubViewOp> {
   LogicalResult
   matchAndRewrite(memref::SubViewOp subViewOp, OpAdaptor adaptor,
                   ConversionPatternRewriter &rewriter) const override {
-    MemRefType newTy = dyn_cast<MemRefType>(
-        getTypeConverter()->convertType(subViewOp.getType()));
+    MemRefType newTy =
+        getTypeConverter()->convertType<MemRefType>(subViewOp.getType());
     if (!newTy) {
       return rewriter.notifyMatchFailure(
           subViewOp->getLoc(),
@@ -632,14 +633,14 @@ void memref::populateMemRefNarrowTypeEmulationConversions(
         SmallVector<int64_t> strides;
         int64_t offset;
         if (failed(getStridesAndOffset(ty, strides, offset)))
-          return std::nullopt;
+          return nullptr;
         if (!strides.empty() && strides.back() != 1)
-          return std::nullopt;
+          return nullptr;
 
         auto newElemTy = IntegerType::get(ty.getContext(), loadStoreWidth,
                                           intTy.getSignedness());
         if (!newElemTy)
-          return std::nullopt;
+          return nullptr;
 
         StridedLayoutAttr layoutAttr;
         // If the offset is 0, we do not need a strided layout as the stride is
diff --git a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
index bc4535f97acf..49b71625291d 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/EmulateWideInt.cpp
@@ -159,7 +159,7 @@ void memref::populateMemRefWideIntEmulationConversions(
 
         Type newElemTy = typeConverter.convertType(intTy);
         if (!newElemTy)
-          return std::nullopt;
+          return nullptr;
 
         return ty.cloneWith(std::nullopt, newElemTy);
       });
diff --git a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
index 540da239fced..1d6cbfa343ba 100644
--- a/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-narrow-type.mlir
@@ -203,7 +203,6 @@ func.func @memref_subview_dynamic_offset_i4(%idx : index) -> i4 {
 
 // -----
 
-
 func.func @negative_memref_subview_non_contiguous(%idx : index) -> i4 {
   %c0 = arith.constant 0 : index
   %arr = memref.alloc() : memref<40x40xi4>
@@ -543,13 +542,15 @@ func.func @memref_copy_i4(%arg0: memref<32x128xi4, 1>, %arg1: memref<32x128xi4>)
 
 // -----
 
-!colMajor = memref<8x8xi4, strided<[1, 8]>>
-func.func @copy_distinct_layouts(%idx : index) -> i4 {
-  %c0 = arith.constant 0 : index
-  %arr = memref.alloc() : memref<8x8xi4>
-  %arr2 = memref.alloc() : !colMajor
-  // expected-error @+1 {{failed to legalize operation 'memref.copy' that was explicitly marked illegal}}
-  memref.copy %arr, %arr2 : memref<8x8xi4> to !colMajor
-  %ld = memref.load %arr2[%c0, %c0] : !colMajor
-  return %ld : i4
+func.func @alloc_non_contiguous() {
+  // expected-error @+1 {{failed to legalize operation 'memref.alloc' that was explicitly marked illegal}}
+  %arr = memref.alloc() : memref<8x8xi4, strided<[1, 8]>>
+  return
+}
+
+// -----
+
+// expected-error @+1 {{failed to legalize operation 'func.func' that was explicitly marked illegal}}
+func.func @argument_non_contiguous(%arg0 : memref<8x8xi4, strided<[1, 8]>>) {
+  return
 }
diff --git a/mlir/test/Dialect/MemRef/emulate-wide-int.mlir b/mlir/test/Dialect/MemRef/emulate-wide-int.mlir
index 65ac5beed0a1..994e400bd73c 100644
--- a/mlir/test/Dialect/MemRef/emulate-wide-int.mlir
+++ b/mlir/test/Dialect/MemRef/emulate-wide-int.mlir
@@ -1,4 +1,5 @@
-// RUN: mlir-opt --memref-emulate-wide-int="widest-int-supported=32" %s | FileCheck %s
+// RUN: mlir-opt --memref-emulate-wide-int="widest-int-supported=32" %s \
+// RUN:   --split-input-file --verify-diagnostics | FileCheck %s
 
 // Expect no conversions, i32 is supported.
 // CHECK-LABEL: func @memref_i32
@@ -15,6 +16,8 @@ func.func @memref_i32() {
     return
 }
 
+// -----
+
 // Expect no conversions, f64 is not an integer type.
 // CHECK-LABEL: func @memref_f32
 // CHECK:         [[M:%.+]] = memref.alloc() : memref<4xf32, 1>
@@ -30,6 +33,8 @@ func.func @memref_f32() {
     return
 }
 
+// -----
+
 // CHECK-LABEL: func @alloc_load_store_i64
 // CHECK:         [[C1:%.+]] = arith.constant dense<[1, 0]> : vector<2xi32>
 // CHECK-NEXT:    [[M:%.+]]  = memref.alloc() : memref<4xvector<2xi32>, 1>
@@ -45,6 +50,7 @@ func.func @alloc_load_store_i64() {
     return
 }
 
+// -----
 
 // CHECK-LABEL: func @alloc_load_store_i64_nontemporal
 // CHECK:         [[C1:%.+]] = arith.constant dense<[1, 0]> : vector<2xi32>
@@ -60,3 +66,30 @@ func.func @alloc_load_store_i64_nontemporal() {
     memref.store %c1, %m[%c0] {nontemporal = true} : memref<4xi64, 1>
     return
 }
+
+// -----
+
+// Make sure we do not crash on unsupported types.
+func.func @alloc_i128() {
+  // expected-error@+1 {{failed to legalize operation 'memref.alloc' that was explicitly marked illegal}}
+  %m = memref.alloc() : memref<4xi128, 1>
+  return
+}
+
+// -----
+
+func.func @load_i128(%m: memref<4xi128, 1>) {
+  %c0 = arith.constant 0 : index
+  // expected-error@+1 {{failed to legalize operation 'memref.load' that was explicitly marked illegal}}
+  %v = memref.load %m[%c0] : memref<4xi128, 1>
+  return
+}
+
+// -----
+
+func.func @store_i128(%c1: i128, %m: memref<4xi128, 1>) {
+  %c0 = arith.constant 0 : index
+  // expected-error@+1 {{failed to legalize operation 'memref.store' that was explicitly marked illegal}}
+  memref.store %c1, %m[%c0] : memref<4xi128, 1>
+  return
+}
-- 
GitLab


From 9930a5a3338ba642c52292107e0f729328d79034 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Thu, 17 Oct 2024 09:08:51 +0800
Subject: [PATCH 189/329] [mlir][tosa] Update document of `tosa.rescale`(NFC)
 (#112531)

This PR formats the `supported rescalings` using a table. The previous
structure was disorganized, as seen in the documentation:
https://mlir.llvm.org/docs/Dialects/TOSA/#tosarescale-mlirtosarescaleop.
---
 mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td | 32 +++++++++++---------
 1 file changed, 17 insertions(+), 15 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
index 07402c8695b3..3bb5ceb0f469 100644
--- a/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
+++ b/mlir/include/mlir/Dialect/Tosa/IR/TosaOps.td
@@ -1877,21 +1877,23 @@ def Tosa_RescaleOp: Tosa_Op<"rescale", [Pure,
 
   let description = [{
     Rescale quantized values into a new domain. Supported rescalings are:
-    Mode                    Input   Output
-    signed 8 to 8           int8    int8
-    signed 8 to 16          int8    int16
-    signed 8 to 32          int8    int32
-    signed 16 to 8          int16   int8
-    signed 16 to 16         int16   int16
-    signed 16 to 32         int16   int32
-    signed 32 to 8          int32   int8
-    signed 32 to 16         int32   int16
-    signed 32 to 32         int32   int32
-    signed 48 to 8          int48   int8
-    signed 48 to 16         int48   int16
-    signed 48 to 32         int48   int32
-    unsigned 8 to signed 8  uint8   int8
-    signed 8 to unsigned 8  int8    uint8
+
+    | Mode                   | Input | Output |
+    |------------------------|-------|--------|
+    | signed 8 to 8          | int8  | int8   |
+    | signed 8 to 16         | int8  | int16  |
+    | signed 8 to 32         | int8  | int32  |
+    | signed 16 to 8         | int16 | int8   |
+    | signed 16 to 16        | int16 | int16  |
+    | signed 16 to 32        | int16 | int32  |
+    | signed 32 to 8         | int32 | int8   |
+    | signed 32 to 16        | int32 | int16  |
+    | signed 32 to 32        | int32 | int32  |
+    | signed 48 to 8         | int48 | int8   |
+    | signed 48 to 16        | int48 | int16  |
+    | signed 48 to 32        | int48 | int32  |
+    | unsigned 8 to signed 8 | uint8 | int8   |
+    | signed 8 to unsigned 8 | int8  | uint8  |
   }];
 
   let arguments = (ins
-- 
GitLab


From 4c98a71993ddba09ab6e81c905d2a1cc08d8d76e Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 16 Oct 2024 18:29:15 -0700
Subject: [PATCH 190/329] [nfc][sanitizer] Unmap memory in test (#112644)

---
 compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
index be577c336404..bed19d15a8ec 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
@@ -78,6 +78,8 @@ TEST(SanitizerCommon, IsAccessibleMemoryRange) {
   EXPECT_TRUE(IsAccessibleMemoryRange(mem + 2 * page_size, page_size));
   EXPECT_FALSE(IsAccessibleMemoryRange(mem, 3 * page_size));
   EXPECT_FALSE(IsAccessibleMemoryRange(0x0, 2));
+
+  munmap((void *)mem, 3 * page_size);
 }
 
 }  // namespace __sanitizer
-- 
GitLab


From dd9a34fd7e6cb190d44d310a610e9f959e2e599f Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 16 Oct 2024 18:30:20 -0700
Subject: [PATCH 191/329] [nfc][lsan] Move up vectors cleanup (#112608)

---
 compiler-rt/lib/lsan/lsan_common.cpp | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp
index 329e29477d07..6510e0ac3bf6 100644
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@@ -405,6 +405,8 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
   InternalMmapVector<uptr> registers;
   InternalMmapVector<Range> extra_ranges;
   for (uptr i = 0; i < suspended_threads.ThreadCount(); i++) {
+    registers.clear();
+    extra_ranges.clear();
     const tid_t os_id = static_cast<tid_t>(suspended_threads.GetThreadID(i));
     LOG_THREADS("Processing thread %llu.\n", os_id);
     uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
@@ -463,7 +465,6 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
       }
       ScanRangeForPointers(stack_begin, stack_end, frontier, "STACK",
                            kReachable);
-      extra_ranges.clear();
       GetThreadExtraStackRangesLocked(os_id, &extra_ranges);
       ScanExtraStackRanges(extra_ranges, frontier);
     }
-- 
GitLab


From 6ffd3bbcd7240f2a23cec99c11b7298cc28f54c5 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 16 Oct 2024 18:32:14 -0700
Subject: [PATCH 192/329] [nfc][lsan] Restructure loop in ProcessThreads
 (#112609)

The goal is to move `SuspendedThreadsList` related code into
the beginning of the loop, and prepare for extraction the rest
of the loop body into a function.
---
 compiler-rt/lib/lsan/lsan_common.cpp | 28 +++++++++++++++++-----------
 1 file changed, 17 insertions(+), 11 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp
index 6510e0ac3bf6..a1a15bf98a11 100644
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@@ -407,7 +407,20 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
   for (uptr i = 0; i < suspended_threads.ThreadCount(); i++) {
     registers.clear();
     extra_ranges.clear();
+
     const tid_t os_id = static_cast<tid_t>(suspended_threads.GetThreadID(i));
+    uptr sp = 0;
+    PtraceRegistersStatus have_registers =
+        suspended_threads.GetRegistersAndSP(i, &registers, &sp);
+    if (have_registers != REGISTERS_AVAILABLE) {
+      Report("Unable to get registers from thread %llu.\n", os_id);
+      // If unable to get SP, consider the entire stack to be reachable unless
+      // GetRegistersAndSP failed with ESRCH.
+      if (have_registers == REGISTERS_UNAVAILABLE_FATAL)
+        continue;
+      sp = 0;
+    }
+
     LOG_THREADS("Processing thread %llu.\n", os_id);
     uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
     DTLS *dtls;
@@ -420,20 +433,13 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
       LOG_THREADS("Thread %llu not found in registry.\n", os_id);
       continue;
     }
-    uptr sp;
-    PtraceRegistersStatus have_registers =
-        suspended_threads.GetRegistersAndSP(i, &registers, &sp);
-    if (have_registers != REGISTERS_AVAILABLE) {
-      Report("Unable to get registers from thread %llu.\n", os_id);
-      // If unable to get SP, consider the entire stack to be reachable unless
-      // GetRegistersAndSP failed with ESRCH.
-      if (have_registers == REGISTERS_UNAVAILABLE_FATAL)
-        continue;
-      sp = stack_begin;
-    }
+
     if (os_id == caller_tid)
       sp = caller_sp;
 
+    if (!sp)
+      sp = stack_begin;
+
     if (flags()->use_registers && have_registers) {
       uptr registers_begin = reinterpret_cast<uptr>(registers.data());
       uptr registers_end =
-- 
GitLab


From 5e9166e02ab65d42efba014f2adc59c42b097ddc Mon Sep 17 00:00:00 2001
From: Jim Lin <jim@andestech.com>
Date: Wed, 16 Oct 2024 15:38:06 +0800
Subject: [PATCH 193/329] [SLP] Remove TTI parameter from vectorizeHorReduction
 and vectorizeRootInstruction. NFC.

Since TTI is a member variable.
---
 .../llvm/Transforms/Vectorize/SLPVectorizer.h |  4 +---
 .../Transforms/Vectorize/SLPVectorizer.cpp    | 21 +++++++++----------
 2 files changed, 11 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
index 95531544a1c8..877c83291170 100644
--- a/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
+++ b/llvm/include/llvm/Transforms/Vectorize/SLPVectorizer.h
@@ -122,15 +122,13 @@ private:
   /// or a horizontal reduction was not matched or not possible.
   bool vectorizeHorReduction(PHINode *P, Instruction *Root, BasicBlock *BB,
                              slpvectorizer::BoUpSLP &R,
-                             TargetTransformInfo *TTI,
                              SmallVectorImpl<WeakTrackingVH> &PostponedInsts);
 
   /// Make an attempt to vectorize reduction and then try to vectorize
   /// postponed binary operations.
   /// \returns true on any successfull vectorization.
   bool vectorizeRootInstruction(PHINode *P, Instruction *Root, BasicBlock *BB,
-                                slpvectorizer::BoUpSLP &R,
-                                TargetTransformInfo *TTI);
+                                slpvectorizer::BoUpSLP &R);
 
   /// Try to vectorize trees that start at insertvalue instructions.
   bool vectorizeInsertValueInst(InsertValueInst *IVI, BasicBlock *BB,
diff --git a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
index 336126cc1fbc..ba70ab1e5e14 100644
--- a/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
+++ b/llvm/lib/Transforms/Vectorize/SLPVectorizer.cpp
@@ -19949,7 +19949,7 @@ static bool isReductionCandidate(Instruction *I) {
 }
 
 bool SLPVectorizerPass::vectorizeHorReduction(
-    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R, TargetTransformInfo *TTI,
+    PHINode *P, Instruction *Root, BasicBlock *BB, BoUpSLP &R,
     SmallVectorImpl<WeakTrackingVH> &PostponedInsts) {
   if (!ShouldVectorizeHor)
     return false;
@@ -19982,7 +19982,7 @@ bool SLPVectorizerPass::vectorizeHorReduction(
   Stack.emplace(SelectRoot(), 0);
   SmallPtrSet<Value *, 8> VisitedInstrs;
   bool Res = false;
-  auto &&TryToReduce = [this, TTI, &R](Instruction *Inst) -> Value * {
+  auto &&TryToReduce = [this, &R](Instruction *Inst) -> Value * {
     if (R.isAnalyzedReductionRoot(Inst))
       return nullptr;
     if (!isReductionCandidate(Inst))
@@ -20049,10 +20049,9 @@ bool SLPVectorizerPass::vectorizeHorReduction(
 }
 
 bool SLPVectorizerPass::vectorizeRootInstruction(PHINode *P, Instruction *Root,
-                                                 BasicBlock *BB, BoUpSLP &R,
-                                                 TargetTransformInfo *TTI) {
+                                                 BasicBlock *BB, BoUpSLP &R) {
   SmallVector<WeakTrackingVH> PostponedInsts;
-  bool Res = vectorizeHorReduction(P, Root, BB, R, TTI, PostponedInsts);
+  bool Res = vectorizeHorReduction(P, Root, BB, R, PostponedInsts);
   Res |= tryToVectorize(PostponedInsts, R);
   return Res;
 }
@@ -20317,7 +20316,7 @@ bool SLPVectorizerPass::vectorizeCmpInsts(iterator_range<ItT> CmpInsts,
       continue;
     for (Value *Op : I->operands())
       if (auto *RootOp = dyn_cast<Instruction>(Op))
-        Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R, TTI);
+        Changed |= vectorizeRootInstruction(nullptr, RootOp, BB, R);
   }
   // Try to vectorize operands as vector bundles.
   for (CmpInst *I : CmpInsts) {
@@ -20384,7 +20383,7 @@ bool SLPVectorizerPass::vectorizeInserts(InstSetVector &Instructions,
     // pass2 - try to vectorize reductions only
     if (R.isDeleted(I))
       continue;
-    OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, TTI, PostponedInsts);
+    OpsChanged |= vectorizeHorReduction(nullptr, I, BB, R, PostponedInsts);
     if (R.isDeleted(I) || isa<CmpInst>(I))
       continue;
     // pass3 - try to match and vectorize a buildvector sequence.
@@ -20644,7 +20643,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
       if (P->getNumIncomingValues() == 2) {
         // Try to match and vectorize a horizontal reduction.
         Instruction *Root = getReductionInstr(DT, P, BB, LI);
-        if (Root && vectorizeRootInstruction(P, Root, BB, R, TTI)) {
+        if (Root && vectorizeRootInstruction(P, Root, BB, R)) {
           Changed = true;
           It = BB->begin();
           E = BB->end();
@@ -20666,8 +20665,8 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
         // vectorization.
         if (auto *PI = dyn_cast<Instruction>(P->getIncomingValue(I));
             PI && !IsInPostProcessInstrs(PI)) {
-          bool Res = vectorizeRootInstruction(nullptr, PI,
-                                              P->getIncomingBlock(I), R, TTI);
+          bool Res =
+              vectorizeRootInstruction(nullptr, PI, P->getIncomingBlock(I), R);
           Changed |= Res;
           if (Res && R.isDeleted(P)) {
             It = BB->begin();
@@ -20701,7 +20700,7 @@ bool SLPVectorizerPass::vectorizeChainsInBlock(BasicBlock *BB, BoUpSLP &R) {
           if (auto *VI = dyn_cast<Instruction>(V);
               VI && !IsInPostProcessInstrs(VI))
             // Try to match and vectorize a horizontal reduction.
-            OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R, TTI);
+            OpsChanged |= vectorizeRootInstruction(nullptr, VI, BB, R);
         }
       }
       // Start vectorization of post-process list of instructions from the
-- 
GitLab


From d54953ef472bfd8d4b503aae7682aa76c49f8cc0 Mon Sep 17 00:00:00 2001
From: Wu Yingcong <yingcong.wu@intel.com>
Date: Thu, 17 Oct 2024 10:17:16 +0800
Subject: [PATCH 194/329] [fuzzer] fix clang-cl build fuzzer lit test failure
 (#112339)

The `check-fuzzer` runs fine with cl build llvm, but the following lit
tests fail with clang-cl build llvm
```
********************
Timed Out Tests (2):
  libFuzzer-x86_64-default-Windows :: fork-ubsan.test
  libFuzzer-x86_64-default-Windows :: fuzzer-oom.test

********************
Failed Tests (22):
  libFuzzer-x86_64-default-Windows :: acquire-crash-state.test
  libFuzzer-x86_64-default-Windows :: cross_over_copy.test
  libFuzzer-x86_64-default-Windows :: cross_over_insert.test
  libFuzzer-x86_64-default-Windows :: exit_on_src_pos.test
  libFuzzer-x86_64-default-Windows :: fuzzer-alignment-assumption.test
  libFuzzer-x86_64-default-Windows :: fuzzer-implicit-integer-sign-change.test
  libFuzzer-x86_64-default-Windows :: fuzzer-implicit-signed-integer-truncation-or-sign-change.test
  libFuzzer-x86_64-default-Windows :: fuzzer-implicit-signed-integer-truncation.test
  libFuzzer-x86_64-default-Windows :: fuzzer-implicit-unsigned-integer-truncation.test
  libFuzzer-x86_64-default-Windows :: fuzzer-printcovpcs.test
  libFuzzer-x86_64-default-Windows :: fuzzer-timeout.test
  libFuzzer-x86_64-default-Windows :: fuzzer-ubsan.test
  libFuzzer-x86_64-default-Windows :: minimize_crash.test
  libFuzzer-x86_64-default-Windows :: minimize_two_crashes.test
  libFuzzer-x86_64-default-Windows :: null-deref-on-empty.test
  libFuzzer-x86_64-default-Windows :: null-deref.test
  libFuzzer-x86_64-default-Windows :: print-func.test
  libFuzzer-x86_64-default-Windows :: stack-overflow-with-asan.test
  libFuzzer-x86_64-default-Windows :: trace-malloc-2.test
  libFuzzer-x86_64-default-Windows :: trace-malloc-unbalanced.test
  libFuzzer-x86_64-default-Windows :: trace-malloc.test
```

The related commits are
https://github.com/llvm/llvm-project/commit/53a81d4d26f0409de8a0655d7af90f2bea222a12
and
https://github.com/llvm/llvm-project/commit/e31efd8f6fbc27000a4933f889e0deb922411006.
Following the change in
https://github.com/llvm/llvm-project/commit/e31efd8f6fbc27000a4933f889e0deb922411006
can fix these failures.

As for the issue mentioned in the comment that alternatename support in
clang not good enough(https://bugs.llvm.org/show_bug.cgi?id=40218). I
find that using `__builtin_function_start(func)` instead of directly
using `func` would make it work as intended.
---
 .../lib/fuzzer/FuzzerExtFunctionsWindows.cpp  | 28 +++++++++----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp b/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp
index 688bad1d51ca..dfc32ac9db29 100644
--- a/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp
+++ b/compiler-rt/lib/fuzzer/FuzzerExtFunctionsWindows.cpp
@@ -22,6 +22,11 @@ using namespace fuzzer;
 #define STRINGIFY(A) STRINGIFY_(A)
 
 #if LIBFUZZER_MSVC
+#define GET_FUNCTION_ADDRESS(fn) &fn
+#else
+#define GET_FUNCTION_ADDRESS(fn) __builtin_function_start(fn)
+#endif // LIBFUZER_MSVC
+
 // Copied from compiler-rt/lib/sanitizer_common/sanitizer_win_defs.h
 #if defined(_M_IX86) || defined(__i386__)
 #define WIN_SYM_PREFIX "_"
@@ -31,17 +36,9 @@ using namespace fuzzer;
 
 // Declare external functions as having alternativenames, so that we can
 // determine if they are not defined.
-#define EXTERNAL_FUNC(Name, Default)                                   \
-  __pragma(comment(linker, "/alternatename:" WIN_SYM_PREFIX STRINGIFY( \
+#define EXTERNAL_FUNC(Name, Default)                                           \
+  __pragma(comment(linker, "/alternatename:" WIN_SYM_PREFIX STRINGIFY(         \
                                Name) "=" WIN_SYM_PREFIX STRINGIFY(Default)))
-#else
-// Declare external functions as weak to allow them to default to a specified
-// function if not defined explicitly. We must use weak symbols because clang's
-// support for alternatename is not 100%, see
-// https://bugs.llvm.org/show_bug.cgi?id=40218 for more details.
-#define EXTERNAL_FUNC(Name, Default) \
-  __attribute__((weak, alias(STRINGIFY(Default))))
-#endif  // LIBFUZZER_MSVC
 
 extern "C" {
 #define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)         \
@@ -57,20 +54,23 @@ extern "C" {
 }
 
 template <typename T>
-static T *GetFnPtr(T *Fun, T *FunDef, const char *FnName, bool WarnIfMissing) {
+static T *GetFnPtr(void *Fun, void *FunDef, const char *FnName,
+                   bool WarnIfMissing) {
   if (Fun == FunDef) {
     if (WarnIfMissing)
       Printf("WARNING: Failed to find function \"%s\".\n", FnName);
     return nullptr;
   }
-  return Fun;
+  return (T *)Fun;
 }
 
 namespace fuzzer {
 
 ExternalFunctions::ExternalFunctions() {
-#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN) \
-  this->NAME = GetFnPtr<decltype(::NAME)>(::NAME, ::NAME##Def, #NAME, WARN);
+#define EXT_FUNC(NAME, RETURN_TYPE, FUNC_SIG, WARN)                            \
+  this->NAME = GetFnPtr<decltype(::NAME)>(GET_FUNCTION_ADDRESS(::NAME),        \
+                                          GET_FUNCTION_ADDRESS(::NAME##Def),   \
+                                          #NAME, WARN);
 
 #include "FuzzerExtFunctions.def"
 
-- 
GitLab


From 1b4a173fa41e02eddec9f1cf41324aa4ea8a7fa5 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 17 Oct 2024 04:15:46 +0100
Subject: [PATCH 195/329] [LV] Remove unneeded LoopScalarBody member variable.
 (NFC)

---
 llvm/lib/Transforms/Vectorize/LoopVectorize.cpp | 15 +++++----------
 1 file changed, 5 insertions(+), 10 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
index 8bf92f348062..3e8bc1451f62 100644
--- a/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
+++ b/llvm/lib/Transforms/Vectorize/LoopVectorize.cpp
@@ -638,9 +638,6 @@ protected:
   /// there can be multiple exiting edges reaching this block.
   BasicBlock *LoopExitBlock;
 
-  /// The scalar loop body.
-  BasicBlock *LoopScalarBody;
-
   /// A list of all bypass blocks. The first block is the entry of the loop.
   SmallVector<BasicBlock *, 4> LoopBypassBlocks;
 
@@ -2530,7 +2527,6 @@ BasicBlock *InnerLoopVectorizer::emitMemRuntimeChecks(BasicBlock *Bypass) {
 }
 
 void InnerLoopVectorizer::createVectorLoopSkeleton(StringRef Prefix) {
-  LoopScalarBody = OrigLoop->getHeader();
   LoopVectorPreHeader = OrigLoop->getLoopPreheader();
   assert(LoopVectorPreHeader && "Invalid loop structure");
   LoopExitBlock = OrigLoop->getUniqueExitBlock(); // may be nullptr
@@ -2944,7 +2940,7 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
 
   // Set/update profile weights for the vector and remainder loops as original
   // loop iterations are now distributed among them. Note that original loop
-  // represented by LoopScalarBody becomes remainder loop after vectorization.
+  // becomes the scalar remainder loop after vectorization.
   //
   // For cases like foldTailByMasking() and requiresScalarEpiloque() we may
   // end up getting slightly roughened result but that should be OK since
@@ -2952,12 +2948,11 @@ void InnerLoopVectorizer::fixVectorizedLoop(VPTransformState &State,
   // vector code caused by legality checks is ignored, assigning all the weight
   // to the vector loop, optimistically.
   //
-  // For scalable vectorization we can't know at compile time how many iterations
-  // of the loop are handled in one vector iteration, so instead assume a pessimistic
-  // vscale of '1'.
-  Loop *ScalarLoop = LI->getLoopFor(LoopScalarBody);
+  // For scalable vectorization we can't know at compile time how many
+  // iterations of the loop are handled in one vector iteration, so instead
+  // assume a pessimistic vscale of '1'.
   Loop *VectorLoop = LI->getLoopFor(HeaderBB);
-  setProfileInfoAfterUnrolling(ScalarLoop, VectorLoop, ScalarLoop,
+  setProfileInfoAfterUnrolling(OrigLoop, VectorLoop, OrigLoop,
                                VF.getKnownMinValue() * UF);
 }
 
-- 
GitLab


From 4512bbe7467c1c0f884304e5654d1070df58d6f8 Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Wed, 16 Oct 2024 21:24:13 -0700
Subject: [PATCH 196/329] [HLSL] Collect explicit resource binding information
 (#111203)

Scans each global variable declaration and its members and collects all
required resource bindings in a new `SemaHLSL` data member `Bindings`.

New fields are added `HLSLResourceBindingAttr` for storing processed
binding information so that it can be used by CodeGen (`Bindings` or any
other Sema information is not accessible from CodeGen.)

Adjusts the existing register binding attribute handling and diagnostics
to:
- do not create HLSLResourceBindingAttribute if it is not valid
- diagnose only the simple/local errors when a register binding
attribute is parsed
- additional diagnostic of binding type mismatches is done later and
uses the new `Bindings` data

Fixes #110719
---
 clang/include/clang/Basic/Attr.td             |  25 ++
 clang/include/clang/Sema/SemaHLSL.h           |  57 +++
 clang/lib/Sema/SemaDecl.cpp                   |   3 +
 clang/lib/Sema/SemaHLSL.cpp                   | 376 ++++++++++++------
 .../resource_binding_attr_error_udt.hlsl      |   8 +-
 5 files changed, 352 insertions(+), 117 deletions(-)

diff --git a/clang/include/clang/Basic/Attr.td b/clang/include/clang/Basic/Attr.td
index ec3d6e0079f6..0259b6e40ca9 100644
--- a/clang/include/clang/Basic/Attr.td
+++ b/clang/include/clang/Basic/Attr.td
@@ -4593,6 +4593,31 @@ def HLSLResourceBinding: InheritableAttr {
   let LangOpts = [HLSL];
   let Args = [StringArgument<"Slot">, StringArgument<"Space", 1>];
   let Documentation = [HLSLResourceBindingDocs];
+  let AdditionalMembers = [{
+  public:
+      enum class RegisterType : unsigned { SRV, UAV, CBuffer, Sampler, C, I };
+  
+  private:
+      RegisterType RegType;
+      unsigned SlotNumber;
+      unsigned SpaceNumber;
+
+  public:
+      void setBinding(RegisterType RT, unsigned SlotNum, unsigned SpaceNum) {
+        RegType = RT;
+        SlotNumber = SlotNum;
+        SpaceNumber = SpaceNum;
+      }
+      RegisterType getRegisterType() const {
+        return RegType;
+      }
+      unsigned getSlotNumber() const {
+        return SlotNumber;
+      }
+      unsigned getSpaceNumber() const {
+        return SpaceNumber;
+      }
+  }];
 }
 
 def HLSLPackOffset: HLSLAnnotationAttr {
diff --git a/clang/include/clang/Sema/SemaHLSL.h b/clang/include/clang/Sema/SemaHLSL.h
index fa957abc9791..4f1fc9a31404 100644
--- a/clang/include/clang/Sema/SemaHLSL.h
+++ b/clang/include/clang/Sema/SemaHLSL.h
@@ -28,6 +28,9 @@ class AttributeCommonInfo;
 class IdentifierInfo;
 class ParsedAttr;
 class Scope;
+class VarDecl;
+
+using llvm::dxil::ResourceClass;
 
 // FIXME: This can be hidden (as static function in SemaHLSL.cpp) once we no
 // longer need to create builtin buffer types in HLSLExternalSemaSource.
@@ -35,6 +38,50 @@ bool CreateHLSLAttributedResourceType(
     Sema &S, QualType Wrapped, ArrayRef<const Attr *> AttrList,
     QualType &ResType, HLSLAttributedResourceLocInfo *LocInfo = nullptr);
 
+enum class BindingType : uint8_t { NotAssigned, Explicit, Implicit };
+
+// DeclBindingInfo struct stores information about required/assigned resource
+// binding onon a declaration for specific resource class.
+struct DeclBindingInfo {
+  const VarDecl *Decl;
+  ResourceClass ResClass;
+  const HLSLResourceBindingAttr *Attr;
+  BindingType BindType;
+
+  DeclBindingInfo(const VarDecl *Decl, ResourceClass ResClass,
+                  BindingType BindType = BindingType::NotAssigned,
+                  const HLSLResourceBindingAttr *Attr = nullptr)
+      : Decl(Decl), ResClass(ResClass), Attr(Attr), BindType(BindType) {}
+
+  void setBindingAttribute(HLSLResourceBindingAttr *A, BindingType BT) {
+    assert(Attr == nullptr && BindType == BindingType::NotAssigned &&
+           "binding attribute already assigned");
+    Attr = A;
+    BindType = BT;
+  }
+};
+
+// ResourceBindings class stores information about all resource bindings
+// in a shader. It is used for binding diagnostics and implicit binding
+// assigments.
+class ResourceBindings {
+public:
+  DeclBindingInfo *addDeclBindingInfo(const VarDecl *VD,
+                                      ResourceClass ResClass);
+  DeclBindingInfo *getDeclBindingInfo(const VarDecl *VD,
+                                      ResourceClass ResClass);
+  bool hasBindingInfoForDecl(const VarDecl *VD) const;
+
+private:
+  // List of all resource bindings required by the shader.
+  // A global declaration can have multiple bindings for different
+  // resource classes. They are all stored sequentially in this list.
+  // The DeclToBindingListIndex hashtable maps a declaration to the
+  // index of the first binding info in the list.
+  llvm::SmallVector<DeclBindingInfo> BindingsList;
+  llvm::DenseMap<const VarDecl *, unsigned> DeclToBindingListIndex;
+};
+
 class SemaHLSL : public SemaBase {
 public:
   SemaHLSL(Sema &S);
@@ -55,6 +102,7 @@ public:
   mergeParamModifierAttr(Decl *D, const AttributeCommonInfo &AL,
                          HLSLParamModifierAttr::Spelling Spelling);
   void ActOnTopLevelFunction(FunctionDecl *FD);
+  void ActOnVariableDeclarator(VarDecl *VD);
   void CheckEntryPoint(FunctionDecl *FD);
   void CheckSemanticAnnotation(FunctionDecl *EntryPoint, const Decl *Param,
                                const HLSLAnnotationAttr *AnnotationAttr);
@@ -102,6 +150,15 @@ private:
   llvm::DenseMap<const HLSLAttributedResourceType *,
                  HLSLAttributedResourceLocInfo>
       LocsForHLSLAttributedResources;
+
+  // List of all resource bindings
+  ResourceBindings Bindings;
+
+private:
+  void collectResourcesOnVarDecl(VarDecl *D);
+  void collectResourcesOnUserRecordDecl(const VarDecl *VD,
+                                        const RecordType *RT);
+  void processExplicitBindingsOnDecl(VarDecl *D);
 };
 
 } // namespace clang
diff --git a/clang/lib/Sema/SemaDecl.cpp b/clang/lib/Sema/SemaDecl.cpp
index fece22c663d0..229c9080d558 100644
--- a/clang/lib/Sema/SemaDecl.cpp
+++ b/clang/lib/Sema/SemaDecl.cpp
@@ -7883,6 +7883,9 @@ NamedDecl *Sema::ActOnVariableDeclarator(
   // Handle attributes prior to checking for duplicates in MergeVarDecl
   ProcessDeclAttributes(S, NewVD, D);
 
+  if (getLangOpts().HLSL)
+    HLSL().ActOnVariableDeclarator(NewVD);
+
   // FIXME: This is probably the wrong location to be doing this and we should
   // probably be doing this for more attributes (especially for function
   // pointer attributes such as format, warn_unused_result, etc.). Ideally
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 698fdbed0484..0d23c4935e91 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -40,9 +40,7 @@
 #include <utility>
 
 using namespace clang;
-using llvm::dxil::ResourceClass;
-
-enum class RegisterType { SRV, UAV, CBuffer, Sampler, C, I, Invalid };
+using RegisterType = HLSLResourceBindingAttr::RegisterType;
 
 static RegisterType getRegisterType(ResourceClass RC) {
   switch (RC) {
@@ -58,31 +56,95 @@ static RegisterType getRegisterType(ResourceClass RC) {
   llvm_unreachable("unexpected ResourceClass value");
 }
 
-static RegisterType getRegisterType(StringRef Slot) {
+// Converts the first letter of string Slot to RegisterType.
+// Returns false if the letter does not correspond to a valid register type.
+static bool convertToRegisterType(StringRef Slot, RegisterType *RT) {
+  assert(RT != nullptr);
   switch (Slot[0]) {
   case 't':
   case 'T':
-    return RegisterType::SRV;
+    *RT = RegisterType::SRV;
+    return true;
   case 'u':
   case 'U':
-    return RegisterType::UAV;
+    *RT = RegisterType::UAV;
+    return true;
   case 'b':
   case 'B':
-    return RegisterType::CBuffer;
+    *RT = RegisterType::CBuffer;
+    return true;
   case 's':
   case 'S':
-    return RegisterType::Sampler;
+    *RT = RegisterType::Sampler;
+    return true;
   case 'c':
   case 'C':
-    return RegisterType::C;
+    *RT = RegisterType::C;
+    return true;
   case 'i':
   case 'I':
-    return RegisterType::I;
+    *RT = RegisterType::I;
+    return true;
   default:
-    return RegisterType::Invalid;
+    return false;
   }
 }
 
+static ResourceClass getResourceClass(RegisterType RT) {
+  switch (RT) {
+  case RegisterType::SRV:
+    return ResourceClass::SRV;
+  case RegisterType::UAV:
+    return ResourceClass::UAV;
+  case RegisterType::CBuffer:
+    return ResourceClass::CBuffer;
+  case RegisterType::Sampler:
+    return ResourceClass::Sampler;
+  case RegisterType::C:
+  case RegisterType::I:
+    llvm_unreachable("unexpected RegisterType value");
+  }
+}
+
+DeclBindingInfo *ResourceBindings::addDeclBindingInfo(const VarDecl *VD,
+                                                      ResourceClass ResClass) {
+  assert(getDeclBindingInfo(VD, ResClass) == nullptr &&
+         "DeclBindingInfo already added");
+#ifndef NDEBUG
+  // Verify that existing bindings for this decl are stored sequentially
+  // and at the end of the BindingsList
+  auto I = DeclToBindingListIndex.find(VD);
+  if (I != DeclToBindingListIndex.end()) {
+    for (unsigned Index = I->getSecond(); Index < BindingsList.size(); ++Index)
+      assert(BindingsList[Index].Decl == VD);
+  }
+#endif
+  // VarDecl may have multiple entries for different resource classes.
+  // DeclToBindingListIndex stores the index of the first binding we saw
+  // for this decl. If there are any additional ones then that index
+  // shouldn't be updated.
+  DeclToBindingListIndex.try_emplace(VD, BindingsList.size());
+  return &BindingsList.emplace_back(VD, ResClass);
+}
+
+DeclBindingInfo *ResourceBindings::getDeclBindingInfo(const VarDecl *VD,
+                                                      ResourceClass ResClass) {
+  auto Entry = DeclToBindingListIndex.find(VD);
+  if (Entry != DeclToBindingListIndex.end()) {
+    for (unsigned Index = Entry->getSecond();
+         Index < BindingsList.size() && BindingsList[Index].Decl == VD;
+         ++Index) {
+      if (BindingsList[Index].ResClass == ResClass)
+        return &BindingsList[Index];
+    }
+  }
+  return nullptr;
+}
+
+bool ResourceBindings::hasBindingInfoForDecl(const VarDecl *VD) const {
+  return DeclToBindingListIndex.contains(VD);
+}
+
 SemaHLSL::SemaHLSL(Sema &S) : SemaBase(S) {}
 
 Decl *SemaHLSL::ActOnStartBuffer(Scope *BufferScope, bool CBuffer,
@@ -985,88 +1047,85 @@ SemaHLSL::TakeLocForHLSLAttribute(const HLSLAttributedResourceType *RT) {
   return LocInfo;
 }
 
-// get the record decl from a var decl that we expect
-// represents a resource
-static CXXRecordDecl *getRecordDeclFromVarDecl(VarDecl *VD) {
-  const Type *Ty = VD->getType()->getPointeeOrArrayElementType();
-  assert(Ty && "Resource must have an element type.");
-
-  if (Ty->isBuiltinType())
-    return nullptr;
-
-  CXXRecordDecl *TheRecordDecl = Ty->getAsCXXRecordDecl();
-  assert(TheRecordDecl && "Resource should have a resource type declaration.");
-  return TheRecordDecl;
-}
-
+// Returns handle type of a resource, if the type is a resource
 static const HLSLAttributedResourceType *
-findAttributedResourceTypeOnField(VarDecl *VD) {
-  assert(VD != nullptr && "expected VarDecl");
-  if (RecordDecl *RD = getRecordDeclFromVarDecl(VD)) {
-    for (auto *FD : RD->fields()) {
-      if (const HLSLAttributedResourceType *AttrResType =
-              dyn_cast<HLSLAttributedResourceType>(FD->getType().getTypePtr()))
-        return AttrResType;
+findHandleTypeOnResource(const Type *Ty) {
+  // If Ty is a resource class, the first field must
+  // be the resource handle of type HLSLAttributedResourceType
+  if (RecordDecl *RD = Ty->getAsCXXRecordDecl()) {
+    if (!RD->fields().empty()) {
+      const auto &FirstFD = RD->fields().begin();
+      return dyn_cast<HLSLAttributedResourceType>(
+          FirstFD->getType().getTypePtr());
     }
   }
   return nullptr;
 }
 
-// Iterate over RecordType fields and return true if any of them matched the
-// register type
-static bool ContainsResourceForRegisterType(Sema &S, const RecordType *RT,
-                                            RegisterType RegType) {
-  llvm::SmallVector<const Type *> TypesToScan;
-  TypesToScan.emplace_back(RT);
-
-  while (!TypesToScan.empty()) {
-    const Type *T = TypesToScan.pop_back_val();
-    while (T->isArrayType())
-      T = T->getArrayElementTypeNoTypeQual();
-    if (T->isIntegralOrEnumerationType() || T->isFloatingType()) {
-      if (RegType == RegisterType::C)
-        return true;
+// Walks though the global variable declaration, collects all resource binding
+// requirements and adds them to Bindings
+void SemaHLSL::collectResourcesOnUserRecordDecl(const VarDecl *VD,
+                                                const RecordType *RT) {
+  const RecordDecl *RD = RT->getDecl();
+  for (FieldDecl *FD : RD->fields()) {
+    const Type *Ty = FD->getType()->getUnqualifiedDesugaredType();
+
+    // Unwrap arrays
+    // FIXME: Calculate array size while unwrapping
+    assert(!Ty->isIncompleteArrayType() &&
+           "incomplete arrays inside user defined types are not supported");
+    while (Ty->isConstantArrayType()) {
+      const ConstantArrayType *CAT = cast<ConstantArrayType>(Ty);
+      Ty = CAT->getElementType()->getUnqualifiedDesugaredType();
     }
-    const RecordType *RT = T->getAs<RecordType>();
-    if (!RT)
+
+    if (!Ty->isRecordType())
       continue;
 
-    const RecordDecl *RD = RT->getDecl();
-    for (FieldDecl *FD : RD->fields()) {
-      const Type *FieldTy = FD->getType().getTypePtr();
-      if (const HLSLAttributedResourceType *AttrResType =
-              dyn_cast<HLSLAttributedResourceType>(FieldTy)) {
-        ResourceClass RC = AttrResType->getAttrs().ResourceClass;
-        if (getRegisterType(RC) == RegType)
-          return true;
-      } else {
-        TypesToScan.emplace_back(FD->getType().getTypePtr());
-      }
+    if (const HLSLAttributedResourceType *AttrResType =
+            findHandleTypeOnResource(Ty)) {
+      // Add a new DeclBindingInfo to Bindings if it does not already exist
+      ResourceClass RC = AttrResType->getAttrs().ResourceClass;
+      DeclBindingInfo *DBI = Bindings.getDeclBindingInfo(VD, RC);
+      if (!DBI)
+        Bindings.addDeclBindingInfo(VD, RC);
+    } else if (const RecordType *RT = dyn_cast<RecordType>(Ty)) {
+      // Recursively scan embedded struct or class; it would be nice to do this
+      // without recursion, but tricky to correctly calculate the size of the
+      // binding, which is something we are probably going to need to do later
+      // on. Hopefully nesting of structs in structs too many levels is
+      // unlikely.
+      collectResourcesOnUserRecordDecl(VD, RT);
     }
   }
-  return false;
 }
 
-static void CheckContainsResourceForRegisterType(Sema &S,
-                                                 SourceLocation &ArgLoc,
-                                                 Decl *D, RegisterType RegType,
-                                                 bool SpecifiedSpace) {
+// Diagnore localized register binding errors for a single binding; does not
+// diagnose resource binding on user record types, that will be done later
+// in processResourceBindingOnDecl based on the information collected in
+// collectResourcesOnVarDecl.
+// Returns false if the register binding is not valid.
+static bool DiagnoseLocalRegisterBinding(Sema &S, SourceLocation &ArgLoc,
+                                         Decl *D, RegisterType RegType,
+                                         bool SpecifiedSpace) {
   int RegTypeNum = static_cast<int>(RegType);
 
   // check if the decl type is groupshared
   if (D->hasAttr<HLSLGroupSharedAddressSpaceAttr>()) {
     S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum;
-    return;
+    return false;
   }
 
   // Cbuffers and Tbuffers are HLSLBufferDecl types
   if (HLSLBufferDecl *CBufferOrTBuffer = dyn_cast<HLSLBufferDecl>(D)) {
     ResourceClass RC = CBufferOrTBuffer->isCBuffer() ? ResourceClass::CBuffer
                                                      : ResourceClass::SRV;
-    if (RegType != getRegisterType(RC))
-      S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch)
-          << RegTypeNum;
-    return;
+    if (RegType == getRegisterType(RC))
+      return true;
+
+    S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch)
+        << RegTypeNum;
+    return false;
   }
 
   // Samplers, UAVs, and SRVs are VarDecl types
@@ -1075,11 +1134,13 @@ static void CheckContainsResourceForRegisterType(Sema &S,
 
   // Resource
   if (const HLSLAttributedResourceType *AttrResType =
-          findAttributedResourceTypeOnField(VD)) {
-    if (RegType != getRegisterType(AttrResType->getAttrs().ResourceClass))
-      S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch)
-          << RegTypeNum;
-    return;
+          findHandleTypeOnResource(VD->getType().getTypePtr())) {
+    if (RegType == getRegisterType(AttrResType->getAttrs().ResourceClass))
+      return true;
+
+    S.Diag(D->getLocation(), diag::err_hlsl_binding_type_mismatch)
+        << RegTypeNum;
+    return false;
   }
 
   const clang::Type *Ty = VD->getType().getTypePtr();
@@ -1105,51 +1166,44 @@ static void CheckContainsResourceForRegisterType(Sema &S,
       else
         S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum;
     }
-  } else if (Ty->isRecordType()) {
-    // Class/struct types - walk the declaration and check each field and
-    // subclass
-    if (!ContainsResourceForRegisterType(S, Ty->getAs<RecordType>(), RegType))
-      S.Diag(D->getLocation(), diag::warn_hlsl_user_defined_type_missing_member)
-          << RegTypeNum;
-  } else {
-    // Anything else is an error
-    S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum;
+    return false;
   }
+  if (Ty->isRecordType())
+    // RecordTypes will be diagnosed in processResourceBindingOnDecl
+    // that is called from ActOnVariableDeclarator
+    return true;
+
+  // Anything else is an error
+  S.Diag(ArgLoc, diag::err_hlsl_binding_type_mismatch) << RegTypeNum;
+  return false;
 }
 
-static void ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl,
+static bool ValidateMultipleRegisterAnnotations(Sema &S, Decl *TheDecl,
                                                 RegisterType regType) {
   // make sure that there are no two register annotations
   // applied to the decl with the same register type
   bool RegisterTypesDetected[5] = {false};
-
   RegisterTypesDetected[static_cast<int>(regType)] = true;
 
-  // we need a static map to keep track of previous conflicts
-  // so that we don't emit the same error multiple times
-  static std::map<Decl *, std::set<RegisterType>> PreviousConflicts;
-
   for (auto it = TheDecl->attr_begin(); it != TheDecl->attr_end(); ++it) {
     if (HLSLResourceBindingAttr *attr =
             dyn_cast<HLSLResourceBindingAttr>(*it)) {
 
-      RegisterType otherRegType = getRegisterType(attr->getSlot());
+      RegisterType otherRegType = attr->getRegisterType();
       if (RegisterTypesDetected[static_cast<int>(otherRegType)]) {
-        if (PreviousConflicts[TheDecl].count(otherRegType))
-          continue;
         int otherRegTypeNum = static_cast<int>(otherRegType);
         S.Diag(TheDecl->getLocation(),
                diag::err_hlsl_duplicate_register_annotation)
             << otherRegTypeNum;
-        PreviousConflicts[TheDecl].insert(otherRegType);
-      } else {
-        RegisterTypesDetected[static_cast<int>(otherRegType)] = true;
+        return false;
       }
+      RegisterTypesDetected[static_cast<int>(otherRegType)] = true;
     }
   }
+  return true;
 }
 
-static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc,
+static bool DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc,
                                           Decl *D, RegisterType RegType,
                                           bool SpecifiedSpace) {
 
@@ -1159,10 +1213,11 @@ static void DiagnoseHLSLRegisterAttribute(Sema &S, SourceLocation &ArgLoc,
          "expecting VarDecl or HLSLBufferDecl");
 
   // check if the declaration contains resource matching the register type
-  CheckContainsResourceForRegisterType(S, ArgLoc, D, RegType, SpecifiedSpace);
+  if (!DiagnoseLocalRegisterBinding(S, ArgLoc, D, RegType, SpecifiedSpace))
+    return false;
 
   // next, if multiple register annotations exist, check that none conflict.
-  ValidateMultipleRegisterAnnotations(S, D, RegType);
+  return ValidateMultipleRegisterAnnotations(S, D, RegType);
 }
 
 void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) {
@@ -1203,23 +1258,23 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) {
     Slot = Str;
   }
 
-  RegisterType regType;
+  RegisterType RegType;
+  unsigned SlotNum = 0;
+  unsigned SpaceNum = 0;
 
   // Validate.
   if (!Slot.empty()) {
-    regType = getRegisterType(Slot);
-    if (regType == RegisterType::I) {
-      Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_i);
+    if (!convertToRegisterType(Slot, &RegType)) {
+      Diag(ArgLoc, diag::err_hlsl_binding_type_invalid) << Slot.substr(0, 1);
       return;
     }
-    if (regType == RegisterType::Invalid) {
-      Diag(ArgLoc, diag::err_hlsl_binding_type_invalid) << Slot.substr(0, 1);
+    if (RegType == RegisterType::I) {
+      Diag(ArgLoc, diag::warn_hlsl_deprecated_register_type_i);
       return;
     }
 
-    StringRef SlotNum = Slot.substr(1);
-    unsigned Num = 0;
-    if (SlotNum.getAsInteger(10, Num)) {
+    StringRef SlotNumStr = Slot.substr(1);
+    if (SlotNumStr.getAsInteger(10, SlotNum)) {
       Diag(ArgLoc, diag::err_hlsl_unsupported_register_number);
       return;
     }
@@ -1229,20 +1284,22 @@ void SemaHLSL::handleResourceBindingAttr(Decl *TheDecl, const ParsedAttr &AL) {
     Diag(SpaceArgLoc, diag::err_hlsl_expected_space) << Space;
     return;
   }
-  StringRef SpaceNum = Space.substr(5);
-  unsigned Num = 0;
-  if (SpaceNum.getAsInteger(10, Num)) {
+  StringRef SpaceNumStr = Space.substr(5);
+  if (SpaceNumStr.getAsInteger(10, SpaceNum)) {
     Diag(SpaceArgLoc, diag::err_hlsl_expected_space) << Space;
     return;
   }
 
-  DiagnoseHLSLRegisterAttribute(SemaRef, ArgLoc, TheDecl, regType,
-                                SpecifiedSpace);
+  if (!DiagnoseHLSLRegisterAttribute(SemaRef, ArgLoc, TheDecl, RegType,
+                                     SpecifiedSpace))
+    return;
 
   HLSLResourceBindingAttr *NewAttr =
       HLSLResourceBindingAttr::Create(getASTContext(), Slot, Space, AL);
-  if (NewAttr)
+  if (NewAttr) {
+    NewAttr->setBinding(RegType, SlotNum, SpaceNum);
     TheDecl->addAttr(NewAttr);
+  }
 }
 
 void SemaHLSL::handleParamModifierAttr(Decl *D, const ParsedAttr &AL) {
@@ -2089,6 +2146,7 @@ bool SemaHLSL::IsIntangibleType(clang::QualType QT) {
   CXXRecordDecl *RD = RT->getAsCXXRecordDecl();
   assert(RD != nullptr &&
          "all HLSL struct and classes should be CXXRecordDecl");
+  assert(RD->isCompleteDefinition() && "expecting complete type");
   return RD->isHLSLIntangible();
 }
 
@@ -2274,3 +2332,95 @@ QualType SemaHLSL::getInoutParameterType(QualType Ty) {
   Ty.addRestrict();
   return Ty;
 }
+
+void SemaHLSL::ActOnVariableDeclarator(VarDecl *VD) {
+  if (VD->hasGlobalStorage()) {
+    // make sure the declaration has a complete type
+    if (SemaRef.RequireCompleteType(
+            VD->getLocation(),
+            SemaRef.getASTContext().getBaseElementType(VD->getType()),
+            diag::err_typecheck_decl_incomplete_type)) {
+      VD->setInvalidDecl();
+      return;
+    }
+
+    // find all resources on decl
+    if (IsIntangibleType(VD->getType()))
+      collectResourcesOnVarDecl(VD);
+
+    // process explicit bindings
+    processExplicitBindingsOnDecl(VD);
+  }
+}
+
+// Walks though the global variable declaration, collects all resource binding
+// requirements and adds them to Bindings
+void SemaHLSL::collectResourcesOnVarDecl(VarDecl *VD) {
+  assert(VD->hasGlobalStorage() && IsIntangibleType(VD->getType()) &&
+         "expected global variable that contains HLSL resource");
+
+  // Cbuffers and Tbuffers are HLSLBufferDecl types
+  if (const HLSLBufferDecl *CBufferOrTBuffer = dyn_cast<HLSLBufferDecl>(VD)) {
+    Bindings.addDeclBindingInfo(VD, CBufferOrTBuffer->isCBuffer()
+                                        ? ResourceClass::CBuffer
+                                        : ResourceClass::SRV);
+    return;
+  }
+
+  // Unwrap arrays
+  // FIXME: Calculate array size while unwrapping
+  const Type *Ty = VD->getType()->getUnqualifiedDesugaredType();
+  while (Ty->isConstantArrayType()) {
+    const ConstantArrayType *CAT = cast<ConstantArrayType>(Ty);
+    Ty = CAT->getElementType()->getUnqualifiedDesugaredType();
+  }
+
+  // Resource (or array of resources)
+  if (const HLSLAttributedResourceType *AttrResType =
+          findHandleTypeOnResource(Ty)) {
+    Bindings.addDeclBindingInfo(VD, AttrResType->getAttrs().ResourceClass);
+    return;
+  }
+
+  // User defined record type
+  if (const RecordType *RT = dyn_cast<RecordType>(Ty))
+    collectResourcesOnUserRecordDecl(VD, RT);
+}
+
+// Walks though the explicit resource binding attributes on the declaration,
+// and makes sure there is a resource that matched the binding and updates
+// DeclBindingInfoLists
+void SemaHLSL::processExplicitBindingsOnDecl(VarDecl *VD) {
+  assert(VD->hasGlobalStorage() && "expected global variable");
+
+  for (Attr *A : VD->attrs()) {
+    HLSLResourceBindingAttr *RBA = dyn_cast<HLSLResourceBindingAttr>(A);
+    if (!RBA)
+      continue;
+
+    RegisterType RT = RBA->getRegisterType();
+    assert(RT != RegisterType::I && "invalid or obsolete register type should "
+                                    "never have an attribute created");
+
+    if (RT == RegisterType::C) {
+      if (Bindings.hasBindingInfoForDecl(VD))
+        SemaRef.Diag(VD->getLocation(),
+                     diag::warn_hlsl_user_defined_type_missing_member)
+            << static_cast<int>(RT);
+      continue;
+    }
+
+    // Find DeclBindingInfo for this binding and update it, or report error
+    // if it does not exist (user type does to contain resources with the
+    // expected resource class).
+    ResourceClass RC = getResourceClass(RT);
+    if (DeclBindingInfo *BI = Bindings.getDeclBindingInfo(VD, RC)) {
+      // update binding info
+      BI->setBindingAttribute(RBA, BindingType::Explicit);
+    } else {
+      SemaRef.Diag(VD->getLocation(),
+                   diag::warn_hlsl_user_defined_type_missing_member)
+          << static_cast<int>(RT);
+    }
+  }
+}
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
index ea2d576e4cca..40517f393e12 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
@@ -106,7 +106,6 @@ struct Eg12{
   MySRV s1;
   MySRV s2;
 };
-// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}}
 // expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}}
 // expected-error@+1{{binding type 'u' cannot be applied more than once}}
 Eg12 e12 : register(u9) : register(u10);
@@ -115,12 +114,14 @@ struct Eg13{
   MySRV s1;
   MySRV s2;
 };
-// expected-warning@+4{{binding type 'u' only applies to types containing UAV resources}}
 // expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}}
-// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}}
+// expected-error@+2{{binding type 'u' cannot be applied more than once}}
 // expected-error@+1{{binding type 'u' cannot be applied more than once}}
 Eg13 e13 : register(u9) : register(u10) : register(u11);
 
+// expected-error@+1{{binding type 't' cannot be applied more than once}}
+Eg13 e13_2 : register(t11) : register(t12);
+
 struct Eg14{
  MyTemplatedUAV<int> r1;  
 };
@@ -132,4 +133,3 @@ struct Eg15 {
 }; 
 // expected no error
 Eg15 e15 : register(c0);
-
-- 
GitLab


From 81bbe19383797d5daaa5ddd16a47cd6ff44b66e2 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 17 Oct 2024 05:31:28 +0100
Subject: [PATCH 197/329] [VPlan] Add VPSingleDefRecipe::dump() to resolve
 ambigous lookup (NFC).

This allows calling ::dump() on various sub-classes of VPSingleDefRecipe
directly, as it resolves an ambigous name lookup.

Previously, calling VPWidenRecipe::dump() (and others), would result in

the following errors:
llvm/unittests/Transforms/Vectorize/VPlanTest.cpp:1284:19: error: member 'dump' found in multiple base classes of different types
 1284 |           WidenR->dump();
      |                   ^
llvm/include/../lib/Transforms/Vectorize/VPlanValue.h:434:8: note: member found by ambiguous name lookup
  434 |   void dump() const;
      |        ^
llvm/include/../lib/Transforms/Vectorize/VPlanValue.h:108:8: note: member found by ambiguous name lookup
  108 |   void dump() const;
      |        ^
1 error generated.
---
 llvm/lib/Transforms/Vectorize/VPlan.h             |  5 +++++
 llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp    |  4 ++++
 llvm/unittests/Transforms/Vectorize/VPlanTest.cpp | 15 +++++++++++++++
 3 files changed, 24 insertions(+)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index 4cef47e69f0e..fd97dda6dc1b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -954,6 +954,11 @@ public:
   /// Return the cost of this VPSingleDefRecipe.
   InstructionCost computeCost(ElementCount VF,
                               VPCostContext &Ctx) const override;
+
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+  /// Print this VPSingleDefRecipe to dbgs() (for debugging).
+  LLVM_DUMP_METHOD void dump() const;
+#endif
 };
 
 /// Class to record LLVM IR flag for a recipe along with it.
diff --git a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
index 6fe30356e8c9..a38cdfc542cb 100644
--- a/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlanRecipes.cpp
@@ -343,6 +343,10 @@ FastMathFlags VPRecipeWithIRFlags::getFastMathFlags() const {
   return Res;
 }
 
+#if !defined(NDEBUG) || defined(LLVM_ENABLE_DUMP)
+void VPSingleDefRecipe::dump() const { VPDef::dump(); }
+#endif
+
 template <unsigned PartOpIdx>
 VPValue *
 VPUnrollPartAccessor<PartOpIdx>::getUnrollPartOperand(VPUser &U) const {
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
index 376b00224eb5..0f170efac207 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanTest.cpp
@@ -1279,6 +1279,21 @@ TEST(VPRecipeTest, dumpRecipeInPlan) {
         },
         testing::ExitedWithCode(0), "WIDEN ir<%a> = add ir<1>, ir<2>");
 
+    VPDef *Def = WidenR;
+    EXPECT_EXIT(
+        {
+          Def->dump();
+          exit(0);
+        },
+        testing::ExitedWithCode(0), "WIDEN ir<%a> = add ir<1>, ir<2>");
+
+    EXPECT_EXIT(
+        {
+          WidenR->dump();
+          exit(0);
+        },
+        testing::ExitedWithCode(0), "WIDEN ir<%a> = add ir<1>, ir<2>");
+
     // Test VPRecipeBase::dump().
     VPRecipeBase *R = WidenR;
     EXPECT_EXIT(
-- 
GitLab


From 927af63fddb8e34f23b2974f812156767988ec5f Mon Sep 17 00:00:00 2001
From: thetruestblue <92476612+thetruestblue@users.noreply.github.com>
Date: Wed, 16 Oct 2024 21:52:38 -0700
Subject: [PATCH 198/329] [SanitizerCoverage] Add an option to gate the
 invocation of the tracing callbacks (#108328)

Implement -sanitizer-coverage-gated-trace-callbacks to gate the
invocation of the tracing callbacks based on the value of a global
variable, which is stored in a specific section.
When this option is enabled, the instrumentation will not call into the
runtime-provided callbacks for tracing, thus only incurring in a trivial
branch without going through a function call. It is up to the runtime to
toggle the value of the global variable in order to enable tracing.

This option is only supported for trace-pc-guard.

Note: will add additional support for trace-cmp in a follow up PR.

Patch by Filippo Bigarella

rdar://101626834
---
 .../sanitize-coverage-gated-callbacks.c       | 42 +++++++++++++
 .../llvm/Transforms/Utils/Instrumentation.h   |  1 +
 .../Instrumentation/SanitizerCoverage.cpp     | 63 ++++++++++++++++++-
 3 files changed, 103 insertions(+), 3 deletions(-)
 create mode 100644 clang/test/CodeGen/sanitize-coverage-gated-callbacks.c

diff --git a/clang/test/CodeGen/sanitize-coverage-gated-callbacks.c b/clang/test/CodeGen/sanitize-coverage-gated-callbacks.c
new file mode 100644
index 000000000000..9a00d91d5ad0
--- /dev/null
+++ b/clang/test/CodeGen/sanitize-coverage-gated-callbacks.c
@@ -0,0 +1,42 @@
+// RUN: %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=trace-pc-guard -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o - | FileCheck %s --check-prefixes=CHECK,GATED
+// RUN: %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=trace-pc-guard -mllvm -sanitizer-coverage-gated-trace-callbacks=0 -o - | FileCheck %s --check-prefixes=CHECK,PLAIN
+// RUN: not %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=trace-pc -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o /dev/null 2>&1 | FileCheck %s --check-prefixes=INCOMPATIBLE
+// RUN: not %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=inline-8bit-counters -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o /dev/null 2>&1 | FileCheck %s --check-prefixes=INCOMPATIBLE
+// RUN: not %clang %s -target arm64-apple-darwin -emit-llvm -S -fsanitize-coverage=inline-bool-flag -mllvm -sanitizer-coverage-gated-trace-callbacks=1 -o /dev/null 2>&1 | FileCheck %s --check-prefixes=INCOMPATIBLE
+
+// Verify that we do not emit the __sancov_gate section for "plain" trace-pc-guard
+// GATED: section "__DATA,__sancov_gate"
+// PLAIN-NOT: section "__DATA,__sancov_gate"
+
+// Produce an error for all incompatible sanitizer coverage modes.
+// INCOMPATIBLE: error: 'sanitizer-coverage-gated-trace-callbacks' is only supported with trace-pc-guard
+
+int x[10];
+
+// CHECK: define{{.*}} void @foo
+void foo(int n, int m) {
+  // COM: Verify that we're emitting the call to __sanitizer_cov_trace_pc_guard upon
+  // COM: checking the value of __sancov_should_track.
+  // GATED: [[VAL:%.*]] = load i64, {{.*}}@__sancov_should_track
+  // GATED-NOT: [[VAL:%.*]] = load i64, i64* @__sancov_should_track
+  // GATED-NEXT: [[CMP:%.*]] = icmp ne i64 [[VAL]], 0
+  // GATED-NEXT: br i1 [[CMP]], label %[[L_TRUE:.*]], label %[[L_FALSE:.*]], !prof [[WEIGHTS:!.+]]
+  // GATED: [[L_TRUE]]:
+  // GATED-NEXT:   call void @__sanitizer_cov_trace_pc_guard
+  // GATED:   br i1 [[CMP]], label %[[L_TRUE_2:.*]], label %[[L_FALSE_2:.*]]
+  // GATED: [[L_TRUE_2]]:
+  // GATED-NEXT:   call void @__sanitizer_cov_trace_pc_guard
+  // GATED: [[WEIGHTS]] = !{!"branch_weights", i32 1, i32 100000}
+
+  // COM: With the non-gated instrumentation, we should not emit the
+  // COM: __sancov_should_track global.
+  // PLAIN-NOT: __sancov_should_track
+  // But we should still be emitting the calls to the callback.
+  // PLAIN: call void @__sanitizer_cov_trace_pc_guard
+  if (n) {
+    x[n] = 42;
+    if (m) {
+      x[m] = 41;
+    }
+  }
+}
diff --git a/llvm/include/llvm/Transforms/Utils/Instrumentation.h b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
index 1a4824a806dc..4f67d079d146 100644
--- a/llvm/include/llvm/Transforms/Utils/Instrumentation.h
+++ b/llvm/include/llvm/Transforms/Utils/Instrumentation.h
@@ -161,6 +161,7 @@ struct SanitizerCoverageOptions {
   bool TraceLoads = false;
   bool TraceStores = false;
   bool CollectControlFlow = false;
+  bool GatedCallbacks = false;
 
   SanitizerCoverageOptions() = default;
 };
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 719806fdf37f..8130a719691b 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -16,6 +16,7 @@
 #include "llvm/Analysis/GlobalsModRef.h"
 #include "llvm/Analysis/PostDominators.h"
 #include "llvm/IR/Constant.h"
+#include "llvm/IR/Constants.h"
 #include "llvm/IR/DataLayout.h"
 #include "llvm/IR/Dominators.h"
 #include "llvm/IR/EHPersonalities.h"
@@ -28,6 +29,8 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Module.h"
 #include "llvm/IR/Type.h"
+#include "llvm/IR/ValueSymbolTable.h"
+#include "llvm/InitializePasses.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/SpecialCaseList.h"
 #include "llvm/Support/VirtualFileSystem.h"
@@ -82,8 +85,10 @@ const char SanCovCountersSectionName[] = "sancov_cntrs";
 const char SanCovBoolFlagSectionName[] = "sancov_bools";
 const char SanCovPCsSectionName[] = "sancov_pcs";
 const char SanCovCFsSectionName[] = "sancov_cfs";
+const char SanCovCallbackGateSectionName[] = "sancov_gate";
 
 const char SanCovLowestStackName[] = "__sancov_lowest_stack";
+const char SanCovCallbackGateName[] = "__sancov_should_track";
 
 static cl::opt<int> ClCoverageLevel(
     "sanitizer-coverage-level",
@@ -152,6 +157,12 @@ static cl::opt<bool>
     ClCollectCF("sanitizer-coverage-control-flow",
                 cl::desc("collect control flow for each function"), cl::Hidden);
 
+static cl::opt<bool> ClGatedCallbacks(
+    "sanitizer-coverage-gated-trace-callbacks",
+    cl::desc("Gate the invocation of the tracing callbacks on a global "
+             "variable. Currently only supported for trace-pc-guard."),
+    cl::Hidden, cl::init(false));
+
 namespace {
 
 SanitizerCoverageOptions getOptions(int LegacyCoverageLevel) {
@@ -194,6 +205,7 @@ SanitizerCoverageOptions OverrideFromCL(SanitizerCoverageOptions Options) {
   Options.StackDepth |= ClStackDepth;
   Options.TraceLoads |= ClLoadTracing;
   Options.TraceStores |= ClStoreTracing;
+  Options.GatedCallbacks |= ClGatedCallbacks;
   if (!Options.TracePCGuard && !Options.TracePC &&
       !Options.Inline8bitCounters && !Options.StackDepth &&
       !Options.InlineBoolFlag && !Options.TraceLoads && !Options.TraceStores)
@@ -239,8 +251,9 @@ private:
                                                     const char *Section);
   GlobalVariable *CreatePCArray(Function &F, ArrayRef<BasicBlock *> AllBlocks);
   void CreateFunctionLocalArrays(Function &F, ArrayRef<BasicBlock *> AllBlocks);
+  Value *CreateFunctionLocalGateCmp(IRBuilder<> &IRB);
   void InjectCoverageAtBlock(Function &F, BasicBlock &BB, size_t Idx,
-                             bool IsLeafFunc = true);
+                             Value *&FunctionGateCmp, bool IsLeafFunc = true);
   Function *CreateInitCallsForSections(Module &M, const char *CtorName,
                                        const char *InitFunctionName, Type *Ty,
                                        const char *Section);
@@ -265,6 +278,7 @@ private:
   FunctionCallee SanCovTraceGepFunction;
   FunctionCallee SanCovTraceSwitchFunction;
   GlobalVariable *SanCovLowestStack;
+  GlobalVariable *SanCovCallbackGate;
   Type *PtrTy, *IntptrTy, *Int64Ty, *Int32Ty, *Int16Ty, *Int8Ty, *Int1Ty;
   Module *CurModule;
   std::string CurModuleUniqueId;
@@ -478,6 +492,23 @@ bool ModuleSanitizerCoverage::instrumentModule() {
   if (Options.StackDepth && !SanCovLowestStack->isDeclaration())
     SanCovLowestStack->setInitializer(Constant::getAllOnesValue(IntptrTy));
 
+  if (Options.GatedCallbacks) {
+    if (!Options.TracePCGuard) {
+      C->emitError(StringRef("'") + ClGatedCallbacks.ArgStr +
+                   "' is only supported with trace-pc-guard");
+      return true;
+    }
+
+    SanCovCallbackGate = cast<GlobalVariable>(
+        M.getOrInsertGlobal(SanCovCallbackGateName, Int64Ty));
+    SanCovCallbackGate->setSection(
+        getSectionName(SanCovCallbackGateSectionName));
+    SanCovCallbackGate->setInitializer(Constant::getNullValue(Int64Ty));
+    SanCovCallbackGate->setLinkage(GlobalVariable::LinkOnceAnyLinkage);
+    SanCovCallbackGate->setVisibility(GlobalVariable::HiddenVisibility);
+    appendToCompilerUsed(M, SanCovCallbackGate);
+  }
+
   SanCovTracePC = M.getOrInsertFunction(SanCovTracePCName, VoidTy);
   SanCovTracePCGuard =
       M.getOrInsertFunction(SanCovTracePCGuardName, VoidTy, PtrTy);
@@ -777,13 +808,22 @@ void ModuleSanitizerCoverage::CreateFunctionLocalArrays(
     FunctionPCsArray = CreatePCArray(F, AllBlocks);
 }
 
+Value *ModuleSanitizerCoverage::CreateFunctionLocalGateCmp(IRBuilder<> &IRB) {
+  auto Load = IRB.CreateLoad(Int64Ty, SanCovCallbackGate);
+  Load->setNoSanitizeMetadata();
+  auto Cmp = IRB.CreateIsNotNull(Load);
+  Cmp->setName("sancov gate cmp");
+  return Cmp;
+}
+
 bool ModuleSanitizerCoverage::InjectCoverage(Function &F,
                                              ArrayRef<BasicBlock *> AllBlocks,
                                              bool IsLeafFunc) {
   if (AllBlocks.empty()) return false;
   CreateFunctionLocalArrays(F, AllBlocks);
+  Value *FunctionGateCmp = nullptr;
   for (size_t i = 0, N = AllBlocks.size(); i < N; i++)
-    InjectCoverageAtBlock(F, *AllBlocks[i], i, IsLeafFunc);
+    InjectCoverageAtBlock(F, *AllBlocks[i], i, FunctionGateCmp, IsLeafFunc);
   return true;
 }
 
@@ -946,6 +986,7 @@ void ModuleSanitizerCoverage::InjectTraceForCmp(
 
 void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
                                                     size_t Idx,
+                                                    Value *&FunctionGateCmp,
                                                     bool IsLeafFunc) {
   BasicBlock::iterator IP = BB.getFirstInsertionPt();
   bool IsEntryBB = &BB == &F.getEntryBlock();
@@ -971,7 +1012,23 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
         IRB.CreateAdd(IRB.CreatePointerCast(FunctionGuardArray, IntptrTy),
                       ConstantInt::get(IntptrTy, Idx * 4)),
         PtrTy);
-    IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
+    if (Options.GatedCallbacks) {
+      if (!FunctionGateCmp) {
+        // Create this in the entry block
+        assert(IsEntryBB);
+        FunctionGateCmp = CreateFunctionLocalGateCmp(IRB);
+      }
+      // Set the branch weights in order to minimize the price paid when the
+      // gate is turned off, allowing the default enablement of this
+      // instrumentation with as little of a performance cost as possible
+      auto Weights = MDBuilder(*C).createBranchWeights(1, 100000);
+      auto ThenTerm =
+          SplitBlockAndInsertIfThen(FunctionGateCmp, &*IP, false, Weights);
+      IRBuilder<> ThenIRB(ThenTerm);
+      ThenIRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
+    } else {
+      IRB.CreateCall(SanCovTracePCGuard, GuardPtr)->setCannotMerge();
+    }
   }
   if (Options.Inline8bitCounters) {
     auto CounterPtr = IRB.CreateGEP(
-- 
GitLab


From 3142dff70401086a14ee9ae3428f65f5dfa6a2e6 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Wed, 16 Oct 2024 22:45:40 -0700
Subject: [PATCH 199/329] [nfc][lsan] Extract significant part of the loop into
 a function (#112610)

Co-authored-by: thetruestblue <92476612+thetruestblue@users.noreply.github.com>
---
 compiler-rt/lib/lsan/lsan_common.cpp | 209 ++++++++++++++-------------
 1 file changed, 108 insertions(+), 101 deletions(-)

diff --git a/compiler-rt/lib/lsan/lsan_common.cpp b/compiler-rt/lib/lsan/lsan_common.cpp
index a1a15bf98a11..721db7872cce 100644
--- a/compiler-rt/lib/lsan/lsan_common.cpp
+++ b/compiler-rt/lib/lsan/lsan_common.cpp
@@ -399,6 +399,112 @@ static void ProcessThreadRegistry(Frontier *frontier) {
 }
 
 // Scans thread data (stacks and TLS) for heap pointers.
+static void ProcessThread(tid_t os_id, uptr sp,
+                          const InternalMmapVector<uptr> &registers,
+                          InternalMmapVector<Range> &extra_ranges,
+                          Frontier *frontier) {
+  // `extra_ranges` is outside of the function and the loop to reused mapped
+  // memory.
+  CHECK(extra_ranges.empty());
+  LOG_THREADS("Processing thread %llu.\n", os_id);
+  uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
+  DTLS *dtls;
+  bool thread_found =
+      GetThreadRangesLocked(os_id, &stack_begin, &stack_end, &tls_begin,
+                            &tls_end, &cache_begin, &cache_end, &dtls);
+  if (!thread_found) {
+    // If a thread can't be found in the thread registry, it's probably in the
+    // process of destruction. Log this event and move on.
+    LOG_THREADS("Thread %llu not found in registry.\n", os_id);
+    return;
+  }
+
+  if (!sp)
+    sp = stack_begin;
+
+  if (flags()->use_registers) {
+    uptr registers_begin = reinterpret_cast<uptr>(registers.data());
+    uptr registers_end =
+        reinterpret_cast<uptr>(registers.data() + registers.size());
+    ScanRangeForPointers(registers_begin, registers_end, frontier, "REGISTERS",
+                         kReachable);
+  }
+
+  if (flags()->use_stacks) {
+    LOG_THREADS("Stack at %p-%p (SP = %p).\n", (void *)stack_begin,
+                (void *)stack_end, (void *)sp);
+    if (sp < stack_begin || sp >= stack_end) {
+      // SP is outside the recorded stack range (e.g. the thread is running a
+      // signal handler on alternate stack, or swapcontext was used).
+      // Again, consider the entire stack range to be reachable.
+      LOG_THREADS("WARNING: stack pointer not in stack range.\n");
+      uptr page_size = GetPageSizeCached();
+      int skipped = 0;
+      while (stack_begin < stack_end &&
+             !IsAccessibleMemoryRange(stack_begin, 1)) {
+        skipped++;
+        stack_begin += page_size;
+      }
+      LOG_THREADS("Skipped %d guard page(s) to obtain stack %p-%p.\n", skipped,
+                  (void *)stack_begin, (void *)stack_end);
+    } else {
+      // Shrink the stack range to ignore out-of-scope values.
+      stack_begin = sp;
+    }
+    ScanRangeForPointers(stack_begin, stack_end, frontier, "STACK", kReachable);
+    GetThreadExtraStackRangesLocked(os_id, &extra_ranges);
+    ScanExtraStackRanges(extra_ranges, frontier);
+  }
+
+  if (flags()->use_tls) {
+    if (tls_begin) {
+      LOG_THREADS("TLS at %p-%p.\n", (void *)tls_begin, (void *)tls_end);
+      // If the tls and cache ranges don't overlap, scan full tls range,
+      // otherwise, only scan the non-overlapping portions
+      if (cache_begin == cache_end || tls_end < cache_begin ||
+          tls_begin > cache_end) {
+        ScanRangeForPointers(tls_begin, tls_end, frontier, "TLS", kReachable);
+      } else {
+        if (tls_begin < cache_begin)
+          ScanRangeForPointers(tls_begin, cache_begin, frontier, "TLS",
+                               kReachable);
+        if (tls_end > cache_end)
+          ScanRangeForPointers(cache_end, tls_end, frontier, "TLS", kReachable);
+      }
+    }
+#    if SANITIZER_ANDROID
+    auto *cb = +[](void *dtls_begin, void *dtls_end, uptr /*dso_idd*/,
+                   void *arg) -> void {
+      ScanRangeForPointers(
+          reinterpret_cast<uptr>(dtls_begin), reinterpret_cast<uptr>(dtls_end),
+          reinterpret_cast<Frontier *>(arg), "DTLS", kReachable);
+    };
+
+    // FIXME: There might be a race-condition here (and in Bionic) if the
+    // thread is suspended in the middle of updating its DTLS. IOWs, we
+    // could scan already freed memory. (probably fine for now)
+    __libc_iterate_dynamic_tls(os_id, cb, frontier);
+#    else
+    if (dtls && !DTLSInDestruction(dtls)) {
+      ForEachDVT(dtls, [&](const DTLS::DTV &dtv, int id) {
+        uptr dtls_beg = dtv.beg;
+        uptr dtls_end = dtls_beg + dtv.size;
+        if (dtls_beg < dtls_end) {
+          LOG_THREADS("DTLS %d at %p-%p.\n", id, (void *)dtls_beg,
+                      (void *)dtls_end);
+          ScanRangeForPointers(dtls_beg, dtls_end, frontier, "DTLS",
+                               kReachable);
+        }
+      });
+    } else {
+      // We are handling a thread with DTLS under destruction. Log about
+      // this and continue.
+      LOG_THREADS("Thread %llu has DTLS under destruction.\n", os_id);
+    }
+#    endif
+  }
+}
+
 static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
                            Frontier *frontier, tid_t caller_tid,
                            uptr caller_sp) {
@@ -408,7 +514,7 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
     registers.clear();
     extra_ranges.clear();
 
-    const tid_t os_id = static_cast<tid_t>(suspended_threads.GetThreadID(i));
+    const tid_t os_id = suspended_threads.GetThreadID(i);
     uptr sp = 0;
     PtraceRegistersStatus have_registers =
         suspended_threads.GetRegistersAndSP(i, &registers, &sp);
@@ -421,109 +527,10 @@ static void ProcessThreads(SuspendedThreadsList const &suspended_threads,
       sp = 0;
     }
 
-    LOG_THREADS("Processing thread %llu.\n", os_id);
-    uptr stack_begin, stack_end, tls_begin, tls_end, cache_begin, cache_end;
-    DTLS *dtls;
-    bool thread_found =
-        GetThreadRangesLocked(os_id, &stack_begin, &stack_end, &tls_begin,
-                              &tls_end, &cache_begin, &cache_end, &dtls);
-    if (!thread_found) {
-      // If a thread can't be found in the thread registry, it's probably in the
-      // process of destruction. Log this event and move on.
-      LOG_THREADS("Thread %llu not found in registry.\n", os_id);
-      continue;
-    }
-
     if (os_id == caller_tid)
       sp = caller_sp;
 
-    if (!sp)
-      sp = stack_begin;
-
-    if (flags()->use_registers && have_registers) {
-      uptr registers_begin = reinterpret_cast<uptr>(registers.data());
-      uptr registers_end =
-          reinterpret_cast<uptr>(registers.data() + registers.size());
-      ScanRangeForPointers(registers_begin, registers_end, frontier,
-                           "REGISTERS", kReachable);
-    }
-
-    if (flags()->use_stacks) {
-      LOG_THREADS("Stack at %p-%p (SP = %p).\n", (void *)stack_begin,
-                  (void *)stack_end, (void *)sp);
-      if (sp < stack_begin || sp >= stack_end) {
-        // SP is outside the recorded stack range (e.g. the thread is running a
-        // signal handler on alternate stack, or swapcontext was used).
-        // Again, consider the entire stack range to be reachable.
-        LOG_THREADS("WARNING: stack pointer not in stack range.\n");
-        uptr page_size = GetPageSizeCached();
-        int skipped = 0;
-        while (stack_begin < stack_end &&
-               !IsAccessibleMemoryRange(stack_begin, 1)) {
-          skipped++;
-          stack_begin += page_size;
-        }
-        LOG_THREADS("Skipped %d guard page(s) to obtain stack %p-%p.\n",
-                    skipped, (void *)stack_begin, (void *)stack_end);
-      } else {
-        // Shrink the stack range to ignore out-of-scope values.
-        stack_begin = sp;
-      }
-      ScanRangeForPointers(stack_begin, stack_end, frontier, "STACK",
-                           kReachable);
-      GetThreadExtraStackRangesLocked(os_id, &extra_ranges);
-      ScanExtraStackRanges(extra_ranges, frontier);
-    }
-
-    if (flags()->use_tls) {
-      if (tls_begin) {
-        LOG_THREADS("TLS at %p-%p.\n", (void *)tls_begin, (void *)tls_end);
-        // If the tls and cache ranges don't overlap, scan full tls range,
-        // otherwise, only scan the non-overlapping portions
-        if (cache_begin == cache_end || tls_end < cache_begin ||
-            tls_begin > cache_end) {
-          ScanRangeForPointers(tls_begin, tls_end, frontier, "TLS", kReachable);
-        } else {
-          if (tls_begin < cache_begin)
-            ScanRangeForPointers(tls_begin, cache_begin, frontier, "TLS",
-                                 kReachable);
-          if (tls_end > cache_end)
-            ScanRangeForPointers(cache_end, tls_end, frontier, "TLS",
-                                 kReachable);
-        }
-      }
-#    if SANITIZER_ANDROID
-      auto *cb = +[](void *dtls_begin, void *dtls_end, uptr /*dso_idd*/,
-                     void *arg) -> void {
-        ScanRangeForPointers(reinterpret_cast<uptr>(dtls_begin),
-                             reinterpret_cast<uptr>(dtls_end),
-                             reinterpret_cast<Frontier *>(arg), "DTLS",
-                             kReachable);
-      };
-
-      // FIXME: There might be a race-condition here (and in Bionic) if the
-      // thread is suspended in the middle of updating its DTLS. IOWs, we
-      // could scan already freed memory. (probably fine for now)
-      __libc_iterate_dynamic_tls(os_id, cb, frontier);
-#    else
-      if (dtls && !DTLSInDestruction(dtls)) {
-        ForEachDVT(dtls, [&](const DTLS::DTV &dtv, int id) {
-          uptr dtls_beg = dtv.beg;
-          uptr dtls_end = dtls_beg + dtv.size;
-          if (dtls_beg < dtls_end) {
-            LOG_THREADS("DTLS %d at %p-%p.\n", id, (void *)dtls_beg,
-                        (void *)dtls_end);
-            ScanRangeForPointers(dtls_beg, dtls_end, frontier, "DTLS",
-                                 kReachable);
-          }
-        });
-      } else {
-        // We are handling a thread with DTLS under destruction. Log about
-        // this and continue.
-        LOG_THREADS("Thread %llu has DTLS under destruction.\n", os_id);
-      }
-#    endif
-    }
+    ProcessThread(os_id, sp, registers, extra_ranges, frontier);
   }
 
   // Add pointers reachable from ThreadContexts
-- 
GitLab


From 23d4fe6c5c52e054bbed75e78104f59869337356 Mon Sep 17 00:00:00 2001
From: Mikael Holmen <mikael.holmen@ericsson.com>
Date: Thu, 17 Oct 2024 07:28:28 +0200
Subject: [PATCH 200/329] [RISCV] Fix gcc -Wparentheses warning [NFC]

Without this gcc warned like
 ../lib/Target/RISCV/RISCVVLOptimizer.cpp:760: warning: suggest parentheses around '&&' within '||' [-Wparentheses]
   760 |            VLOp.getReg() != RISCV::X0 && "Did not expect X0 VL");
       |
---
 llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
index ee494c468151..2141a6ed1ed7 100644
--- a/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
+++ b/llvm/lib/Target/RISCV/RISCVVLOptimizer.cpp
@@ -756,8 +756,8 @@ bool RISCVVLOptimizer::checkUsers(const MachineOperand *&CommonVL,
     const MachineOperand &VLOp = UserMI.getOperand(VLOpNum);
 
     // Looking for an immediate or a register VL that isn't X0.
-    assert(!VLOp.isReg() ||
-           VLOp.getReg() != RISCV::X0 && "Did not expect X0 VL");
+    assert((!VLOp.isReg() || VLOp.getReg() != RISCV::X0) &&
+           "Did not expect X0 VL");
 
     if (!CommonVL) {
       CommonVL = &VLOp;
-- 
GitLab


From 3ae6b57671744b4fe4dd76769cce0745a0f5bc31 Mon Sep 17 00:00:00 2001
From: Lang Hames <lhames@gmail.com>
Date: Thu, 17 Oct 2024 17:33:04 +1100
Subject: [PATCH 201/329] [ORC] Remove extraneous lines in comment.

---
 llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h | 2 --
 1 file changed, 2 deletions(-)

diff --git a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
index 6468f2dfc11a..06bc85dc40a8 100644
--- a/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
+++ b/llvm/include/llvm/ExecutionEngine/Orc/ExecutorProcessControl.h
@@ -235,8 +235,6 @@ public:
   }
 
   /// Look up and SPS-deserialize a bootstrap map value.
-  ///
-  ///
   template <typename T, typename SPSTagT>
   Error getBootstrapMapValue(StringRef Key, std::optional<T> &Val) const {
     Val = std::nullopt;
-- 
GitLab


From 255a99c29f9fa1a89b03a85a3a73d6f44d03c6c1 Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Oct 2024 08:48:08 +0200
Subject: [PATCH 202/329] [APInt] Fix APInt constructions where value does not
 fit bitwidth (NFCI) (#80309)

This fixes all the places that hit the new assertion added in
https://github.com/llvm/llvm-project/pull/106524 in tests. That is,
cases where the value passed to the APInt constructor is not an N-bit
signed/unsigned integer, where N is the bit width and signedness is
determined by the isSigned flag.

The fixes either set the correct value for isSigned, set the
implicitTrunc flag, or perform more calculations inside APInt.

Note that the assertion is currently still disabled by default, so this
patch is mostly NFC.
---
 clang/include/clang/Sema/Sema.h               |  2 +-
 clang/lib/AST/ByteCode/IntegralAP.h           |  7 ++--
 clang/lib/CodeGen/CGVTT.cpp                   |  5 +--
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |  5 +--
 clang/lib/Parse/ParseInit.cpp                 |  6 ++--
 clang/lib/Sema/SemaExpr.cpp                   |  5 +--
 clang/lib/Sema/SemaOpenMP.cpp                 |  4 ++-
 lldb/source/Expression/DWARFExpression.cpp    |  8 +++--
 llvm/include/llvm/ADT/APFixedPoint.h          |  4 ++-
 llvm/lib/Analysis/ConstantFolding.cpp         |  3 +-
 llvm/lib/Analysis/Loads.cpp                   |  6 ++--
 llvm/lib/Analysis/MemoryBuiltins.cpp          |  2 ++
 llvm/lib/Analysis/ScalarEvolution.cpp         |  2 +-
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp     |  3 +-
 .../lib/CodeGen/SelectionDAG/SelectionDAG.cpp |  6 +++-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |  3 +-
 .../CodeGen/SelectionDAG/SelectionDAGISel.cpp | 10 ++++--
 .../CodeGen/SelectionDAG/TargetLowering.cpp   |  8 +++--
 llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp      |  2 +-
 llvm/lib/IR/Constants.cpp                     |  5 ++-
 .../Target/AArch64/AArch64ISelLowering.cpp    | 32 +++++++++----------
 llvm/lib/Target/AMDGPU/SIInstrInfo.cpp        |  3 +-
 .../Target/AMDGPU/SIShrinkInstructions.cpp    |  4 +--
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp |  3 +-
 .../Hexagon/HexagonConstPropagation.cpp       |  3 +-
 llvm/lib/Target/Hexagon/HexagonGenExtract.cpp |  2 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  4 ++-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  6 ++--
 llvm/lib/Transforms/IPO/ArgumentPromotion.cpp |  3 +-
 llvm/lib/Transforms/Utils/SimplifyCFG.cpp     |  2 +-
 llvm/unittests/ADT/APFixedPointTest.cpp       |  9 +++---
 31 files changed, 103 insertions(+), 64 deletions(-)

diff --git a/clang/include/clang/Sema/Sema.h b/clang/include/clang/Sema/Sema.h
index 0faa5aed4eec..2c5769f8469e 100644
--- a/clang/include/clang/Sema/Sema.h
+++ b/clang/include/clang/Sema/Sema.h
@@ -6755,7 +6755,7 @@ public:
 
   ExprResult BuildPredefinedExpr(SourceLocation Loc, PredefinedIdentKind IK);
   ExprResult ActOnPredefinedExpr(SourceLocation Loc, tok::TokenKind Kind);
-  ExprResult ActOnIntegerConstant(SourceLocation Loc, uint64_t Val);
+  ExprResult ActOnIntegerConstant(SourceLocation Loc, int64_t Val);
 
   bool CheckLoopHintExpr(Expr *E, SourceLocation Loc, bool AllowZero);
 
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index a4d656433344..252d7243bee7 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -61,7 +61,7 @@ public:
 
   IntegralAP(APInt V) : V(V) {}
   /// Arbitrary value for uninitialized variables.
-  IntegralAP() : IntegralAP(-1, 3) {}
+  IntegralAP() : IntegralAP(Signed ? -1 : 7, 3) {}
 
   IntegralAP operator-() const { return IntegralAP(-V); }
   IntegralAP operator-(const IntegralAP &Other) const {
@@ -112,7 +112,10 @@ public:
 
   template <unsigned Bits, bool InputSigned>
   static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
-    APInt Copy = APInt(BitWidth, static_cast<uint64_t>(I), InputSigned);
+    // TODO: Avoid implicit trunc?
+    // See https://github.com/llvm/llvm-project/issues/112510.
+    APInt Copy = APInt(BitWidth, static_cast<uint64_t>(I), InputSigned,
+                       /*implicitTrunc=*/true);
 
     return IntegralAP<Signed>(Copy);
   }
diff --git a/clang/lib/CodeGen/CGVTT.cpp b/clang/lib/CodeGen/CGVTT.cpp
index 20bd2c2fc2c6..989a07d09d50 100644
--- a/clang/lib/CodeGen/CGVTT.cpp
+++ b/clang/lib/CodeGen/CGVTT.cpp
@@ -85,8 +85,9 @@ CodeGenVTables::EmitVTTDefinition(llvm::GlobalVariable *VTT,
          cast<llvm::StructType>(VTable->getValueType())
              ->getElementType(AddressPoint.VTableIndex));
      unsigned Offset = ComponentSize * AddressPoint.AddressPointIndex;
-     llvm::ConstantRange InRange(llvm::APInt(32, -Offset, true),
-                                 llvm::APInt(32, VTableSize - Offset, true));
+     llvm::ConstantRange InRange(
+         llvm::APInt(32, (int)-Offset, true),
+         llvm::APInt(32, (int)(VTableSize - Offset), true));
      llvm::Constant *Init = llvm::ConstantExpr::getGetElementPtr(
          VTable->getValueType(), VTable, Idxs, /*InBounds=*/true, InRange);
 
diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 3cc17ebaacd9..6c2a6f9ba66f 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -2099,8 +2099,9 @@ ItaniumCXXABI::getVTableAddressPoint(BaseSubobject Base,
   unsigned VTableSize =
       ComponentSize * Layout.getVTableSize(AddressPoint.VTableIndex);
   unsigned Offset = ComponentSize * AddressPoint.AddressPointIndex;
-  llvm::ConstantRange InRange(llvm::APInt(32, -Offset, true),
-                              llvm::APInt(32, VTableSize - Offset, true));
+  llvm::ConstantRange InRange(
+      llvm::APInt(32, (int)-Offset, true),
+      llvm::APInt(32, (int)(VTableSize - Offset), true));
   return llvm::ConstantExpr::getGetElementPtr(
       VTable->getValueType(), VTable, Indices, /*InBounds=*/true, InRange);
 }
diff --git a/clang/lib/Parse/ParseInit.cpp b/clang/lib/Parse/ParseInit.cpp
index 0a9a359cdaf9..dd59cb23236d 100644
--- a/clang/lib/Parse/ParseInit.cpp
+++ b/clang/lib/Parse/ParseInit.cpp
@@ -436,9 +436,9 @@ ExprResult Parser::createEmbedExpr() {
   ASTContext &Context = Actions.getASTContext();
   SourceLocation StartLoc = ConsumeAnnotationToken();
   if (Data->BinaryData.size() == 1) {
-    Res = IntegerLiteral::Create(Context,
-                                 llvm::APInt(CHAR_BIT, Data->BinaryData.back()),
-                                 Context.UnsignedCharTy, StartLoc);
+    Res = IntegerLiteral::Create(
+        Context, llvm::APInt(CHAR_BIT, (unsigned char)Data->BinaryData.back()),
+        Context.UnsignedCharTy, StartLoc);
   } else {
     auto CreateStringLiteralFromStringRef = [&](StringRef Str, QualType Ty) {
       llvm::APSInt ArraySize =
diff --git a/clang/lib/Sema/SemaExpr.cpp b/clang/lib/Sema/SemaExpr.cpp
index 044f56f2af71..6807f44562f6 100644
--- a/clang/lib/Sema/SemaExpr.cpp
+++ b/clang/lib/Sema/SemaExpr.cpp
@@ -3598,9 +3598,10 @@ ExprResult Sema::ActOnCharacterConstant(const Token &Tok, Scope *UDLScope) {
                                         Lit, Tok.getLocation());
 }
 
-ExprResult Sema::ActOnIntegerConstant(SourceLocation Loc, uint64_t Val) {
+ExprResult Sema::ActOnIntegerConstant(SourceLocation Loc, int64_t Val) {
   unsigned IntSize = Context.getTargetInfo().getIntWidth();
-  return IntegerLiteral::Create(Context, llvm::APInt(IntSize, Val),
+  return IntegerLiteral::Create(Context,
+                                llvm::APInt(IntSize, Val, /*isSigned=*/true),
                                 Context.IntTy, Loc);
 }
 
diff --git a/clang/lib/Sema/SemaOpenMP.cpp b/clang/lib/Sema/SemaOpenMP.cpp
index d3e696a79b94..0232745b3c19 100644
--- a/clang/lib/Sema/SemaOpenMP.cpp
+++ b/clang/lib/Sema/SemaOpenMP.cpp
@@ -5697,7 +5697,9 @@ StmtResult SemaOpenMP::ActOnOpenMPCanonicalLoop(Stmt *AStmt) {
       llvm_unreachable("unhandled unary increment operator");
     }
     Step = IntegerLiteral::Create(
-        Ctx, llvm::APInt(Ctx.getIntWidth(LogicalTy), Direction), LogicalTy, {});
+        Ctx,
+        llvm::APInt(Ctx.getIntWidth(LogicalTy), Direction, /*isSigned=*/true),
+        LogicalTy, {});
   } else if (auto *IncBin = dyn_cast<BinaryOperator>(Inc)) {
     if (IncBin->getOpcode() == BO_AddAssign) {
       Step = IncBin->getRHS();
diff --git a/lldb/source/Expression/DWARFExpression.cpp b/lldb/source/Expression/DWARFExpression.cpp
index 97bcd4f7eec2..f92f25ed342a 100644
--- a/lldb/source/Expression/DWARFExpression.cpp
+++ b/lldb/source/Expression/DWARFExpression.cpp
@@ -860,10 +860,12 @@ llvm::Expected<Value> DWARFExpression::Evaluate(
   // TODO: Implement a real typed stack, and store the genericness of the value
   // there.
   auto to_generic = [&](auto v) {
+    // TODO: Avoid implicit trunc?
+    // See https://github.com/llvm/llvm-project/issues/112510.
     bool is_signed = std::is_signed<decltype(v)>::value;
-    return Scalar(llvm::APSInt(
-        llvm::APInt(8 * opcodes.GetAddressByteSize(), v, is_signed),
-        !is_signed));
+    return Scalar(llvm::APSInt(llvm::APInt(8 * opcodes.GetAddressByteSize(), v,
+                                           is_signed, /*implicitTrunc=*/true),
+                               !is_signed));
   };
 
   // The default kind is a memory location. This is updated by any
diff --git a/llvm/include/llvm/ADT/APFixedPoint.h b/llvm/include/llvm/ADT/APFixedPoint.h
index e4aa82d7a41c..70d7f325702c 100644
--- a/llvm/include/llvm/ADT/APFixedPoint.h
+++ b/llvm/include/llvm/ADT/APFixedPoint.h
@@ -168,7 +168,9 @@ public:
   }
 
   APFixedPoint(uint64_t Val, const FixedPointSemantics &Sema)
-      : APFixedPoint(APInt(Sema.getWidth(), Val, Sema.isSigned()), Sema) {}
+      : APFixedPoint(APInt(Sema.getWidth(), Val, Sema.isSigned(),
+                           /*implicitTrunc=*/true),
+                     Sema) {}
 
   // Zero initialization.
   APFixedPoint(const FixedPointSemantics &Sema) : APFixedPoint(0, Sema) {}
diff --git a/llvm/lib/Analysis/ConstantFolding.cpp b/llvm/lib/Analysis/ConstantFolding.cpp
index 3af4ae31ffe8..da0fd1f07c83 100644
--- a/llvm/lib/Analysis/ConstantFolding.cpp
+++ b/llvm/lib/Analysis/ConstantFolding.cpp
@@ -888,7 +888,8 @@ Constant *SymbolicallyEvaluateGEP(const GEPOperator *GEP,
   APInt Offset = APInt(
       BitWidth,
       DL.getIndexedOffsetInType(
-          SrcElemTy, ArrayRef((Value *const *)Ops.data() + 1, Ops.size() - 1)));
+          SrcElemTy, ArrayRef((Value *const *)Ops.data() + 1, Ops.size() - 1)),
+      /*isSigned=*/true, /*implicitTrunc=*/true);
 
   std::optional<ConstantRange> InRange = GEP->getInRange();
   if (InRange)
diff --git a/llvm/lib/Analysis/Loads.cpp b/llvm/lib/Analysis/Loads.cpp
index f4b202791a70..820b8e96c1d3 100644
--- a/llvm/lib/Analysis/Loads.cpp
+++ b/llvm/lib/Analysis/Loads.cpp
@@ -95,10 +95,8 @@ static bool isDereferenceableAndAlignedPointer(
 
   auto IsKnownDeref = [&]() {
     bool CheckForNonNull, CheckForFreed;
-    APInt KnownDerefBytes(Size.getBitWidth(),
-                          V->getPointerDereferenceableBytes(DL, CheckForNonNull,
-                                                            CheckForFreed));
-    if (!KnownDerefBytes.getBoolValue() || !KnownDerefBytes.uge(Size) ||
+    if (!Size.ule(V->getPointerDereferenceableBytes(DL, CheckForNonNull,
+                                                    CheckForFreed)) ||
         CheckForFreed)
       return false;
     if (CheckForNonNull &&
diff --git a/llvm/lib/Analysis/MemoryBuiltins.cpp b/llvm/lib/Analysis/MemoryBuiltins.cpp
index e1abf5e4d885..dc2dc4c1733b 100644
--- a/llvm/lib/Analysis/MemoryBuiltins.cpp
+++ b/llvm/lib/Analysis/MemoryBuiltins.cpp
@@ -767,6 +767,8 @@ SizeOffsetAPInt ObjectSizeOffsetVisitor::visitAllocaInst(AllocaInst &I) {
   TypeSize ElemSize = DL.getTypeAllocSize(I.getAllocatedType());
   if (ElemSize.isScalable() && Options.EvalMode != ObjectSizeOpts::Mode::Min)
     return ObjectSizeOffsetVisitor::unknown();
+  if (!isUIntN(IntTyBits, ElemSize.getKnownMinValue()))
+    return ObjectSizeOffsetVisitor::unknown();
   APInt Size(IntTyBits, ElemSize.getKnownMinValue());
   if (!I.isArrayAllocation())
     return SizeOffsetAPInt(align(Size, I.getAlign()), Zero);
diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index a3ba8e037819..3d028ab752f2 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -6883,7 +6883,7 @@ const ConstantRange &ScalarEvolution::getRangeRef(
       bool CanBeNull, CanBeFreed;
       uint64_t DerefBytes =
           V->getPointerDereferenceableBytes(DL, CanBeNull, CanBeFreed);
-      if (DerefBytes > 1) {
+      if (DerefBytes > 1 && isUIntN(BitWidth, DerefBytes)) {
         // The highest address the object can start is DerefBytes bytes before
         // the end (unsigned max value). If this value is not a multiple of the
         // alignment, the last possible start value is the next lowest multiple
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index e09532dc4735..182c5808f8ca 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -876,7 +876,8 @@ private:
     } else {
       int64_t Start = BitcodeReader::decodeSignRotatedValue(Record[OpNum++]);
       int64_t End = BitcodeReader::decodeSignRotatedValue(Record[OpNum++]);
-      return ConstantRange(APInt(BitWidth, Start), APInt(BitWidth, End));
+      return ConstantRange(APInt(BitWidth, Start, true),
+                           APInt(BitWidth, End, true));
     }
   }
 
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
index d63ed7ecf023..66c078b1d35b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAG.cpp
@@ -1641,7 +1641,11 @@ SDValue SelectionDAG::getConstant(uint64_t Val, const SDLoc &DL, EVT VT,
   assert((EltVT.getSizeInBits() >= 64 ||
           (uint64_t)((int64_t)Val >> EltVT.getSizeInBits()) + 1 < 2) &&
          "getConstant with a uint64_t value that doesn't fit in the type!");
-  return getConstant(APInt(EltVT.getSizeInBits(), Val), DL, VT, isT, isO);
+  // TODO: Avoid implicit trunc?
+  // See https://github.com/llvm/llvm-project/issues/112510.
+  return getConstant(APInt(EltVT.getSizeInBits(), Val, /*isSigned=*/false,
+                           /*implicitTrunc=*/true),
+                     DL, VT, isT, isO);
 }
 
 SDValue SelectionDAG::getConstant(const APInt &Val, const SDLoc &DL, EVT VT,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 9d8224749967..3e13364cf28a 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -4333,7 +4333,8 @@ void SelectionDAGBuilder::visitGetElementPtr(const User &I) {
           GTI.getSequentialElementStride(DAG.getDataLayout());
       // We intentionally mask away the high bits here; ElementSize may not
       // fit in IdxTy.
-      APInt ElementMul(IdxSize, ElementSize.getKnownMinValue());
+      APInt ElementMul(IdxSize, ElementSize.getKnownMinValue(),
+                       /*isSigned=*/false, /*implicitTrunc=*/true);
       bool ElementScalable = ElementSize.isScalable();
 
       // If this is a scalar constant or a splat vector of constants,
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
index 2a97580942df..9bd894dd952b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGISel.cpp
@@ -2200,7 +2200,10 @@ ScheduleDAGSDNodes *SelectionDAGISel::CreateScheduler() {
 bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
                                     int64_t DesiredMaskS) const {
   const APInt &ActualMask = RHS->getAPIntValue();
-  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+  // TODO: Avoid implicit trunc?
+  // See https://github.com/llvm/llvm-project/issues/112510.
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS,
+                                   /*isSigned=*/false, /*implicitTrunc=*/true);
 
   // If the actual mask exactly matches, success!
   if (ActualMask == DesiredMask)
@@ -2229,7 +2232,10 @@ bool SelectionDAGISel::CheckAndMask(SDValue LHS, ConstantSDNode *RHS,
 bool SelectionDAGISel::CheckOrMask(SDValue LHS, ConstantSDNode *RHS,
                                    int64_t DesiredMaskS) const {
   const APInt &ActualMask = RHS->getAPIntValue();
-  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS);
+  // TODO: Avoid implicit trunc?
+  // See https://github.com/llvm/llvm-project/issues/112510.
+  const APInt &DesiredMask = APInt(LHS.getValueSizeInBits(), DesiredMaskS,
+                                   /*isSigned=*/false, /*implicitTrunc=*/true);
 
   // If the actual mask exactly matches, success!
   if (ActualMask == DesiredMask)
diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 40f030d7b936..4f42ed2ee701 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -6813,7 +6813,9 @@ TargetLowering::prepareUREMEqFold(EVT SETCCVT, SDValue REMNode,
 
     PAmts.push_back(DAG.getConstant(P, DL, SVT));
     KAmts.push_back(
-        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K, /*isSigned=*/false,
+                              /*implicitTrunc=*/true),
+                        DL, ShSVT));
     QAmts.push_back(DAG.getConstant(Q, DL, SVT));
     return true;
   };
@@ -7084,7 +7086,9 @@ TargetLowering::prepareSREMEqFold(EVT SETCCVT, SDValue REMNode,
     PAmts.push_back(DAG.getConstant(P, DL, SVT));
     AAmts.push_back(DAG.getConstant(A, DL, SVT));
     KAmts.push_back(
-        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K), DL, ShSVT));
+        DAG.getConstant(APInt(ShSVT.getSizeInBits(), K, /*isSigned=*/false,
+                              /*implicitTrunc=*/true),
+                        DL, ShSVT));
     QAmts.push_back(DAG.getConstant(Q, DL, SVT));
     return true;
   };
diff --git a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
index 4cce4a77b343..e3b7db2380bb 100644
--- a/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
+++ b/llvm/lib/ExecutionEngine/MCJIT/MCJIT.cpp
@@ -588,7 +588,7 @@ GenericValue MCJIT::runFunction(Function *F, ArrayRef<GenericValue> ArgValues) {
       return rv;
     }
     case Type::VoidTyID:
-      rv.IntVal = APInt(32, ((int(*)())(intptr_t)FPtr)());
+      rv.IntVal = APInt(32, ((int (*)())(intptr_t)FPtr)(), true);
       return rv;
     case Type::FloatTyID:
       rv.FloatVal = ((float(*)())(intptr_t)FPtr)();
diff --git a/llvm/lib/IR/Constants.cpp b/llvm/lib/IR/Constants.cpp
index fe3a086c5772..7ae397871bde 100644
--- a/llvm/lib/IR/Constants.cpp
+++ b/llvm/lib/IR/Constants.cpp
@@ -932,7 +932,10 @@ Constant *ConstantInt::get(Type *Ty, uint64_t V, bool isSigned) {
 }
 
 ConstantInt *ConstantInt::get(IntegerType *Ty, uint64_t V, bool isSigned) {
-  return get(Ty->getContext(), APInt(Ty->getBitWidth(), V, isSigned));
+  // TODO: Avoid implicit trunc?
+  // See https://github.com/llvm/llvm-project/issues/112510.
+  return get(Ty->getContext(),
+             APInt(Ty->getBitWidth(), V, isSigned, /*implicitTrunc=*/true));
 }
 
 Constant *ConstantInt::get(Type *Ty, const APInt& V) {
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 60150c3328aa..5e5afdb7fa0a 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -2400,10 +2400,11 @@ void AArch64TargetLowering::computeKnownBitsForTargetNode(
   }
   case AArch64ISD::BICi: {
     // Compute the bit cleared value.
-    uint64_t Mask =
-        ~(Op->getConstantOperandVal(1) << Op->getConstantOperandVal(2));
+    APInt Mask =
+        ~(Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
+             .trunc(Known.getBitWidth());
     Known = DAG.computeKnownBits(Op->getOperand(0), Depth + 1);
-    Known &= KnownBits::makeConstant(APInt(Known.getBitWidth(), Mask));
+    Known &= KnownBits::makeConstant(Mask);
     break;
   }
   case AArch64ISD::VLSHR: {
@@ -12839,7 +12840,8 @@ static bool isEXTMask(ArrayRef<int> M, EVT VT, bool &ReverseEXT,
   // Benefit form APInt to handle overflow when calculating expected element.
   unsigned NumElts = VT.getVectorNumElements();
   unsigned MaskBits = APInt(32, NumElts * 2).logBase2();
-  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1);
+  APInt ExpectedElt = APInt(MaskBits, *FirstRealElt + 1, /*isSigned=*/false,
+                            /*implicitTrunc=*/true);
   // The following shuffle indices must be the successive elements after the
   // first real element.
   bool FoundWrongElt = std::any_of(FirstRealElt + 1, M.end(), [&](int Elt) {
@@ -14306,9 +14308,9 @@ static SDValue NormalizeBuildVector(SDValue Op,
     // (with operands cast to integers), then the only possibilities
     // are constants and UNDEFs.
     if (auto *CstLane = dyn_cast<ConstantSDNode>(Lane)) {
-      APInt LowBits(EltTy.getSizeInBits(),
-                    CstLane->getZExtValue());
-      Lane = DAG.getConstant(LowBits.getZExtValue(), dl, MVT::i32);
+      Lane = DAG.getConstant(
+          CstLane->getAPIntValue().trunc(EltTy.getSizeInBits()).getZExtValue(),
+          dl, MVT::i32);
     } else if (Lane.getNode()->isUndef()) {
       Lane = DAG.getUNDEF(MVT::i32);
     } else {
@@ -23713,7 +23715,7 @@ static bool findMoreOptimalIndexType(const MaskedGatherScatterSDNode *N,
   EVT NewIndexVT = IndexVT.changeVectorElementType(MVT::i32);
   // Stride does not scale explicitly by 'Scale', because it happens in
   // the gather/scatter addressing mode.
-  Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride));
+  Index = DAG.getStepVector(SDLoc(N), NewIndexVT, APInt(32, Stride, true));
   return true;
 }
 
@@ -28727,7 +28729,7 @@ static SDValue GenerateFixedLengthSVETBL(SDValue Op, SDValue Op1, SDValue Op2,
   unsigned BitsPerElt = VTOp1.getVectorElementType().getSizeInBits();
   unsigned IndexLen = MinSVESize / BitsPerElt;
   unsigned ElementsPerVectorReg = VTOp1.getVectorNumElements();
-  uint64_t MaxOffset = APInt(BitsPerElt, -1, false).getZExtValue();
+  uint64_t MaxOffset = maxUIntN(BitsPerElt);
   EVT MaskEltType = VTOp1.getVectorElementType().changeTypeToInteger();
   EVT MaskType = EVT::getVectorVT(*DAG.getContext(), MaskEltType, IndexLen);
   bool MinMaxEqual = (MinSVESize == MaxSVESize);
@@ -29085,16 +29087,14 @@ bool AArch64TargetLowering::SimplifyDemandedBitsForTargetNode(
     KnownBits KnownOp0 =
         TLO.DAG.computeKnownBits(Op0, OriginalDemandedElts, Depth + 1);
     // Op0 &= ~(ConstantOperandVal(1) << ConstantOperandVal(2))
-    uint64_t BitsToClear = Op->getConstantOperandVal(1)
-                           << Op->getConstantOperandVal(2);
+    APInt BitsToClear =
+        (Op->getConstantOperandAPInt(1) << Op->getConstantOperandAPInt(2))
+            .trunc(KnownOp0.getBitWidth());
     APInt AlreadyZeroedBitsToClear = BitsToClear & KnownOp0.Zero;
-    if (APInt(Known.getBitWidth(), BitsToClear)
-            .isSubsetOf(AlreadyZeroedBitsToClear))
+    if (BitsToClear.isSubsetOf(AlreadyZeroedBitsToClear))
       return TLO.CombineTo(Op, Op0);
 
-    Known = KnownOp0 &
-            KnownBits::makeConstant(APInt(Known.getBitWidth(), ~BitsToClear));
-
+    Known = KnownOp0 & KnownBits::makeConstant(~BitsToClear);
     return false;
   }
   case ISD::INTRINSIC_WO_CHAIN: {
diff --git a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
index abd6c7cce53c..76c1ea4e7420 100644
--- a/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIInstrInfo.cpp
@@ -3441,7 +3441,8 @@ bool SIInstrInfo::foldImmediate(MachineInstr &UseMI, MachineInstr &DefMI,
                                            : AMDGPU::V_MOV_B32_e32
                                  : Is64Bit ? AMDGPU::S_MOV_B64_IMM_PSEUDO
                                            : AMDGPU::S_MOV_B32;
-    APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)));
+    APInt Imm(Is64Bit ? 64 : 32, getImmFor(UseMI.getOperand(1)),
+              /*isSigned=*/true, /*implicitTrunc=*/true);
 
     if (RI.isAGPR(*MRI, DstReg)) {
       if (Is64Bit || !isInlineConstant(Imm))
diff --git a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
index 78267d402b6c..f0b0e378ad66 100644
--- a/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
+++ b/llvm/lib/Target/AMDGPU/SIShrinkInstructions.cpp
@@ -213,12 +213,12 @@ static unsigned canModifyToInlineImmOp32(const SIInstrInfo *TII,
     // that SCC is not live as S_NOT_B32 clobbers it. It's probably not worth
     // it, as the reasonable values are already covered by s_movk_i32.
     ModifiedImm = ~SrcImm;
-    if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+    if (TII->isInlineConstant(APInt(32, ModifiedImm, true)))
       return AMDGPU::V_NOT_B32_e32;
   }
 
   ModifiedImm = reverseBits<int32_t>(SrcImm);
-  if (TII->isInlineConstant(APInt(32, ModifiedImm)))
+  if (TII->isInlineConstant(APInt(32, ModifiedImm, true)))
     return Scalar ? AMDGPU::S_BREV_B32 : AMDGPU::V_BFREV_B32_e32;
 
   return 0;
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index b908e4f367e1..0ce5f466bad2 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -1158,7 +1158,8 @@ public:
   bool isFPImm() const {
     if (!isImm()) return false;
     const MCConstantExpr *CE = dyn_cast<MCConstantExpr>(getImm());
-    if (!CE) return false;
+    if (!CE || !isUInt<32>(CE->getValue()))
+      return false;
     int Val = ARM_AM::getFP32Imm(APInt(32, CE->getValue()));
     return Val != -1;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
index dae316ccb5e9..f68444c0b8d4 100644
--- a/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonConstPropagation.cpp
@@ -2503,7 +2503,8 @@ APInt HexagonConstEvaluator::getCmpImm(unsigned Opc, unsigned OpX,
   }
 
   uint64_t Val = MO.getImm();
-  return APInt(32, Val, Signed);
+  // TODO: Is implicitTrunc correct here?
+  return APInt(32, Val, Signed, /*implicitTrunc=*/true);
 }
 
 void HexagonConstEvaluator::replaceWithNop(MachineInstr &MI) {
diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
index 65bbb1364488..b16ab3931b28 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -171,7 +171,7 @@ bool HexagonGenExtract::convert(Instruction *In) {
     // this value.
     if (!LogicalSR && (SR > SL))
       return false;
-    APInt A = APInt(BW, ~0ULL).lshr(SR).shl(SL);
+    APInt A = APInt(BW, ~0ULL, true).lshr(SR).shl(SL);
     CM = ConstantInt::get(Ctx, A);
   }
 
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 14249e34921e..fa14a203913e 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -3446,7 +3446,9 @@ static std::optional<VIDSequence> isSimpleVIDSequence(SDValue Op,
     if (!Elt)
       continue;
     APInt ExpectedVal =
-        (APInt(EltSizeInBits, Idx) * *SeqStepNum).sdiv(*SeqStepDenom);
+        (APInt(EltSizeInBits, Idx, /*isSigned=*/false, /*implicitTrunc=*/true) *
+         *SeqStepNum)
+            .sdiv(*SeqStepDenom);
 
     APInt Addend = *Elt - ExpectedVal;
     if (!SeqAddend)
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index 0155409dfda0..aa6e75cbf410 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -52745,8 +52745,8 @@ static SDValue combineFMulcFCMulc(SDNode *N, SelectionDAG &DAG,
       if (XOR->getOpcode() == ISD::XOR && XOR.hasOneUse()) {
         KnownBits XORRHS = DAG.computeKnownBits(XOR.getOperand(1));
         if (XORRHS.isConstant()) {
-          APInt ConjugationInt32 = APInt(32, 0x80000000, true);
-          APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL, true);
+          APInt ConjugationInt32 = APInt(32, 0x80000000);
+          APInt ConjugationInt64 = APInt(64, 0x8000000080000000ULL);
           if ((XORRHS.getBitWidth() == 32 &&
                XORRHS.getConstant() == ConjugationInt32) ||
               (XORRHS.getBitWidth() == 64 &&
@@ -52785,7 +52785,7 @@ static SDValue combineFaddCFmul(SDNode *N, SelectionDAG &DAG,
            Flags.hasNoSignedZeros();
   };
   auto IsVectorAllNegativeZero = [&DAG](SDValue Op) {
-    APInt AI = APInt(32, 0x80008000, true);
+    APInt AI = APInt(32, 0x80008000);
     KnownBits Bits = DAG.computeKnownBits(Op);
     return Bits.getBitWidth() == 32 && Bits.isConstant() &&
            Bits.getConstant() == AI;
diff --git a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
index afc13232ff19..ac3f2bab5b09 100644
--- a/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
+++ b/llvm/lib/Transforms/IPO/ArgumentPromotion.cpp
@@ -101,7 +101,8 @@ using OffsetAndArgPart = std::pair<int64_t, ArgPart>;
 static Value *createByteGEP(IRBuilderBase &IRB, const DataLayout &DL,
                             Value *Ptr, Type *ResElemTy, int64_t Offset) {
   if (Offset != 0) {
-    APInt APOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset);
+    APInt APOffset(DL.getIndexTypeSizeInBits(Ptr->getType()), Offset,
+                   /*isSigned=*/true);
     Ptr = IRB.CreatePtrAdd(Ptr, IRB.getInt(APOffset));
   }
   return Ptr;
diff --git a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
index 566ae2cf1936..72228b445a8b 100644
--- a/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyCFG.cpp
@@ -7243,7 +7243,7 @@ static bool reduceSwitchRange(SwitchInst *SI, IRBuilder<> &Builder,
 
   for (auto Case : SI->cases()) {
     auto *Orig = Case.getCaseValue();
-    auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base);
+    auto Sub = Orig->getValue() - APInt(Ty->getBitWidth(), Base, true);
     Case.setValue(cast<ConstantInt>(ConstantInt::get(Ty, Sub.lshr(Shift))));
   }
   return true;
diff --git a/llvm/unittests/ADT/APFixedPointTest.cpp b/llvm/unittests/ADT/APFixedPointTest.cpp
index e7aa58a83257..b71c5e16a915 100644
--- a/llvm/unittests/ADT/APFixedPointTest.cpp
+++ b/llvm/unittests/ADT/APFixedPointTest.cpp
@@ -240,19 +240,20 @@ void CheckIntPart(const FixedPointSemantics &Sema, int64_t IntPart) {
   APFixedPoint ValWithFract(
       APInt(Sema.getWidth(),
             relativeShr(IntPart, Sema.getLsbWeight()) + FullFactPart,
-            Sema.isSigned()),
+            Sema.isSigned(), /*implicitTrunc=*/true),
       Sema);
   ASSERT_EQ(ValWithFract.getIntPart(), IntPart);
 
   // Just fraction
-  APFixedPoint JustFract(APInt(Sema.getWidth(), FullFactPart, Sema.isSigned()),
+  APFixedPoint JustFract(APInt(Sema.getWidth(), FullFactPart, Sema.isSigned(),
+                               /*implicitTrunc=*/true),
                          Sema);
   ASSERT_EQ(JustFract.getIntPart(), 0);
 
   // Whole number
   APFixedPoint WholeNum(APInt(Sema.getWidth(),
                               relativeShr(IntPart, Sema.getLsbWeight()),
-                              Sema.isSigned()),
+                              Sema.isSigned(), /*implicitTrunc=*/true),
                         Sema);
   ASSERT_EQ(WholeNum.getIntPart(), IntPart);
 
@@ -260,7 +261,7 @@ void CheckIntPart(const FixedPointSemantics &Sema, int64_t IntPart) {
   if (Sema.isSigned()) {
     APFixedPoint Negative(APInt(Sema.getWidth(),
                                 relativeShr(IntPart, Sema.getLsbWeight()),
-                                Sema.isSigned()),
+                                Sema.isSigned(), /*implicitTrunc=*/true),
                           Sema);
     ASSERT_EQ(Negative.getIntPart(), IntPart);
   }
-- 
GitLab


From 267be4a7f4ac69cfd1bec5223554bbe400c5636c Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Oct 2024 09:03:39 +0200
Subject: [PATCH 203/329] [MLIR] Reference issue for implicit trunc TODOs (NFC)

---
 mlir/include/mlir/IR/BuiltinAttributes.td | 1 +
 mlir/lib/IR/Builders.cpp                  | 1 +
 2 files changed, 2 insertions(+)

diff --git a/mlir/include/mlir/IR/BuiltinAttributes.td b/mlir/include/mlir/IR/BuiltinAttributes.td
index 530ba7d2f11e..492b8309a5ea 100644
--- a/mlir/include/mlir/IR/BuiltinAttributes.td
+++ b/mlir/include/mlir/IR/BuiltinAttributes.td
@@ -702,6 +702,7 @@ def Builtin_IntegerAttr : Builtin_Attr<"Integer", "integer",
       }
 
       // TODO: Avoid implicit trunc?
+      // See https://github.com/llvm/llvm-project/issues/112510.
       IntegerType intTy = ::llvm::cast<IntegerType>(type);
       APInt apValue(intTy.getWidth(), value, intTy.isSignedInteger(),
                     /*implicitTrunc=*/true);
diff --git a/mlir/lib/IR/Builders.cpp b/mlir/lib/IR/Builders.cpp
index 5359432a04f3..5397fbabc5c9 100644
--- a/mlir/lib/IR/Builders.cpp
+++ b/mlir/lib/IR/Builders.cpp
@@ -269,6 +269,7 @@ IntegerAttr Builder::getIntegerAttr(Type type, int64_t value) {
   if (type.isIndex())
     return IntegerAttr::get(type, APInt(64, value));
   // TODO: Avoid implicit trunc?
+  // See https://github.com/llvm/llvm-project/issues/112510.
   return IntegerAttr::get(type, APInt(type.getIntOrFloatBitWidth(), value,
                                       type.isSignedInteger(),
                                       /*implicitTrunc=*/true));
-- 
GitLab


From 1cc5290a30a0d6dffeb2e0f475558fcf3ded8e1f Mon Sep 17 00:00:00 2001
From: Stanislav Mekhanoshin <rampitec@users.noreply.github.com>
Date: Thu, 17 Oct 2024 00:47:19 -0700
Subject: [PATCH 204/329] [AMDGPU] Factor out getNumUsedPhysRegs(). NFC.
 (#112624)

I will need it from one more place.
---
 .../AMDGPU/AMDGPUResourceUsageAnalysis.cpp    | 42 ++-----------------
 llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp     |  9 ++++
 llvm/lib/Target/AMDGPU/SIRegisterInfo.h       |  5 +++
 3 files changed, 18 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
index 1ee3c40d69a3..9087442caf8d 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUResourceUsageAnalysis.cpp
@@ -146,44 +146,10 @@ AMDGPUResourceUsageAnalysis::analyzeResourceUsage(
   // count easily.
   // A tail call isn't considered a call for MachineFrameInfo's purposes.
   if (!FrameInfo.hasCalls() && !FrameInfo.hasTailCall()) {
-    MCPhysReg HighestVGPRReg = AMDGPU::NoRegister;
-    for (MCPhysReg Reg : reverse(AMDGPU::VGPR_32RegClass.getRegisters())) {
-      if (MRI.isPhysRegUsed(Reg)) {
-        HighestVGPRReg = Reg;
-        break;
-      }
-    }
-
-    if (ST.hasMAIInsts()) {
-      MCPhysReg HighestAGPRReg = AMDGPU::NoRegister;
-      for (MCPhysReg Reg : reverse(AMDGPU::AGPR_32RegClass.getRegisters())) {
-        if (MRI.isPhysRegUsed(Reg)) {
-          HighestAGPRReg = Reg;
-          break;
-        }
-      }
-      Info.NumAGPR = HighestAGPRReg == AMDGPU::NoRegister
-                         ? 0
-                         : TRI.getHWRegIndex(HighestAGPRReg) + 1;
-    }
-
-    MCPhysReg HighestSGPRReg = AMDGPU::NoRegister;
-    for (MCPhysReg Reg : reverse(AMDGPU::SGPR_32RegClass.getRegisters())) {
-      if (MRI.isPhysRegUsed(Reg)) {
-        HighestSGPRReg = Reg;
-        break;
-      }
-    }
-
-    // We found the maximum register index. They start at 0, so add one to get
-    // the number of registers.
-    Info.NumVGPR = HighestVGPRReg == AMDGPU::NoRegister
-                       ? 0
-                       : TRI.getHWRegIndex(HighestVGPRReg) + 1;
-    Info.NumExplicitSGPR = HighestSGPRReg == AMDGPU::NoRegister
-                               ? 0
-                               : TRI.getHWRegIndex(HighestSGPRReg) + 1;
-
+    Info.NumVGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::VGPR_32RegClass);
+    Info.NumExplicitSGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::SGPR_32RegClass);
+    if (ST.hasMAIInsts())
+      Info.NumAGPR = TRI.getNumUsedPhysRegs(MRI, AMDGPU::AGPR_32RegClass);
     return Info;
   }
 
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
index de9cbe403ab6..8de16974a3e7 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.cpp
@@ -3851,3 +3851,12 @@ SIRegisterInfo::getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
   }
   return 0;
 }
+
+unsigned
+SIRegisterInfo::getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+                                   const TargetRegisterClass &RC) const {
+  for (MCPhysReg Reg : reverse(RC.getRegisters()))
+    if (MRI.isPhysRegUsed(Reg))
+      return getHWRegIndex(Reg) + 1;
+  return 0;
+}
diff --git a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
index 99fa632c0300..e12a41371c7f 100644
--- a/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
+++ b/llvm/lib/Target/AMDGPU/SIRegisterInfo.h
@@ -457,6 +457,11 @@ public:
   // No check if the subreg is supported by the current RC is made.
   unsigned getSubRegAlignmentNumBits(const TargetRegisterClass *RC,
                                      unsigned SubReg) const;
+
+  // \returns a number of registers of a given \p RC used in a function.
+  // Does not go inside function calls.
+  unsigned getNumUsedPhysRegs(const MachineRegisterInfo &MRI,
+                              const TargetRegisterClass &RC) const;
 };
 
 namespace AMDGPU {
-- 
GitLab


From cb9bacf57d5c58eba28a76fd07ea2d4f9a0da847 Mon Sep 17 00:00:00 2001
From: Paschalis Mpeis <paschalis.mpeis@arm.com>
Date: Thu, 17 Oct 2024 10:59:05 +0300
Subject: [PATCH 205/329] [AArch64][BOLT] Ensure tentative code layout for cold
 BBs runs. (#96609)

When split functions is used, BOLT may skip tentative code layout
estimation in some cases, like:
- when there is no profile data for some blocks (ie cold blocks)
- when there are cold functions in lite mode
- when skip functions is used

However, when rewriting the binary we still need to compute PC-relative
distances between hot and cold basic blocks. Without cold layout
estimation, BOLT uses '0x0' as the address of the first cold block,
leading to incorrect estimations of any PC-relative addresses.

This affects large binaries as the relaxStub method expands more
branches than necessary using the short-jump sequence, at it wrongly
believes it has exceeded the branch distance boundary.

This increases code size with both a larger and slower sequence;
however,
performance regression is expected to be minimal since this only affects
any called cold code.

Example of such an unnecessary relaxation:
from:
```armasm
b       .Ltmp1234
```

to:
```armasm
adrp    x16, .Ltmp1234
add     x16, x16, :lo12:.Ltmp1234
br      x16
```
---
 bolt/lib/Passes/LongJmp.cpp          | 23 ++++++++++++++---------
 bolt/test/AArch64/split-funcs-lite.s | 27 +++++++++++++++++++++++++++
 2 files changed, 41 insertions(+), 9 deletions(-)
 create mode 100644 bolt/test/AArch64/split-funcs-lite.s

diff --git a/bolt/lib/Passes/LongJmp.cpp b/bolt/lib/Passes/LongJmp.cpp
index c483f70a836e..0b2d00300f46 100644
--- a/bolt/lib/Passes/LongJmp.cpp
+++ b/bolt/lib/Passes/LongJmp.cpp
@@ -324,9 +324,8 @@ uint64_t LongJmpPass::tentativeLayoutRelocColdPart(
 uint64_t LongJmpPass::tentativeLayoutRelocMode(
     const BinaryContext &BC, std::vector<BinaryFunction *> &SortedFunctions,
     uint64_t DotAddress) {
-
   // Compute hot cold frontier
-  uint32_t LastHotIndex = -1u;
+  int64_t LastHotIndex = -1u;
   uint32_t CurrentIndex = 0;
   if (opts::HotFunctionsAtEnd) {
     for (BinaryFunction *BF : SortedFunctions) {
@@ -351,19 +350,20 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
   // Hot
   CurrentIndex = 0;
   bool ColdLayoutDone = false;
+  auto runColdLayout = [&]() {
+    DotAddress = tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress);
+    ColdLayoutDone = true;
+    if (opts::HotFunctionsAtEnd)
+      DotAddress = alignTo(DotAddress, opts::AlignText);
+  };
   for (BinaryFunction *Func : SortedFunctions) {
     if (!BC.shouldEmit(*Func)) {
       HotAddresses[Func] = Func->getAddress();
       continue;
     }
 
-    if (!ColdLayoutDone && CurrentIndex >= LastHotIndex) {
-      DotAddress =
-          tentativeLayoutRelocColdPart(BC, SortedFunctions, DotAddress);
-      ColdLayoutDone = true;
-      if (opts::HotFunctionsAtEnd)
-        DotAddress = alignTo(DotAddress, opts::AlignText);
-    }
+    if (!ColdLayoutDone && CurrentIndex >= LastHotIndex)
+      runColdLayout();
 
     DotAddress = alignTo(DotAddress, Func->getMinAlignment());
     uint64_t Pad =
@@ -382,6 +382,11 @@ uint64_t LongJmpPass::tentativeLayoutRelocMode(
     DotAddress += Func->estimateConstantIslandSize();
     ++CurrentIndex;
   }
+
+  // Ensure that tentative code layout always runs for cold blocks.
+  if (!ColdLayoutDone)
+    runColdLayout();
+
   // BBs
   for (BinaryFunction *Func : SortedFunctions)
     tentativeBBLayout(*Func);
diff --git a/bolt/test/AArch64/split-funcs-lite.s b/bolt/test/AArch64/split-funcs-lite.s
new file mode 100644
index 000000000000..5f95eea17ae7
--- /dev/null
+++ b/bolt/test/AArch64/split-funcs-lite.s
@@ -0,0 +1,27 @@
+# This test checks that tentative code layout for cold blocks always runs.
+# It commonly happens when using lite mode with split functions.
+
+# REQUIRES: system-linux, asserts
+
+# RUN: %clang %cflags -o %t %s
+# RUN: %clang %s %cflags -Wl,-q -o %t
+# RUN: link_fdata --no-lbr %s %t %t.fdata
+# RUN: llvm-bolt %t -o %t.bolt --data %t.fdata -split-functions \
+# RUN:   -debug 2>&1 | FileCheck %s
+
+  .text
+  .globl  foo
+  .type foo, %function
+foo:
+.entry_bb:
+# FDATA: 1 foo #.entry_bb# 10
+    cmp x0, #0
+    b.eq .Lcold_bb1
+    ret
+.Lcold_bb1:
+    ret
+
+## Force relocation mode.
+.reloc 0, R_AARCH64_NONE
+
+# CHECK: foo{{.*}} cold tentative: {{.*}}
-- 
GitLab


From 9d5ceccbd909398babd1ab71d62b0b708bb066c0 Mon Sep 17 00:00:00 2001
From: Hans Wennborg <hans@chromium.org>
Date: Thu, 17 Oct 2024 09:58:05 +0200
Subject: [PATCH 206/329] Speculatively un-XFAIL
 TestCases/Misc/Posix/ubsan_options.cpp on Darwin

After https://github.com/llvm/llvm-project/pull/111497 the test started
unexpectedly passing (https://crbug.com/373891811), probably because it
does actually work but wasn't run when it lived in the Misc/Linux/
directory.
---
 compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp | 3 ---
 1 file changed, 3 deletions(-)

diff --git a/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp b/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp
index 284b4ba0abe4..515102715a66 100644
--- a/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp
+++ b/compiler-rt/test/ubsan/TestCases/Misc/Posix/ubsan_options.cpp
@@ -1,9 +1,6 @@
 // RUN: %clangxx -fsanitize=integer -fsanitize-recover=integer %s -o %t
 // RUN: not %run %t 2>&1 | FileCheck %s
 
-// __ubsan_default_options() doesn't work on Darwin.
-// XFAIL: darwin
-
 #include <stdint.h>
 
 extern "C" const char *__ubsan_default_options() {
-- 
GitLab


From 4cda28c1ada702a08f6960eb4c93919187c1d4d1 Mon Sep 17 00:00:00 2001
From: Byoungchan Lee <byoungchan.lee@gmx.com>
Date: Thu, 17 Oct 2024 17:16:07 +0900
Subject: [PATCH 207/329] [clang-include-cleaner] Fix incorrect directory issue
 for writing files (#111375)

If the current working directory of `clang-include-cleaner` differs from
the directory of the input files specified in the compilation database,
it doesn't adjust the input file paths properly. As a result,
`clang-include-cleaner` either writes files to the wrong directory or
fails to write files altogether.

This pull request fixes the issue by adjusting the input file paths
based on the directory specified in the compilation database. If that
directory is not writable, `clang-include-cleaner` will write the output
relative to the current working directory.

Fixes #110843.
---
 .../include-cleaner/test/tool.cpp             | 10 +++
 .../include-cleaner/tool/IncludeCleaner.cpp   | 70 +++++++++++++++++--
 2 files changed, 73 insertions(+), 7 deletions(-)

diff --git a/clang-tools-extra/include-cleaner/test/tool.cpp b/clang-tools-extra/include-cleaner/test/tool.cpp
index 2155eec189d1..d72d2317ce2b 100644
--- a/clang-tools-extra/include-cleaner/test/tool.cpp
+++ b/clang-tools-extra/include-cleaner/test/tool.cpp
@@ -48,3 +48,13 @@ int x = foo();
 //        RUN: clang-include-cleaner -edit --ignore-headers="foobar\.h,foo\.h" %t.cpp -- -I%S/Inputs/ 
 //        RUN: FileCheck --match-full-lines --check-prefix=EDIT2 %s < %t.cpp
 //  EDIT2-NOT: {{^}}#include "foo.h"{{$}}
+
+//        RUN: rm -rf %t.dir && mkdir -p %t.dir
+//        RUN: cp %s %t.cpp
+//        RUN: echo "[{\"directory\":\"%t.dir\",\"file\":\"../%{t:stem}.tmp.cpp\",\"command\":\":clang++ -I%S/Inputs/ ../%{t:stem}.tmp.cpp\"}]" | sed -e 's/\\/\\\\/g' > %t.dir/compile_commands.json
+//        RUN: pushd %t.dir
+//        RUN: clang-include-cleaner -p %{t:stem}.tmp.dir -edit ../%{t:stem}.tmp.cpp
+//        RUN: popd
+//        RUN: FileCheck --match-full-lines --check-prefix=EDIT3 %s < %t.cpp
+//      EDIT3: #include "foo.h"
+//  EDIT3-NOT: {{^}}#include "foobar.h"{{$}}
diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index 080099adc9a0..6bd9c40c7075 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -173,9 +173,11 @@ private:
     if (!HTMLReportPath.empty())
       writeHTML();
 
-    llvm::StringRef Path =
-        SM.getFileEntryRefForID(SM.getMainFileID())->getName();
-    assert(!Path.empty() && "Main file path not known?");
+    // Source File's path of compiler invocation, converted to absolute path.
+    llvm::SmallString<256> AbsPath(
+        SM.getFileEntryRefForID(SM.getMainFileID())->getName());
+    assert(!AbsPath.empty() && "Main file path not known?");
+    SM.getFileManager().makeAbsolutePath(AbsPath);
     llvm::StringRef Code = SM.getBufferData(SM.getMainFileID());
 
     auto Results =
@@ -185,7 +187,7 @@ private:
       Results.Missing.clear();
     if (!Remove)
       Results.Unused.clear();
-    std::string Final = fixIncludes(Results, Path, Code, getStyle(Path));
+    std::string Final = fixIncludes(Results, AbsPath, Code, getStyle(AbsPath));
 
     if (Print.getNumOccurrences()) {
       switch (Print) {
@@ -202,7 +204,7 @@ private:
     }
 
     if (!Results.Missing.empty() || !Results.Unused.empty())
-      EditedFiles.try_emplace(Path, Final);
+      EditedFiles.try_emplace(AbsPath, Final);
   }
 
   void writeHTML() {
@@ -280,6 +282,48 @@ std::function<bool(llvm::StringRef)> headerFilter() {
   };
 }
 
+// Maps absolute path of each files of each compilation commands to the
+// absolute path of the input file.
+llvm::Expected<std::map<std::string, std::string>>
+mapInputsToAbsPaths(clang::tooling::CompilationDatabase &CDB,
+                    llvm::IntrusiveRefCntPtr<llvm::vfs::FileSystem> VFS,
+                    const std::vector<std::string> &Inputs) {
+  std::map<std::string, std::string> CDBToAbsPaths;
+  // Factory.editedFiles()` will contain the final code, along with the
+  // path given in the compilation database. That path can be
+  // absolute or relative, and if it is relative, it is relative to the
+  // "Directory" field in the compilation database. We need to make it
+  // absolute to write the final code to the correct path.
+  for (auto &Source : Inputs) {
+    llvm::SmallString<256> AbsPath(Source);
+    if (auto Err = VFS->makeAbsolute(AbsPath)) {
+      llvm::errs() << "Failed to get absolute path for " << Source << " : "
+                   << Err.message() << '\n';
+      return std::move(llvm::errorCodeToError(Err));
+    }
+    std::vector<clang::tooling::CompileCommand> Cmds =
+        CDB.getCompileCommands(AbsPath);
+    if (Cmds.empty()) {
+      // It should be found in the compilation database, even user didn't
+      // specify the compilation database, the `FixedCompilationDatabase` will
+      // create an entry from the arguments. So it is an error if we can't
+      // find the compile commands.
+      std::string ErrorMsg =
+          llvm::formatv("No compile commands found for {0}", AbsPath).str();
+      llvm::errs() << ErrorMsg << '\n';
+      return llvm::make_error<llvm::StringError>(
+          ErrorMsg, llvm::inconvertibleErrorCode());
+    }
+    for (const auto &Cmd : Cmds) {
+      llvm::SmallString<256> CDBPath(Cmd.Filename);
+      std::string Directory(Cmd.Directory);
+      llvm::sys::fs::make_absolute(Cmd.Directory, CDBPath);
+      CDBToAbsPaths[std::string(CDBPath)] = std::string(AbsPath);
+    }
+  }
+  return CDBToAbsPaths;
+}
+
 } // namespace
 } // namespace include_cleaner
 } // namespace clang
@@ -305,8 +349,16 @@ int main(int argc, const char **argv) {
     }
   }
 
-  clang::tooling::ClangTool Tool(OptionsParser->getCompilations(),
-                                 OptionsParser->getSourcePathList());
+  auto VFS = llvm::vfs::getRealFileSystem();
+  auto &CDB = OptionsParser->getCompilations();
+  // CDBToAbsPaths is a map from the path in the compilation database to the
+  // writable absolute path of the file.
+  auto CDBToAbsPaths =
+      mapInputsToAbsPaths(CDB, VFS, OptionsParser->getSourcePathList());
+  if (!CDBToAbsPaths)
+    return 1;
+
+  clang::tooling::ClangTool Tool(CDB, OptionsParser->getSourcePathList());
 
   auto HeaderFilter = headerFilter();
   if (!HeaderFilter)
@@ -316,6 +368,10 @@ int main(int argc, const char **argv) {
   if (Edit) {
     for (const auto &NameAndContent : Factory.editedFiles()) {
       llvm::StringRef FileName = NameAndContent.first();
+      if (auto It = CDBToAbsPaths->find(FileName.str());
+          It != CDBToAbsPaths->end())
+        FileName = It->second;
+
       const std::string &FinalCode = NameAndContent.second;
       if (auto Err = llvm::writeToOutput(
               FileName, [&](llvm::raw_ostream &OS) -> llvm::Error {
-- 
GitLab


From cb43021e5726a4462f28a999fb66a8dc20dc354b Mon Sep 17 00:00:00 2001
From: CarolineConcatto <caroline.concatto@arm.com>
Date: Thu, 17 Oct 2024 09:22:55 +0100
Subject: [PATCH 208/329] [CLANG]Add Scalable vectors for mfloat8_t (#101644)

This patch adds these new vector sizes for sve:
    svmfloat8_t

According to the ARM ACLE PR#323[1].

[1] ARM-software/acle#323
---
 .../clang/Basic/AArch64SVEACLETypes.def       |  3 ++
 clang/include/clang/Basic/arm_sve_sme_incl.td |  1 +
 .../include/clang/Serialization/ASTBitCodes.h |  2 +-
 clang/lib/AST/Type.cpp                        |  1 +
 clang/test/AST/ast-dump-aarch64-sve-types.c   |  3 ++
 clang/test/CodeGen/aarch64-sve.c              |  2 +
 .../CodeGenCXX/aarch64-mangle-sve-vectors.cpp |  4 ++
 .../test/CodeGenCXX/aarch64-sve-typeinfo.cpp  |  5 +++
 .../CodeGenCXX/aarch64-sve-vector-init.cpp    | 17 ++++++++
 clang/test/CodeGenObjC/aarch64-sve-types.m    |  3 ++
 clang/test/Modules/no-external-type-id.cppm   |  2 +-
 clang/test/Sema/aarch64-sve-types.c           |  3 ++
 clang/test/Sema/arm-mfp8.cpp                  | 13 ++++++
 clang/test/SemaObjC/aarch64-sve-types.m       |  2 +
 clang/utils/TableGen/SveEmitter.cpp           | 41 +++++++++++++++----
 15 files changed, 92 insertions(+), 10 deletions(-)
 create mode 100644 clang/test/Sema/arm-mfp8.cpp

diff --git a/clang/include/clang/Basic/AArch64SVEACLETypes.def b/clang/include/clang/Basic/AArch64SVEACLETypes.def
index 55ed9c36f6c5..72df1e35aaec 100644
--- a/clang/include/clang/Basic/AArch64SVEACLETypes.def
+++ b/clang/include/clang/Basic/AArch64SVEACLETypes.def
@@ -115,6 +115,9 @@ SVE_VECTOR_TYPE_FLOAT("__SVFloat64_t", "__SVFloat64_t", SveFloat64, SveFloat64Ty
 
 SVE_VECTOR_TYPE_BFLOAT("__SVBfloat16_t", "__SVBfloat16_t", SveBFloat16, SveBFloat16Ty, 8, 16, 1)
 
+// This is a 8 bits opaque type.
+SVE_VECTOR_TYPE_INT("__SVMfloat8_t", "__SVMfloat8_t",  SveMFloat8, SveMFloat8Ty, 16, 8, 1, false)
+
 //
 // x2
 //
diff --git a/clang/include/clang/Basic/arm_sve_sme_incl.td b/clang/include/clang/Basic/arm_sve_sme_incl.td
index fdf4ba55fe93..50911fb63e81 100644
--- a/clang/include/clang/Basic/arm_sve_sme_incl.td
+++ b/clang/include/clang/Basic/arm_sve_sme_incl.td
@@ -162,6 +162,7 @@ def EltTyBool16  : EltType<10>;
 def EltTyBool32  : EltType<11>;
 def EltTyBool64  : EltType<12>;
 def EltTyBFloat16 : EltType<13>;
+def EltTyMFloat8  : EltType<14>;
 
 class MemEltType<int val> {
   int Value = val;
diff --git a/clang/include/clang/Serialization/ASTBitCodes.h b/clang/include/clang/Serialization/ASTBitCodes.h
index 4b79d4b77119..d735e2dcaa8c 100644
--- a/clang/include/clang/Serialization/ASTBitCodes.h
+++ b/clang/include/clang/Serialization/ASTBitCodes.h
@@ -1149,7 +1149,7 @@ enum PredefinedTypeIDs {
 ///
 /// Type IDs for non-predefined types will start at
 /// NUM_PREDEF_TYPE_IDs.
-const unsigned NUM_PREDEF_TYPE_IDS = 505;
+const unsigned NUM_PREDEF_TYPE_IDS = 506;
 
 // Ensure we do not overrun the predefined types we reserved
 // in the enum PredefinedTypeIDs above.
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index f013ed11d129..6f23a1a13d05 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -2525,6 +2525,7 @@ bool Type::isSveVLSBuiltinType() const {
     case BuiltinType::SveBool:
     case BuiltinType::SveBoolx2:
     case BuiltinType::SveBoolx4:
+    case BuiltinType::SveMFloat8:
       return true;
     default:
       return false;
diff --git a/clang/test/AST/ast-dump-aarch64-sve-types.c b/clang/test/AST/ast-dump-aarch64-sve-types.c
index b5a0b00b4928..386133e05b1d 100644
--- a/clang/test/AST/ast-dump-aarch64-sve-types.c
+++ b/clang/test/AST/ast-dump-aarch64-sve-types.c
@@ -45,6 +45,9 @@
 // CHECK: TypedefDecl {{.*}} implicit __SVBfloat16_t '__SVBfloat16_t'
 // CHECK-NEXT: -BuiltinType {{.*}} '__SVBfloat16_t'
 
+// CHECK: TypedefDecl {{.*}} implicit __SVMfloat8_t '__SVMfloat8_t'
+// CHECK-NEXT: -BuiltinType {{.*}} '__SVMfloat8_t'
+
 // CHECK: TypedefDecl {{.*}} implicit __SVBool_t '__SVBool_t'
 // CHECK-NEXT: -BuiltinType {{.*}} '__SVBool_t'
 
diff --git a/clang/test/CodeGen/aarch64-sve.c b/clang/test/CodeGen/aarch64-sve.c
index 5f6a0178aa44..690b010e967a 100644
--- a/clang/test/CodeGen/aarch64-sve.c
+++ b/clang/test/CodeGen/aarch64-sve.c
@@ -13,6 +13,7 @@
 // CHECK: %f16 = alloca <vscale x 8 x half>, align 16
 // CHECK: %f32 = alloca <vscale x 4 x float>, align 16
 // CHECK: %f64 = alloca <vscale x 2 x double>, align 16
+// CHECK: %mf8 = alloca <vscale x 16 x i8>, align 16
 // CHECK: %bf16 = alloca <vscale x 8 x bfloat>, align 16
 // CHECK: %b8 = alloca <vscale x 16 x i1>, align 2
 
@@ -33,6 +34,7 @@ void test_locals(void) {
   __SVFloat32_t f32;
   __SVFloat64_t f64;
 
+  __SVMfloat8_t mf8;
   __SVBfloat16_t bf16;
 
   __SVBool_t b8;
diff --git a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp
index dfe31ff2ce25..3f2b0622d551 100644
--- a/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp
+++ b/clang/test/CodeGenCXX/aarch64-mangle-sve-vectors.cpp
@@ -17,6 +17,7 @@ void f(__SVFloat16_t, __SVFloat16_t);
 void f(__SVFloat32_t, __SVFloat32_t);
 void f(__SVFloat64_t, __SVFloat64_t);
 void f(__SVBfloat16_t, __SVBfloat16_t);
+void f(__SVMfloat8_t, __SVMfloat8_t);
 void f(__SVBool_t, __SVBool_t);
 void f(__SVCount_t, __SVCount_t);
 
@@ -150,6 +151,7 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t);
 // CHECK-NEXT:    call void @_Z1fu13__SVFloat16_tS_(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer)
 // CHECK-NEXT:    call void @_Z1fu13__SVFloat32_tS_(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer)
 // CHECK-NEXT:    call void @_Z1fu13__SVFloat64_tS_(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer)
+// CHECK-NEXT:     call void @_Z1fu13__SVMfloat8_tS_(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer)
 // CHECK-NEXT:    call void @_Z1fu14__SVBfloat16_tS_(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer)
 // CHECK-NEXT:    call void @_Z1fu10__SVBool_tS_(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer)
 // CHECK-NEXT:    call void @_Z1fu11__SVCount_tS_(target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer)
@@ -664,6 +666,7 @@ void f(__clang_svboolx4_t, __clang_svboolx4_t);
 // COMPAT_17-NEXT:    call void @_Z1fu13__SVFloat16_tu13__SVFloat16_t(<vscale x 8 x half> zeroinitializer, <vscale x 8 x half> zeroinitializer)
 // COMPAT_17-NEXT:    call void @_Z1fu13__SVFloat32_tu13__SVFloat32_t(<vscale x 4 x float> zeroinitializer, <vscale x 4 x float> zeroinitializer)
 // COMPAT_17-NEXT:    call void @_Z1fu13__SVFloat64_tu13__SVFloat64_t(<vscale x 2 x double> zeroinitializer, <vscale x 2 x double> zeroinitializer)
+// COMPAT_17-NEXT:    call void @_Z1fu13__SVMfloat8_tu13__SVMfloat8_t(<vscale x 16 x i8> zeroinitializer, <vscale x 16 x i8> zeroinitializer)
 // COMPAT_17-NEXT:    call void @_Z1fu14__SVBFloat16_tu14__SVBFloat16_t(<vscale x 8 x bfloat> zeroinitializer, <vscale x 8 x bfloat> zeroinitializer)
 // COMPAT_17-NEXT:    call void @_Z1fu10__SVBool_tu10__SVBool_t(<vscale x 16 x i1> zeroinitializer, <vscale x 16 x i1> zeroinitializer)
 // COMPAT_17-NEXT:    call void @_Z1fu11__SVCount_tu11__SVCount_t(target("aarch64.svcount") zeroinitializer, target("aarch64.svcount") zeroinitializer)
@@ -1100,6 +1103,7 @@ void foo() {
   f(__SVFloat16_t(), __SVFloat16_t());
   f(__SVFloat32_t(), __SVFloat32_t());
   f(__SVFloat64_t(), __SVFloat64_t());
+  f(__SVMfloat8_t(), __SVMfloat8_t());
   f(__SVBfloat16_t(), __SVBfloat16_t());
   f(__SVBool_t(), __SVBool_t());
   f(__SVCount_t(), __SVCount_t());
diff --git a/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp b/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp
index 7f6b2a9caae6..beab9f9078a7 100644
--- a/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp
+++ b/clang/test/CodeGenCXX/aarch64-sve-typeinfo.cpp
@@ -21,6 +21,8 @@ auto &f64 = typeid(__SVFloat64_t);
 
 auto &bf16 = typeid(__SVBfloat16_t);
 
+auto &mf8 = typeid(__SVMfloat8_t);
+
 auto &b8 = typeid(__SVBool_t);
 auto &c8 = typeid(__SVCount_t);
 
@@ -60,6 +62,9 @@ auto &c8 = typeid(__SVCount_t);
 // CHECK-DAG: @_ZTSu14__SVBfloat16_t = {{.*}} c"u14__SVBfloat16_t\00"
 // CHECK-DAG: @_ZTIu14__SVBfloat16_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu14__SVBfloat16_t
 
+// CHECK-DAG: @_ZTSu13__SVMfloat8_t = {{.*}} c"u13__SVMfloat8_t\00"
+// CHECK-DAG: @_ZTIu13__SVMfloat8_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu13__SVMfloat8_t
+
 // CHECK-DAG: @_ZTSu10__SVBool_t = {{.*}} c"u10__SVBool_t\00"
 // CHECK-DAG: @_ZTIu10__SVBool_t = {{.*}} @_ZTVN10__cxxabiv123__fundamental_type_infoE, {{.*}} @_ZTSu10__SVBool_t
 
diff --git a/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp b/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp
index 503d77a1822a..45cf8081eb3a 100644
--- a/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp
+++ b/clang/test/CodeGenCXX/aarch64-sve-vector-init.cpp
@@ -12,6 +12,7 @@
 // CHECK-NEXT:    [[U16:%.*]] = alloca <vscale x 8 x i16>, align 16
 // CHECK-NEXT:    [[U32:%.*]] = alloca <vscale x 4 x i32>, align 16
 // CHECK-NEXT:    [[U64:%.*]] = alloca <vscale x 2 x i64>, align 16
+// CHECK-NEXT:    [[MF8:%.*]] = alloca <vscale x 16 x i8>, align 16
 // CHECK-NEXT:    [[F16:%.*]] = alloca <vscale x 8 x half>, align 16
 // CHECK-NEXT:    [[F32:%.*]] = alloca <vscale x 4 x float>, align 16
 // CHECK-NEXT:    [[F64:%.*]] = alloca <vscale x 2 x double>, align 16
@@ -64,6 +65,7 @@
 // CHECK-NEXT:    store <vscale x 8 x i16> zeroinitializer, ptr [[U16]], align 16
 // CHECK-NEXT:    store <vscale x 4 x i32> zeroinitializer, ptr [[U32]], align 16
 // CHECK-NEXT:    store <vscale x 2 x i64> zeroinitializer, ptr [[U64]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> zeroinitializer, ptr [[MF8]], align 16
 // CHECK-NEXT:    store <vscale x 8 x half> zeroinitializer, ptr [[F16]], align 16
 // CHECK-NEXT:    store <vscale x 4 x float> zeroinitializer, ptr [[F32]], align 16
 // CHECK-NEXT:    store <vscale x 2 x double> zeroinitializer, ptr [[F64]], align 16
@@ -119,6 +121,7 @@ void test_locals(void) {
   __SVUint16_t u16{};
   __SVUint32_t u32{};
   __SVUint64_t u64{};
+  __SVMfloat8_t mf8{};
   __SVFloat16_t f16{};
   __SVFloat32_t f32{};
   __SVFloat64_t f64{};
@@ -282,6 +285,20 @@ void test_copy_u64(__SVUint64_t a) {
   __SVUint64_t b{a};
 }
 
+// CHECK-LABEL: define dso_local void @_Z13test_copy_mf8u13__SVMfloat8_t
+// CHECK-SAME: (<vscale x 16 x i8> [[A:%.*]]) #[[ATTR0]] {
+// CHECK-NEXT:  entry:
+// CHECK-NEXT:    [[A_ADDR:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    [[B:%.*]] = alloca <vscale x 16 x i8>, align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[A]], ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    [[TMP0:%.*]] = load <vscale x 16 x i8>, ptr [[A_ADDR]], align 16
+// CHECK-NEXT:    store <vscale x 16 x i8> [[TMP0]], ptr [[B]], align 16
+// CHECK-NEXT:    ret void
+//
+void test_copy_mf8(__SVMfloat8_t a) {
+  __SVMfloat8_t b{a};
+}
+
 // CHECK-LABEL: define dso_local void @_Z13test_copy_f16u13__SVFloat16_t
 // CHECK-SAME: (<vscale x 8 x half> [[A:%.*]]) #[[ATTR0]] {
 // CHECK-NEXT:  entry:
diff --git a/clang/test/CodeGenObjC/aarch64-sve-types.m b/clang/test/CodeGenObjC/aarch64-sve-types.m
index eae734fa4d59..a97ce4b5bd39 100644
--- a/clang/test/CodeGenObjC/aarch64-sve-types.m
+++ b/clang/test/CodeGenObjC/aarch64-sve-types.m
@@ -31,5 +31,8 @@ const char f64[] = @encode(__SVFloat64_t);
 // CHECK: error: cannot yet @encode type __SVBfloat16_t
 const char bf16[] = @encode(__SVBfloat16_t);
 
+// CHECK: error: cannot yet @encode type __SVMfloat8_t
+const char mf8[] = @encode(__SVMfloat8_t);
+
 // CHECK: error: cannot yet @encode type __SVBool_t
 const char b8[] = @encode(__SVBool_t);
diff --git a/clang/test/Modules/no-external-type-id.cppm b/clang/test/Modules/no-external-type-id.cppm
index 068e52646dcc..a4ca389739fb 100644
--- a/clang/test/Modules/no-external-type-id.cppm
+++ b/clang/test/Modules/no-external-type-id.cppm
@@ -23,7 +23,7 @@ export module b;
 import a;
 export int b();
 
-// CHECK: <DECL_FUNCTION {{.*}} op8=4056
+// CHECK: <DECL_FUNCTION {{.*}} op8=4064
 // CHECK: <TYPE_FUNCTION_PROTO
 
 //--- a.v1.cppm
diff --git a/clang/test/Sema/aarch64-sve-types.c b/clang/test/Sema/aarch64-sve-types.c
index 4525e71de6eb..8eed11675a69 100644
--- a/clang/test/Sema/aarch64-sve-types.c
+++ b/clang/test/Sema/aarch64-sve-types.c
@@ -37,6 +37,9 @@ void f(void) {
   int size_bf16[sizeof(__SVBfloat16_t) == 0 ? 1 : -1];        // expected-error {{invalid application of 'sizeof' to sizeless type '__SVBfloat16_t'}}
   int align_bf16[__alignof__(__SVBfloat16_t) == 16 ? 1 : -1]; // expected-error {{invalid application of '__alignof' to sizeless type '__SVBfloat16_t'}}
 
+ int size_mf8[sizeof(__SVMfloat8_t) == 0 ? 1 : -1];        // expected-error {{invalid application of 'sizeof' to sizeless type '__SVMfloat8_t'}}
+ int align_mf8[__alignof__(__SVMfloat8_t) == 16 ? 1 : -1]; // expected-error {{invalid application of '__alignof' to sizeless type '__SVMfloat8_t'}}
+
   int size_b8[sizeof(__SVBool_t) == 0 ? 1 : -1];       // expected-error {{invalid application of 'sizeof' to sizeless type '__SVBool_t'}}
   int align_b8[__alignof__(__SVBool_t) == 2 ? 1 : -1]; // expected-error {{invalid application of '__alignof' to sizeless type '__SVBool_t'}}
 }
diff --git a/clang/test/Sema/arm-mfp8.cpp b/clang/test/Sema/arm-mfp8.cpp
new file mode 100644
index 000000000000..b1509c542473
--- /dev/null
+++ b/clang/test/Sema/arm-mfp8.cpp
@@ -0,0 +1,13 @@
+// RUN: %clang_cc1 -fsyntax-only -verify=sve -triple aarch64-arm-none-eabi \
+// RUN: -target-feature -fp8 -target-feature +sve %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+void test_vector_sve(svmfloat8_t a, svuint8_t c) {
+  a + c;  // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}}
+  a - c;  // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}}
+  a * c;  // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}}
+  a / c;  // sve-error {{cannot convert between vector type 'svuint8_t' (aka '__SVUint8_t') and vector type 'svmfloat8_t' (aka '__SVMfloat8_t') as implicit conversion would cause truncation}}
+}
+
diff --git a/clang/test/SemaObjC/aarch64-sve-types.m b/clang/test/SemaObjC/aarch64-sve-types.m
index b50f43cee76f..a45e02217667 100644
--- a/clang/test/SemaObjC/aarch64-sve-types.m
+++ b/clang/test/SemaObjC/aarch64-sve-types.m
@@ -20,5 +20,7 @@
 
 @property(nullable) __SVBfloat16_t bf16; // expected-error {{cannot be applied to non-pointer type}}
 
+@property(nullable) __SVMfloat8_t mf8; // expected-error {{cannot be applied to non-pointer type}}
+
 @property(nullable) __SVBool_t b8; // expected-error {{cannot be applied to non-pointer type}}
 @end
diff --git a/clang/utils/TableGen/SveEmitter.cpp b/clang/utils/TableGen/SveEmitter.cpp
index 82bbd04f97b4..1d79cc71dd97 100644
--- a/clang/utils/TableGen/SveEmitter.cpp
+++ b/clang/utils/TableGen/SveEmitter.cpp
@@ -51,7 +51,7 @@ using TypeSpec = std::string;
 
 namespace {
 class SVEType {
-  bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat;
+  bool Float, Signed, Immediate, Void, Constant, Pointer, BFloat, MFloat;
   bool DefaultType, IsScalable, Predicate, PredicatePattern, PrefetchOp,
       Svcount;
   unsigned Bitwidth, ElementBitwidth, NumVectors;
@@ -61,10 +61,10 @@ public:
 
   SVEType(StringRef TS, char CharMod, unsigned NumVectors = 1)
       : Float(false), Signed(true), Immediate(false), Void(false),
-        Constant(false), Pointer(false), BFloat(false), DefaultType(false),
-        IsScalable(true), Predicate(false), PredicatePattern(false),
-        PrefetchOp(false), Svcount(false), Bitwidth(128), ElementBitwidth(~0U),
-        NumVectors(NumVectors) {
+        Constant(false), Pointer(false), BFloat(false), MFloat(false),
+        DefaultType(false), IsScalable(true), Predicate(false),
+        PredicatePattern(false), PrefetchOp(false), Svcount(false),
+        Bitwidth(128), ElementBitwidth(~0U), NumVectors(NumVectors) {
     if (!TS.empty())
       applyTypespec(TS);
     applyModifier(CharMod);
@@ -82,11 +82,14 @@ public:
   bool isVector() const { return NumVectors > 0; }
   bool isScalableVector() const { return isVector() && IsScalable; }
   bool isFixedLengthVector() const { return isVector() && !IsScalable; }
-  bool isChar() const { return ElementBitwidth == 8; }
+  bool isChar() const { return ElementBitwidth == 8 && !MFloat; }
   bool isVoid() const { return Void && !Pointer; }
   bool isDefault() const { return DefaultType; }
-  bool isFloat() const { return Float && !BFloat; }
-  bool isBFloat() const { return BFloat && !Float; }
+  bool isFloat() const { return Float && !BFloat && !MFloat; }
+  bool isBFloat() const { return BFloat && !Float && !MFloat; }
+  bool isMFloat() const {
+    return MFloat && !BFloat && !Float;
+  }
   bool isFloatingPoint() const { return Float || BFloat; }
   bool isInteger() const {
     return !isFloatingPoint() && !Predicate && !Svcount;
@@ -454,6 +457,9 @@ std::string SVEType::builtin_str() const {
   else if (isBFloat()) {
     assert(ElementBitwidth == 16 && "Not a valid BFloat.");
     S += "y";
+  } else if (isMFloat()) {
+    assert(ElementBitwidth == 8 && "Not a valid MFloat.");
+    S += "m";
   }
 
   if (!isFloatingPoint()) {
@@ -509,6 +515,8 @@ std::string SVEType::str() const {
       S += "bool";
     else if (isBFloat())
       S += "bfloat";
+    else if (isMFloat())
+      S += "mfloat";
     else
       S += "int";
 
@@ -572,8 +580,16 @@ void SVEType::applyTypespec(StringRef TS) {
     case 'b':
       BFloat = true;
       Float = false;
+      MFloat = false;
       ElementBitwidth = 16;
       break;
+    case 'm':
+      Signed = false;
+      MFloat = true;
+      Float = false;
+      BFloat = false;
+      ElementBitwidth = 8;
+      break;
     default:
       llvm_unreachable("Unhandled type code!");
     }
@@ -1037,6 +1053,8 @@ std::string Intrinsic::replaceTemplatedArgs(std::string Name, TypeSpec TS,
       TypeCode = 'b';
     else if (T.isBFloat())
       TypeCode = "bf";
+    else if (T.isMFloat())
+      TypeCode = "mfp";
     else
       TypeCode = 'f';
     Ret.replace(Pos, NumChars, TypeCode + utostr(T.getElementSizeInBits()));
@@ -1130,6 +1148,11 @@ uint64_t SVEEmitter::encodeTypeFlags(const SVEType &T) {
     return encodeEltType("EltTyBFloat16");
   }
 
+  if (T.isMFloat()) {
+    assert(T.getElementSizeInBits() == 8 && "Not a valid MFloat.");
+    return encodeEltType("EltTyMFloat8");
+  }
+
   if (T.isPredicateVector() || T.isSvcount()) {
     switch (T.getElementSizeInBits()) {
     case 8:
@@ -1305,6 +1328,8 @@ void SVEEmitter::createHeader(raw_ostream &OS) {
   OS << "#include <arm_bf16.h>\n";
   OS << "#include <arm_vector_types.h>\n";
 
+  OS << "typedef __SVMfloat8_t svmfloat8_t;\n\n";
+
   OS << "typedef __SVFloat32_t svfloat32_t;\n";
   OS << "typedef __SVFloat64_t svfloat64_t;\n";
   OS << "typedef __clang_svint8x2_t svint8x2_t;\n";
-- 
GitLab


From e1f8f84acec05997893c305c78fbf7feecf44dd7 Mon Sep 17 00:00:00 2001
From: Oliver Stannard <oliver.stannard@arm.com>
Date: Thu, 17 Oct 2024 10:32:44 +0200
Subject: [PATCH 209/329] [ARM] Fix frame chains with M-profile PACBTI
 (#110285)

When using AAPCS-compliant frame chains with PACBTI return address
signing, there ware a number of bugs in the generation of the frame
pointer and function prologues. The most obvious was that we sometimes
would modify r11 before pushing it to the stack, so it wasn't preserved
as required by the PCS. We also sometimes did not push R11 and LR
adjacent to one another on the stack, or used R11 as a frame pointer
without pointing it at the saved value of R11, both of which are
required to have an AAPCS compliant frame chain.

The original work of this patch was done by James Westwood, reviewed as
 #82801 and #81249, with some tidy-ups done by Mark Murray and myself.
---
 llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp   |   5 +-
 llvm/lib/Target/ARM/ARMCallingConv.td         |  19 ++-
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      | 145 +++++++++++------
 llvm/lib/Target/ARM/ARMSubtarget.cpp          |   7 +
 llvm/lib/Target/ARM/ARMSubtarget.h            |  12 ++
 .../CodeGen/Thumb2/pacbti-m-frame-chain.ll    | 150 ++++++++++++++++++
 6 files changed, 281 insertions(+), 57 deletions(-)
 create mode 100644 llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll

diff --git a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
index 3f28ce8ca4b5..aad305cce039 100644
--- a/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMBaseRegisterInfo.cpp
@@ -116,9 +116,12 @@ ARMBaseRegisterInfo::getCalleeSavedRegs(const MachineFunction *MF) const {
     return CSR_iOS_SaveList;
 
   if (PushPopSplit == ARMSubtarget::SplitR7)
-    return STI.createAAPCSFrameChain() ? CSR_AAPCS_SplitPush_SaveList
+    return STI.createAAPCSFrameChain() ? CSR_AAPCS_SplitPush_R7_SaveList
                                        : CSR_ATPCS_SplitPush_SaveList;
 
+  if (PushPopSplit == ARMSubtarget::SplitR11AAPCSSignRA)
+    return CSR_AAPCS_SplitPush_R11_SaveList;
+
   return CSR_AAPCS_SaveList;
 }
 
diff --git a/llvm/lib/Target/ARM/ARMCallingConv.td b/llvm/lib/Target/ARM/ARMCallingConv.td
index d14424c2deca..27f175a70033 100644
--- a/llvm/lib/Target/ARM/ARMCallingConv.td
+++ b/llvm/lib/Target/ARM/ARMCallingConv.td
@@ -301,14 +301,17 @@ def CSR_ATPCS_SplitPush_SwiftError : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush,
 def CSR_ATPCS_SplitPush_SwiftTail : CalleeSavedRegs<(sub CSR_ATPCS_SplitPush,
                                                      R10)>;
 
-// When enforcing an AAPCS compliant frame chain, R11 is used as the frame
-// pointer even for Thumb targets, where split pushes are necessary.
-// This AAPCS alternative makes sure the frame index slots match the push
-// order in that case.
-def CSR_AAPCS_SplitPush : CalleeSavedRegs<(add LR, R11,
-                                               R7, R6, R5, R4,
-                                               R10, R9, R8,
-                                               (sequence "D%u", 15, 8))>;
+// Sometimes we need to split the push of the callee-saved GPRs into two
+// regions, to ensure that the frame chain record is set up correctly. These
+// list the callee-saved registers in the order they end up on the stack, which
+// depends on whether the frame pointer is r7 or r11.
+def CSR_AAPCS_SplitPush_R11 : CalleeSavedRegs<(add R10, R9, R8, R7, R6, R5, R4,
+                                                   LR, R11,
+                                                   (sequence "D%u", 15, 8))>;
+def CSR_AAPCS_SplitPush_R7 : CalleeSavedRegs<(add LR, R11,
+                                                  R7, R6, R5, R4,
+                                                  R10, R9, R8,
+                                                  (sequence "D%u", 15, 8))>;
 
 // Constructors and destructors return 'this' in the ARM C++ ABI; since 'this'
 // and the pointer return value are both passed in R0 in these cases, this can
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 60062a2422e4..06e26262062c 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -199,6 +199,11 @@ SpillArea getSpillArea(Register Reg,
   // push {r0-r10, r12}  GPRCS1
   // vpush {r8-d15}      DPRCS1
   // push {r11, lr}      GPRCS2
+  //
+  // SplitR11AAPCSSignRA:
+  // push {r0-r10, r12}  GPRSC1
+  // push {r11, lr}      GPRCS2
+  // vpush {r8-d15}      DPRCS1
 
   // If FPCXTNS is spilled (for CMSE secure entryfunctions), it is always at
   // the top of the stack frame.
@@ -246,7 +251,8 @@ SpillArea getSpillArea(Register Reg,
       return SpillArea::GPRCS1;
 
   case ARM::LR:
-    if (Variation == ARMSubtarget::SplitR11WindowsSEH)
+    if (Variation == ARMSubtarget::SplitR11WindowsSEH ||
+        Variation == ARMSubtarget::SplitR11AAPCSSignRA)
       return SpillArea::GPRCS2;
     else
       return SpillArea::GPRCS1;
@@ -859,6 +865,9 @@ static int getMaxFPOffset(const ARMSubtarget &STI, const ARMFunctionInfo &AFI,
   // This is a conservative estimation: Assume the frame pointer being r7 and
   // pc("r15") up to r8 getting spilled before (= 8 registers).
   int MaxRegBytes = 8 * 4;
+  if (PushPopSplit == ARMSubtarget::SplitR11AAPCSSignRA)
+    // Here, r11 can be stored below all of r4-r15.
+    MaxRegBytes = 11 * 4;
   if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
     // Here, r11 can be stored below all of r4-r15 plus d8-d15.
     MaxRegBytes = 11 * 4 + 8 * 8;
@@ -931,17 +940,23 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   }
 
   // Determine spill area sizes, and some important frame indices.
+  SpillArea FramePtrSpillArea;
+  bool BeforeFPPush = true;
   for (const CalleeSavedInfo &I : CSI) {
     Register Reg = I.getReg();
     int FI = I.getFrameIdx();
 
-    if (Reg == FramePtr)
+    SpillArea Area = getSpillArea(Reg, PushPopSplit,
+                                  AFI->getNumAlignedDPRCS2Regs(), RegInfo);
+
+    if (Reg == FramePtr) {
       FramePtrSpillFI = FI;
+      FramePtrSpillArea = Area;
+    }
     if (Reg == ARM::D8)
       D8SpillFI = FI;
 
-    switch (getSpillArea(Reg, PushPopSplit, AFI->getNumAlignedDPRCS2Regs(),
-                         RegInfo)) {
+    switch (Area) {
     case SpillArea::FPCXT:
       FPCXTSaveSize += 4;
       break;
@@ -968,7 +983,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // Move past FPCXT area.
   if (FPCXTSaveSize > 0) {
     LastPush = MBBI++;
-    DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, true);
+    DefCFAOffsetCandidates.addInst(LastPush, FPCXTSaveSize, BeforeFPPush);
   }
 
   // Allocate the vararg register save area.
@@ -976,13 +991,15 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     emitSPUpdate(isARM, MBB, MBBI, dl, TII, -ArgRegsSaveSize,
                  MachineInstr::FrameSetup);
     LastPush = std::prev(MBBI);
-    DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, true);
+    DefCFAOffsetCandidates.addInst(LastPush, ArgRegsSaveSize, BeforeFPPush);
   }
 
   // Move past area 1.
   if (GPRCS1Size > 0) {
     GPRCS1Push = LastPush = MBBI++;
-    DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, true);
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS1Size, BeforeFPPush);
+    if (FramePtrSpillArea == SpillArea::GPRCS1)
+      BeforeFPPush = false;
   }
 
   // Determine starting offsets of spill areas. These offsets are all positive
@@ -1006,7 +1023,6 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   } else {
     DPRCSOffset = GPRCS2Offset - DPRGapSize - DPRCSSize;
   }
-  int FramePtrOffsetInPush = 0;
   if (HasFP) {
     // Offset from the CFA to the saved frame pointer, will be negative.
     int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
@@ -1014,13 +1030,6 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
                       << ", FPOffset: " << FPOffset << "\n");
     assert(getMaxFPOffset(STI, *AFI, MF) <= FPOffset &&
            "Max FP estimation is wrong");
-    // Offset from the top of the GPRCS1 area to the saved frame pointer, will
-    // be negative.
-    FramePtrOffsetInPush = FPOffset + ArgRegsSaveSize + FPCXTSaveSize;
-    LLVM_DEBUG(dbgs() << "FramePtrOffsetInPush=" << FramePtrOffsetInPush
-                      << ", FramePtrSpillOffset="
-                      << (MFI.getObjectOffset(FramePtrSpillFI) + NumBytes)
-                      << "\n");
     AFI->setFramePtrSpillOffset(MFI.getObjectOffset(FramePtrSpillFI) +
                                 NumBytes);
   }
@@ -1032,7 +1041,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // after DPRCS1.
   if (GPRCS2Size > 0 && PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) {
     GPRCS2Push = LastPush = MBBI++;
-    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush);
+    if (FramePtrSpillArea == SpillArea::GPRCS2)
+      BeforeFPPush = false;
   }
 
   // Prolog/epilog inserter assumes we correctly align DPRs on the stack, so our
@@ -1045,7 +1056,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     else {
       emitSPUpdate(isARM, MBB, MBBI, dl, TII, -DPRGapSize,
                    MachineInstr::FrameSetup);
-      DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize);
+      DefCFAOffsetCandidates.addInst(std::prev(MBBI), DPRGapSize, BeforeFPPush);
     }
   }
 
@@ -1054,7 +1065,8 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
     // Since vpush register list cannot have gaps, there may be multiple vpush
     // instructions in the prologue.
     while (MBBI != MBB.end() && MBBI->getOpcode() == ARM::VSTMDDB_UPD) {
-      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI));
+      DefCFAOffsetCandidates.addInst(MBBI, sizeOfSPAdjustment(*MBBI),
+                                     BeforeFPPush);
       LastPush = MBBI++;
     }
   }
@@ -1073,7 +1085,9 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // Move GPRCS2, if using using SplitR11WindowsSEH.
   if (GPRCS2Size > 0 && PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
     GPRCS2Push = LastPush = MBBI++;
-    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size);
+    DefCFAOffsetCandidates.addInst(LastPush, GPRCS2Size, BeforeFPPush);
+    if (FramePtrSpillArea == SpillArea::GPRCS2)
+      BeforeFPPush = false;
   }
 
   bool NeedsWinCFIStackAlloc = NeedsWinCFI;
@@ -1174,28 +1188,51 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   // into spill area 1, including the FP in R11.  In either case, it
   // is in area one and the adjustment needs to take place just after
   // that push.
-  // FIXME: The above is not necessary true when PACBTI is enabled.
-  // AAPCS requires use of R11, and PACBTI gets in the way of regular pushes,
-  // so FP ends up on area two.
   MachineBasicBlock::iterator AfterPush;
   if (HasFP) {
-    AfterPush = std::next(GPRCS1Push);
-    unsigned PushSize = sizeOfSPAdjustment(*GPRCS1Push);
-    int FPOffset = PushSize + FramePtrOffsetInPush;
-    if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH) {
-      AfterPush = std::next(GPRCS2Push);
-      emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
-                           FramePtr, ARM::SP, 0, MachineInstr::FrameSetup);
-    } else {
-      emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
-                           FramePtr, ARM::SP, FPOffset,
-                           MachineInstr::FrameSetup);
+    MachineBasicBlock::iterator FPPushInst;
+    // Offset from SP immediately after the push which saved the FP to the FP
+    // save slot.
+    int64_t FPOffsetAfterPush;
+    switch (FramePtrSpillArea) {
+    case SpillArea::GPRCS1:
+      FPPushInst = GPRCS1Push;
+      FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) +
+                          ArgRegsSaveSize + FPCXTSaveSize +
+                          sizeOfSPAdjustment(*FPPushInst);
+      LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS1, offset "
+                        << FPOffsetAfterPush << "  after that push\n");
+      break;
+    case SpillArea::GPRCS2:
+      FPPushInst = GPRCS2Push;
+      FPOffsetAfterPush = MFI.getObjectOffset(FramePtrSpillFI) +
+                          ArgRegsSaveSize + FPCXTSaveSize + GPRCS1Size +
+                          sizeOfSPAdjustment(*FPPushInst);
+      if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH)
+        FPOffsetAfterPush += DPRCSSize + DPRGapSize;
+      LLVM_DEBUG(dbgs() << "Frame pointer in GPRCS2, offset "
+                        << FPOffsetAfterPush << "  after that push\n");
+      break;
+    default:
+      llvm_unreachable("frame pointer in unknown spill area");
+      break;
     }
+    AfterPush = std::next(FPPushInst);
+    if (PushPopSplit == ARMSubtarget::SplitR11WindowsSEH)
+      assert(FPOffsetAfterPush == 0);
+
+    // Emit the MOV or ADD to set up the frame pointer register.
+    emitRegPlusImmediate(!AFI->isThumbFunction(), MBB, AfterPush, dl, TII,
+                         FramePtr, ARM::SP, FPOffsetAfterPush,
+                         MachineInstr::FrameSetup);
+
     if (!NeedsWinCFI) {
-      if (FramePtrOffsetInPush + PushSize != 0) {
+      // Emit DWARF info to find the CFA using the frame pointer from this
+      // point onward.
+      if (FPOffsetAfterPush != 0) {
         unsigned CFIIndex = MF.addFrameInst(MCCFIInstruction::cfiDefCfa(
             nullptr, MRI->getDwarfRegNum(FramePtr, true),
-            FPCXTSaveSize + ArgRegsSaveSize - FramePtrOffsetInPush));
+            -MFI.getObjectOffset(FramePtrSpillFI)));
         BuildMI(MBB, AfterPush, dl, TII.get(TargetOpcode::CFI_INSTRUCTION))
             .addCFIIndex(CFIIndex)
             .setMIFlags(MachineInstr::FrameSetup);
@@ -1708,7 +1745,8 @@ void ARMFrameLowering::emitPopInst(MachineBasicBlock &MBB,
       if (Reg == ARM::LR && !isTailCall && !isVarArg && !isInterrupt &&
           !isCmseEntry && !isTrap && AFI->getArgumentStackToRestore() == 0 &&
           STI.hasV5TOps() && MBB.succ_empty() && !hasPAC &&
-          PushPopSplit != ARMSubtarget::SplitR11WindowsSEH) {
+          (PushPopSplit != ARMSubtarget::SplitR11WindowsSEH &&
+           PushPopSplit != ARMSubtarget::SplitR11AAPCSSignRA)) {
         Reg = ARM::PC;
         // Fold the return instruction into the LDM.
         DeleteRet = true;
@@ -2940,18 +2978,29 @@ bool ARMFrameLowering::assignCalleeSavedSpillSlots(
   const auto &AFI = *MF.getInfo<ARMFunctionInfo>();
   if (AFI.shouldSignReturnAddress()) {
     // The order of register must match the order we push them, because the
-    // PEI assigns frame indices in that order. When compiling for return
-    // address sign and authenication, we use split push, therefore the orders
-    // we want are:
-    // LR, R7, R6, R5, R4, <R12>, R11, R10,  R9,  R8, D15-D8
-    CSI.insert(find_if(CSI,
-                       [=](const auto &CS) {
-                         Register Reg = CS.getReg();
-                         return Reg == ARM::R10 || Reg == ARM::R11 ||
-                                Reg == ARM::R8 || Reg == ARM::R9 ||
-                                ARM::DPRRegClass.contains(Reg);
-                       }),
-               CalleeSavedInfo(ARM::R12));
+    // PEI assigns frame indices in that order. That order depends on the
+    // PushPopSplitVariation, there are only two cases which we use with return
+    // address signing:
+    switch (STI.getPushPopSplitVariation(MF)) {
+    case ARMSubtarget::SplitR7:
+      // LR, R7, R6, R5, R4, <R12>, R11, R10,  R9,  R8, D15-D8
+      CSI.insert(find_if(CSI,
+                         [=](const auto &CS) {
+                           Register Reg = CS.getReg();
+                           return Reg == ARM::R10 || Reg == ARM::R11 ||
+                                  Reg == ARM::R8 || Reg == ARM::R9 ||
+                                  ARM::DPRRegClass.contains(Reg);
+                         }),
+                 CalleeSavedInfo(ARM::R12));
+      break;
+    case ARMSubtarget::SplitR11AAPCSSignRA:
+      // With SplitR11AAPCSSignRA, R12 will always be the highest-addressed CSR
+      // on the stack.
+      CSI.insert(CSI.begin(), CalleeSavedInfo(ARM::R12));
+      break;
+    default:
+      llvm_unreachable("Unexpected CSR split with return address signing");
+    }
   }
 
   return false;
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.cpp b/llvm/lib/Target/ARM/ARMSubtarget.cpp
index c4a782bc4091..9adfb1fab5f0 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.cpp
+++ b/llvm/lib/Target/ARM/ARMSubtarget.cpp
@@ -514,5 +514,12 @@ ARMSubtarget::getPushPopSplitVariation(const MachineFunction &MF) const {
       F.needsUnwindTableEntry() &&
       (MFI.hasVarSizedObjects() || getRegisterInfo()->hasStackRealignment(MF)))
     return SplitR11WindowsSEH;
+
+  // Returns R11SplitAAPCSBranchSigning if R11 and lr are not adjacent to each
+  // other in the list of callee saved registers in a frame, and branch
+  // signing is enabled.
+  if (MF.getInfo<ARMFunctionInfo>()->shouldSignReturnAddress() &&
+      getFramePointerReg() == ARM::R11)
+    return SplitR11AAPCSSignRA;
   return NoSplit;
 }
diff --git a/llvm/lib/Target/ARM/ARMSubtarget.h b/llvm/lib/Target/ARM/ARMSubtarget.h
index 7917ddc17bdb..214c5f1b45e5 100644
--- a/llvm/lib/Target/ARM/ARMSubtarget.h
+++ b/llvm/lib/Target/ARM/ARMSubtarget.h
@@ -105,6 +105,18 @@ public:
     /// vpush {d8-d15}
     /// push {r11, lr}
     SplitR11WindowsSEH,
+
+    /// When generating AAPCS-compilant frame chains, R11 is the frame pointer,
+    /// and must be pushed adjacent to the return address (LR). Normally this
+    /// isn't a problem, because the only register between them is r12, which is
+    /// the intra-procedure-call scratch register, so doesn't need to be saved.
+    /// However, when PACBTI is in use, r12 contains the authentication code, so
+    /// does need to be saved. This means that we need a separate push for R11
+    /// and LR.
+    /// push {r0-r10, r12}
+    /// push {r11, lr}
+    /// vpush {d8-d15}
+    SplitR11AAPCSSignRA,
   };
 
 protected:
diff --git a/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll b/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll
new file mode 100644
index 000000000000..8bcf87130c54
--- /dev/null
+++ b/llvm/test/CodeGen/Thumb2/pacbti-m-frame-chain.ll
@@ -0,0 +1,150 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=thumbv8.1m.main-none-eabi < %s --force-dwarf-frame-section -frame-pointer=all -mattr=+aapcs-frame-chain | FileCheck %s
+
+; int test1() {
+;     return 0;
+; }
+define i32 @test1() "sign-return-address"="non-leaf" {
+; CHECK-LABEL: test1:
+; CHECK:         .cfi_sections .debug_frame
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  @ %bb.0: @ %entry
+; CHECK-NEXT:    pac r12, lr, sp
+; CHECK-NEXT:    .save {ra_auth_code}
+; CHECK-NEXT:    str r12, [sp, #-4]!
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    .cfi_offset ra_auth_code, -4
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push.w {r11, lr}
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    .cfi_offset lr, -8
+; CHECK-NEXT:    .cfi_offset r11, -12
+; CHECK-NEXT:    .setfp r11, sp
+; CHECK-NEXT:    mov r11, sp
+; CHECK-NEXT:    .cfi_def_cfa_register r11
+; CHECK-NEXT:    movs r0, #0
+; CHECK-NEXT:    pop.w {r11, lr}
+; CHECK-NEXT:    ldr r12, [sp], #4
+; CHECK-NEXT:    aut r12, lr, sp
+; CHECK-NEXT:    bx lr
+entry:
+    ret i32 0
+}
+
+; void foo(int n) {
+;   int a[n];
+;   bar(a);
+; }
+define dso_local void @test2(i32 noundef %n) "sign-return-address"="non-leaf" {
+; CHECK-LABEL: test2:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  @ %bb.0: @ %entry
+; CHECK-NEXT:    pac r12, lr, sp
+; CHECK-NEXT:    .save {r4, r7, ra_auth_code}
+; CHECK-NEXT:    push.w {r4, r7, r12}
+; CHECK-NEXT:    .cfi_def_cfa_offset 12
+; CHECK-NEXT:    .cfi_offset ra_auth_code, -4
+; CHECK-NEXT:    .cfi_offset r7, -8
+; CHECK-NEXT:    .cfi_offset r4, -12
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push.w {r11, lr}
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    .cfi_offset lr, -16
+; CHECK-NEXT:    .cfi_offset r11, -20
+; CHECK-NEXT:    .setfp r11, sp
+; CHECK-NEXT:    mov r11, sp
+; CHECK-NEXT:    .cfi_def_cfa_register r11
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    movs r1, #7
+; CHECK-NEXT:    add.w r0, r1, r0, lsl #2
+; CHECK-NEXT:    bic r0, r0, #7
+; CHECK-NEXT:    sub.w r0, sp, r0
+; CHECK-NEXT:    mov sp, r0
+; CHECK-NEXT:    bl take_ptr
+; CHECK-NEXT:    mov sp, r11
+; CHECK-NEXT:    pop.w {r11, lr}
+; CHECK-NEXT:    pop.w {r4, r7, r12}
+; CHECK-NEXT:    aut r12, lr, sp
+; CHECK-NEXT:    bx lr
+entry:
+  %vla = alloca i32, i32 %n, align 4
+  call void @take_ptr(ptr noundef nonnull %vla)
+  ret void
+}
+
+; void test3(int c, float e, int z) {
+;   if (c)
+;     knr();
+;   take_ptr(alloca(z));
+;   if (e)
+;     knr();
+; }
+define void @test3(i32 noundef %c, float noundef %e, i32 noundef %z) "sign-return-address"="non-leaf" {
+; CHECK-LABEL: test3:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  @ %bb.0: @ %entry
+; CHECK-NEXT:    pac r12, lr, sp
+; CHECK-NEXT:    .save {r4, r5, r6, r7, ra_auth_code}
+; CHECK-NEXT:    push.w {r4, r5, r6, r7, r12}
+; CHECK-NEXT:    .cfi_def_cfa_offset 20
+; CHECK-NEXT:    .cfi_offset ra_auth_code, -4
+; CHECK-NEXT:    .cfi_offset r7, -8
+; CHECK-NEXT:    .cfi_offset r6, -12
+; CHECK-NEXT:    .cfi_offset r5, -16
+; CHECK-NEXT:    .cfi_offset r4, -20
+; CHECK-NEXT:    .save {r11, lr}
+; CHECK-NEXT:    push.w {r11, lr}
+; CHECK-NEXT:    .cfi_def_cfa_offset 28
+; CHECK-NEXT:    .cfi_offset lr, -24
+; CHECK-NEXT:    .cfi_offset r11, -28
+; CHECK-NEXT:    .setfp r11, sp
+; CHECK-NEXT:    mov r11, sp
+; CHECK-NEXT:    .cfi_def_cfa_register r11
+; CHECK-NEXT:    .pad #4
+; CHECK-NEXT:    sub sp, #4
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    mov r5, r2
+; CHECK-NEXT:    mov r4, r1
+; CHECK-NEXT:    it ne
+; CHECK-NEXT:    blne knr
+; CHECK-NEXT:    adds r0, r5, #7
+; CHECK-NEXT:    bic r0, r0, #7
+; CHECK-NEXT:    sub.w r0, sp, r0
+; CHECK-NEXT:    mov sp, r0
+; CHECK-NEXT:    bl take_ptr
+; CHECK-NEXT:    mov r0, r4
+; CHECK-NEXT:    movs r1, #0
+; CHECK-NEXT:    bl __aeabi_fcmpeq
+; CHECK-NEXT:    cmp r0, #0
+; CHECK-NEXT:    it eq
+; CHECK-NEXT:    bleq knr
+; CHECK-NEXT:    mov sp, r11
+; CHECK-NEXT:    pop.w {r11, lr}
+; CHECK-NEXT:    pop.w {r4, r5, r6, r7, r12}
+; CHECK-NEXT:    aut r12, lr, sp
+; CHECK-NEXT:    bx lr
+entry:
+  %tobool.not = icmp eq i32 %c, 0
+  br i1 %tobool.not, label %if.end, label %if.then
+
+if.then:                                          ; preds = %entry
+  tail call void @knr()
+  br label %if.end
+
+if.end:                                           ; preds = %if.then, %entry
+  %0 = alloca i8, i32 %z, align 8
+  call void @take_ptr(ptr noundef nonnull %0)
+  %tobool1 = fcmp une float %e, 0.000000e+00
+  br i1 %tobool1, label %if.then2, label %if.end3
+
+if.then2:                                         ; preds = %if.end
+  call void @knr()
+  br label %if.end3
+
+if.end3:                                          ; preds = %if.then2, %if.end
+  ret void
+}
+
+declare void @knr(...)
+declare void @take_ptr(ptr noundef)
-- 
GitLab


From 671976ff59ac893c2e97a95860510afa5d5e9a84 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 17 Oct 2024 09:49:51 +0100
Subject: [PATCH 210/329] [NFC][LoopVectorize] Add more simple early exit tests
 (#112529)

I realised we are missing tests to cover more loops with multiple early
exits - some countable and some uncountable.

I've also added a few SVE versions of the test in the AArch64 directory.
Once we can vectorise such early exit loops it's a good sanity check to
make sure they also vectorise for SVE. Also, for some of the tests I
expect there to be some divergence from the same tests in the top level
directory once we start vectorising them.
---
 .../AArch64/simple_early_exit.ll              | 333 ++++++++++++++++++
 .../LoopVectorize/simple_early_exit.ll        | 188 +++++++++-
 2 files changed, 518 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll

diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
new file mode 100644
index 000000000000..82556bdd2a5e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/simple_early_exit.ll
@@ -0,0 +1,333 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize | FileCheck %s --check-prefixes=CHECK
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @init_mem(ptr, i64);
+
+define i64 @same_exit_block_pre_inc_use1() #1 {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1(
+; CHECK-SAME: ) #[[ATTR0:[0-9]+]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX2]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+; In this example the early exit block appears in the list of ExitNotTaken
+; SCEVs, but is not computable.
+define i64 @same_exit_block_pre_inc_use4() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use4() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i64]
+  %p2 = alloca [1024 x i64]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i64, ptr %p1, i64 %index
+  %ld1 = load i64, ptr %arrayidx, align 1
+  %cmp3 = icmp ult i64 %index, %ld1
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_safe_call() #1 {
+; CHECK-LABEL: define i64 @loop_contains_safe_call(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index
+  %ld1 = load float, ptr %arrayidx, align 1
+  %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1)
+  %cmp = fcmp fast ult float %sqrt, 3.0e+00
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_safe_div() #1 {
+; CHECK-LABEL: define i64 @loop_contains_safe_div(
+; CHECK-SAME: ) #[[ATTR0]] {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[LD1]], 20000
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DIV]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %div = udiv i32 %ld1, 20000
+  %cmp = icmp eq i32 %div, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) {
+; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit'
+; DEBUG:       LV: Found an early exit loop with symbolic max backedge taken count: 63
+; DEBUG-NEXT:  LV: We can vectorize this loop!
+; DEBUG-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit(
+; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LD1]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP1]] ], [ [[LD2]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %cmp = icmp eq i32 %ld1, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index
+  %ld2 = load i64, ptr %arrayidx2, align 8
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ]
+  ret i64 %retval
+}
+
+
+; The form of the induction variables requires SCEV predicates.
+define i32 @diff_exit_block_needs_scev_check(i32 %end) {
+; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check'
+; DEBUG:       Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32)))<nsw>
+; DEBUG-NEXT:  LV: We can vectorize this loop!
+; DEBUG-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check(
+; CHECK-SAME: i32 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[END_CLAMPED:%.*]] = and i32 [[END]], 1023
+; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IND_NEXT]] = add i8 [[IND]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32
+; CHECK-NEXT:    [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY1]], label [[EXIT:%.*]]
+; CHECK:       found:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %p1 = alloca [1024 x i32]
+  %p2 = alloca [1024 x i32]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  %end.clamped = and i32 %end, 1023
+  br label %for.body
+
+for.body:
+  %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ]
+  %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind
+  %1 = load i32, ptr %arrayidx2, align 4
+  %cmp.early = icmp eq i32 %0, %1
+  br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+  %ind.next = add i8 %ind, 1
+  %conv = zext i8 %ind.next to i32
+  %gep.ind.next = add i64 %gep.ind, 1
+  %cmp = icmp ult i32 %conv, %end.clamped
+  br i1 %cmp, label %for.body, label %exit
+
+found:
+  ret i32 1
+
+exit:
+  ret i32 0
+}
+
+
+declare i32 @foo(i32) readonly
+declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
+
+attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
+attributes #1 = { "target-features"="+sve" vscale_range(1,16) }
diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
index 49454ae18db7..d5e4f4d016c6 100644
--- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
+++ b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
@@ -311,7 +311,6 @@ loop.end:
 }
 
 
-
 define i64 @same_exit_block_post_inc_use() {
 ; CHECK-LABEL: define i64 @same_exit_block_post_inc_use() {
 ; CHECK-NEXT:  entry:
@@ -860,8 +859,8 @@ loop.end:
 
 ; There are multiple exit blocks - two of them have an exact representation for the
 ; exit-not-taken counts and the other is unknown, i.e. the "early exit".
-define i64 @multiple_exits_one_early() {
-; CHECK-LABEL: define i64 @multiple_exits_one_early() {
+define i64 @multiple_exiting_one_early_same_exit() {
+; CHECK-LABEL: define i64 @multiple_exiting_one_early_same_exit() {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
 ; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
@@ -918,6 +917,189 @@ loop.end:
 }
 
 
+define i64 @multiple_exiting_one_early_same_exit_phi_of_consts() {
+; CHECK-LABEL: define i64 @multiple_exiting_one_early_same_exit_phi_of_consts() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[SEARCH]] ], [ 0, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %loop ], [ 1, %search ], [ 0, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @multiple_exiting_one_early_diff_exit() {
+; CHECK-LABEL: define i64 @multiple_exiting_one_early_diff_exit() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end.early:
+; CHECK-NEXT:    [[RET_EARLY:%.*]] = phi i64 [ [[INDEX]], [[SEARCH]] ]
+; CHECK-NEXT:    ret i64 [[RET_EARLY]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ 128, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end.early, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end.early:
+  %ret.early = phi i64 [ %index, %search ]
+  ret i64 %ret.early
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ 128, %loop.inc ]
+  ret i64 %retval
+}
+
+define i64 @multiple_exiting_one_early_diff_exit_no_phis() {
+; CHECK-LABEL: define i64 @multiple_exiting_one_early_diff_exit_no_phis() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end.early:
+; CHECK-NEXT:    ret i64 1
+; CHECK:       loop.end:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end.early, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end.early:
+  ret i64 1
+
+loop.end:
+  ret i64 0
+}
+
+
 ; We don't currently support multiple early exits.
 define i64 @multiple_early_exits() {
 ; DEBUG-LABEL: LV: Checking a loop in 'multiple_early_exits'
-- 
GitLab


From d51af6c215fce3d6d3791dbfdb3d0c6296dd0bf9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 09:54:24 +0100
Subject: [PATCH 211/329] [X86] Regenerate test checks with vpternlog comments

---
 llvm/test/CodeGen/X86/midpoint-int-vec-128.ll | 18 ++++-----
 llvm/test/CodeGen/X86/midpoint-int-vec-256.ll | 18 ++++-----
 llvm/test/CodeGen/X86/midpoint-int-vec-512.ll | 40 +++++++++----------
 .../CodeGen/X86/min-legal-vector-width.ll     | 18 ++++-----
 llvm/test/CodeGen/X86/pmul.ll                 |  8 ++--
 .../CodeGen/X86/prefer-avx256-mask-extend.ll  | 12 +++---
 .../CodeGen/X86/prefer-avx256-mask-shuffle.ll | 28 ++++++-------
 llvm/test/CodeGen/X86/prefer-avx256-mulo.ll   |  8 ++--
 .../CodeGen/X86/prefer-avx256-wide-mul.ll     |  2 +-
 .../X86/vector-shuffle-combining-avx512bw.ll  |  8 ++--
 10 files changed, 80 insertions(+), 80 deletions(-)

diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
index acaa9be3f89a..6fd3db3464de 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-128.ll
@@ -1928,7 +1928,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpminuw %xmm1, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm3
-; AVX512F-NEXT:    vpternlogq $15, %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 = ~zmm3
 ; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; AVX512F-NEXT:    vpmaxuw %xmm1, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
@@ -1945,7 +1945,7 @@ define <8 x i16> @vec128_i16_unsigned_reg_reg(<8 x i16> %a1, <8 x i16> %a2) noun
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm2, %xmm2, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpxor %xmm2, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %xmm2, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %xmm0, %xmm1, %xmm0
@@ -2500,7 +2500,7 @@ define <16 x i8> @vec128_i8_signed_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2706,7 +2706,7 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpminub %xmm1, %xmm0, %xmm2
 ; AVX512F-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm3
-; AVX512F-NEXT:    vpternlogq $15, %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 = ~zmm3
 ; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm3, %xmm3
 ; AVX512F-NEXT:    vpmaxub %xmm1, %xmm0, %xmm1
 ; AVX512F-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
@@ -2728,8 +2728,8 @@ define <16 x i8> @vec128_i8_unsigned_reg_reg(<16 x i8> %a1, <16 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %xmm2, %xmm0, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %xmm2, %xmm2, %xmm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm2, %xmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm1 = xmm2 ^ (xmm1 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm1, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2961,7 +2961,7 @@ define <16 x i8> @vec128_i8_signed_mem_reg(ptr %a1_addr, <16 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm0, %xmm1, %xmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm0, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm0 ^ (xmm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm0, %xmm2, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -3192,7 +3192,7 @@ define <16 x i8> @vec128_i8_signed_reg_mem(<16 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -3432,7 +3432,7 @@ define <16 x i8> @vec128_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm2, %xmm3, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %xmm2, %xmm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %xmm1, %xmm0, %xmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} xmm2 = xmm1 ^ (xmm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %xmm1, %xmm2, %xmm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %xmm0, %xmm1, %xmm0
 ; AVX512VL-FALLBACK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
index 7d882b772a64..5a1c4c8a52c8 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-256.ll
@@ -1434,7 +1434,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpminuw %ymm1, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogq $15, %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 = ~zmm3
 ; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
 ; AVX512F-NEXT:    vpmaxuw %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
@@ -1450,7 +1450,7 @@ define <16 x i16> @vec256_i16_unsigned_reg_reg(<16 x i16> %a1, <16 x i16> %a2) n
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqw %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm2, %ymm2, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} ymm2 = ~ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpxor %ymm2, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -2016,7 +2016,7 @@ define <32 x i8> @vec256_i8_signed_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2169,7 +2169,7 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX512F:       # %bb.0:
 ; AVX512F-NEXT:    vpminub %ymm1, %ymm0, %ymm2
 ; AVX512F-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm3
-; AVX512F-NEXT:    vpternlogq $15, %zmm3, %zmm3, %zmm3
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm3 = ~zmm3
 ; AVX512F-NEXT:    vpor {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm3, %ymm3
 ; AVX512F-NEXT:    vpmaxub %ymm1, %ymm0, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
@@ -2193,8 +2193,8 @@ define <32 x i8> @vec256_i8_unsigned_reg_reg(<32 x i8> %a1, <32 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpcmpeqb %ymm2, %ymm0, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $15, %ymm2, %ymm2, %ymm2
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm2, %ymm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} ymm2 = ~ymm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm1 = ymm2 ^ (ymm1 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2372,7 +2372,7 @@ define <32 x i8> @vec256_i8_signed_mem_reg(ptr %a1_addr, <32 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm0, %ymm1, %ymm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm0 ^ (ymm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm2, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm1, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2550,7 +2550,7 @@ define <32 x i8> @vec256_i8_signed_reg_mem(<32 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
@@ -2733,7 +2733,7 @@ define <32 x i8> @vec256_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm3, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsrlw $1, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpcmpgtb %ymm1, %ymm0, %ymm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogd $108, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm2
+; AVX512VL-FALLBACK-NEXT:    vpternlogd {{.*#+}} ymm2 = ymm1 ^ (ymm2 & mem)
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm2, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
index 366dad1612b4..5f6337e29d68 100644
--- a/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
+++ b/llvm/test/CodeGen/X86/midpoint-int-vec-512.ll
@@ -288,7 +288,7 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou
 ; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -315,7 +315,7 @@ define <32 x i16> @vec512_i16_signed_reg_reg(<32 x i16> %a1, <32 x i16> %a2) nou
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -365,7 +365,7 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
 ; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -392,7 +392,7 @@ define <32 x i16> @vec512_i16_unsigned_reg_reg(<32 x i16> %a1, <32 x i16> %a2) n
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -445,7 +445,7 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
@@ -473,7 +473,7 @@ define <32 x i16> @vec512_i16_signed_mem_reg(ptr %a1_addr, <32 x i16> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
@@ -526,7 +526,7 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
 ; AVX512F-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -554,7 +554,7 @@ define <32 x i16> @vec512_i16_signed_reg_mem(<32 x i16> %a1, ptr %a2_addr) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm2, %ymm6, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm0, %ymm1, %ymm0
@@ -608,7 +608,7 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; AVX512F-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
@@ -637,7 +637,7 @@ define <32 x i16> @vec512_i16_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubw %ymm0, %ymm6, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm3, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddw %ymm2, %ymm0, %ymm0
@@ -700,7 +700,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
@@ -730,7 +730,7 @@ define <64 x i8> @vec512_i8_signed_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounwin
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
@@ -784,7 +784,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512F-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
 ; AVX512F-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
@@ -814,7 +814,7 @@ define <64 x i8> @vec512_i8_unsigned_reg_reg(<64 x i8> %a1, <64 x i8> %a2) nounw
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm2, %ymm6, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm6, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $184, %zmm4, %zmm5, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm1 ^ (zmm5 & (zmm1 ^ zmm4))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
@@ -872,7 +872,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
 ; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
@@ -904,7 +904,7 @@ define <64 x i8> @vec512_i8_signed_mem_reg(ptr %a1_addr, <64 x i8> %a2) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
@@ -962,7 +962,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpand %ymm6, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
 ; AVX512F-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
@@ -994,7 +994,7 @@ define <64 x i8> @vec512_i8_signed_reg_mem(<64 x i8> %a1, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm1, %ymm7, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm2, %zmm1, %zmm1
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm1
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm1 = zmm5 ^ (zmm4 & (zmm1 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm1, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm2, %ymm2
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm0, %ymm1, %ymm0
@@ -1053,7 +1053,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512F-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; AVX512F-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512F-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
@@ -1086,7 +1086,7 @@ define <64 x i8> @vec512_i8_signed_mem_mem(ptr %a1_addr, ptr %a2_addr) nounwind
 ; AVX512VL-FALLBACK-NEXT:    vpand %ymm6, %ymm0, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vpsubb %ymm0, %ymm7, %ymm0
 ; AVX512VL-FALLBACK-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512VL-FALLBACK-NEXT:    vpternlogq $226, %zmm5, %zmm4, %zmm0
+; AVX512VL-FALLBACK-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm5 ^ (zmm4 & (zmm0 ^ zmm5))
 ; AVX512VL-FALLBACK-NEXT:    vextracti64x4 $1, %zmm0, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm3, %ymm1, %ymm1
 ; AVX512VL-FALLBACK-NEXT:    vpaddb %ymm2, %ymm0, %ymm0
diff --git a/llvm/test/CodeGen/X86/min-legal-vector-width.ll b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
index e3d2ac659d43..8289e885618f 100644
--- a/llvm/test/CodeGen/X86/min-legal-vector-width.ll
+++ b/llvm/test/CodeGen/X86/min-legal-vector-width.ll
@@ -918,13 +918,13 @@ define dso_local void @mul256(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-AVX512-NEXT:    vpandn %ymm3, %ymm4, %ymm3
 ; CHECK-AVX512-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; CHECK-AVX512-NEXT:    vpsllw $8, %ymm1, %ymm1
-; CHECK-AVX512-NEXT:    vpternlogq $248, %ymm4, %ymm5, %ymm1
+; CHECK-AVX512-NEXT:    vpternlogq {{.*#+}} ymm1 = ymm1 | (ymm5 & ymm4)
 ; CHECK-AVX512-NEXT:    vpand %ymm2, %ymm4, %ymm3
 ; CHECK-AVX512-NEXT:    vpmaddubsw %ymm3, %ymm0, %ymm3
 ; CHECK-AVX512-NEXT:    vpandn %ymm2, %ymm4, %ymm2
 ; CHECK-AVX512-NEXT:    vpmaddubsw %ymm2, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    vpsllw $8, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpternlogq $248, %ymm4, %ymm3, %ymm0
+; CHECK-AVX512-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm4)
 ; CHECK-AVX512-NEXT:    vmovdqa %ymm0, (%rdx)
 ; CHECK-AVX512-NEXT:    vmovdqa %ymm1, 32(%rdx)
 ; CHECK-AVX512-NEXT:    vzeroupper
@@ -985,7 +985,7 @@ define dso_local void @mul512(ptr %a, ptr %b, ptr %c) "min-legal-vector-width"="
 ; CHECK-AVX512-NEXT:    vpandnq %zmm1, %zmm2, %zmm1
 ; CHECK-AVX512-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
 ; CHECK-AVX512-NEXT:    vpsllw $8, %zmm0, %zmm0
-; CHECK-AVX512-NEXT:    vpternlogq $248, %zmm2, %zmm3, %zmm0
+; CHECK-AVX512-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
 ; CHECK-AVX512-NEXT:    vmovdqa64 %zmm0, (%rdx)
 ; CHECK-AVX512-NEXT:    vzeroupper
 ; CHECK-AVX512-NEXT:    retq
@@ -1993,21 +1993,21 @@ define <32 x i8> @splatconstant_rotate_v32i8(<32 x i8> %a) nounwind "min-legal-v
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-SKX-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; CHECK-SKX-NEXT:    retq
 ;
 ; CHECK-AVX512-LABEL: splatconstant_rotate_v32i8:
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; CHECK-AVX512-NEXT:    retq
 ;
 ; CHECK-VBMI1-LABEL: splatconstant_rotate_v32i8:
 ; CHECK-VBMI1:       # %bb.0:
 ; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-VBMI1-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; CHECK-VBMI1-NEXT:    retq
 ;
 ; CHECK-GFNI-LABEL: splatconstant_rotate_v32i8:
@@ -2025,7 +2025,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-SKX:       # %bb.0:
 ; CHECK-SKX-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-SKX-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-SKX-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-SKX-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; CHECK-SKX-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-SKX-NEXT:    retq
 ;
@@ -2033,7 +2033,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-AVX512:       # %bb.0:
 ; CHECK-AVX512-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-AVX512-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-AVX512-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-AVX512-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; CHECK-AVX512-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-AVX512-NEXT:    retq
 ;
@@ -2041,7 +2041,7 @@ define <32 x i8> @splatconstant_rotate_mask_v32i8(<32 x i8> %a) nounwind "min-le
 ; CHECK-VBMI1:       # %bb.0:
 ; CHECK-VBMI1-NEXT:    vpsllw $4, %ymm0, %ymm1
 ; CHECK-VBMI1-NEXT:    vpsrlw $4, %ymm0, %ymm0
-; CHECK-VBMI1-NEXT:    vpternlogd $216, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm1, %ymm0
+; CHECK-VBMI1-NEXT:    vpternlogd {{.*#+}} ymm0 = ymm0 ^ (mem & (ymm0 ^ ymm1))
 ; CHECK-VBMI1-NEXT:    vpandd {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to8}, %ymm0, %ymm0
 ; CHECK-VBMI1-NEXT:    retq
 ;
diff --git a/llvm/test/CodeGen/X86/pmul.ll b/llvm/test/CodeGen/X86/pmul.ll
index 2b475644a38c..6c3d04863118 100644
--- a/llvm/test/CodeGen/X86/pmul.ll
+++ b/llvm/test/CodeGen/X86/pmul.ll
@@ -833,7 +833,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512F-NEXT:    vpmaddubsw %ymm3, %ymm1, %ymm1
 ; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm2, %zmm0
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm2 & mem)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v64i8c:
@@ -841,7 +841,7 @@ define <64 x i8> @mul_v64i8c(<64 x i8> %i) nounwind  {
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm1 # [117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0]
 ; AVX512BW-NEXT:    vpmaddubsw {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %zmm0, %zmm0 # [0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117,0,117]
 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogd $248, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to16}, %zmm1, %zmm0
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 = zmm0 | (zmm1 & mem)
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <64 x i8> %i, < i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117, i8 117 >
@@ -978,7 +978,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; AVX512F-NEXT:    vpmaddubsw %ymm1, %ymm5, %ymm1
 ; AVX512F-NEXT:    vpsllw $8, %ymm1, %ymm1
 ; AVX512F-NEXT:    vinserti64x4 $1, %ymm1, %zmm0, %zmm0
-; AVX512F-NEXT:    vpternlogq $248, %zmm3, %zmm4, %zmm0
+; AVX512F-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm4 & zmm3)
 ; AVX512F-NEXT:    retq
 ;
 ; AVX512BW-LABEL: mul_v64i8:
@@ -989,7 +989,7 @@ define <64 x i8> @mul_v64i8(<64 x i8> %i, <64 x i8> %j) nounwind  {
 ; AVX512BW-NEXT:    vpandnq %zmm1, %zmm2, %zmm1
 ; AVX512BW-NEXT:    vpmaddubsw %zmm1, %zmm0, %zmm0
 ; AVX512BW-NEXT:    vpsllw $8, %zmm0, %zmm0
-; AVX512BW-NEXT:    vpternlogq $248, %zmm2, %zmm3, %zmm0
+; AVX512BW-NEXT:    vpternlogq {{.*#+}} zmm0 = zmm0 | (zmm3 & zmm2)
 ; AVX512BW-NEXT:    retq
 entry:
   %A = mul <64 x i8> %i, %j
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
index cca9d270fd49..ad08eaffab38 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-extend.ll
@@ -62,7 +62,7 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) {
 ; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -74,7 +74,7 @@ define <16 x i8> @testv16i1_sext_v16i8(ptr %p, ptr %q) {
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -109,7 +109,7 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
 ; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512VL-NEXT:    retq
 ;
@@ -120,7 +120,7 @@ define <16 x i16> @testv16i1_sext_v16i16(ptr %p, ptr %q) {
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    retq
   %in = load <8 x i32>, ptr %p
@@ -242,7 +242,7 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
 ; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VL-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512VL-NEXT:    vpsrlw $15, %ymm0, %ymm0
 ; AVX512VL-NEXT:    retq
@@ -254,7 +254,7 @@ define <16 x i16> @testv16i1_zext_v16i16(ptr %p, ptr %q) {
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm0
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    kunpckbw %k0, %k1, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdw %zmm0, %ymm0
 ; AVX512F-NEXT:    vpsrlw $15, %ymm0, %ymm0
 ; AVX512F-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
index ffc83620d3da..3699c7f75c86 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mask-shuffle.ll
@@ -47,12 +47,12 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
 ; AVX512VL-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512VL-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VL-NEXT:    vptestnmd %ymm1, %ymm1, %k2
-; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512VL-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
 ; AVX512VL-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
 ; AVX512VL-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; AVX512VL-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; AVX512VL-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512VL-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512VL-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512VL-NEXT:    vzeroupper
 ; AVX512VL-NEXT:    retq
@@ -78,8 +78,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
 ; AVX512VLBW-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512VLBW-NEXT:    vptestnmd %ymm0, %ymm0, %k1
 ; AVX512VLBW-NEXT:    vptestnmd %ymm1, %ymm1, %k2
-; AVX512VLBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512VLBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512VLBW-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
 ; AVX512VLBW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
 ; AVX512VLBW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; AVX512VLBW-NEXT:    vptestmd %zmm2, %zmm2, %k0
@@ -93,12 +93,12 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
 ; AVX512F-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512F-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512F-NEXT:    vptestnmd %zmm1, %zmm1, %k2
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512F-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
 ; AVX512F-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; AVX512F-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; AVX512F-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512F-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512F-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512F-NEXT:    vzeroupper
 ; AVX512F-NEXT:    retq
@@ -109,8 +109,8 @@ define <16 x i1> @shuf16i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0(ptr %a, ptr %b) {
 ; AVX512BW-NEXT:    vmovdqa (%rsi), %ymm1
 ; AVX512BW-NEXT:    vptestnmd %zmm0, %zmm0, %k1
 ; AVX512BW-NEXT:    vptestnmd %zmm1, %zmm1, %k2
-; AVX512BW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512BW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512BW-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
 ; AVX512BW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,18,20,3,7,7,0,3,6,1,21,3,19,7,0]
 ; AVX512BW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; AVX512BW-NEXT:    vptestmd %zmm2, %zmm2, %k0
@@ -152,7 +152,7 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
 ; AVX256VL-NEXT:    vmovdqa32 %ymm0, %ymm2 {%k1} {z}
 ; AVX256VL-NEXT:    vpmovdw %ymm2, %xmm2
 ; AVX256VL-NEXT:    vpermq {{.*#+}} ymm2 = ymm2[1,1,1,1]
-; AVX256VL-NEXT:    vpternlogq $220, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %ymm1, %ymm2
+; AVX256VL-NEXT:    vpternlogq {{.*#+}} ymm2 = (ymm2 & ~mem) | ymm1
 ; AVX256VL-NEXT:    vpmovsxwd %xmm2, %ymm1
 ; AVX256VL-NEXT:    vpslld $31, %ymm1, %ymm1
 ; AVX256VL-NEXT:    vptestmd %ymm1, %ymm1, %k1
@@ -179,12 +179,12 @@ define <32 x i1> @shuf32i1_3_6_22_12_3_7_7_0_3_6_1_13_3_21_7_0_3_6_22_12_3_7_7_0
 ; AVX512NOBW-NEXT:    vextracti128 $1, %ymm0, %xmm0
 ; AVX512NOBW-NEXT:    vpmovsxbd %xmm0, %zmm0
 ; AVX512NOBW-NEXT:    vptestmd %zmm0, %zmm0, %k2
-; AVX512NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k2} {z}
-; AVX512NOBW-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1 {%k1} {z}
+; AVX512NOBW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k2} {z} = -1
+; AVX512NOBW-NEXT:    vpternlogd {{.*#+}} zmm1 {%k1} {z} = -1
 ; AVX512NOBW-NEXT:    vpmovsxbd {{.*#+}} zmm2 = [3,6,22,12,3,7,7,0,3,6,1,13,3,21,7,0]
 ; AVX512NOBW-NEXT:    vpermi2d %zmm0, %zmm1, %zmm2
 ; AVX512NOBW-NEXT:    vptestmd %zmm2, %zmm2, %k1
-; AVX512NOBW-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512NOBW-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512NOBW-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512NOBW-NEXT:    vinserti128 $1, %xmm0, %ymm0, %ymm0
 ; AVX512NOBW-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll
index e3a608abfda4..155ef0faadad 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-mulo.ll
@@ -17,7 +17,7 @@ define <16 x i1> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX256-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX256-NEXT:    vpcmpgtb %xmm0, %xmm2, %xmm2
 ; AVX256-NEXT:    vpcmpeqb %xmm1, %xmm2, %xmm1
-; AVX256-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX256-NEXT:    vpternlogq {{.*#+}} xmm1 = ~xmm1
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; AVX256-NEXT:    vpmovsxbd %xmm2, %ymm2
 ; AVX256-NEXT:    vptestmd %ymm2, %ymm2, %k1
@@ -46,7 +46,7 @@ define <16 x i1> @smulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vpcmpneqd %zmm1, %zmm2, %k1
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512-NEXT:    vpmovdb %zmm0, (%rdi)
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
@@ -69,7 +69,7 @@ define <16 x i1> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX256-NEXT:    vpackuswb %xmm2, %xmm1, %xmm1
 ; AVX256-NEXT:    vpxor %xmm2, %xmm2, %xmm2
 ; AVX256-NEXT:    vpcmpeqb %xmm2, %xmm1, %xmm1
-; AVX256-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX256-NEXT:    vpternlogq {{.*#+}} xmm1 = ~xmm1
 ; AVX256-NEXT:    vpshufd {{.*#+}} xmm2 = xmm1[2,3,2,3]
 ; AVX256-NEXT:    vpmovsxbd %xmm2, %ymm2
 ; AVX256-NEXT:    vptestmd %ymm2, %ymm2, %k1
@@ -98,7 +98,7 @@ define <16 x i1> @umulo_v16i8(<16 x i8> %a0, <16 x i8> %a1, ptr %p2) nounwind {
 ; AVX512-NEXT:    vptestmd %zmm1, %zmm1, %k1
 ; AVX512-NEXT:    vpmovzxwd {{.*#+}} zmm0 = ymm0[0],zero,ymm0[1],zero,ymm0[2],zero,ymm0[3],zero,ymm0[4],zero,ymm0[5],zero,ymm0[6],zero,ymm0[7],zero,ymm0[8],zero,ymm0[9],zero,ymm0[10],zero,ymm0[11],zero,ymm0[12],zero,ymm0[13],zero,ymm0[14],zero,ymm0[15],zero
 ; AVX512-NEXT:    vpmovdb %zmm0, (%rdi)
-; AVX512-NEXT:    vpternlogd $255, %zmm0, %zmm0, %zmm0 {%k1} {z}
+; AVX512-NEXT:    vpternlogd {{.*#+}} zmm0 {%k1} {z} = -1
 ; AVX512-NEXT:    vpmovdb %zmm0, %xmm0
 ; AVX512-NEXT:    vzeroupper
 ; AVX512-NEXT:    retq
diff --git a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
index a950a13b0d8c..c9bb3de92dcd 100644
--- a/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
+++ b/llvm/test/CodeGen/X86/prefer-avx256-wide-mul.ll
@@ -64,7 +64,7 @@ define <32 x i8> @test_mul_32i8(<32 x i8> %a, <32 x i8> %b) {
 ; AVX256BW-NEXT:    vpandn %ymm1, %ymm2, %ymm1
 ; AVX256BW-NEXT:    vpmaddubsw %ymm1, %ymm0, %ymm0
 ; AVX256BW-NEXT:    vpsllw $8, %ymm0, %ymm0
-; AVX256BW-NEXT:    vpternlogq $248, %ymm2, %ymm3, %ymm0
+; AVX256BW-NEXT:    vpternlogq {{.*#+}} ymm0 = ymm0 | (ymm3 & ymm2)
 ; AVX256BW-NEXT:    retq
 ;
 ; AVX512BWVL-LABEL: test_mul_32i8:
diff --git a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
index bbc87eda82a5..b4375cfb343b 100644
--- a/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
+++ b/llvm/test/CodeGen/X86/vector-shuffle-combining-avx512bw.ll
@@ -55,10 +55,10 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
 ; X86-LABEL: combine_pshufb_identity_mask:
 ; X86:       # %bb.0:
 ; X86-NEXT:    kmovq {{[0-9]+}}(%esp), %k1
-; X86-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; X86-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; X86-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X86-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
-; X86-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
+; X86-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
 ; X86-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
 ; X86-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
 ; X86-NEXT:    vmovdqa64 %zmm1, %zmm0
@@ -66,11 +66,11 @@ define <64 x i8> @combine_pshufb_identity_mask(<64 x i8> %x0, i64 %m) {
 ;
 ; X64-LABEL: combine_pshufb_identity_mask:
 ; X64:       # %bb.0:
-; X64-NEXT:    vpternlogd $255, %zmm1, %zmm1, %zmm1
+; X64-NEXT:    vpternlogd {{.*#+}} zmm1 = -1
 ; X64-NEXT:    vbroadcasti32x4 {{.*#+}} zmm2 = [15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0,15,14,13,12,11,10,9,8,7,6,5,4,3,2,1,0]
 ; X64-NEXT:    # zmm2 = mem[0,1,2,3,0,1,2,3,0,1,2,3,0,1,2,3]
 ; X64-NEXT:    kmovq %rdi, %k1
-; X64-NEXT:    vpternlogd $255, %zmm3, %zmm3, %zmm3
+; X64-NEXT:    vpternlogd {{.*#+}} zmm3 = -1
 ; X64-NEXT:    vpshufb %zmm2, %zmm0, %zmm3 {%k1}
 ; X64-NEXT:    vpshufb %zmm2, %zmm3, %zmm1 {%k1}
 ; X64-NEXT:    vmovdqa64 %zmm1, %zmm0
-- 
GitLab


From 375690c0a1a1caacad1bbd243a611ae5c2970996 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 17 Oct 2024 13:15:12 +0400
Subject: [PATCH 212/329] clang/HIP: Remove REQUIRES libgcc from a test
 (#112412)

---
 clang/test/Driver/hip-include-path.hip | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/test/Driver/hip-include-path.hip b/clang/test/Driver/hip-include-path.hip
index 1b4179e65c0b..5eeee2f5ce0d 100644
--- a/clang/test/Driver/hip-include-path.hip
+++ b/clang/test/Driver/hip-include-path.hip
@@ -1,4 +1,3 @@
-// REQUIRES: libgcc
 // UNSUPPORTED: system-windows
 
 // RUN: %clang -c -### --target=x86_64-unknown-linux-gnu --cuda-gpu-arch=gfx900 \
-- 
GitLab


From 77ea619bc6cdcdf734105e0c96c92e060aadc011 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 17 Oct 2024 13:17:10 +0400
Subject: [PATCH 213/329] clang/HIP: Remove REQUIRES windows from a test
 (#112411)

---
 clang/test/Driver/hip-runtime-libs-msvc.hip | 2 --
 1 file changed, 2 deletions(-)

diff --git a/clang/test/Driver/hip-runtime-libs-msvc.hip b/clang/test/Driver/hip-runtime-libs-msvc.hip
index 8085e77d457e..943cd0569f4f 100644
--- a/clang/test/Driver/hip-runtime-libs-msvc.hip
+++ b/clang/test/Driver/hip-runtime-libs-msvc.hip
@@ -1,5 +1,3 @@
-// REQUIRES: system-windows
-
 // RUN:  touch %t.o
 
 // Test HIP runtime lib args specified by --rocm-path.
-- 
GitLab


From 6902b39b6ffda5ad1253147740fb04befbf82333 Mon Sep 17 00:00:00 2001
From: Ivan Butygin <ivan.butygin@gmail.com>
Date: Thu, 17 Oct 2024 12:23:11 +0300
Subject: [PATCH 214/329] [mlir] UnsignedWhenEquivalent: use greedy rewriter
 instead of dialect conversion (#112454)

`UnsignedWhenEquivalent` doesn't really need any dialect conversion
features and switching it normal patterns makes it more composable with
other patterns-based transformations (and probably faster).
---
 .../mlir/Dialect/Arith/Transforms/Passes.h    |  4 +
 .../Transforms/UnsignedWhenEquivalent.cpp     | 98 ++++++++++++-------
 .../Arith/unsigned-when-equivalent.mlir       | 20 ++--
 3 files changed, 76 insertions(+), 46 deletions(-)

diff --git a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
index aee64475171a..e866ac518dbb 100644
--- a/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
+++ b/mlir/include/mlir/Dialect/Arith/Transforms/Passes.h
@@ -70,6 +70,10 @@ std::unique_ptr<Pass> createArithUnsignedWhenEquivalentPass();
 void populateIntRangeOptimizationsPatterns(RewritePatternSet &patterns,
                                            DataFlowSolver &solver);
 
+/// Replace signed ops with unsigned ones where they are proven equivalent.
+void populateUnsignedWhenEquivalentPatterns(RewritePatternSet &patterns,
+                                            DataFlowSolver &solver);
+
 /// Create a pass which do optimizations based on integer range analysis.
 std::unique_ptr<Pass> createIntRangeOptimizationsPass();
 
diff --git a/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp b/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp
index 4edce84bafd4..bebe0b5a7c0b 100644
--- a/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp
+++ b/mlir/lib/Dialect/Arith/Transforms/UnsignedWhenEquivalent.cpp
@@ -13,7 +13,8 @@
 #include "mlir/Analysis/DataFlow/DeadCodeAnalysis.h"
 #include "mlir/Analysis/DataFlow/IntegerRangeAnalysis.h"
 #include "mlir/Dialect/Arith/IR/Arith.h"
-#include "mlir/Transforms/DialectConversion.h"
+#include "mlir/IR/PatternMatch.h"
+#include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir {
 namespace arith {
@@ -29,6 +30,9 @@ using namespace mlir::dataflow;
 /// Succeeds when a value is statically non-negative in that it has a lower
 /// bound on its value (if it is treated as signed) and that bound is
 /// non-negative.
+// TODO: IntegerRangeAnalysis internally assumes index is 64bit and this pattern
+// relies on this. These transformations may not be valid for 32bit index,
+// need more investigation.
 static LogicalResult staticallyNonNegative(DataFlowSolver &solver, Value v) {
   auto *result = solver.lookupState<IntegerValueRangeLattice>(v);
   if (!result || result->getValue().isUninitialized())
@@ -85,35 +89,60 @@ static CmpIPredicate toUnsignedPred(CmpIPredicate pred) {
 }
 
 namespace {
+class DataFlowListener : public RewriterBase::Listener {
+public:
+  DataFlowListener(DataFlowSolver &s) : s(s) {}
+
+protected:
+  void notifyOperationErased(Operation *op) override {
+    s.eraseState(s.getProgramPointAfter(op));
+    for (Value res : op->getResults())
+      s.eraseState(res);
+  }
+
+  DataFlowSolver &s;
+};
+
 template <typename Signed, typename Unsigned>
-struct ConvertOpToUnsigned : OpConversionPattern<Signed> {
-  using OpConversionPattern<Signed>::OpConversionPattern;
+struct ConvertOpToUnsigned final : OpRewritePattern<Signed> {
+  ConvertOpToUnsigned(MLIRContext *context, DataFlowSolver &s)
+      : OpRewritePattern<Signed>(context), solver(s) {}
 
-  LogicalResult matchAndRewrite(Signed op, typename Signed::Adaptor adaptor,
-                                ConversionPatternRewriter &rw) const override {
-    rw.replaceOpWithNewOp<Unsigned>(op, op->getResultTypes(),
-                                    adaptor.getOperands(), op->getAttrs());
+  LogicalResult matchAndRewrite(Signed op, PatternRewriter &rw) const override {
+    if (failed(
+            staticallyNonNegative(this->solver, static_cast<Operation *>(op))))
+      return failure();
+
+    rw.replaceOpWithNewOp<Unsigned>(op, op->getResultTypes(), op->getOperands(),
+                                    op->getAttrs());
     return success();
   }
+
+private:
+  DataFlowSolver &solver;
 };
 
-struct ConvertCmpIToUnsigned : OpConversionPattern<CmpIOp> {
-  using OpConversionPattern<CmpIOp>::OpConversionPattern;
+struct ConvertCmpIToUnsigned final : OpRewritePattern<CmpIOp> {
+  ConvertCmpIToUnsigned(MLIRContext *context, DataFlowSolver &s)
+      : OpRewritePattern<CmpIOp>(context), solver(s) {}
+
+  LogicalResult matchAndRewrite(CmpIOp op, PatternRewriter &rw) const override {
+    if (failed(isCmpIConvertable(this->solver, op)))
+      return failure();
 
-  LogicalResult matchAndRewrite(CmpIOp op, CmpIOpAdaptor adaptor,
-                                ConversionPatternRewriter &rw) const override {
     rw.replaceOpWithNewOp<CmpIOp>(op, toUnsignedPred(op.getPredicate()),
                                   op.getLhs(), op.getRhs());
     return success();
   }
+
+private:
+  DataFlowSolver &solver;
 };
 
 struct ArithUnsignedWhenEquivalentPass
     : public arith::impl::ArithUnsignedWhenEquivalentBase<
           ArithUnsignedWhenEquivalentPass> {
-  /// Implementation structure: first find all equivalent ops and collect them,
-  /// then perform all the rewrites in a second pass over the target op. This
-  /// ensures that analysis results are not invalidated during rewriting.
+
   void runOnOperation() override {
     Operation *op = getOperation();
     MLIRContext *ctx = op->getContext();
@@ -123,35 +152,32 @@ struct ArithUnsignedWhenEquivalentPass
     if (failed(solver.initializeAndRun(op)))
       return signalPassFailure();
 
-    ConversionTarget target(*ctx);
-    target.addLegalDialect<ArithDialect>();
-    target.addDynamicallyLegalOp<DivSIOp, CeilDivSIOp, FloorDivSIOp, RemSIOp,
-                                 MinSIOp, MaxSIOp, ExtSIOp>(
-        [&solver](Operation *op) -> std::optional<bool> {
-          return failed(staticallyNonNegative(solver, op));
-        });
-    target.addDynamicallyLegalOp<CmpIOp>(
-        [&solver](CmpIOp op) -> std::optional<bool> {
-          return failed(isCmpIConvertable(solver, op));
-        });
+    DataFlowListener listener(solver);
 
     RewritePatternSet patterns(ctx);
-    patterns.add<ConvertOpToUnsigned<DivSIOp, DivUIOp>,
-                 ConvertOpToUnsigned<CeilDivSIOp, CeilDivUIOp>,
-                 ConvertOpToUnsigned<FloorDivSIOp, DivUIOp>,
-                 ConvertOpToUnsigned<RemSIOp, RemUIOp>,
-                 ConvertOpToUnsigned<MinSIOp, MinUIOp>,
-                 ConvertOpToUnsigned<MaxSIOp, MaxUIOp>,
-                 ConvertOpToUnsigned<ExtSIOp, ExtUIOp>, ConvertCmpIToUnsigned>(
-        ctx);
-
-    if (failed(applyPartialConversion(op, target, std::move(patterns)))) {
+    populateUnsignedWhenEquivalentPatterns(patterns, solver);
+
+    GreedyRewriteConfig config;
+    config.listener = &listener;
+
+    if (failed(applyPatternsAndFoldGreedily(op, std::move(patterns), config)))
       signalPassFailure();
-    }
   }
 };
 } // end anonymous namespace
 
+void mlir::arith::populateUnsignedWhenEquivalentPatterns(
+    RewritePatternSet &patterns, DataFlowSolver &solver) {
+  patterns.add<ConvertOpToUnsigned<DivSIOp, DivUIOp>,
+               ConvertOpToUnsigned<CeilDivSIOp, CeilDivUIOp>,
+               ConvertOpToUnsigned<FloorDivSIOp, DivUIOp>,
+               ConvertOpToUnsigned<RemSIOp, RemUIOp>,
+               ConvertOpToUnsigned<MinSIOp, MinUIOp>,
+               ConvertOpToUnsigned<MaxSIOp, MaxUIOp>,
+               ConvertOpToUnsigned<ExtSIOp, ExtUIOp>, ConvertCmpIToUnsigned>(
+      patterns.getContext(), solver);
+}
+
 std::unique_ptr<Pass> mlir::arith::createArithUnsignedWhenEquivalentPass() {
   return std::make_unique<ArithUnsignedWhenEquivalentPass>();
 }
diff --git a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
index 49bd74cfe912..0ea69de8b8f9 100644
--- a/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
+++ b/mlir/test/Dialect/Arith/unsigned-when-equivalent.mlir
@@ -12,7 +12,7 @@
 // CHECK: arith.cmpi slt
 // CHECK: arith.cmpi sge
 // CHECK: arith.cmpi sgt
-func.func @not_with_maybe_overflow(%arg0 : i32) {
+func.func @not_with_maybe_overflow(%arg0 : i32) -> (i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1) {
     %ci32_smax = arith.constant 0x7fffffff : i32
     %c1 = arith.constant 1 : i32
     %c4 = arith.constant 4 : i32
@@ -29,7 +29,7 @@ func.func @not_with_maybe_overflow(%arg0 : i32) {
     %10 = arith.cmpi slt, %1, %c4 : i32
     %11 = arith.cmpi sge, %1, %c4 : i32
     %12 = arith.cmpi sgt, %1, %c4 : i32
-    func.return
+    func.return %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12 : i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1
 }
 
 // CHECK-LABEL: func @yes_with_no_overflow
@@ -44,7 +44,7 @@ func.func @not_with_maybe_overflow(%arg0 : i32) {
 // CHECK: arith.cmpi ult
 // CHECK: arith.cmpi uge
 // CHECK: arith.cmpi ugt
-func.func @yes_with_no_overflow(%arg0 : i32) {
+func.func @yes_with_no_overflow(%arg0 : i32) -> (i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1) {
     %ci32_almost_smax = arith.constant 0x7ffffffe : i32
     %c1 = arith.constant 1 : i32
     %c4 = arith.constant 4 : i32
@@ -61,7 +61,7 @@ func.func @yes_with_no_overflow(%arg0 : i32) {
     %10 = arith.cmpi slt, %1, %c4 : i32
     %11 = arith.cmpi sge, %1, %c4 : i32
     %12 = arith.cmpi sgt, %1, %c4 : i32
-    func.return
+    func.return %2, %3, %4, %5, %6, %7, %8, %9, %10, %11, %12 : i32, i32, i32, i32, i32, i32, i64, i1, i1, i1, i1
 }
 
 // CHECK-LABEL: func @preserves_structure
@@ -90,20 +90,20 @@ func.func @preserves_structure(%arg0 : memref<8xindex>) {
 func.func private @external() -> i8
 
 // CHECK-LABEL: @dead_code
-func.func @dead_code() {
+func.func @dead_code() -> i8 {
   %0 = call @external() : () -> i8
   // CHECK: arith.floordivsi
   %1 = arith.floordivsi %0, %0 : i8
-  return
+  return %1 : i8
 }
 
 // Make sure not crash.
 // CHECK-LABEL: @no_integer_or_index
-func.func @no_integer_or_index() { 
+func.func @no_integer_or_index(%arg0: vector<1xi32>) -> vector<1xi1> {
   // CHECK: arith.cmpi
   %cst_0 = arith.constant dense<[0]> : vector<1xi32> 
-  %cmp = arith.cmpi slt, %cst_0, %cst_0 : vector<1xi32> 
-  return
+  %cmp = arith.cmpi slt, %cst_0, %arg0 : vector<1xi32>
+  return %cmp : vector<1xi1>
 }
 
 // CHECK-LABEL: @gpu_func
@@ -113,4 +113,4 @@ func.func @gpu_func(%arg0: memref<2x32xf32>, %arg1: memref<2x32xf32>, %arg2: mem
     gpu.terminator
   } 
   return %arg1 : memref<2x32xf32> 
-}  
+}
-- 
GitLab


From 9b713f5d234adec266d46c9cfc3f2607793976dc Mon Sep 17 00:00:00 2001
From: Pradeep Kumar <pradeepku@nvidia.com>
Date: Thu, 17 Oct 2024 15:03:00 +0530
Subject: [PATCH 215/329] [MLIR][NVVM] Add PTX predefined special registers
 (#112343)

This commit adds support for the following PTX predefined special
registers
* warpid
* nwarpid
* smid
* nsmid
* gridid
* lanemask.*
* globaltimer
* envreg* And added lit tests under nvvmir.mlir
---
 mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td | 22 ++++-
 mlir/test/Target/LLVMIR/nvvmir.mlir         | 92 ++++++++++++++++++++-
 2 files changed, 109 insertions(+), 5 deletions(-)

diff --git a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
index 152715f28108..5806295cedb1 100644
--- a/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
+++ b/mlir/include/mlir/Dialect/LLVMIR/NVVMOps.td
@@ -139,9 +139,22 @@ class NVVM_SpecialRangeableRegisterOp<string mnemonic, list<Trait> traits = []>
 }
 
 //===----------------------------------------------------------------------===//
-// Lane index and range
+// Lane, Warp, SM, Grid index and range
 def NVVM_LaneIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.laneid">;
 def NVVM_WarpSizeOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpsize">;
+def NVVM_WarpIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.warpid">;
+def NVVM_WarpDimOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nwarpid">;
+def NVVM_SmIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.smid">;
+def NVVM_SmDimOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.nsmid">;
+def NVVM_GridIdOp : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.gridid">;
+
+//===----------------------------------------------------------------------===//
+// Lane Mask Comparison Ops
+def NVVM_LaneMaskEqOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.eq">;
+def NVVM_LaneMaskLeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.le">;
+def NVVM_LaneMaskLtOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.lt">;
+def NVVM_LaneMaskGeOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.ge">;
+def NVVM_LaneMaskGtOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.lanemask.gt">;
 
 //===----------------------------------------------------------------------===//
 // Thread index and range
@@ -189,6 +202,13 @@ def NVVM_ClusterDim : NVVM_SpecialRangeableRegisterOp<"read.ptx.sreg.cluster.nct
 // Clock registers
 def NVVM_ClockOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock">;
 def NVVM_Clock64Op : NVVM_SpecialRegisterOp<"read.ptx.sreg.clock64">;
+def NVVM_GlobalTimerOp : NVVM_SpecialRegisterOp<"read.ptx.sreg.globaltimer">;
+
+//===----------------------------------------------------------------------===//
+// envreg registers
+foreach index = !range(0, 32) in {
+  def NVVM_EnvReg # index # Op : NVVM_SpecialRegisterOp<"read.ptx.sreg.envreg" # index>;
+}
 
 //===----------------------------------------------------------------------===//
 // NVVM approximate op definitions
diff --git a/mlir/test/Target/LLVMIR/nvvmir.mlir b/mlir/test/Target/LLVMIR/nvvmir.mlir
index 7fd082a5eb3c..0471e5faf845 100644
--- a/mlir/test/Target/LLVMIR/nvvmir.mlir
+++ b/mlir/test/Target/LLVMIR/nvvmir.mlir
@@ -62,10 +62,94 @@ llvm.func @nvvm_special_regs() -> i32 {
   %29 = nvvm.read.ptx.sreg.clock : i32
   // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.clock64
   %30 = nvvm.read.ptx.sreg.clock64 : i64
-
-  // CHECK: %31 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
-  %31 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
-
+  // CHECK: call i64 @llvm.nvvm.read.ptx.sreg.globaltimer
+  %31 = nvvm.read.ptx.sreg.globaltimer : i64
+  // CHECK: %32 = call range(i32 0, 64) i32 @llvm.nvvm.read.ptx.sreg.tid.x()
+  %32 = nvvm.read.ptx.sreg.tid.x range <i32, 0, 64> : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.warpid
+  %33 = nvvm.read.ptx.sreg.warpid : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nwarpid
+  %34 = nvvm.read.ptx.sreg.nwarpid : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.smid
+  %35 = nvvm.read.ptx.sreg.smid : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.nsmid
+  %36 = nvvm.read.ptx.sreg.nsmid : i32
+  // CHECK: call i32 @llvm.nvvm.read.ptx.sreg.gridid
+  %37 = nvvm.read.ptx.sreg.gridid : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg0
+  %38 = nvvm.read.ptx.sreg.envreg0 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg1
+  %39 = nvvm.read.ptx.sreg.envreg1 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg2
+  %40 = nvvm.read.ptx.sreg.envreg2 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg3
+  %41 = nvvm.read.ptx.sreg.envreg3 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg4
+  %42 = nvvm.read.ptx.sreg.envreg4 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg5
+  %43 = nvvm.read.ptx.sreg.envreg5 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg6
+  %44 = nvvm.read.ptx.sreg.envreg6 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg7
+  %45 = nvvm.read.ptx.sreg.envreg7 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg8
+  %46 = nvvm.read.ptx.sreg.envreg8 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg9
+  %47 = nvvm.read.ptx.sreg.envreg9 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg10
+  %48 = nvvm.read.ptx.sreg.envreg10 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg11
+  %49 = nvvm.read.ptx.sreg.envreg11 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg12
+  %50 = nvvm.read.ptx.sreg.envreg12 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg13
+  %51 = nvvm.read.ptx.sreg.envreg13 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg14
+  %52 = nvvm.read.ptx.sreg.envreg14 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg15
+  %53 = nvvm.read.ptx.sreg.envreg15 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg16
+  %54 = nvvm.read.ptx.sreg.envreg16 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg17
+  %55 = nvvm.read.ptx.sreg.envreg17 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg18
+  %56 = nvvm.read.ptx.sreg.envreg18 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg19
+  %57 = nvvm.read.ptx.sreg.envreg19 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg20
+  %58 = nvvm.read.ptx.sreg.envreg20 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg21
+  %59 = nvvm.read.ptx.sreg.envreg21 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg22
+  %60 = nvvm.read.ptx.sreg.envreg22 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg23
+  %61 = nvvm.read.ptx.sreg.envreg23 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg24
+  %62 = nvvm.read.ptx.sreg.envreg24 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg25
+  %63 = nvvm.read.ptx.sreg.envreg25 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg26
+  %64 = nvvm.read.ptx.sreg.envreg26 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg27
+  %65 = nvvm.read.ptx.sreg.envreg27 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg28
+  %66 = nvvm.read.ptx.sreg.envreg28 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg29
+  %67 = nvvm.read.ptx.sreg.envreg29 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg30
+  %68 = nvvm.read.ptx.sreg.envreg30 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.envreg31
+  %69 = nvvm.read.ptx.sreg.envreg31 : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.eq
+  %70 = nvvm.read.ptx.sreg.lanemask.eq : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.le
+  %71 = nvvm.read.ptx.sreg.lanemask.le : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.lt
+  %72 = nvvm.read.ptx.sreg.lanemask.lt : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.ge
+  %73 = nvvm.read.ptx.sreg.lanemask.ge : i32
+  //CHECK: call i32 @llvm.nvvm.read.ptx.sreg.lanemask.gt
+  %74 = nvvm.read.ptx.sreg.lanemask.gt : i32
   llvm.return %1 : i32
 }
 
-- 
GitLab


From 2ab2539ce95bd3330370e703020a28eca89ea872 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <joseph942010@gmail.com>
Date: Thu, 17 Oct 2024 05:40:43 -0400
Subject: [PATCH 216/329] [polly] Avoid llvm::Type::getPointerTo() (NFC)
 (#112651)

`llvm::Type::getPointerTo()` is to be deprecated & removed soon.
---
 polly/lib/CodeGen/LoopGeneratorsKMP.cpp | 60 +++++++++----------------
 1 file changed, 20 insertions(+), 40 deletions(-)

diff --git a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
index 4ec5afe6aa63..45800b105ea7 100644
--- a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
+++ b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
@@ -28,27 +28,23 @@ void ParallelLoopGeneratorKMP::createCallSpawnThreads(Value *SubFn,
 
   if (!KMPCMicroTy) {
     // void (*kmpc_micro)(kmp_int32 *global_tid, kmp_int32 *bound_tid, ...)
-    Type *MicroParams[] = {Builder.getInt32Ty()->getPointerTo(),
-                           Builder.getInt32Ty()->getPointerTo()};
+    Type *MicroParams[] = {Builder.getPtrTy(0), Builder.getPtrTy(0)};
 
     KMPCMicroTy = FunctionType::get(Builder.getVoidTy(), MicroParams, true);
   }
 
   // If F is not available, declare it.
   if (!F) {
-    StructType *IdentTy =
-        StructType::getTypeByName(M->getContext(), "struct.ident_t");
-
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
-                      KMPCMicroTy->getPointerTo()};
+    Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(),
+                      Builder.getPtrTy(0)};
 
     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, true);
     F = Function::Create(Ty, Linkage, Name, M);
   }
 
-  Value *Task = Builder.CreatePointerBitCastOrAddrSpaceCast(
-      SubFn, KMPCMicroTy->getPointerTo());
+  Value *Task =
+      Builder.CreatePointerBitCastOrAddrSpaceCast(SubFn, Builder.getPtrTy(0));
 
   Value *Args[] = {SourceLocationInfo,
                    Builder.getInt32(4) /* Number of arguments (w/o Task) */,
@@ -77,12 +73,9 @@ void ParallelLoopGeneratorKMP::deployParallelExecution(Function *SubFn,
 }
 
 Function *ParallelLoopGeneratorKMP::prepareSubFnDefinition(Function *F) const {
-  std::vector<Type *> Arguments = {Builder.getInt32Ty()->getPointerTo(),
-                                   Builder.getInt32Ty()->getPointerTo(),
-                                   LongType,
-                                   LongType,
-                                   LongType,
-                                   Builder.getPtrTy()};
+  std::vector<Type *> Arguments = {
+      Builder.getPtrTy(0), Builder.getPtrTy(0), LongType, LongType, LongType,
+      Builder.getPtrTy()};
 
   FunctionType *FT = FunctionType::get(Builder.getVoidTy(), Arguments, false);
   Function *SubFn = Function::Create(FT, Function::InternalLinkage,
@@ -320,11 +313,8 @@ Value *ParallelLoopGeneratorKMP::createCallGlobalThreadNum() {
 
   // If F is not available, declare it.
   if (!F) {
-    StructType *IdentTy =
-        StructType::getTypeByName(M->getContext(), "struct.ident_t");
-
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    Type *Params[] = {IdentTy->getPointerTo()};
+    Type *Params[] = {Builder.getPtrTy(0)};
 
     FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
     F = Function::Create(Ty, Linkage, Name, M);
@@ -342,11 +332,8 @@ void ParallelLoopGeneratorKMP::createCallPushNumThreads(Value *GlobalThreadID,
 
   // If F is not available, declare it.
   if (!F) {
-    StructType *IdentTy =
-        StructType::getTypeByName(M->getContext(), "struct.ident_t");
-
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty(),
+    Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(),
                       Builder.getInt32Ty()};
 
     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
@@ -367,20 +354,18 @@ void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID,
   const std::string Name =
       is64BitArch() ? "__kmpc_for_static_init_8" : "__kmpc_for_static_init_4";
   Function *F = M->getFunction(Name);
-  StructType *IdentTy =
-      StructType::getTypeByName(M->getContext(), "struct.ident_t");
 
   // If F is not available, declare it.
   if (!F) {
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
 
-    Type *Params[] = {IdentTy->getPointerTo(),
+    Type *Params[] = {Builder.getPtrTy(0),
                       Builder.getInt32Ty(),
                       Builder.getInt32Ty(),
-                      Builder.getInt32Ty()->getPointerTo(),
-                      LongType->getPointerTo(),
-                      LongType->getPointerTo(),
-                      LongType->getPointerTo(),
+                      Builder.getPtrTy(0),
+                      Builder.getPtrTy(0),
+                      Builder.getPtrTy(0),
+                      Builder.getPtrTy(0),
                       LongType,
                       LongType};
 
@@ -414,7 +399,7 @@ void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) {
   // If F is not available, declare it.
   if (!F) {
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
-    Type *Params[] = {IdentTy->getPointerTo(), Builder.getInt32Ty()};
+    Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty()};
     FunctionType *Ty = FunctionType::get(Builder.getVoidTy(), Params, false);
     F = Function::Create(Ty, Linkage, Name, M);
   }
@@ -432,14 +417,12 @@ void ParallelLoopGeneratorKMP::createCallDispatchInit(Value *GlobalThreadID,
   const std::string Name =
       is64BitArch() ? "__kmpc_dispatch_init_8" : "__kmpc_dispatch_init_4";
   Function *F = M->getFunction(Name);
-  StructType *IdentTy =
-      StructType::getTypeByName(M->getContext(), "struct.ident_t");
 
   // If F is not available, declare it.
   if (!F) {
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
 
-    Type *Params[] = {IdentTy->getPointerTo(),
+    Type *Params[] = {Builder.getPtrTy(0),
                       Builder.getInt32Ty(),
                       Builder.getInt32Ty(),
                       LongType,
@@ -481,12 +464,9 @@ Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID,
   if (!F) {
     GlobalValue::LinkageTypes Linkage = Function::ExternalLinkage;
 
-    Type *Params[] = {IdentTy->getPointerTo(),
-                      Builder.getInt32Ty(),
-                      Builder.getInt32Ty()->getPointerTo(),
-                      LongType->getPointerTo(),
-                      LongType->getPointerTo(),
-                      LongType->getPointerTo()};
+    Type *Params[] = {Builder.getPtrTy(0), Builder.getInt32Ty(),
+                      Builder.getPtrTy(0), Builder.getPtrTy(0),
+                      Builder.getPtrTy(0), Builder.getPtrTy(0)};
 
     FunctionType *Ty = FunctionType::get(Builder.getInt32Ty(), Params, false);
     F = Function::Create(Ty, Linkage, Name, M);
-- 
GitLab


From b584478e0068fd627b7f5e9f63574caab78cc56e Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Thu, 17 Oct 2024 10:43:17 +0100
Subject: [PATCH 217/329] [AArch64] Introduce new armv9.6 features (#111677)

This patch implements new features introduced in 2024 release of ARM ISA
and creates predicates, which will be used by new instructions.


Co-authored-by: Caroline Concatto caroline.concatto@arm.com
Co-authored-by: Spencer Abson spencer.abson@arm.com
---
 clang/test/Driver/aarch64-v96a.c              | 42 +++++++++++++++--
 .../print-supported-extensions-aarch64.c      | 11 +++++
 llvm/lib/Target/AArch64/AArch64Features.td    | 37 ++++++++++++++-
 llvm/lib/Target/AArch64/AArch64InstrInfo.td   | 35 +++++++++++++-
 .../TargetParser/TargetParserTest.cpp         | 47 +++++++++++++++----
 5 files changed, 158 insertions(+), 14 deletions(-)

diff --git a/clang/test/Driver/aarch64-v96a.c b/clang/test/Driver/aarch64-v96a.c
index 0aaadddb2842..80c99be93433 100644
--- a/clang/test/Driver/aarch64-v96a.c
+++ b/clang/test/Driver/aarch64-v96a.c
@@ -6,7 +6,7 @@
 // RUN: %clang -target aarch64 -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
 // RUN: %clang -target aarch64_be -mlittle-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A %s
-// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"
+// GENERICV96A: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2"
 
 // RUN: %clang -target aarch64_be -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
@@ -14,6 +14,42 @@
 // RUN: %clang -target aarch64 -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
 // RUN: %clang -target aarch64_be -mbig-endian -march=armv9.6-a -### -c %s 2>&1 | FileCheck -check-prefix=GENERICV96A-BE %s
-// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"
-//
+// GENERICV96A-BE: "-cc1"{{.*}} "-triple" "aarch64_be{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+cmpbr"{{.*}} "-target-feature" "+fprcvt"{{.*}} "-target-feature" "+sve2p2"
+
 // ===== Features supported on aarch64 =====
+
+// RUN: %clang -target aarch64 -march=armv9.6a+f8f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F16MM %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+f8f16mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F16MM %s
+// V96A-F8F16MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+f8f16mm"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+f8f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F32MM %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+f8f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-F8F32MM %s
+// V96A-F8F32MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+f8f32mm"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+lsfe -### -c %s 2>&1 | FileCheck -check-prefix=V96A-LSFE %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+lsfe -### -c %s 2>&1 | FileCheck -check-prefix=V96A-LSFE %s
+// V96A-LSFE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+lsfe"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+sme2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SME2p2 %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+sme2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SME2p2 %s
+// V96A-SME2p2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sme2p2"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+ssve-aes -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SSVE-AES %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+ssve-aes -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SSVE-AES %s
+// V96A-SSVE-AES: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+ssve-aes"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+sve2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE2p2 %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+sve2p2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE2p2 %s
+// V96A-SVE2p2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve2p2"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+sve-aes2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-AES2 %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+sve-aes2 -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-AES2 %s
+// V96A-SVE-AES2: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve-aes2"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+sve-bfscale -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-BFSCALE %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+sve-bfscale -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-BFSCALE %s
+// V96A-SVE-BFSCALE: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve-bfscale"
+
+// RUN: %clang -target aarch64 -march=armv9.6a+sve-f16f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-F16F32MM %s
+// RUN: %clang -target aarch64 -march=armv9.6-a+sve-f16f32mm -### -c %s 2>&1 | FileCheck -check-prefix=V96A-SVE-F16F32MM %s
+// V96A-SVE-F16F32MM: "-cc1"{{.*}} "-triple" "aarch64{{.*}}" "-target-cpu" "generic" "-target-feature" "+v9.6a"{{.*}} "-target-feature" "+sve-f16f32mm"
diff --git a/clang/test/Driver/print-supported-extensions-aarch64.c b/clang/test/Driver/print-supported-extensions-aarch64.c
index e6247307c721..fbc0d70c4901 100644
--- a/clang/test/Driver/print-supported-extensions-aarch64.c
+++ b/clang/test/Driver/print-supported-extensions-aarch64.c
@@ -8,6 +8,7 @@
 // CHECK-NEXT:     bf16                FEAT_BF16                                              Enable BFloat16 Extension
 // CHECK-NEXT:     brbe                FEAT_BRBE                                              Enable Branch Record Buffer Extension
 // CHECK-NEXT:     bti                 FEAT_BTI                                               Enable Branch Target Identification
+// CHECK-NEXT:     cmpbr               FEAT_CMPBR                                             Enable Armv9.6-A base compare and branch instructions
 // CHECK-NEXT:     fcma                FEAT_FCMA                                              Enable Armv8.3-A Floating-point complex number support
 // CHECK-NEXT:     cpa                 FEAT_CPA                                               Enable Armv9.5-A Checked Pointer Arithmetic
 // CHECK-NEXT:     crc                 FEAT_CRC32                                             Enable Armv8.0-A CRC-32 checksum instructions
@@ -18,6 +19,8 @@
 // CHECK-NEXT:     dotprod             FEAT_DotProd                                           Enable dot product support
 // CHECK-NEXT:     f32mm               FEAT_F32MM                                             Enable Matrix Multiply FP32 Extension
 // CHECK-NEXT:     f64mm               FEAT_F64MM                                             Enable Matrix Multiply FP64 Extension
+// CHECK-NEXT:     f8f16mm             FEAT_F8F16MM                                           Enable Armv9.6-A FP8 to Half-Precision Matrix Multiplication
+// CHECK-NEXT:     f8f32mm             FEAT_F8F32MM                                           Enable Armv9.6-A FP8 to Single-Precision Matrix Multiplication
 // CHECK-NEXT:     faminmax            FEAT_FAMINMAX                                          Enable FAMIN and FAMAX instructions
 // CHECK-NEXT:     flagm               FEAT_FlagM                                             Enable Armv8.4-A Flag Manipulation instructions
 // CHECK-NEXT:     fp                  FEAT_FP                                                Enable Armv8.0-A Floating Point Extensions
@@ -26,6 +29,7 @@
 // CHECK-NEXT:     fp8dot2             FEAT_FP8DOT2                                           Enable FP8 2-way dot instructions
 // CHECK-NEXT:     fp8dot4             FEAT_FP8DOT4                                           Enable FP8 4-way dot instructions
 // CHECK-NEXT:     fp8fma              FEAT_FP8FMA                                            Enable Armv9.5-A FP8 multiply-add instructions
+// CHECK-NEXT:     fprcvt              FEAT_FPRCVT                                            Enable Armv9.6-A base convert instructions for SIMD&FP scalar register operands of different input and output sizes
 // CHECK-NEXT:     fp16                FEAT_FP16                                              Enable half-precision floating-point data processing
 // CHECK-NEXT:     gcs                 FEAT_GCS                                               Enable Armv9.4-A Guarded Call Stack Extension
 // CHECK-NEXT:     hbc                 FEAT_HBC                                               Enable Armv8.8-A Hinted Conditional Branches Extension
@@ -35,6 +39,7 @@
 // CHECK-NEXT:     ls64                FEAT_LS64, FEAT_LS64_V, FEAT_LS64_ACCDATA              Enable Armv8.7-A LD64B/ST64B Accelerator Extension
 // CHECK-NEXT:     lse                 FEAT_LSE                                               Enable Armv8.1-A Large System Extension (LSE) atomic instructions
 // CHECK-NEXT:     lse128              FEAT_LSE128                                            Enable Armv9.4-A 128-bit Atomic instructions
+// CHECK-NEXT:     lsfe                FEAT_LSFE                                              Enable Armv9.6-A base Atomic floating-point in-memory instructions
 // CHECK-NEXT:     lut                 FEAT_LUT                                               Enable Lookup Table instructions
 // CHECK-NEXT:     mops                FEAT_MOPS                                              Enable Armv8.8-A memcpy and memset acceleration instructions
 // CHECK-NEXT:     memtag              FEAT_MTE, FEAT_MTE2                                    Enable Memory Tagging Extension
@@ -64,20 +69,26 @@
 // CHECK-NEXT:     sme-lutv2           FEAT_SME_LUTv2                                         Enable Scalable Matrix Extension (SME) LUTv2 instructions
 // CHECK-NEXT:     sme2                FEAT_SME2                                              Enable Scalable Matrix Extension 2 (SME2) instructions
 // CHECK-NEXT:     sme2p1              FEAT_SME2p1                                            Enable Scalable Matrix Extension 2.1 instructions
+// CHECK-NEXT:     sme2p2              FEAT_SME2p2                                            Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions
 // CHECK-NEXT:     profile             FEAT_SPE                                               Enable Statistical Profiling extension
 // CHECK-NEXT:     predres2            FEAT_SPECRES2                                          Enable Speculation Restriction Instruction
 // CHECK-NEXT:     ssbs                FEAT_SSBS, FEAT_SSBS2                                  Enable Speculative Store Bypass Safe bit
+// CHECK-NEXT:     ssve-aes            FEAT_SSVE_AES                                          Enable Armv9.6-A SVE2 AES support in streaming SVE mode
 // CHECK-NEXT:     ssve-fp8dot2        FEAT_SSVE_FP8DOT2                                      Enable SVE2 FP8 2-way dot product instructions
 // CHECK-NEXT:     ssve-fp8dot4        FEAT_SSVE_FP8DOT4                                      Enable SVE2 FP8 4-way dot product instructions
 // CHECK-NEXT:     ssve-fp8fma         FEAT_SSVE_FP8FMA                                       Enable SVE2 FP8 multiply-add instructions
 // CHECK-NEXT:     sve                 FEAT_SVE                                               Enable Scalable Vector Extension (SVE) instructions
+// CHECK-NEXT:     sve-aes2            FEAT_SVE_AES2                                          Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions
 // CHECK-NEXT:     sve-b16b16          FEAT_SVE_B16B16                                        Enable SVE2 non-widening and SME2 Z-targeting non-widening BFloat16 instructions
+// CHECK-NEXT:     sve-bfscale         FEAT_SVE_BFSCALE                                       Enable Armv9.6-A SVE BFloat16 scaling instructions
+// CHECK-NEXT:     sve-f16f32mm        FEAT_SVE_F16F32MM                                      Enable Armv9.6-A FP16 to FP32 Matrix Multiply
 // CHECK-NEXT:     sve2                FEAT_SVE2                                              Enable Scalable Vector Extension 2 (SVE2) instructions
 // CHECK-NEXT:     sve2-aes            FEAT_SVE_AES, FEAT_SVE_PMULL128                        Enable AES SVE2 instructions
 // CHECK-NEXT:     sve2-bitperm        FEAT_SVE_BitPerm                                       Enable bit permutation SVE2 instructions
 // CHECK-NEXT:     sve2-sha3           FEAT_SVE_SHA3                                          Enable SHA3 SVE2 instructions
 // CHECK-NEXT:     sve2-sm4            FEAT_SVE_SM4                                           Enable SM4 SVE2 instructions
 // CHECK-NEXT:     sve2p1              FEAT_SVE2p1                                            Enable Scalable Vector Extension 2.1 instructions
+// CHECK-NEXT:     sve2p2              FEAT_SVE2p2                                            Enable Armv9.6-A Scalable Vector Extension 2.2 instructions
 // CHECK-NEXT:     the                 FEAT_THE                                               Enable Armv8.9-A Translation Hardening Extension
 // CHECK-NEXT:     tlbiw               FEAT_TLBIW                                             Enable Armv9.5-A TLBI VMALL for Dirty State
 // CHECK-NEXT:     tme                 FEAT_TME                                               Enable Transactional Memory Extension
diff --git a/llvm/lib/Target/AArch64/AArch64Features.td b/llvm/lib/Target/AArch64/AArch64Features.td
index 97671bc59f6b..831f311b2364 100644
--- a/llvm/lib/Target/AArch64/AArch64Features.td
+++ b/llvm/lib/Target/AArch64/AArch64Features.td
@@ -522,6 +522,39 @@ def FeatureTLBIW : ExtensionWithMArch<"tlbiw", "TLBIW", "FEAT_TLBIW",
 //  Armv9.6 Architecture Extensions
 //===----------------------------------------------------------------------===//
 
+def FeatureCMPBR : ExtensionWithMArch<"cmpbr", "CMPBR", "FEAT_CMPBR",
+  "Enable Armv9.6-A base compare and branch instructions">;
+
+def FeatureF8F32MM: ExtensionWithMArch<"f8f32mm", "F8F32MM", "FEAT_F8F32MM",
+  "Enable Armv9.6-A FP8 to Single-Precision Matrix Multiplication">;
+
+def FeatureF8F16MM: ExtensionWithMArch<"f8f16mm", "F8F16MM", "FEAT_F8F16MM",
+  "Enable Armv9.6-A FP8 to Half-Precision Matrix Multiplication">;
+
+def FeatureFPRCVT: ExtensionWithMArch<"fprcvt", "FPRCVT", "FEAT_FPRCVT",
+  "Enable Armv9.6-A base convert instructions for SIMD&FP scalar register operands of"
+  " different input and output sizes">;
+
+def FeatureLSFE : ExtensionWithMArch<"lsfe", "LSFE", "FEAT_LSFE",
+  "Enable Armv9.6-A base Atomic floating-point in-memory instructions">;
+
+def FeatureSME2p2: ExtensionWithMArch<"sme2p2", "SME2p2", "FEAT_SME2p2",
+  "Enable Armv9.6-A Scalable Matrix Extension 2.2 instructions", [FeatureSME2p1]>;
+
+def FeatureSSVE_AES : ExtensionWithMArch<"ssve-aes", "SSVE_AES", "FEAT_SSVE_AES",
+  "Enable Armv9.6-A SVE2 AES support in streaming SVE mode">;
+
+def FeatureSVE2p2 : ExtensionWithMArch<"sve2p2", "SVE2p2", "FEAT_SVE2p2",
+  "Enable Armv9.6-A Scalable Vector Extension 2.2 instructions", [FeatureSVE2p1]>;
+
+def FeatureSVEAES2: ExtensionWithMArch<"sve-aes2", "SVE_AES2", "FEAT_SVE_AES2",
+  "Enable Armv9.6-A SVE multi-vector AES and 128-bit PMULL instructions">;
+
+def FeatureSVEBFSCALE: ExtensionWithMArch<"sve-bfscale", "SVE_BFSCALE", "FEAT_SVE_BFSCALE",
+  "Enable Armv9.6-A SVE BFloat16 scaling instructions">;
+
+def FeatureSVE_F16F32MM: ExtensionWithMArch<"sve-f16f32mm", "SVE_F16F32MM", "FEAT_SVE_F16F32MM",
+  "Enable Armv9.6-A FP16 to FP32 Matrix Multiply instructions">;
 
 //===----------------------------------------------------------------------===//
 //  Other Features
@@ -833,8 +866,8 @@ def HasV9_5aOps : Architecture64<9, 5, "a", "v9.5a",
   [HasV9_4aOps, FeatureCPA],
   !listconcat(HasV9_4aOps.DefaultExts, [FeatureCPA,  FeatureLUT, FeatureFAMINMAX])>;
 def HasV9_6aOps : Architecture64<9, 6, "a", "v9.6a",
-  [HasV9_5aOps],
-  !listconcat(HasV9_5aOps.DefaultExts, [])>;
+  [HasV9_5aOps, FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2],
+  !listconcat(HasV9_5aOps.DefaultExts, [FeatureCMPBR, FeatureFPRCVT, FeatureSVE2p2])>;
 def HasV8_0rOps : Architecture64<8, 0, "r", "v8r",
   [ //v8.1
     FeatureCRC, FeaturePAN, FeatureLSE, FeatureCONTEXTIDREL2,
diff --git a/llvm/lib/Target/AArch64/AArch64InstrInfo.td b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
index 32f2c7c71d17..6c9f0986b9e3 100644
--- a/llvm/lib/Target/AArch64/AArch64InstrInfo.td
+++ b/llvm/lib/Target/AArch64/AArch64InstrInfo.td
@@ -213,12 +213,35 @@ def HasSMEF8F16     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8
 def HasSMEF8F32     : Predicate<"Subtarget->isStreaming() && Subtarget->hasSMEF8F32()">,
                                  AssemblerPredicateWithAll<(all_of FeatureSMEF8F32), "sme-f8f32">;
 
+def HasCMPBR        : Predicate<"Subtarget->hasCMPBR()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureCMPBR), "cmpbr">;
+def HasF8F32MM      : Predicate<"Subtarget->hasF8F32MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF8F32MM), "f8f32mm">;
+def HasF8F16MM      : Predicate<"Subtarget->hasF8F16MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureF8F16MM), "f8f16mm">;
+def HasFPRCVT       : Predicate<"Subtarget->hasFPRCVT()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureFPRCVT), "fprcvt">;
+def HasLSFE         : Predicate<"Subtarget->hasLSFE()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureLSFE), "lsfe">;
+def HasSME2p2       : Predicate<"Subtarget->isStreaming() && Subtarget->hasSME2p2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSME2p2), "sme2p2">;
+def HasSVEAES2      : Predicate<"Subtarget->hasSVEAES2()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVEAES2), "sve-aes2">;
+def HasSVEBFSCALE   : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSVEBFSCALE()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVEBFSCALE), "sve-bfscale">;
+def HasSVE_F16F32MM : Predicate<"Subtarget->isSVEAvailable() && Subtarget->hasSVE_F16F32MM()">,
+                                 AssemblerPredicateWithAll<(all_of FeatureSVE_F16F32MM), "sve-f16f32mm">;
 // A subset of SVE(2) instructions are legal in Streaming SVE execution mode,
 // they should be enabled if either has been specified.
 def HasSVEorSME
     : Predicate<"Subtarget->hasSVE() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME),
                 "sve or sme">;
+def HasSVEorSME2p2
+    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE()) ||"
+                "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSME2p2())">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE, FeatureSME2p2),
+                "sve or sme2p2">;
 def HasSVE2orSME
     : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME),
@@ -227,6 +250,10 @@ def HasSVE2orSME2
     : Predicate<"Subtarget->hasSVE2() || (Subtarget->isStreaming() && Subtarget->hasSME2())">,
                 AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSME2),
                 "sve2 or sme2">;
+def HasSVE2orSSVE_AES
+    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2()) ||"
+                "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE2, FeatureSSVE_AES), "sve2 or ssve-aes">;
 def HasSVE2p1_or_HasSME
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME, FeatureSVE2p1), "sme or sve2p1">;
@@ -236,7 +263,13 @@ def HasSVE2p1_or_HasSME2
 def HasSVE2p1_or_HasSME2p1
     : Predicate<"Subtarget->hasSVE2p1() || (Subtarget->isStreaming() && Subtarget->hasSME2p1())">,
                  AssemblerPredicateWithAll<(any_of FeatureSME2p1, FeatureSVE2p1), "sme2p1 or sve2p1">;
-
+def HasSVE2p2orSME2p2
+    : Predicate<"Subtarget->isSVEorStreamingSVEAvailable() && (Subtarget->hasSVE2p2() || Subtarget->hasSME2p2())">,
+                 AssemblerPredicateWithAll<(any_of FeatureSME2p2, FeatureSVE2p2), "sme2p2 or sve2p2">;
+def HasSVE2p1orSSVE_AES
+    : Predicate<"(Subtarget->isSVEAvailable() && Subtarget->hasSVE2p1()) ||"
+                "(Subtarget->isSVEorStreamingSVEAvailable() && Subtarget->hasSSVE_AES())">,
+                AssemblerPredicateWithAll<(any_of FeatureSVE2p1, FeatureSSVE_AES), "sve2p1 or ssve-aes">;
 def HasSMEF16F16orSMEF8F16
     : Predicate<"Subtarget->isStreaming() && (Subtarget->hasSMEF16F16() || Subtarget->hasSMEF8F16())">,
                 AssemblerPredicateWithAll<(any_of FeatureSMEF16F16, FeatureSMEF8F16),
diff --git a/llvm/unittests/TargetParser/TargetParserTest.cpp b/llvm/unittests/TargetParser/TargetParserTest.cpp
index 87b78d502780..369e53463480 100644
--- a/llvm/unittests/TargetParser/TargetParserTest.cpp
+++ b/llvm/unittests/TargetParser/TargetParserTest.cpp
@@ -1326,8 +1326,12 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
       AArch64::AEK_CPA,          AArch64::AEK_PAUTHLR,
       AArch64::AEK_TLBIW,        AArch64::AEK_JSCVT,
       AArch64::AEK_FCMA,         AArch64::AEK_FP8,
-      AArch64::AEK_SVEB16B16,
-  };
+      AArch64::AEK_SVEB16B16,    AArch64::AEK_SVE2P2,
+      AArch64::AEK_SME2P2,       AArch64::AEK_SVE_BFSCALE,
+      AArch64::AEK_SVE_F16F32MM, AArch64::AEK_SVE_AES2,
+      AArch64::AEK_SSVE_AES,     AArch64::AEK_F8F32MM,
+      AArch64::AEK_F8F16MM,      AArch64::AEK_LSFE,
+      AArch64::AEK_FPRCVT,       AArch64::AEK_CMPBR};
 
   std::vector<StringRef> Features;
 
@@ -1359,12 +1363,17 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+ras"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve-b16b16"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve-bfscale"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve-f16f32mm"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-aes"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sm4"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-sha3"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2-bitperm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve-aes2"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+ssve-aes"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sve2p1"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sve2p2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+rcpc"));
   EXPECT_TRUE(llvm::is_contained(Features, "+rand"));
   EXPECT_TRUE(llvm::is_contained(Features, "+mte"));
@@ -1387,6 +1396,7 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+sme2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-b16b16"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme2p1"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+sme2p2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+hbc"));
   EXPECT_TRUE(llvm::is_contained(Features, "+mops"));
   EXPECT_TRUE(llvm::is_contained(Features, "+perfmon"));
@@ -1406,6 +1416,8 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8dot2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+fp8dot4"));
   EXPECT_TRUE(llvm::is_contained(Features, "+ssve-fp8dot4"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f8f32mm"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+f8f16mm"));
   EXPECT_TRUE(llvm::is_contained(Features, "+lut"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-lutv2"));
   EXPECT_TRUE(llvm::is_contained(Features, "+sme-f8f16"));
@@ -1416,6 +1428,9 @@ TEST(TargetParserTest, AArch64ExtensionFeatures) {
   EXPECT_TRUE(llvm::is_contained(Features, "+tlbiw"));
   EXPECT_TRUE(llvm::is_contained(Features, "+jsconv"));
   EXPECT_TRUE(llvm::is_contained(Features, "+complxnum"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+lsfe"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+fprcvt"));
+  EXPECT_TRUE(llvm::is_contained(Features, "+cmpbr"));
 
   // Assuming we listed every extension above, this should produce the same
   // result.
@@ -1513,12 +1528,17 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"rdm", "nordm", "+rdm", "-rdm"},
       {"sve", "nosve", "+sve", "-sve"},
       {"sve-b16b16", "nosve-b16b16", "+sve-b16b16", "-sve-b16b16"},
+      {"sve-bfscale", "nosve-bfscale", "+sve-bfscale", "-sve-bfscale"},
+      {"sve-f16f32mm", "nosve-f16f32mm", "+sve-f16f32mm", "-sve-f16f32mm"},
       {"sve2", "nosve2", "+sve2", "-sve2"},
       {"sve2-aes", "nosve2-aes", "+sve2-aes", "-sve2-aes"},
       {"sve2-sm4", "nosve2-sm4", "+sve2-sm4", "-sve2-sm4"},
       {"sve2-sha3", "nosve2-sha3", "+sve2-sha3", "-sve2-sha3"},
       {"sve2p1", "nosve2p1", "+sve2p1", "-sve2p1"},
+      {"sve2p2", "nosve2p2", "+sve2p2", "-sve2p2"},
       {"sve2-bitperm", "nosve2-bitperm", "+sve2-bitperm", "-sve2-bitperm"},
+      {"sve-aes2", "nosve-aes2", "+sve-aes2", "-sve-aes2"},
+      {"ssve-aes", "nossve-aes", "+ssve-aes", "-ssve-aes"},
       {"dotprod", "nodotprod", "+dotprod", "-dotprod"},
       {"rcpc", "norcpc", "+rcpc", "-rcpc"},
       {"rng", "norng", "+rand", "-rand"},
@@ -1531,6 +1551,8 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"i8mm", "noi8mm", "+i8mm", "-i8mm"},
       {"f32mm", "nof32mm", "+f32mm", "-f32mm"},
       {"f64mm", "nof64mm", "+f64mm", "-f64mm"},
+      {"f8f32mm", "nof8f32mm", "+f8f32mm", "-f8f32mm"},
+      {"f8f16mm", "nof8f16mm", "+f8f16mm", "-f8f16mm"},
       {"sme", "nosme", "+sme", "-sme"},
       {"sme-fa64", "nosme-fa64", "+sme-fa64", "-sme-fa64"},
       {"sme-f64f64", "nosme-f64f64", "+sme-f64f64", "-sme-f64f64"},
@@ -1539,6 +1561,7 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"sme2", "nosme2", "+sme2", "-sme2"},
       {"sme-b16b16", "nosme-b16b16", "+sme-b16b16", "-sme-b16b16"},
       {"sme2p1", "nosme2p1", "+sme2p1", "-sme2p1"},
+      {"sme2p2", "nosme2p2", "+sme2p2", "-sme2p2"},
       {"hbc", "nohbc", "+hbc", "-hbc"},
       {"mops", "nomops", "+mops", "-mops"},
       {"pmuv3", "nopmuv3", "+perfmon", "-perfmon"},
@@ -1557,7 +1580,9 @@ TEST(TargetParserTest, AArch64ArchExtFeature) {
       {"sme-lutv2", "nosme-lutv2", "+sme-lutv2", "-sme-lutv2"},
       {"sme-f8f16", "nosme-f8f16", "+sme-f8f16", "-sme-f8f16"},
       {"sme-f8f32", "nosme-f8f32", "+sme-f8f32", "-sme-f8f32"},
-  };
+      {"lsfe", "nolsfe", "+lsfe", "-lsfe"},
+      {"fprcvt", "nofprcvt", "+fprcvt", "-fprcvt"},
+      {"cmpbr", "nocmpbr", "+cmpbr", "-cmpbr"}};
 
   for (unsigned i = 0; i < std::size(ArchExt); i++) {
     EXPECT_EQ(StringRef(ArchExt[i][2]),
@@ -1783,7 +1808,7 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV8A, {"nosve", "f64mm"}, {"sve", "f64mm"}, {}},
         {AArch64::ARMV8A, {"f64mm", "nosve"}, {}, {"sve", "f64mm"}},
 
-        // sve2 -> {sve2p1, sve2-bitperm, sve2-aes, sve2-sha3, sve2-sm4}
+        // sve2 -> {sve2p1, sve2-bitperm, sve2-sha3, sve2-sm4}
         {AArch64::ARMV8A, {"nosve2", "sve2p1"}, {"sve2", "sve2p1"}, {}},
         {AArch64::ARMV8A, {"sve2p1", "nosve2"}, {}, {"sve2", "sve2p1"}},
         {AArch64::ARMV8A,
@@ -1794,23 +1819,25 @@ AArch64ExtensionDependenciesBaseArchTestParams
          {"sve2-bitperm", "nosve2"},
          {},
          {"sve2", "sve2-bitperm"}},
-        {AArch64::ARMV8A, {"nosve2", "sve2-aes"}, {"sve2", "sve2-aes"}, {}},
-        {AArch64::ARMV8A, {"sve2-aes", "nosve2"}, {}, {"sve2", "sve2-aes"}},
         {AArch64::ARMV8A, {"nosve2", "sve2-sha3"}, {"sve2", "sve2-sha3"}, {}},
         {AArch64::ARMV8A, {"sve2-sha3", "nosve2"}, {}, {"sve2", "sve2-sha3"}},
         {AArch64::ARMV8A, {"nosve2", "sve2-sm4"}, {"sve2", "sve2-sm4"}, {}},
         {AArch64::ARMV8A, {"sve2-sm4", "nosve2"}, {}, {"sve2", "sve2-sm4"}},
 
         // sve-b16b16 -> {sme-b16b16}
-        {AArch64::ARMV8A,
+        {AArch64::ARMV9_4A,
          {"nosve-b16b16", "sme-b16b16"},
          {"sve-b16b16", "sme-b16b16"},
          {}},
-        {AArch64::ARMV8A,
+        {AArch64::ARMV9_4A,
          {"sme-b16b16", "nosve-b16b16"},
          {},
          {"sve-b16b16", "sme-b16b16"}},
 
+        // sve2p1 -> {sve2p2}
+        {AArch64::ARMV9_6A, {"nosve2p1", "sve2p2"}, {"sve2p1", "sve2p2"}, {}},
+        {AArch64::ARMV9_6A, {"sve2p2", "nosve2p1"}, {}, {"sve2p1", "sve2p2"}},
+
         // sme -> {sme2, sme-f16f16, sme-f64f64, sme-i16i64, sme-fa64}
         {AArch64::ARMV8A, {"nosme", "sme2"}, {"sme", "sme2"}, {}},
         {AArch64::ARMV8A, {"sme2", "nosme"}, {}, {"sme", "sme2"}},
@@ -1858,6 +1885,10 @@ AArch64ExtensionDependenciesBaseArchTestParams
         {AArch64::ARMV8A, {"nosme2", "sme-b16b16"}, {"sme2", "sme-b16b16"}, {}},
         {AArch64::ARMV8A, {"sme-b16b16", "nosme2"}, {}, {"sme2", "sme-b16b16"}},
 
+        // sme2p1 -> {sme2p2}
+        {AArch64::ARMV9_6A, {"nosme2p1", "sme2p2"}, {"sme2p2", "sme2p1"}, {}},
+        {AArch64::ARMV9_6A, {"sme2p2", "nosme2p1"}, {}, {"sme2p1", "sme2p2"}},
+
         // fp8 -> {sme-f8f16, sme-f8f32}
         {AArch64::ARMV8A, {"nofp8", "sme-f8f16"}, {"fp8", "sme-f8f16"}, {}},
         {AArch64::ARMV8A, {"sme-f8f16", "nofp8"}, {}, {"fp8", "sme-f8f16"}},
-- 
GitLab


From 4091bc61e315f187829dca877dd908a07ba9cb91 Mon Sep 17 00:00:00 2001
From: Sergio Afonso <safonsof@amd.com>
Date: Thu, 17 Oct 2024 10:46:38 +0100
Subject: [PATCH 218/329] [MLIR][OpenMP] Split region-associated op
 verification (#112355)

This patch moves the part of operation verifiers dependent on the
contents of their regions to the corresponding `verifyRegions` method.
This ensures these are only triggered after the operations in the region
have themselved already been verified in advance, avoiding checks based
on invalid nested operations.

The `LoopWrapperInterface` is also updated so that its verifier runs
after operations in the region of ops with this interface have already
been verified.
---
 mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td |  7 ++-
 .../Dialect/OpenMP/OpenMPOpsInterfaces.td     |  1 +
 mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp  | 51 +++++++++++++------
 mlir/test/Dialect/OpenMP/invalid.mlir         | 12 +++--
 4 files changed, 49 insertions(+), 22 deletions(-)

diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
index 11649ef2d033..45313200d4f0 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOps.td
@@ -137,7 +137,7 @@ def PrivateClauseOp : OpenMP_Op<"private", [IsolatedFromAbove, RecipeInterface]>
     }
   }];
 
-  let hasVerifier = 1;
+  let hasRegionVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -175,6 +175,7 @@ def ParallelOp : OpenMP_Op<"parallel", traits = [
   }];
 
   let hasVerifier = 1;
+  let hasRegionVerifier = 1;
 }
 
 def TerminatorOp : OpenMP_Op<"terminator", [Terminator, Pure]> {
@@ -426,6 +427,7 @@ def WsloopOp : OpenMP_Op<"wsloop", traits = [
   }];
 
   let hasVerifier = 1;
+  let hasRegionVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -479,6 +481,7 @@ def SimdOp : OpenMP_Op<"simd", traits = [
   }];
 
   let hasVerifier = 1;
+  let hasRegionVerifier = 1;
 }
 
 
@@ -556,6 +559,7 @@ def DistributeOp : OpenMP_Op<"distribute", traits = [
   }];
 
   let hasVerifier = 1;
+  let hasRegionVerifier = 1;
 }
 
 //===----------------------------------------------------------------------===//
@@ -693,6 +697,7 @@ def TaskloopOp : OpenMP_Op<"taskloop", traits = [
   }] # clausesExtraClassDeclaration;
 
   let hasVerifier = 1;
+  let hasRegionVerifier = 1;
 }
 
 def TaskgroupOp : OpenMP_Op<"taskgroup", traits = [
diff --git a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
index 22521b08637c..8b72689dc3fd 100644
--- a/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
+++ b/mlir/include/mlir/Dialect/OpenMP/OpenMPOpsInterfaces.td
@@ -258,6 +258,7 @@ def LoopWrapperInterface : OpInterface<"LoopWrapperInterface"> {
   let verify = [{
     return ::llvm::cast<::mlir::omp::LoopWrapperInterface>($_op).verifyImpl();
   }];
+  let verifyWithRegions = 1;
 }
 
 def ComposableOpInterface : OpInterface<"ComposableOpInterface"> {
diff --git a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
index 3217542e1056..e1df647d6a3c 100644
--- a/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
+++ b/mlir/lib/Dialect/OpenMP/IR/OpenMPDialect.cpp
@@ -1760,6 +1760,18 @@ static LogicalResult verifyPrivateVarList(OpType &op) {
 }
 
 LogicalResult ParallelOp::verify() {
+  if (getAllocateVars().size() != getAllocatorVars().size())
+    return emitError(
+        "expected equal sizes for allocate and allocator variables");
+
+  if (failed(verifyPrivateVarList(*this)))
+    return failure();
+
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
+}
+
+LogicalResult ParallelOp::verifyRegions() {
   auto distributeChildOps = getOps<DistributeOp>();
   if (!distributeChildOps.empty()) {
     if (!isComposite())
@@ -1780,16 +1792,7 @@ LogicalResult ParallelOp::verify() {
     return emitError()
            << "'omp.composite' attribute present in non-composite operation";
   }
-
-  if (getAllocateVars().size() != getAllocatorVars().size())
-    return emitError(
-        "expected equal sizes for allocate and allocator variables");
-
-  if (failed(verifyPrivateVarList(*this)))
-    return failure();
-
-  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
-                                getReductionByref());
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -1979,6 +1982,11 @@ void WsloopOp::build(OpBuilder &builder, OperationState &state,
 }
 
 LogicalResult WsloopOp::verify() {
+  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
+                                getReductionByref());
+}
+
+LogicalResult WsloopOp::verifyRegions() {
   bool isCompositeChildLeaf =
       llvm::dyn_cast_if_present<LoopWrapperInterface>((*this)->getParentOp());
 
@@ -2000,8 +2008,7 @@ LogicalResult WsloopOp::verify() {
            << "'omp.composite' attribute missing from composite wrapper";
   }
 
-  return verifyReductionVarList(*this, getReductionSyms(), getReductionVars(),
-                                getReductionByref());
+  return success();
 }
 
 //===----------------------------------------------------------------------===//
@@ -2037,9 +2044,6 @@ LogicalResult SimdOp::verify() {
   if (verifyNontemporalClause(*this, getNontemporalVars()).failed())
     return failure();
 
-  if (getNestedWrapper())
-    return emitOpError() << "must wrap an 'omp.loop_nest' directly";
-
   bool isCompositeChildLeaf =
       llvm::dyn_cast_if_present<LoopWrapperInterface>((*this)->getParentOp());
 
@@ -2054,6 +2058,13 @@ LogicalResult SimdOp::verify() {
   return success();
 }
 
+LogicalResult SimdOp::verifyRegions() {
+  if (getNestedWrapper())
+    return emitOpError() << "must wrap an 'omp.loop_nest' directly";
+
+  return success();
+}
+
 //===----------------------------------------------------------------------===//
 // Distribute construct [2.9.4.1]
 //===----------------------------------------------------------------------===//
@@ -2076,6 +2087,10 @@ LogicalResult DistributeOp::verify() {
     return emitError(
         "expected equal sizes for allocate and allocator variables");
 
+  return success();
+}
+
+LogicalResult DistributeOp::verifyRegions() {
   if (LoopWrapperInterface nested = getNestedWrapper()) {
     if (!isComposite())
       return emitError()
@@ -2281,6 +2296,10 @@ LogicalResult TaskloopOp::verify() {
         "may not appear on the same taskloop directive");
   }
 
+  return success();
+}
+
+LogicalResult TaskloopOp::verifyRegions() {
   if (LoopWrapperInterface nested = getNestedWrapper()) {
     if (!isComposite())
       return emitError()
@@ -2725,7 +2744,7 @@ void PrivateClauseOp::build(OpBuilder &odsBuilder, OperationState &odsState,
                                      DataSharingClauseType::Private));
 }
 
-LogicalResult PrivateClauseOp::verify() {
+LogicalResult PrivateClauseOp::verifyRegions() {
   Type symType = getType();
 
   auto verifyTerminator = [&](Operation *terminator,
diff --git a/mlir/test/Dialect/OpenMP/invalid.mlir b/mlir/test/Dialect/OpenMP/invalid.mlir
index f7a87713aca3..fd89ec31c64a 100644
--- a/mlir/test/Dialect/OpenMP/invalid.mlir
+++ b/mlir/test/Dialect/OpenMP/invalid.mlir
@@ -136,9 +136,11 @@ func.func @invalid_nested_wrapper(%lb : index, %ub : index, %step : index) {
   // expected-error @below {{only supported nested wrapper is 'omp.simd'}}
   omp.wsloop {
     omp.distribute {
-      omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
-        omp.yield
-      }
+      omp.simd {
+        omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
+          omp.yield
+        }
+      } {omp.composite}
     } {omp.composite}
   } {omp.composite}
 }
@@ -1975,7 +1977,7 @@ func.func @taskloop(%lb: i32, %ub: i32, %step: i32) {
       omp.loop_nest (%iv) : i32 = (%lb) to (%ub) step (%step) {
         omp.yield
       }
-    } {omp.composite}
+    }
   } {omp.composite}
   return
 }
@@ -2188,7 +2190,7 @@ func.func @omp_distribute_nested_wrapper2(%lb: index, %ub: index, %step: index)
       omp.loop_nest (%iv) : index = (%lb) to (%ub) step (%step) {
         "omp.yield"() : () -> ()
       }
-    }) {omp.composite} : () -> ()
+    }) : () -> ()
   } {omp.composite}
 }
 
-- 
GitLab


From b091701d0190912578ac3fe91ee8fd29e9b6de6e Mon Sep 17 00:00:00 2001
From: Rajveer Singh Bharadwaj <rajveer.developer@icloud.com>
Date: Thu, 17 Oct 2024 15:32:24 +0530
Subject: [PATCH 219/329] [mlir] Add a method on MLIRContext to retrieve the
 operations for a given dialect (#112344)

Currently we have `MLIRContext::getRegisteredOperations` which returns
all operations for the given context, with the addition of
`MLIRContext::getRegisteredOperationsByDialect` we can now retrieve the
same for a given dialect class.

Closes #111591
---
 mlir/include/mlir/IR/MLIRContext.h |  5 +++++
 mlir/lib/IR/MLIRContext.cpp        | 24 ++++++++++++++++++++++++
 2 files changed, 29 insertions(+)

diff --git a/mlir/include/mlir/IR/MLIRContext.h b/mlir/include/mlir/IR/MLIRContext.h
index d17bbac81655..ef8dab87f131 100644
--- a/mlir/include/mlir/IR/MLIRContext.h
+++ b/mlir/include/mlir/IR/MLIRContext.h
@@ -197,6 +197,11 @@ public:
   /// operations.
   ArrayRef<RegisteredOperationName> getRegisteredOperations();
 
+  /// Return a sorted array containing the information for registered operations
+  /// filtered by dialect name.
+  ArrayRef<RegisteredOperationName>
+  getRegisteredOperationsByDialect(StringRef dialectName);
+
   /// Return true if this operation name is registered in this context.
   bool isOperationRegistered(StringRef name);
 
diff --git a/mlir/lib/IR/MLIRContext.cpp b/mlir/lib/IR/MLIRContext.cpp
index f05666fcde20..d33340f4aefc 100644
--- a/mlir/lib/IR/MLIRContext.cpp
+++ b/mlir/lib/IR/MLIRContext.cpp
@@ -711,6 +711,30 @@ ArrayRef<RegisteredOperationName> MLIRContext::getRegisteredOperations() {
   return impl->sortedRegisteredOperations;
 }
 
+/// Return information for registered operations by dialect.
+ArrayRef<RegisteredOperationName>
+MLIRContext::getRegisteredOperationsByDialect(StringRef dialectName) {
+  auto lowerBound =
+      std::lower_bound(impl->sortedRegisteredOperations.begin(),
+                       impl->sortedRegisteredOperations.end(), dialectName,
+                       [](auto &lhs, auto &rhs) {
+                         return lhs.getDialect().getNamespace().compare(rhs);
+                       });
+
+  if (lowerBound == impl->sortedRegisteredOperations.end() ||
+      lowerBound->getDialect().getNamespace() != dialectName)
+    return ArrayRef<RegisteredOperationName>();
+
+  auto upperBound =
+      std::upper_bound(lowerBound, impl->sortedRegisteredOperations.end(),
+                       dialectName, [](auto &lhs, auto &rhs) {
+                         return lhs.compare(rhs.getDialect().getNamespace());
+                       });
+
+  size_t count = std::distance(lowerBound, upperBound);
+  return ArrayRef(&*lowerBound, count);
+}
+
 bool MLIRContext::isOperationRegistered(StringRef name) {
   return RegisteredOperationName::lookup(name, this).has_value();
 }
-- 
GitLab


From ad45eb4a9c74a878998efe8fd734f5ae7af5003d Mon Sep 17 00:00:00 2001
From: John Brawn <john.brawn@arm.com>
Date: Thu, 17 Oct 2024 11:15:08 +0100
Subject: [PATCH 220/329] [ARM] Fix problems with register list in vscclrm
 (#111825)

The register list in vscclrm is unusual in three ways:
 * The encoded size can be zero, meaning the list contains only vpr.
* Double-precision registers past d15 are permitted even when the
subtarget doesn't have them, they are instead ignored when the
instruction executes.
* The single-precision variant allows double-precision registers d16
onwards, which are encoded as a pair of single-precision registers.

Fixing this also incidentally changes a vlldm/vlstm error message: when
the first register is in the range d16-d31 we now get the "operand must
be exactly..." error instead of "register expected".
---
 llvm/lib/Target/ARM/ARMRegisterInfo.td        |  4 +-
 .../lib/Target/ARM/AsmParser/ARMAsmParser.cpp | 51 +++++++++++++-----
 .../ARM/Disassembler/ARMDisassembler.cpp      | 52 +++++++++++-------
 .../ARM/MCTargetDesc/ARMInstPrinter.cpp       |  2 +-
 .../ARM/MCTargetDesc/ARMMCCodeEmitter.cpp     | 19 +++++--
 llvm/test/MC/ARM/vlstm-vlldm-diag.s           |  8 +++
 llvm/test/MC/ARM/vscclrm-asm.s                | 51 ++++++++++++++++++
 llvm/test/MC/Disassembler/ARM/vscclrm.txt     | 53 ++++++++++++++++++-
 8 files changed, 202 insertions(+), 38 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMRegisterInfo.td b/llvm/lib/Target/ARM/ARMRegisterInfo.td
index 212f22651f9f..f37d0fe542b4 100644
--- a/llvm/lib/Target/ARM/ARMRegisterInfo.td
+++ b/llvm/lib/Target/ARM/ARMRegisterInfo.td
@@ -200,9 +200,9 @@ def FPEXC   : ARMReg<8,  "fpexc">;
 def FPINST  : ARMReg<9,  "fpinst">;
 def FPINST2 : ARMReg<10, "fpinst2">;
 // These encodings aren't actual instruction encodings, their encoding depends
-// on the instruction they are used in and for VPR 32 was chosen such that it
+// on the instruction they are used in and for VPR 64 was chosen such that it
 // always comes last in spr_reglist_with_vpr.
-def VPR     : ARMReg<32, "vpr">;
+def VPR     : ARMReg<64, "vpr">;
 def FPSCR_NZCVQC
             : ARMReg<2, "fpscr_nzcvqc">;
 def P0      : ARMReg<13, "p0">;
diff --git a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
index 0ce5f466bad2..54eb0118d778 100644
--- a/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
+++ b/llvm/lib/Target/ARM/AsmParser/ARMAsmParser.cpp
@@ -446,8 +446,8 @@ class ARMAsmParser : public MCTargetAsmParser {
   int tryParseShiftRegister(OperandVector &);
   std::optional<ARM_AM::ShiftOpc> tryParseShiftToken();
   bool parseRegisterList(OperandVector &, bool EnforceOrder = true,
-                         bool AllowRAAC = false,
-                         bool AllowOutOfBoundReg = false);
+                         bool AllowRAAC = false, bool IsLazyLoadStore = false,
+                         bool IsVSCCLRM = false);
   bool parseMemory(OperandVector &);
   bool parseOperand(OperandVector &, StringRef Mnemonic);
   bool parseImmExpr(int64_t &Out);
@@ -3811,6 +3811,10 @@ public:
         Kind = k_FPSRegisterListWithVPR;
       else
         Kind = k_SPRRegisterList;
+    } else if (Regs.front().second == ARM::VPR) {
+      assert(Regs.size() == 1 &&
+             "Register list starting with VPR expected to only contain VPR");
+      Kind = k_FPSRegisterListWithVPR;
     }
 
     if (Kind == k_RegisterList && Regs.back().second == ARM::APSR)
@@ -4608,7 +4612,8 @@ insertNoDuplicates(SmallVectorImpl<std::pair<unsigned, MCRegister>> &Regs,
 
 /// Parse a register list.
 bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
-                                     bool AllowRAAC, bool AllowOutOfBoundReg) {
+                                     bool AllowRAAC, bool IsLazyLoadStore,
+                                     bool IsVSCCLRM) {
   MCAsmParser &Parser = getParser();
   if (Parser.getTok().isNot(AsmToken::LCurly))
     return TokError("Token is not a Left Curly Brace");
@@ -4618,15 +4623,23 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
 
   // Check the first register in the list to see what register class
   // this is a list of.
-  MCRegister Reg = tryParseRegister();
+  bool AllowOutOfBoundReg = IsLazyLoadStore || IsVSCCLRM;
+  MCRegister Reg = tryParseRegister(AllowOutOfBoundReg);
   if (!Reg)
     return Error(RegLoc, "register expected");
   if (!AllowRAAC && Reg == ARM::RA_AUTH_CODE)
     return Error(RegLoc, "pseudo-register not allowed");
-  // The reglist instructions have at most 16 registers, so reserve
+  // The reglist instructions have at most 32 registers, so reserve
   // space for that many.
   int EReg = 0;
-  SmallVector<std::pair<unsigned, MCRegister>, 16> Registers;
+  SmallVector<std::pair<unsigned, MCRegister>, 32> Registers;
+
+  // Single-precision VSCCLRM can have double-precision registers in the
+  // register list. When VSCCLRMAdjustEncoding is true then we've switched from
+  // single-precision to double-precision and we pretend that these registers
+  // are encoded as S32 onwards, which we can do by adding 16 to the encoding
+  // value.
+  bool VSCCLRMAdjustEncoding = false;
 
   // Allow Q regs and just interpret them as the two D sub-registers.
   if (ARMMCRegisterClasses[ARM::QPRRegClassID].contains(Reg)) {
@@ -4645,6 +4658,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     RC = &ARMMCRegisterClasses[ARM::SPRRegClassID];
   else if (ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
     RC = &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID];
+  else if (Reg == ARM::VPR)
+    RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
   else
     return Error(RegLoc, "invalid register in register list");
 
@@ -4685,6 +4700,8 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
       while (Reg != EndReg) {
         Reg = getNextRegister(Reg);
         EReg = MRI->getEncodingValue(Reg);
+        if (VSCCLRMAdjustEncoding)
+          EReg += 16;
         if (!insertNoDuplicates(Registers, EReg, Reg)) {
           Warning(AfterMinusLoc, StringRef("duplicated register (") +
                                      ARMInstPrinter::getRegisterName(Reg) +
@@ -4696,6 +4713,7 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     Parser.Lex(); // Eat the comma.
     RegLoc = Parser.getTok().getLoc();
     MCRegister OldReg = Reg;
+    int EOldReg = EReg;
     const AsmToken RegTok = Parser.getTok();
     Reg = tryParseRegister(AllowOutOfBoundReg);
     if (!Reg)
@@ -4727,6 +4745,12 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
       }
       continue;
     }
+    // VSCCLRM can switch from single-precision to double-precision only when
+    // S31 is followed by D16.
+    if (IsVSCCLRM && OldReg == ARM::S31 && Reg == ARM::D16) {
+      VSCCLRMAdjustEncoding = true;
+      RC = &ARMMCRegisterClasses[ARM::FPWithVPRRegClassID];
+    }
     // The register must be in the same register class as the first.
     if ((Reg == ARM::RA_AUTH_CODE &&
          RC != &ARMMCRegisterClasses[ARM::GPRRegClassID]) ||
@@ -4736,8 +4760,10 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     // exception is CLRM, which is order-independent anyway, so
     // there's no potential for confusion if you write clrm {r2,r1}
     // instead of clrm {r1,r2}.
-    if (EnforceOrder &&
-        MRI->getEncodingValue(Reg) < MRI->getEncodingValue(OldReg)) {
+    EReg = MRI->getEncodingValue(Reg);
+    if (VSCCLRMAdjustEncoding)
+      EReg += 16;
+    if (EnforceOrder && EReg < EOldReg) {
       if (ARMMCRegisterClasses[ARM::GPRRegClassID].contains(Reg))
         Warning(RegLoc, "register list not in ascending order");
       else if (!ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID].contains(Reg))
@@ -4746,9 +4772,9 @@ bool ARMAsmParser::parseRegisterList(OperandVector &Operands, bool EnforceOrder,
     // VFP register lists must also be contiguous.
     if (RC != &ARMMCRegisterClasses[ARM::GPRRegClassID] &&
         RC != &ARMMCRegisterClasses[ARM::GPRwithAPSRnospRegClassID] &&
-        Reg != OldReg + 1)
+        EReg != EOldReg + 1)
       return Error(RegLoc, "non-contiguous register range");
-    EReg = MRI->getEncodingValue(Reg);
+
     if (!insertNoDuplicates(Registers, EReg, Reg)) {
       Warning(RegLoc, "duplicated register (" + RegTok.getString() +
                           ") in register list");
@@ -6336,9 +6362,10 @@ bool ARMAsmParser::parseOperand(OperandVector &Operands, StringRef Mnemonic) {
   case AsmToken::LBrac:
     return parseMemory(Operands);
   case AsmToken::LCurly: {
-    bool AllowOutOfBoundReg = Mnemonic == "vlldm" || Mnemonic == "vlstm";
+    bool IsLazyLoadStore = Mnemonic == "vlldm" || Mnemonic == "vlstm";
+    bool IsVSCCLRM = Mnemonic == "vscclrm";
     return parseRegisterList(Operands, !Mnemonic.starts_with("clr"), false,
-                             AllowOutOfBoundReg);
+                             IsLazyLoadStore, IsVSCCLRM);
   }
   case AsmToken::Dollar:
   case AsmToken::Hash: {
diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index fa5dd10cfdaa..be29e4b481c0 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1529,15 +1529,19 @@ static const uint16_t DPRDecoderTable[] = {
     ARM::D28, ARM::D29, ARM::D30, ARM::D31
 };
 
-static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
-                                           uint64_t Address,
-                                           const MCDisassembler *Decoder) {
+// Does this instruction/subtarget permit use of registers d16-d31?
+static bool PermitsD32(const MCInst &Inst, const MCDisassembler *Decoder) {
+  if (Inst.getOpcode() == ARM::VSCCLRMD || Inst.getOpcode() == ARM::VSCCLRMS)
+    return true;
   const FeatureBitset &featureBits =
     ((const MCDisassembler*)Decoder)->getSubtargetInfo().getFeatureBits();
+  return featureBits[ARM::FeatureD32];
+}
 
-  bool hasD32 = featureBits[ARM::FeatureD32];
-
-  if (RegNo > 31 || (!hasD32 && RegNo > 15))
+static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
+                                           uint64_t Address,
+                                           const MCDisassembler *Decoder) {
+  if (RegNo > (PermitsD32(Inst, Decoder) ? 31 : 15))
     return MCDisassembler::Fail;
 
   unsigned Register = DPRDecoderTable[RegNo];
@@ -1816,10 +1820,11 @@ static DecodeStatus DecodeDPRRegListOperand(MCInst &Inst, unsigned Val,
   unsigned regs = fieldFromInstruction(Val, 1, 7);
 
   // In case of unpredictable encoding, tweak the operands.
-  if (regs == 0 || regs > 16 || (Vd + regs) > 32) {
-    regs = Vd + regs > 32 ? 32 - Vd : regs;
+  unsigned MaxReg = PermitsD32(Inst, Decoder) ? 32 : 16;
+  if (regs == 0 || (Vd + regs) > MaxReg) {
+    regs = Vd + regs > MaxReg ? MaxReg - Vd : regs;
     regs = std::max( 1u, regs);
-    regs = std::min(16u, regs);
+    regs = std::min(MaxReg, regs);
     S = MCDisassembler::SoftFail;
   }
 
@@ -6447,20 +6452,31 @@ static DecodeStatus DecodeVSCCLRM(MCInst &Inst, unsigned Insn, uint64_t Address,
 
   Inst.addOperand(MCOperand::createImm(ARMCC::AL));
   Inst.addOperand(MCOperand::createReg(0));
-  if (Inst.getOpcode() == ARM::VSCCLRMD) {
-    unsigned reglist = (fieldFromInstruction(Insn, 1, 7) << 1) |
-                       (fieldFromInstruction(Insn, 12, 4) << 8) |
+  unsigned regs = fieldFromInstruction(Insn, 0, 8);
+  if (regs == 0) {
+    // Register list contains only VPR
+  } else if (Inst.getOpcode() == ARM::VSCCLRMD) {
+    unsigned reglist = regs | (fieldFromInstruction(Insn, 12, 4) << 8) |
                        (fieldFromInstruction(Insn, 22, 1) << 12);
     if (!Check(S, DecodeDPRRegListOperand(Inst, reglist, Address, Decoder))) {
       return MCDisassembler::Fail;
     }
   } else {
-    unsigned reglist = fieldFromInstruction(Insn, 0, 8) |
-                       (fieldFromInstruction(Insn, 22, 1) << 8) |
-                       (fieldFromInstruction(Insn, 12, 4) << 9);
-    if (!Check(S, DecodeSPRRegListOperand(Inst, reglist, Address, Decoder))) {
-      return MCDisassembler::Fail;
-    }
+    unsigned Vd = (fieldFromInstruction(Insn, 12, 4) << 1) |
+                  fieldFromInstruction(Insn, 22, 1);
+    // Registers past s31 are permitted and treated as being half of a d
+    // register, though both halves of each d register must be present.
+    unsigned max_reg = Vd + regs;
+    if (max_reg > 64 || (max_reg > 32 && (max_reg & 1)))
+      S = MCDisassembler::SoftFail;
+    unsigned max_sreg = std::min(32u, max_reg);
+    unsigned max_dreg = std::min(32u, max_reg / 2);
+    for (unsigned i = Vd; i < max_sreg; ++i)
+      if (!Check(S, DecodeSPRRegisterClass(Inst, i, Address, Decoder)))
+        return MCDisassembler::Fail;
+    for (unsigned i = 16; i < max_dreg; ++i)
+      if (!Check(S, DecodeDPRRegisterClass(Inst, i, Address, Decoder)))
+        return MCDisassembler::Fail;
   }
   Inst.addOperand(MCOperand::createReg(ARM::VPR));
 
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
index 5636cc6287ac..e4a2f8c8f2ea 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMInstPrinter.cpp
@@ -851,7 +851,7 @@ void ARMInstPrinter::printPKHASRShiftImm(const MCInst *MI, unsigned OpNum,
 void ARMInstPrinter::printRegisterList(const MCInst *MI, unsigned OpNum,
                                        const MCSubtargetInfo &STI,
                                        raw_ostream &O) {
-  if (MI->getOpcode() != ARM::t2CLRM) {
+  if (MI->getOpcode() != ARM::t2CLRM && MI->getOpcode() != ARM::VSCCLRMS) {
     assert(is_sorted(drop_begin(*MI, OpNum),
                      [&](const MCOperand &LHS, const MCOperand &RHS) {
                        return MRI.getEncodingValue(LHS.getReg()) <
diff --git a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
index 92427b41f0bb..f24ac799b2dd 100644
--- a/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
+++ b/llvm/lib/Target/ARM/MCTargetDesc/ARMMCCodeEmitter.cpp
@@ -1743,15 +1743,28 @@ getRegisterListOpValue(const MCInst &MI, unsigned Op,
 
   unsigned Binary = 0;
 
-  if (SPRRegs || DPRRegs) {
+  if (SPRRegs || DPRRegs || Reg == ARM::VPR) {
     // VLDM/VSTM/VSCCLRM
     unsigned RegNo = CTX.getRegisterInfo()->getEncodingValue(Reg);
     unsigned NumRegs = (MI.getNumOperands() - Op) & 0xff;
     Binary |= (RegNo & 0x1f) << 8;
 
-    // Ignore VPR
-    if (MI.getOpcode() == ARM::VSCCLRMD || MI.getOpcode() == ARM::VSCCLRMS)
+    if (MI.getOpcode() == ARM::VSCCLRMD)
+      // Ignore VPR
       --NumRegs;
+    else if (MI.getOpcode() == ARM::VSCCLRMS) {
+      // The register list can contain both S registers and D registers, with D
+      // registers counting as two registers. VPR doesn't count towards the
+      // number of registers.
+      NumRegs = 0;
+      for (unsigned I = Op, E = MI.getNumOperands(); I < E; ++I) {
+        Reg = MI.getOperand(I).getReg();
+        if (ARMMCRegisterClasses[ARM::SPRRegClassID].contains(Reg))
+          NumRegs += 1;
+        else if (ARMMCRegisterClasses[ARM::DPRRegClassID].contains(Reg))
+          NumRegs += 2;
+      }
+    }
     if (SPRRegs)
       Binary |= NumRegs;
     else
diff --git a/llvm/test/MC/ARM/vlstm-vlldm-diag.s b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
index b57f535c6a25..7aa48b96ff2f 100644
--- a/llvm/test/MC/ARM/vlstm-vlldm-diag.s
+++ b/llvm/test/MC/ARM/vlstm-vlldm-diag.s
@@ -36,6 +36,14 @@ vlldm r8, {d3 - d31}
 // ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
 // ERR-NEXT: vlldm r8, {d3 - d31}
 
+vlstm r8, {d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlstm r8, {d31}
+
+vlldm r8, {d31}
+// ERR: error: operand must be exactly {d0-d15} (T1) or {d0-d31} (T2)
+// ERR-NEXT: vlldm r8, {d31}
+
 vlstm r8, {d0 - d35}
 // ERR: error: register expected
 // ERR-NEXT: vlstm r8, {d0 - d35}
diff --git a/llvm/test/MC/ARM/vscclrm-asm.s b/llvm/test/MC/ARM/vscclrm-asm.s
index 0989b38b07c0..0d2054df4fd3 100644
--- a/llvm/test/MC/ARM/vscclrm-asm.s
+++ b/llvm/test/MC/ARM/vscclrm-asm.s
@@ -35,11 +35,62 @@ it hi
 // CHECK: vscclrmhi          {s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, vpr} @ encoding: [0xdf,0xec,0x1d,0x1a]
 vscclrmhi {s3-s31, vpr}
 
+// CHECK: vscclrm            {vpr} @ encoding: [0x9f,0xec,0x00,0x0a]
+vscclrm {vpr}
+
+// CHECK: vscclrm            {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0b]
+vscclrm {d0-d31, vpr}
+
+// CHECK: vscclrm            {d31, vpr} @ encoding: [0xdf,0xec,0x02,0xfb]
+vscclrm {d31, vpr}
+
+// CHECK: vscclrm            {s31, d16, vpr} @ encoding: [0xdf,0xec,0x03,0xfa]
+vscclrm {s31, d16, vpr}
+
+// CHECK: vscclrm            {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a]
+vscclrm {s0-s31, d16-d31, vpr}
+
+// CHECK: vscclrm            {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a]
+vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr}
+
 // ERROR: non-contiguous register range
 vscclrm {s0, s3-s4, vpr}
 
+// ERROR: non-contiguous register range
+vscclrm {s31, d16, s30, vpr}
+
 // ERROR: register expected
 vscclrm {s32, vpr}
 
+// ERROR: register expected
+vscclrm {d32, vpr}
+
+// ERROR: register expected
+vscclrm {s31-s32, vpr}
+
+// ERROR: register expected
+vscclrm {d31-d32, vpr}
+
 // ERROR: invalid operand for instruction
 vscclrm {s0-s1}
+
+// ERROR: register list not in ascending order
+vscclrm {vpr, s0}
+
+// ERROR: register list not in ascending order
+vscclrm {vpr, s31}
+
+// ERROR: register list not in ascending order
+vscclrm {vpr, d0}
+
+// ERROR: register list not in ascending order
+vscclrm {vpr, d31}
+
+// ERROR: invalid register in register list
+vscclrm {s0, d0, vpr}
+
+// ERROR: invalid register in register list
+vscclrm {s0, d1, vpr}
+
+// ERROR: invalid register in register list
+vscclrm {d16, s31, vpr}
diff --git a/llvm/test/MC/Disassembler/ARM/vscclrm.txt b/llvm/test/MC/Disassembler/ARM/vscclrm.txt
index 8a89cfb76e4a..ef3868eb1569 100644
--- a/llvm/test/MC/Disassembler/ARM/vscclrm.txt
+++ b/llvm/test/MC/Disassembler/ARM/vscclrm.txt
@@ -1,5 +1,7 @@
-# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+8msecext -show-encoding %s 2>&1 | FileCheck %s
-# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+8msecext -show-encoding %s 2>&1 | FileCheck %s
+# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+8msecext -show-encoding %s 2> %t | FileCheck %s
+# RUN: FileCheck --check-prefix=WARN < %t %s
+# RUN: llvm-mc -disassemble -triple=thumbv8.1m.main-none-eabi -mattr=+mve.fp,+8msecext -show-encoding %s 2> %t | FileCheck %s
+# RUN: FileCheck --check-prefix=WARN < %t %s
 
 [0x9f 0xec 0x04 0x0a]
 # CHECK: vscclrm {s0, s1, s2, s3, vpr}
@@ -27,3 +29,50 @@
 
 [0xdf 0xec 0x1d 0x1a]
 # CHECK: vscclrmhi    {s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, vpr}
+
+[0xdf,0xec,0x03,0xfa]
+# CHECK: vscclrm {s31, d16, vpr} @ encoding: [0xdf,0xec,0x03,0xfa]
+
+[0x9f,0xec,0x40,0x0a]
+# CHECK: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a]
+
+# If the list size is zero then we get a list of only vpr, and the Vd register
+# doesn't matter.
+
+[0x9f,0xec,0x00,0x0b]
+# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0b]
+
+[0xdf,0xec,0x00,0xfb]
+# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0b]
+
+[0x9f,0xec,0x00,0x0a]
+# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0a]
+
+[0xdf,0xec,0x00,0xfa]
+# CHECK: vscclrm {vpr} @ encoding: [0x9f,0xec,0x00,0x0a]
+
+# In double-precision if Vd+size goes past 31 the excess registers are ignored
+# and we get a warning.
+
+[0x9f,0xec,0xfe,0x0b]
+# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding
+# CHECK: vscclrm {d0, d1, d2, d3, d4, d5, d6, d7, d8, d9, d10, d11, d12, d13, d14, d15, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0b]
+
+[0xdf,0xec,0x04,0xfb]
+# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding
+# CHECK: vscclrm {d31, vpr} @ encoding: [0xdf,0xec,0x02,0xfb]
+
+# In single-precision if Vd+size goes past 63, or if the encoding suggests half
+# a d registers, then the excess registers are ignored and we get a warning.
+
+[0x9f,0xec,0xff,0x0a]
+# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding
+# CHECK: vscclrm {s0, s1, s2, s3, s4, s5, s6, s7, s8, s9, s10, s11, s12, s13, s14, s15, s16, s17, s18, s19, s20, s21, s22, s23, s24, s25, s26, s27, s28, s29, s30, s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0x9f,0xec,0x40,0x0a]
+
+[0xdf,0xec,0x02,0xfa]
+# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding
+# CHECK: vscclrm {s31, vpr} @ encoding: [0xdf,0xec,0x01,0xfa]
+
+[0xdf,0xec,0x23,0xfa]
+# WARN: [[@LINE-1]]:2: warning: potentially undefined instruction encoding
+vscclrm {s31, d16, d17, d18, d19, d20, d21, d22, d23, d24, d25, d26, d27, d28, d29, d30, d31, vpr} @ encoding: [0xdf,0xec,0x21,0xfa]
-- 
GitLab


From 53d89ef34005f4dc4f764db0c009130bb52a6a78 Mon Sep 17 00:00:00 2001
From: SpencerAbson <Spencer.Abson@arm.com>
Date: Thu, 17 Oct 2024 11:16:06 +0100
Subject: [PATCH 221/329] [AArch64][Clang][NEON] Remove undefined vcmla
 intrinsics (#112575)

arm_neon.td currently generates the same 24 `vcmla` intrinsic prototypes
for each of the f16, f32, and f64 base types. This is incorrect, the
only valid vcmla intrinsics for the f64 base type are:
- `vcmlaq_f64`
- `vcmlaq_rot90_f64`
- `vcmlaq_rot180_f64`
- `vcmlaq_rot270_f64`

(see ACLE
https://github.com/ARM-software/acle/blob/main/neon_intrinsics/advsimd.md)

This patch removes the incorrect intrinsic prototypes.
---
 clang/include/clang/Basic/arm_neon.td |  7 ++++--
 clang/test/Sema/aarch64-neon-target.c |  4 ++--
 clang/test/Sema/aarch64-vcmla-undef.c | 31 +++++++++++++++++++++++++++
 3 files changed, 38 insertions(+), 4 deletions(-)
 create mode 100644 clang/test/Sema/aarch64-vcmla-undef.c

diff --git a/clang/include/clang/Basic/arm_neon.td b/clang/include/clang/Basic/arm_neon.td
index 8652b5e3a9c9..ec829f566ef5 100644
--- a/clang/include/clang/Basic/arm_neon.td
+++ b/clang/include/clang/Basic/arm_neon.td
@@ -1968,13 +1968,16 @@ let TargetGuard = "v8.3a,neon" in {
   def VCADDQ_ROT90  : SInst<"vcaddq_rot90", "QQQ", "f">;
   def VCADDQ_ROT270 : SInst<"vcaddq_rot270", "QQQ", "f">;
 
-  defm VCMLA_F32        : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">;
+  defm VCMLA_F32    : VCMLA_ROTS<"f", "uint64x1_t", "uint64x2_t">;
 }
 let ArchGuard = "defined(__aarch64__) || defined(__arm64ec__)", TargetGuard = "v8.3a,neon" in {
   def VCADDQ_ROT90_FP64  : SInst<"vcaddq_rot90", "QQQ", "d">;
   def VCADDQ_ROT270_FP64 : SInst<"vcaddq_rot270", "QQQ", "d">;
 
-  defm VCMLA_FP64 : VCMLA_ROTS<"d", "uint64x2_t", "uint64x2_t">;
+  def VCMLAQ_FP64        : SInst<"vcmlaq", "QQQQ", "d">;
+  def VCMLAQ_ROT90_FP64  : SInst<"vcmlaq_rot90", "QQQQ", "d">;
+  def VCMLAQ_ROT180_FP64 : SInst<"vcmlaq_rot180", "QQQQ", "d">;
+  def VCMLAQ_ROT270_FP64 : SInst<"vcmlaq_rot270", "QQQQ", "d">;
 }
 
 // V8.2-A BFloat intrinsics
diff --git a/clang/test/Sema/aarch64-neon-target.c b/clang/test/Sema/aarch64-neon-target.c
index fd84b7f2eb00..07d763ec84bd 100644
--- a/clang/test/Sema/aarch64-neon-target.c
+++ b/clang/test/Sema/aarch64-neon-target.c
@@ -58,7 +58,7 @@ __attribute__((target("arch=armv8.3-a+fp16")))
 void test_v83(float32x4_t v4f32, float16x4_t v4f16, float64x2_t v2f64) {
   vcaddq_rot90_f32(v4f32, v4f32);
   vcmla_rot90_f16(v4f16, v4f16, v4f16);
-  vcmlaq_rot270_laneq_f64(v2f64, v2f64, v2f64, 1);
+  vcmlaq_rot270_f64(v2f64, v2f64, v2f64);
 }
 
 __attribute__((target("arch=armv8.5-a")))
@@ -95,7 +95,7 @@ void undefined(uint32x2_t v2i32, uint32x4_t v4i32, uint16x8_t v8i16, uint8x16_t
   // 8.3 - complex
   vcaddq_rot90_f32(v4f32, v4f32); // expected-error {{always_inline function 'vcaddq_rot90_f32' requires target feature 'v8.3a'}}
   vcmla_rot90_f16(v4f16, v4f16, v4f16); // expected-error {{always_inline function 'vcmla_rot90_f16' requires target feature 'v8.3a'}}
-  vcmlaq_rot270_laneq_f64(v2f64, v2f64, v2f64, 1); // expected-error {{always_inline function 'vcmlaq_rot270_f64' requires target feature 'v8.3a'}}
+  vcmlaq_rot270_f64(v2f64, v2f64, v2f64); // expected-error {{always_inline function 'vcmlaq_rot270_f64' requires target feature 'v8.3a'}}
   // 8.5 - frint
   vrnd32xq_f32(v4f32); // expected-error {{always_inline function 'vrnd32xq_f32' requires target feature 'v8.5a'}}
 
diff --git a/clang/test/Sema/aarch64-vcmla-undef.c b/clang/test/Sema/aarch64-vcmla-undef.c
new file mode 100644
index 000000000000..8a777ff61563
--- /dev/null
+++ b/clang/test/Sema/aarch64-vcmla-undef.c
@@ -0,0 +1,31 @@
+// RUN: %clang_cc1 -triple aarch64-linux-gnu -target-feature +neon -target-feature +v8.3a -ffreestanding -fsyntax-only -verify -verify-ignore-unexpected=note  %s
+
+// REQUIRES: aarch64-registered-target
+
+#include <arm_neon.h>
+
+void test(float64x1_t v1f64, float64x2_t v2f64) {
+    vcmla_f64(v1f64, v1f64, v1f64);  // expected-error {{call to undeclared function 'vcmla_f64'}}
+    vcmla_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_lane_f64'}}
+    vcmla_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_laneq_f64'}}
+    vcmlaq_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_lane_f64'}}
+    vcmlaq_laneq_f64(v2f64, v2f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_laneq_f64'}}
+
+    vcmla_rot90_f64(v1f64, v1f64, v1f64); // expected-error {{call to undeclared function 'vcmla_rot90_f64'}}
+    vcmla_rot90_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_rot90_lane_f64'}}
+    vcmla_rot90_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_rot90_laneq_f64'}}
+    vcmlaq_rot90_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot90_lane_f64'}}
+    vcmlaq_rot90_laneq_f64(v2f64, v2f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot90_laneq_f64'}}
+
+    vcmla_rot180_f64(v1f64, v1f64, v1f64); // expected-error {{call to undeclared function 'vcmla_rot180_f64'}}
+    vcmla_rot180_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_rot180_lane_f64'}}
+    vcmla_rot180_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_rot180_laneq_f64'}}
+    vcmlaq_rot180_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot180_lane_f64'}}
+    vcmlaq_rot180_laneq_f64(v2f64, v2f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot180_laneq_f64'}}
+
+    vcmla_rot270_f64(v1f64, v1f64, v1f64); // expected-error {{call to undeclared function 'vcmla_rot270_f64'}}
+    vcmla_rot270_lane_f64(v1f64, v1f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmla_rot270_lane_f64'}}
+    vcmla_rot270_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmla_rot270_laneq_f64'}}
+    vcmlaq_rot270_lane_f64(v2f64, v2f64, v1f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot270_lane_f64'}}
+    vcmlaq_rot270_laneq_f64(v1f64, v1f64, v2f64, 0); // expected-error {{call to undeclared function 'vcmlaq_rot270_laneq_f64'}}
+}
-- 
GitLab


From 5b4071c7554ab4feeae4817e3d41013016308586 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 17 Oct 2024 12:26:44 +0200
Subject: [PATCH 222/329] [clang][bytecode] Explicitly truncate in
 IntegralAP::from() (#112683)

Add Integral::toAPInt(), which truncates to the given BitWidth, similar
to the toAPSInt() we already have.
---
 clang/lib/AST/ByteCode/Integral.h   | 9 ++++++---
 clang/lib/AST/ByteCode/IntegralAP.h | 7 +------
 2 files changed, 7 insertions(+), 9 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Integral.h b/clang/lib/AST/ByteCode/Integral.h
index e06ec1669259..be537d22d5af 100644
--- a/clang/lib/AST/ByteCode/Integral.h
+++ b/clang/lib/AST/ByteCode/Integral.h
@@ -122,11 +122,14 @@ public:
   APSInt toAPSInt() const {
     return APSInt(APInt(Bits, static_cast<uint64_t>(V), Signed), !Signed);
   }
-  APSInt toAPSInt(unsigned NumBits) const {
+  APSInt toAPSInt(unsigned BitWidth) const { return APSInt(toAPInt(BitWidth)); }
+  APInt toAPInt(unsigned BitWidth) const {
     if constexpr (Signed)
-      return APSInt(toAPSInt().sextOrTrunc(NumBits), !Signed);
+      return APInt(Bits, static_cast<uint64_t>(V), Signed)
+          .sextOrTrunc(BitWidth);
     else
-      return APSInt(toAPSInt().zextOrTrunc(NumBits), !Signed);
+      return APInt(Bits, static_cast<uint64_t>(V), Signed)
+          .zextOrTrunc(BitWidth);
   }
   APValue toAPValue(const ASTContext &) const { return APValue(toAPSInt()); }
 
diff --git a/clang/lib/AST/ByteCode/IntegralAP.h b/clang/lib/AST/ByteCode/IntegralAP.h
index 252d7243bee7..f8aeaaca398f 100644
--- a/clang/lib/AST/ByteCode/IntegralAP.h
+++ b/clang/lib/AST/ByteCode/IntegralAP.h
@@ -112,12 +112,7 @@ public:
 
   template <unsigned Bits, bool InputSigned>
   static IntegralAP from(Integral<Bits, InputSigned> I, unsigned BitWidth) {
-    // TODO: Avoid implicit trunc?
-    // See https://github.com/llvm/llvm-project/issues/112510.
-    APInt Copy = APInt(BitWidth, static_cast<uint64_t>(I), InputSigned,
-                       /*implicitTrunc=*/true);
-
-    return IntegralAP<Signed>(Copy);
+    return IntegralAP<Signed>(I.toAPInt(BitWidth));
   }
 
   static IntegralAP zero(int32_t BitWidth) {
-- 
GitLab


From 125168744810fffff4aba039208afd9ffe1d11b1 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Timm=20B=C3=A4der?= <tbaeder@redhat.com>
Date: Thu, 17 Oct 2024 12:38:23 +0200
Subject: [PATCH 223/329] [clang][bytecode][NFC] Remove a leftover dump call

---
 clang/lib/AST/ByteCode/Interp.cpp | 1 -
 1 file changed, 1 deletion(-)

diff --git a/clang/lib/AST/ByteCode/Interp.cpp b/clang/lib/AST/ByteCode/Interp.cpp
index 40137de19c4e..fdc4b38b8aa6 100644
--- a/clang/lib/AST/ByteCode/Interp.cpp
+++ b/clang/lib/AST/ByteCode/Interp.cpp
@@ -1040,7 +1040,6 @@ bool Free(InterpState &S, CodePtr OpPC, bool DeleteIsArrayForm,
         return nullptr;
       };
 
-      AllocType->dump();
       if (const FunctionDecl *VirtualDelete =
               getVirtualOperatorDelete(AllocType);
           VirtualDelete &&
-- 
GitLab


From 4d228e1ebdd652ad3c95e64c0f1bae17145e9e1b Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 17 Oct 2024 12:43:06 +0200
Subject: [PATCH 224/329] [mlir][vector] Escape variable usage in test

Otherwise the shell might expand this in the command line.
---
 mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
index 9cfd6885fba9..8f01cc2b8d44 100644
--- a/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
+++ b/mlir/test/Conversion/VectorToLLVM/vector-xfer-to-llvm.mlir
@@ -1,5 +1,5 @@
-// RUN: mlir-opt %s -convert-vector-to-llvm -split-input-file | FileCheck -D$IDX_TYPE=i32 %s
-// RUN: mlir-opt %s --convert-vector-to-llvm='force-32bit-vector-indices=0' | FileCheck  -D$IDX_TYPE=i64 %s
+// RUN: mlir-opt %s -convert-vector-to-llvm -split-input-file | FileCheck '-D$IDX_TYPE=i32' %s
+// RUN: mlir-opt %s --convert-vector-to-llvm='force-32bit-vector-indices=0' | FileCheck '-D$IDX_TYPE=i64' %s
 
 func.func @transfer_read_write_1d(%A : memref<?xf32>, %base: index) -> vector<17xf32> {
   %f7 = arith.constant 7.0: f32
-- 
GitLab


From 584e00a3161ca51ef9b47acb37a653aa881de0a6 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Thu, 17 Oct 2024 18:46:26 +0800
Subject: [PATCH 225/329] [ARM] Fix -Wunused-variable in ARMFrameLowering.cpp
 (NFC)

/llvm-project/llvm/lib/Target/ARM/ARMFrameLowering.cpp:1028:9:
error: unused variable 'FPOffset' [-Werror,-Wunused-variable]
    int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
        ^
1 error generated.
---
 llvm/lib/Target/ARM/ARMFrameLowering.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 06e26262062c..57e2d5525a1a 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -1025,7 +1025,7 @@ void ARMFrameLowering::emitPrologue(MachineFunction &MF,
   }
   if (HasFP) {
     // Offset from the CFA to the saved frame pointer, will be negative.
-    int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
+    [[maybe_unused]] int FPOffset = MFI.getObjectOffset(FramePtrSpillFI);
     LLVM_DEBUG(dbgs() << "FramePtrSpillFI: " << FramePtrSpillFI
                       << ", FPOffset: " << FPOffset << "\n");
     assert(getMaxFPOffset(STI, *AFI, MF) <= FPOffset &&
-- 
GitLab


From 2954d1f7bc8fa77c51768855d9df98f5559a5c5e Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Thu, 17 Oct 2024 18:51:40 +0800
Subject: [PATCH 226/329] [include-cleaner] Fix -Wpessimizing-move in
 IncludeCleaner.cpp (NFC)

/llvm-project/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp:302:14:
error: moving a temporary object prevents copy elision [-Werror,-Wpessimizing-move]
      return std::move(llvm::errorCodeToError(Err));
             ^
/llvm-project/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp:302:14: note: remove std::move call here
      return std::move(llvm::errorCodeToError(Err));
             ^~~~~~~~~~                           ~
1 error generated.
---
 clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
index 6bd9c40c7075..f85dbc0e0c31 100644
--- a/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
+++ b/clang-tools-extra/include-cleaner/tool/IncludeCleaner.cpp
@@ -299,7 +299,7 @@ mapInputsToAbsPaths(clang::tooling::CompilationDatabase &CDB,
     if (auto Err = VFS->makeAbsolute(AbsPath)) {
       llvm::errs() << "Failed to get absolute path for " << Source << " : "
                    << Err.message() << '\n';
-      return std::move(llvm::errorCodeToError(Err));
+      return llvm::errorCodeToError(Err);
     }
     std::vector<clang::tooling::CompileCommand> Cmds =
         CDB.getCompileCommands(AbsPath);
-- 
GitLab


From ab90d2793cf56758a91f7a7ae027850af2455d3e Mon Sep 17 00:00:00 2001
From: Nashe Mncube <nashe.mncube@arm.com>
Date: Thu, 17 Oct 2024 11:56:00 +0100
Subject: [PATCH 227/329] [llvm][ARM]Add  widen global arrays pass (#107120)

- Pass optimizes memcpy's by padding out destinations and sources to a
full word to make backend generate full word loads instead of loading a
single byte (ldrb) and/or half word (ldrh). Only pads destination when
it's a stack allocated constant size array and source when it's constant
array. Heuristic to decide whether to pad or not is very basic and could
be improved to allow more examples to be padded.
- Pass works within GlobalOpt but is disabled by default on all targets
except ARM.
---
 .../llvm/Analysis/TargetTransformInfo.h       |  11 ++
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   4 +
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   6 +
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  33 ++++
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |   3 +
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 165 ++++++++++++++++++
 .../GlobalOpt/ARM/arm-widen-dest-non-array.ll |  39 +++++
 .../GlobalOpt/ARM/arm-widen-global-dest.ll    |  28 +++
 .../GlobalOpt/ARM/arm-widen-non-byte-array.ll |  22 +++
 .../ARM/arm-widen-non-const-global.ll         |  21 +++
 .../ARM/arm-widen-string-multi-use.ll         |  33 ++++
 .../GlobalOpt/ARM/arm-widen-strings-1.ll      |  21 +++
 .../GlobalOpt/ARM/arm-widen-strings-2.ll      |  21 +++
 .../arm-widen-strings-lengths-dont-match.ll   |  26 +++
 .../arm-widen-strings-more-than-64-bytes.ll   |  28 +++
 .../ARM/arm-widen-strings-ptrtoint.ll         |  54 ++++++
 .../ARM/arm-widen-strings-struct-test.ll      |  45 +++++
 .../ARM/arm-widen-strings-volatile.ll         |  29 +++
 18 files changed, 589 insertions(+)
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
 create mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0459941fe05c..0dc513d8e65b 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1819,6 +1819,10 @@ public:
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
+  /// \return For an array of given Size, return alignment boundary to
+  /// pad to. Default is no padding.
+  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
+
   /// @}
 
 private:
@@ -2225,6 +2229,8 @@ public:
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
+  virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
+                                               Type *ArrayType) const = 0;
 };
 
 template <typename T>
@@ -3026,6 +3032,11 @@ public:
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
+
+  unsigned getNumBytesToPadGlobalArray(unsigned Size,
+                                       Type *ArrayType) const override {
+    return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
+  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index dbdfb4d8cdfa..0b7792f89a05 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1006,6 +1006,10 @@ public:
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
+  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
+    return 0;
+  }
+
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index a47462b61e03..607047336376 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1383,6 +1383,12 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const {
   return TTIImpl->isVectorShiftByScalarCheap(Ty);
 }
 
+unsigned
+TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
+                                                 Type *ArrayType) const {
+  return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
+}
+
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 835ae98efb85..9f6e5e5ab142 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -56,6 +56,10 @@ static cl::opt<bool>
     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
                   cl::desc("Enable the generation of WLS loops"));
 
+static cl::opt<bool> UseWidenGlobalArrays(
+    "widen-global-strings", cl::Hidden, cl::init(true),
+    cl::desc("Enable the widening of global strings to alignment boundaries"));
+
 extern cl::opt<TailPredication::Mode> EnableTailPredication;
 
 extern cl::opt<bool> EnableMaskedGatherScatters;
@@ -2805,3 +2809,32 @@ bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
   }
   return true;
 }
+
+unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
+                                                 Type *ArrayType) const {
+  if (!UseWidenGlobalArrays) {
+    LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
+    return false;
+  }
+
+  // Don't modify none integer array types
+  if (!ArrayType || !ArrayType->isArrayTy() ||
+      !ArrayType->getArrayElementType()->isIntegerTy())
+    return 0;
+
+  // We pad to 4 byte boundaries
+  if (Size % 4 == 0)
+    return 0;
+
+  unsigned NumBytesToPad = 4 - (Size % 4);
+  unsigned NewSize = Size + NumBytesToPad;
+
+  // Max number of bytes that memcpy allows for lowering to load/stores before
+  // it uses library function (__aeabi_memcpy).
+  unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
+
+  if (NewSize > MaxMemIntrinsicSize)
+    return 0;
+
+  return NumBytesToPad;
+}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index b0a75134ee02..3a4f940088b2 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -337,6 +337,9 @@ public:
 
   bool isProfitableToSinkOperands(Instruction *I,
                                   SmallVectorImpl<Use *> &Ops) const;
+
+  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
+
   /// @}
 };
 
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index aae4926e027f..4647c65a5c85 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -92,6 +92,8 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
 STATISTIC(NumColdCC, "Number of functions marked coldcc");
 STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
 STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
+STATISTIC(NumGlobalArraysPadded,
+          "Number of global arrays padded to alignment boundary");
 
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2029,6 +2031,165 @@ OptimizeFunctions(Module &M,
   return Changed;
 }
 
+static bool callInstIsMemcpy(CallInst *CI) {
+  if (!CI)
+    return false;
+
+  Function *F = CI->getCalledFunction();
+  if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
+    return false;
+
+  return true;
+}
+
+static bool destArrayCanBeWidened(CallInst *CI) {
+  auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
+  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+
+  if (!Alloca || !IsVolatile || IsVolatile->isOne())
+    return false;
+
+  if (!Alloca->isStaticAlloca())
+    return false;
+
+  if (!Alloca->getAllocatedType()->isArrayTy())
+    return false;
+
+  return true;
+}
+
+static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F,
+                                           unsigned NumBytesToPad,
+                                           unsigned NumBytesToCopy) {
+  if (!OldVar->hasInitializer())
+    return nullptr;
+
+  ConstantDataArray *DataArray =
+      dyn_cast<ConstantDataArray>(OldVar->getInitializer());
+  if (!DataArray)
+    return nullptr;
+
+  // Update to be word aligned (memcpy(...,X,...))
+  // create replacement with padded null bytes.
+  StringRef Data = DataArray->getRawDataValues();
+  std::vector<uint8_t> StrData(Data.begin(), Data.end());
+  for (unsigned int p = 0; p < NumBytesToPad; p++)
+    StrData.push_back('\0');
+  auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
+  // Create new padded version of global variable.
+  Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
+  GlobalVariable *NewGV = new GlobalVariable(
+      *(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(),
+      SourceReplace, SourceReplace->getName());
+  // Copy any other attributes from original global variable
+  // e.g. unamed_addr
+  NewGV->copyAttributesFrom(OldVar);
+  NewGV->takeName(OldVar);
+  return NewGV;
+}
+
+static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
+                           const unsigned NumBytesToCopy,
+                           ConstantDataArray *SourceDataArray) {
+
+  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+  if (Alloca) {
+    unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
+    unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
+    unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
+    // Update destination array to be word aligned (memcpy(X,...,...))
+    IRBuilder<> BuildAlloca(Alloca);
+    AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
+        Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
+    NewAlloca->takeName(Alloca);
+    NewAlloca->setAlignment(Alloca->getAlign());
+    Alloca->replaceAllUsesWith(NewAlloca);
+    Alloca->eraseFromParent();
+  }
+}
+
+static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
+                                        const unsigned NumBytesToPad,
+                                        const unsigned NumBytesToCopy,
+                                        ConstantInt *BytesToCopyOp,
+                                        ConstantDataArray *SourceDataArray) {
+  auto *NewSourceGV =
+      widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
+  if (!NewSourceGV)
+    return false;
+
+  // Update arguments of remaining uses  that
+  // are memcpys.
+  for (auto *User : SourceVar->users()) {
+    auto *CI = dyn_cast<CallInst>(User);
+    if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
+      continue;
+
+    if (CI->getArgOperand(1) != SourceVar)
+      continue;
+
+    widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);
+
+    CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
+                                          NumBytesToCopy + NumBytesToPad));
+  }
+  SourceVar->replaceAllUsesWith(NewSourceGV);
+
+  NumGlobalArraysPadded++;
+  return true;
+}
+
+static bool tryWidenGlobalArraysUsedByMemcpy(
+    GlobalVariable *GV,
+    function_ref<TargetTransformInfo &(Function &)> GetTTI) {
+
+  if (!GV->hasInitializer() || !GV->isConstant() || !GV->hasLocalLinkage() ||
+      !GV->hasGlobalUnnamedAddr())
+    return false;
+
+  for (auto *User : GV->users()) {
+    CallInst *CI = dyn_cast<CallInst>(User);
+    if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
+      continue;
+
+    Function *F = CI->getCalledFunction();
+
+    auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
+    if (!BytesToCopyOp)
+      continue;
+
+    ConstantDataArray *SourceDataArray =
+        dyn_cast<ConstantDataArray>(GV->getInitializer());
+    if (!SourceDataArray)
+      continue;
+
+    unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
+
+    auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
+    uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
+    uint64_t SZSize = SourceDataArray->getType()->getNumElements();
+    unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
+    // Calculate the number of elements to copy while avoiding floored
+    // division of integers returning wrong values i.e. copying one byte
+    // from an array of i16 would yield 0 elements to copy as supposed to 1.
+    unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);
+
+    // For safety purposes lets add a constraint and only pad when
+    // NumElementsToCopy == destination array size ==
+    // source which is a constant
+    if (NumElementsToCopy != DZSize || DZSize != SZSize)
+      continue;
+
+    unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray(
+        NumBytesToCopy, SourceDataArray->getType());
+    if (NumBytesToPad) {
+      return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy,
+                                         BytesToCopyOp, SourceDataArray);
+    }
+  }
+  return false;
+}
+
 static bool
 OptimizeGlobalVars(Module &M,
                    function_ref<TargetTransformInfo &(Function &)> GetTTI,
@@ -2058,6 +2219,10 @@ OptimizeGlobalVars(Module &M,
       continue;
     }
 
+    // For global variable arrays called in a memcpy
+    // we try to pad to nearest valid alignment boundary
+    Changed |= tryWidenGlobalArraysUsedByMemcpy(&GV, GetTTI);
+
     Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
   return Changed;
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
new file mode 100644
index 000000000000..ab04e0a5bc69
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
+
+define  void @memcpy_struct()  {
+; CHECK-LABEL: define void @memcpy_struct() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca { i8, i8, i8 }, align 1
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca {i8, i8, i8}, align 1
+  %call1 = call i32 @bar(ptr nonnull %something)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  ret void
+}
+
+
+@.i8_multi = private unnamed_addr constant [2 x [3 x i8]] [[3 x i8] [i8 1, i8 2, i8 3], [3 x i8] [i8 4, i8 5, i8 6]] , align 1
+
+define  void @memcpy_array_multidimensional()  {
+; CHECK-LABEL: define void @memcpy_array_multidimensional() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [2 x [3 x i8]], align 1
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [2 x [3 x i8]], align 1
+  %call1 = call i32 @bar(ptr nonnull %something)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
+  ret void
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
new file mode 100644
index 000000000000..f435ffdeed2c
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+; CHECK: [3 x i8]
+@other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
+; CHECK: [4 x i8]
+@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
+
+define  void @memcpy_multiple()  {
+; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull @other)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [3 x i8], align 1
+  %call1 = call i32 @bar(ptr nonnull %something)
+  %call2 = call i32 @bar(ptr nonnull @other)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  ret void
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
new file mode 100644
index 000000000000..c7ca7271fd3d
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
@@ -0,0 +1,22 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+@.i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1
+
+define  void @memcpy_i16_array()  {
+; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [5 x i16], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 10, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  ret void
+}
+
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll
new file mode 100644
index 000000000000..3d9c42fe1f3d
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+@.str = unnamed_addr global [3 x i8] c"12\00", align 1
+
+define  void @foo()  {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [3 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.str, i32 3, i1 false)
+; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [3 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.str, i32 3, i1 false)
+  %call1 = call i32 @bar(ptr nonnull %something)
+  ret void
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
new file mode 100644
index 000000000000..e37925a78d2c
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
@@ -0,0 +1,33 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
+
+define  void @memcpy_multiple()  {
+; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING2:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    [[SOMETHING3:%.*]] = alloca [4 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING2]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING3]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING2]])
+; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
+; CHECK-NEXT:    [[CALL4:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING3]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [3 x i8], align 1
+  %something1 = alloca [3 x i8], align 1
+  %something2 = alloca [3 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something1, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something2, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  %call3 = call i32 @bar(ptr nonnull %something1)
+  %call4 = call i32 @bar(ptr nonnull %something2)
+  ret void
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
new file mode 100644
index 000000000000..8ea9e2804370
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+@.str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
+
+define  void @foo()  {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [12 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 12, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [10 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  ret void
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
new file mode 100644
index 000000000000..ad3620b14ea2
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
@@ -0,0 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+@.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
+
+define  void @foo()  {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [64 x i8], align 1
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 64, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [62 x i8], align 1
+  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) %something, ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 62, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something)
+  ret void
+}
+
+declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
new file mode 100644
index 000000000000..b8e02c3f996d
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+; CHECK: [17 x i8]
+@.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
+
+; Function Attrs: nounwind
+define  void @foo()   {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [20 x i8], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[SOMETHING]], ptr align 1 @.str, i32 17, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [20 x i8], align 1
+  call void @llvm.lifetime.start(i64 20, ptr nonnull %something) #3
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 17, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something) #3
+  call void @llvm.lifetime.end(i64 20, ptr nonnull %something) #3
+  ret void
+}
+
+declare i32 @bar(...)  #2
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
new file mode 100644
index 000000000000..4ac31aa2f976
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+; CHECK: [65 x i8]
+; CHECK-NOT: [68 x i8]
+@.str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1
+
+; Function Attrs: nounwind
+define  void @foo()   {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [65 x i8], align 1
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 65, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[SOMETHING]], ptr align 1 @.str, i32 65, i1 false)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 65, ptr nonnull [[SOMETHING]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [65 x i8], align 1
+  call void @llvm.lifetime.start(i64 65, ptr nonnull %something) #3
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 65, i1 false)
+  %call2 = call i32 @bar(ptr nonnull %something) #3
+  call void @llvm.lifetime.end(i64 65, ptr nonnull %something) #3
+  ret void
+}
+
+declare i32 @bar(...)  #2
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
new file mode 100644
index 000000000000..64f57884cd39
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
@@ -0,0 +1,54 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+@f.string1 = private unnamed_addr constant [45 x i8] c"The quick brown dog jumps over the lazy fox.\00", align 1
+
+; Function Attrs: nounwind
+define  i32 @f() {
+; CHECK-LABEL: define i32 @f() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[STRING1:%.*]] = alloca [48 x i8], align 1
+; CHECK-NEXT:    [[POS:%.*]] = alloca i32, align 4
+; CHECK-NEXT:    [[TOKEN:%.*]] = alloca ptr, align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 45, ptr [[STRING1]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[STRING1]], ptr align 1 @f.string1, i32 48, i1 false)
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[POS]])
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TOKEN]])
+; CHECK-NEXT:    [[CALL:%.*]] = call ptr @strchr(ptr [[STRING1]], i32 101)
+; CHECK-NEXT:    store ptr [[CALL]], ptr [[TOKEN]], align 4
+; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TOKEN]], align 4
+; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP1]] to i32
+; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STRING1]] to i32
+; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i32 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
+; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB_PTR_SUB]], 1
+; CHECK-NEXT:    store i32 [[ADD]], ptr [[POS]], align 4
+; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[POS]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TOKEN]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[POS]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 45, ptr [[STRING1]])
+; CHECK-NEXT:    ret i32 [[TMP2]]
+;
+entry:
+  %string1 = alloca [45 x i8], align 1
+  %pos = alloca i32, align 4
+  %token = alloca ptr, align 4
+  call void @llvm.lifetime.start.p0i8(i64 45, ptr %string1)
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %string1, ptr align 1 @f.string1, i32 45, i1 false)
+  call void @llvm.lifetime.start.p0i8(i64 4, ptr %pos)
+  call void @llvm.lifetime.start.p0i8(i64 4, ptr %token)
+  %call = call ptr @strchr(ptr %string1, i32 101)
+  store ptr %call, ptr %token, align 4
+  %0 = load ptr, ptr %token, align 4
+  %sub.ptr.lhs.cast = ptrtoint ptr %0 to i32
+  %sub.ptr.rhs.cast = ptrtoint ptr %string1 to i32
+  %sub.ptr.sub = sub i32 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
+  %add = add nsw i32 %sub.ptr.sub, 1
+  store i32 %add, ptr %pos, align 4
+  %1 = load i32, ptr %pos, align 4
+  call void @llvm.lifetime.end.p0i8(i64 4, ptr %token)
+  call void @llvm.lifetime.end.p0i8(i64 4, ptr %pos)
+  call void @llvm.lifetime.end.p0i8(i64 45, ptr %string1)
+  ret i32 %1
+}
+
+declare ptr @strchr(ptr, i32)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
new file mode 100644
index 000000000000..5367572704b1
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+%struct.P = type { i32, [13 x i8] }
+
+; CHECK-NOT: [16 x i8]
+@.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
+
+; Function Attrs: nounwind
+define  i32 @main()   {
+; CHECK-LABEL: define i32 @main() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_P:%.*]], align 4
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[P]])
+; CHECK-NEXT:    store i32 10, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
+; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [[STRUCT_P]], ptr [[P]], i32 0, i32 1, i32 0
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[ARRAYDECAY]], ptr align 1 @.str, i32 13, i1 false)
+; CHECK-NEXT:    [[PUTS:%.*]] = call i32 @puts(ptr [[ARRAYDECAY]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr nonnull [[P]])
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %p = alloca %struct.P, align 4
+  call void @llvm.lifetime.start(i64 20, ptr nonnull %p) #2
+  store i32 10, ptr %p, align 4, !tbaa !1
+  %arraydecay = getelementptr inbounds %struct.P, ptr %p, i32 0, i32 1, i32 0
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %arraydecay, ptr align 1 @.str, i32 13, i1 false)
+  %puts = call i32 @puts(ptr %arraydecay)
+  call void @llvm.lifetime.end(i64 20, ptr nonnull %p) #2
+  ret i32 0
+}
+
+declare i32 @puts(ptr nocapture readonly) #2
+
+!1 = !{!2, !3, i64 0}
+!2 = !{!"P", !3, i64 0, !4, i64 4}
+!3 = !{!"int", !4, i64 0}
+!4 = !{!"omnipotent char", !5, i64 0}
+!5 = !{!"Simple C/C++ TBAA"}
+;.
+; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
+; CHECK: [[META1]] = !{!"P", [[META2]], i64 0, [[META3:![0-9]+]], i64 4}
+; CHECK: [[META2]] = !{!"int", [[META3]], i64 0}
+; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
+; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
+;.
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
new file mode 100644
index 000000000000..b735a7788742
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
@@ -0,0 +1,29 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
+
+; CHECK-NOT: [64 x i8]
+@.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
+
+; Function Attrs: nounwind
+define  void @foo()   {
+; CHECK-LABEL: define void @foo() local_unnamed_addr {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [62 x i8], align 1
+; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [62 x i8], ptr [[SOMETHING]], i32 0, i32 0
+; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 62, ptr nonnull [[TMP0]])
+; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[TMP0]], ptr align 1 @.str, i32 62, i1 true)
+; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[TMP0]])
+; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 62, ptr nonnull [[TMP0]])
+; CHECK-NEXT:    ret void
+;
+entry:
+  %something = alloca [62 x i8], align 1
+  %0 = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
+  call void @llvm.lifetime.start(i64 62, ptr nonnull %0) #3
+  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %0, ptr align 1 @.str, i32 62, i1 true)
+  %call2 = call i32 @bar(ptr nonnull %0) #3
+  call void @llvm.lifetime.end(i64 62, ptr nonnull %0) #3
+  ret void
+}
+
+declare i32 @bar(...)  #2
-- 
GitLab


From 83953c7df107af26ebf9ab82e01623c991637199 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 15 Oct 2024 21:12:10 +0900
Subject: [PATCH 228/329] APInt.cpp: Prune a stray semicolon.

---
 llvm/lib/Support/APInt.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Support/APInt.cpp b/llvm/lib/Support/APInt.cpp
index 23e365f16d8f..ea8295f95c75 100644
--- a/llvm/lib/Support/APInt.cpp
+++ b/llvm/lib/Support/APInt.cpp
@@ -2227,7 +2227,7 @@ void APInt::toString(SmallVectorImpl<char> &Str, unsigned Radix, bool Signed,
   while (*Prefix) {
     Str.push_back(*Prefix);
     ++Prefix;
-  };
+  }
 
   // We insert the digits backward, then reverse them to get the right order.
   unsigned StartDig = Str.size();
-- 
GitLab


From 5bcc66dc009893c55aefdcd16a0ca2dad315481b Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 15 Oct 2024 21:14:45 +0900
Subject: [PATCH 229/329] VisitIfStmt: Prune a redundant condition.

`S->isConsteval()` is evaluated at the top of this method.
Likely mis-merging in #75425
---
 clang/lib/CodeGen/CoverageMappingGen.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang/lib/CodeGen/CoverageMappingGen.cpp b/clang/lib/CodeGen/CoverageMappingGen.cpp
index 07015834bc84..577a0f571e16 100644
--- a/clang/lib/CodeGen/CoverageMappingGen.cpp
+++ b/clang/lib/CodeGen/CoverageMappingGen.cpp
@@ -2066,7 +2066,7 @@ struct CounterCoverageMappingBuilder
       GapRegionCounter = OutCount;
     }
 
-    if (!S->isConsteval() && !llvm::EnableSingleByteCoverage)
+    if (!llvm::EnableSingleByteCoverage)
       // Create Branch Region around condition.
       createBranchRegion(S->getCond(), ThenCount,
                          subtractCounters(ParentCount, ThenCount));
-- 
GitLab


From 40d9561b2d5651e3d2ffa057d2b89cb8d5146fb9 Mon Sep 17 00:00:00 2001
From: NAKAMURA Takumi <geek4civic@gmail.com>
Date: Tue, 15 Oct 2024 21:21:16 +0900
Subject: [PATCH 230/329] InstrProfilingMerge.c: Fix potential misalignment in
 `SrcBitmapStart`

Currently it is not an issue. It will be a problem if Bitmap is
located after single byte counters.
---
 compiler-rt/lib/profile/InstrProfilingMerge.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/compiler-rt/lib/profile/InstrProfilingMerge.c b/compiler-rt/lib/profile/InstrProfilingMerge.c
index c0706b73e166..7cf1679811eb 100644
--- a/compiler-rt/lib/profile/InstrProfilingMerge.c
+++ b/compiler-rt/lib/profile/InstrProfilingMerge.c
@@ -154,7 +154,8 @@ int __llvm_profile_merge_from_buffer(const char *ProfileData,
   SrcCountersStart = (char *)SrcDataEnd;
   SrcCountersEnd = SrcCountersStart +
                    Header->NumCounters * __llvm_profile_counter_entry_size();
-  SrcBitmapStart = SrcCountersEnd;
+  SrcBitmapStart = SrcCountersEnd + __llvm_profile_get_num_padding_bytes(
+                                        SrcCountersEnd - SrcCountersStart);
   SrcNameStart = SrcBitmapStart + Header->NumBitmapBytes;
   SrcValueProfDataStart =
       SrcNameStart + getDistanceFromCounterToValueProf(Header);
-- 
GitLab


From 9c80eb7c83c6471d4126ef46f85bf673787de521 Mon Sep 17 00:00:00 2001
From: Benjamin Kramer <benny.kra@googlemail.com>
Date: Thu, 17 Oct 2024 13:14:37 +0200
Subject: [PATCH 231/329] Silence -Wswitch after
 cb43021e5726a4462f28a999fb66a8dc20dc354b

lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp:4885:13: warning: enumeration value 'SveMFloat8' not handled in switch [-Wswitch]
 4885 |     switch (llvm::cast<clang::BuiltinType>(qual_type)->getKind()) {
      |
---
 lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp | 1 +
 1 file changed, 1 insertion(+)

diff --git a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
index fe0c53a7e9a3..50115a638b95 100644
--- a/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
+++ b/lldb/source/Plugins/TypeSystem/Clang/TypeSystemClang.cpp
@@ -5065,6 +5065,7 @@ lldb::Encoding TypeSystemClang::GetEncoding(lldb::opaque_compiler_type_t type,
     case clang::BuiltinType::SveUint64x2:
     case clang::BuiltinType::SveUint64x3:
     case clang::BuiltinType::SveUint64x4:
+    case clang::BuiltinType::SveMFloat8:
     case clang::BuiltinType::SveFloat16:
     case clang::BuiltinType::SveBFloat16:
     case clang::BuiltinType::SveBFloat16x2:
-- 
GitLab


From 2f0b4f43fc5c1e7587c4d00daa9cc230df2f8a2d Mon Sep 17 00:00:00 2001
From: jeanPerier <jperier@nvidia.com>
Date: Thu, 17 Oct 2024 13:25:09 +0200
Subject: [PATCH 232/329] [flang][extension] support concatenation with absent
 optional (#112678)

Fix #112593 by adding support in lowering to concatenation with an
absent optional _assumed length_ dummy argument because:
1. Most compilers seem to support it (most likely by accident).
2. This actually makes the compiler codegen simpler. Codegen was going
out of its way to poke the LLVM optimizer bear by producing an undef
argument for the length.

I insist on the fact that no compiler support this with _explicit
length_ optional arguments and the executable will segfault and I would
discourage users from using that "feature" because runtime checks for
bad optional dereference will kick when used (For instance, "nagfor
-C=present" will produce an executable that abort with an error message
. Flang does not have such runtime check option so far).

Hence, I am not updating the Extensions.md document because this is not
something I think we should advertise.
---
 flang/lib/Optimizer/CodeGen/CodeGen.cpp | 14 +-------------
 flang/test/Fir/optional.fir             |  2 +-
 2 files changed, 2 insertions(+), 14 deletions(-)

diff --git a/flang/lib/Optimizer/CodeGen/CodeGen.cpp b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
index 9b624efa0538..68b8c6613585 100644
--- a/flang/lib/Optimizer/CodeGen/CodeGen.cpp
+++ b/flang/lib/Optimizer/CodeGen/CodeGen.cpp
@@ -3373,19 +3373,7 @@ struct AbsentOpConversion : public fir::FIROpConversion<fir::AbsentOp> {
   matchAndRewrite(fir::AbsentOp absent, OpAdaptor,
                   mlir::ConversionPatternRewriter &rewriter) const override {
     mlir::Type ty = convertType(absent.getType());
-    mlir::Location loc = absent.getLoc();
-
-    if (mlir::isa<fir::BoxCharType>(absent.getType())) {
-      auto structTy = mlir::cast<mlir::LLVM::LLVMStructType>(ty);
-      assert(!structTy.isOpaque() && !structTy.getBody().empty());
-      auto undefStruct = rewriter.create<mlir::LLVM::UndefOp>(loc, ty);
-      auto nullField =
-          rewriter.create<mlir::LLVM::ZeroOp>(loc, structTy.getBody()[0]);
-      rewriter.replaceOpWithNewOp<mlir::LLVM::InsertValueOp>(
-          absent, undefStruct, nullField, 0);
-    } else {
-      rewriter.replaceOpWithNewOp<mlir::LLVM::ZeroOp>(absent, ty);
-    }
+    rewriter.replaceOpWithNewOp<mlir::LLVM::ZeroOp>(absent, ty);
     return mlir::success();
   }
 };
diff --git a/flang/test/Fir/optional.fir b/flang/test/Fir/optional.fir
index 3b350d6fa941..bded8b5332a3 100644
--- a/flang/test/Fir/optional.fir
+++ b/flang/test/Fir/optional.fir
@@ -47,7 +47,7 @@ func.func @foo3(%arg0: !fir.boxchar<1>) -> i1 {
 // CHECK-LABEL: @bar3
 func.func @bar3() -> i1 {
   %0 = fir.absent !fir.boxchar<1>
-  // CHECK: call i1 @foo3(ptr null, i64 undef)
+  // CHECK: call i1 @foo3(ptr null, i64 0)
   %1 = fir.call @foo3(%0) : (!fir.boxchar<1>) -> i1
   return %1 : i1
 }
-- 
GitLab


From e21c80ac73a9da5c86c20dbce37c9227a17ab06a Mon Sep 17 00:00:00 2001
From: Mariya Podchishchaeva <mariya.podchishchaeva@intel.com>
Date: Thu, 17 Oct 2024 13:42:35 +0200
Subject: [PATCH 233/329] [clang] Reject if constexpr in C (#112685)

Fixes https://github.com/llvm/llvm-project/issues/112587
---
 clang/lib/Parse/ParseStmt.cpp | 11 +++++++----
 clang/test/Sema/constexpr.c   |  7 +++++++
 2 files changed, 14 insertions(+), 4 deletions(-)

diff --git a/clang/lib/Parse/ParseStmt.cpp b/clang/lib/Parse/ParseStmt.cpp
index 6480e88316a7..60d647da48f0 100644
--- a/clang/lib/Parse/ParseStmt.cpp
+++ b/clang/lib/Parse/ParseStmt.cpp
@@ -1518,10 +1518,13 @@ StmtResult Parser::ParseIfStatement(SourceLocation *TrailingElseLoc) {
   SourceLocation ConstevalLoc;
 
   if (Tok.is(tok::kw_constexpr)) {
-    Diag(Tok, getLangOpts().CPlusPlus17 ? diag::warn_cxx14_compat_constexpr_if
-                                        : diag::ext_constexpr_if);
-    IsConstexpr = true;
-    ConsumeToken();
+    // C23 supports constexpr keyword, but only for object definitions.
+    if (getLangOpts().CPlusPlus) {
+      Diag(Tok, getLangOpts().CPlusPlus17 ? diag::warn_cxx14_compat_constexpr_if
+                                          : diag::ext_constexpr_if);
+      IsConstexpr = true;
+      ConsumeToken();
+    }
   } else {
     if (Tok.is(tok::exclaim)) {
       NotLocation = ConsumeToken();
diff --git a/clang/test/Sema/constexpr.c b/clang/test/Sema/constexpr.c
index 0cf9491c4a42..eaa000b3b977 100644
--- a/clang/test/Sema/constexpr.c
+++ b/clang/test/Sema/constexpr.c
@@ -367,3 +367,10 @@ struct S10 {
 constexpr struct S10 c = { 255 };
 // FIXME-expected-error@-1 {{constexpr initializer evaluates to 255 which is not exactly representable in 'long long' bit-field with width 8}}
 // See: GH#101299
+
+void constexprif() {
+  if constexpr (300) {} //expected-error {{expected '(' after 'if'}}
+}
+void constevalif() {
+  if consteval (300) {} //expected-error {{expected '(' after 'if'}}
+}
-- 
GitLab


From 2882bb192b07674bc11fd5ddd5a4fa6cea194628 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 17 Oct 2024 12:46:08 +0100
Subject: [PATCH 234/329] [lldb][docs] Add link to LoongArch tracking issue

https://github.com/llvm/llvm-project/issues/112693 will be
tracking the overall state of LoongArch support.

This means anyone can check without having to track down
an expert.
---
 lldb/docs/index.rst | 8 +++++---
 1 file changed, 5 insertions(+), 3 deletions(-)

diff --git a/lldb/docs/index.rst b/lldb/docs/index.rst
index 2c7d7dbfad9a..b91077d66089 100644
--- a/lldb/docs/index.rst
+++ b/lldb/docs/index.rst
@@ -81,9 +81,11 @@ are welcome:
 expected to work, with functionality improving rapidly. ARM and AArch64 support
 is more experimental, with more known issues than the others.
 
-RISC-V support is in active development, refer to the
-`tracking issue <https://github.com/llvm/llvm-project/issues/55383>`_
-for the current status.
+Support for the following architectures is in active development. For their
+current state, follow the links to their respective issues:
+
+* `RISC-V <https://github.com/llvm/llvm-project/issues/55383>`_
+* `LoongArch <https://github.com/llvm/llvm-project/issues/112693>`_
 
 Get Involved
 ------------
-- 
GitLab


From f42785d0c8886a65fbdd160b0ef47baa5931e582 Mon Sep 17 00:00:00 2001
From: Jie Fu <jiefu@tencent.com>
Date: Thu, 17 Oct 2024 19:47:51 +0800
Subject: [PATCH 235/329] [Polly] Remove unused variable 'IdentTy' in
 LoopGeneratorsKMP.cpp (NFC)

/llvm-project/polly/lib/CodeGen/LoopGeneratorsKMP.cpp:396:15: error: unused variable 'IdentTy' [-Werror,-Wunused-variable]
  StructType *IdentTy =
              ^
/llvm-project/polly/lib/CodeGen/LoopGeneratorsKMP.cpp:460:15: error: unused variable 'IdentTy' [-Werror,-Wunused-variable]
  StructType *IdentTy =
              ^
2 errors generated.
---
 polly/lib/CodeGen/LoopGeneratorsKMP.cpp | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
index 45800b105ea7..0cfe18b0c121 100644
--- a/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
+++ b/polly/lib/CodeGen/LoopGeneratorsKMP.cpp
@@ -393,8 +393,6 @@ void ParallelLoopGeneratorKMP::createCallStaticInit(Value *GlobalThreadID,
 void ParallelLoopGeneratorKMP::createCallStaticFini(Value *GlobalThreadID) {
   const std::string Name = "__kmpc_for_static_fini";
   Function *F = M->getFunction(Name);
-  StructType *IdentTy =
-      StructType::getTypeByName(M->getContext(), "struct.ident_t");
 
   // If F is not available, declare it.
   if (!F) {
@@ -457,8 +455,6 @@ Value *ParallelLoopGeneratorKMP::createCallDispatchNext(Value *GlobalThreadID,
   const std::string Name =
       is64BitArch() ? "__kmpc_dispatch_next_8" : "__kmpc_dispatch_next_4";
   Function *F = M->getFunction(Name);
-  StructType *IdentTy =
-      StructType::getTypeByName(M->getContext(), "struct.ident_t");
 
   // If F is not available, declare it.
   if (!F) {
-- 
GitLab


From 067e8b8dc54b2558548c248ae851a0e01cb05878 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 17 Oct 2024 15:49:13 +0400
Subject: [PATCH 236/329] DAG: Lower fcNormal is.fpclass to compare with inf
 (#100389)

---
 .../CodeGen/SelectionDAG/TargetLowering.cpp   | 25 +++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
index 4f42ed2ee701..758b3a5fc526 100644
--- a/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/TargetLowering.cpp
@@ -8817,6 +8817,31 @@ SDValue TargetLowering::expandIS_FPCLASS(EVT ResultVT, SDValue Op,
                             IsOrdered ? OrderedOp : UnorderedOp);
       }
     }
+
+    if (FPTestMask == fcNormal) {
+      // TODO: Handle unordered
+      ISD::CondCode IsFiniteOp = IsInvertedFP ? ISD::SETUGE : ISD::SETOLT;
+      ISD::CondCode IsNormalOp = IsInvertedFP ? ISD::SETOLT : ISD::SETUGE;
+
+      if (isCondCodeLegalOrCustom(IsFiniteOp,
+                                  OperandVT.getScalarType().getSimpleVT()) &&
+          isCondCodeLegalOrCustom(IsNormalOp,
+                                  OperandVT.getScalarType().getSimpleVT()) &&
+          isFAbsFree(OperandVT)) {
+        // isnormal(x) --> fabs(x) < infinity && !(fabs(x) < smallest_normal)
+        SDValue Inf =
+            DAG.getConstantFP(APFloat::getInf(Semantics), DL, OperandVT);
+        SDValue SmallestNormal = DAG.getConstantFP(
+            APFloat::getSmallestNormalized(Semantics), DL, OperandVT);
+
+        SDValue Abs = DAG.getNode(ISD::FABS, DL, OperandVT, Op);
+        SDValue IsFinite = DAG.getSetCC(DL, ResultVT, Abs, Inf, IsFiniteOp);
+        SDValue IsNormal =
+            DAG.getSetCC(DL, ResultVT, Abs, SmallestNormal, IsNormalOp);
+        unsigned LogicOp = IsInvertedFP ? ISD::OR : ISD::AND;
+        return DAG.getNode(LogicOp, DL, ResultVT, IsFinite, IsNormal);
+      }
+    }
   }
 
   // Some checks may be represented as inversion of simpler check, for example
-- 
GitLab


From 8268bc48eb32b006700f6f6b7da0971a2336ab42 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 11:33:14 +0100
Subject: [PATCH 237/329] [DAG] Avoid SDLoc duplication in FP<->INT combines.
 NFC.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 62 +++++++++----------
 1 file changed, 29 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ca91d35573c3..67e1b731ba5f 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -17964,7 +17964,7 @@ SDValue DAGCombiner::visitFPOW(SDNode *N) {
   return SDValue();
 }
 
-static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
+static SDValue foldFPToIntToFP(SDNode *N, const SDLoc &DL, SelectionDAG &DAG,
                                const TargetLowering &TLI) {
   // We only do this if the target has legal ftrunc. Otherwise, we'd likely be
   // replacing casts with a libcall. We also must be allowed to ignore -0.0
@@ -17982,11 +17982,11 @@ static SDValue foldFPToIntToFP(SDNode *N, SelectionDAG &DAG,
   SDValue N0 = N->getOperand(0);
   if (N->getOpcode() == ISD::SINT_TO_FP && N0.getOpcode() == ISD::FP_TO_SINT &&
       N0.getOperand(0).getValueType() == VT)
-    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
 
   if (N->getOpcode() == ISD::UINT_TO_FP && N0.getOpcode() == ISD::FP_TO_UINT &&
       N0.getOperand(0).getValueType() == VT)
-    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0.getOperand(0));
+    return DAG.getNode(ISD::FTRUNC, DL, VT, N0.getOperand(0));
 
   return SDValue();
 }
@@ -17995,17 +17995,17 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
+  SDLoc DL(N);
 
   // [us]itofp(undef) = 0, because the result value is bounded.
   if (N0.isUndef())
-    return DAG.getConstantFP(0.0, SDLoc(N), VT);
+    return DAG.getConstantFP(0.0, DL, VT);
 
   // fold (sint_to_fp c1) -> c1fp
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       // ...but only if the target supports immediate floating-point values
-      (!LegalOperations ||
-       TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
-    return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0);
 
   // If the input is a legal type, and SINT_TO_FP is not legal on this target,
   // but UINT_TO_FP is legal on this target, try to convert.
@@ -18013,31 +18013,27 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
       hasOperation(ISD::UINT_TO_FP, OpVT)) {
     // If the sign bit is known to be zero, we can change this to UINT_TO_FP.
     if (DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
+      return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0);
   }
 
   // The next optimizations are desirable only if SELECT_CC can be lowered.
   // fold (sint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), -1.0, 0.0)
   if (N0.getOpcode() == ISD::SETCC && N0.getValueType() == MVT::i1 &&
       !VT.isVector() &&
-      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
-    SDLoc DL(N);
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
     return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(-1.0, DL, VT),
                          DAG.getConstantFP(0.0, DL, VT));
-  }
 
   // fold (sint_to_fp (zext (setcc x, y, cc))) ->
   //      (select (setcc x, y, cc), 1.0, 0.0)
   if (N0.getOpcode() == ISD::ZERO_EXTEND &&
       N0.getOperand(0).getOpcode() == ISD::SETCC && !VT.isVector() &&
-      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
-    SDLoc DL(N);
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
     return DAG.getSelect(DL, VT, N0.getOperand(0),
                          DAG.getConstantFP(1.0, DL, VT),
                          DAG.getConstantFP(0.0, DL, VT));
-  }
 
-  if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
+  if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
     return FTrunc;
 
   return SDValue();
@@ -18047,17 +18043,17 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
   EVT OpVT = N0.getValueType();
+  SDLoc DL(N);
 
   // [us]itofp(undef) = 0, because the result value is bounded.
   if (N0.isUndef())
-    return DAG.getConstantFP(0.0, SDLoc(N), VT);
+    return DAG.getConstantFP(0.0, DL, VT);
 
   // fold (uint_to_fp c1) -> c1fp
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
       // ...but only if the target supports immediate floating-point values
-      (!LegalOperations ||
-       TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
-    return DAG.getNode(ISD::UINT_TO_FP, SDLoc(N), VT, N0);
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
+    return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0);
 
   // If the input is a legal type, and UINT_TO_FP is not legal on this target,
   // but SINT_TO_FP is legal on this target, try to convert.
@@ -18065,25 +18061,23 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
       hasOperation(ISD::SINT_TO_FP, OpVT)) {
     // If the sign bit is known to be zero, we can change this to SINT_TO_FP.
     if (DAG.SignBitIsZero(N0))
-      return DAG.getNode(ISD::SINT_TO_FP, SDLoc(N), VT, N0);
+      return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0);
   }
 
   // fold (uint_to_fp (setcc x, y, cc)) -> (select (setcc x, y, cc), 1.0, 0.0)
   if (N0.getOpcode() == ISD::SETCC && !VT.isVector() &&
-      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT))) {
-    SDLoc DL(N);
+      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
     return DAG.getSelect(DL, VT, N0, DAG.getConstantFP(1.0, DL, VT),
                          DAG.getConstantFP(0.0, DL, VT));
-  }
 
-  if (SDValue FTrunc = foldFPToIntToFP(N, DAG, TLI))
+  if (SDValue FTrunc = foldFPToIntToFP(N, DL, DAG, TLI))
     return FTrunc;
 
   return SDValue();
 }
 
 // Fold (fp_to_{s/u}int ({s/u}int_to_fpx)) -> zext x, sext x, trunc x, or x
-static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
+static SDValue FoldIntToFPToInt(SDNode *N, const SDLoc &DL, SelectionDAG &DAG) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
 
@@ -18113,12 +18107,12 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
   // represented exactly in the float range.
   if (APFloat::semanticsPrecision(Sem) >= ActualSize) {
     if (VT.getScalarSizeInBits() > SrcVT.getScalarSizeInBits()) {
-      unsigned ExtOp = IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND
-                                                       : ISD::ZERO_EXTEND;
-      return DAG.getNode(ExtOp, SDLoc(N), VT, Src);
+      unsigned ExtOp =
+          IsInputSigned && IsOutputSigned ? ISD::SIGN_EXTEND : ISD::ZERO_EXTEND;
+      return DAG.getNode(ExtOp, DL, VT, Src);
     }
     if (VT.getScalarSizeInBits() < SrcVT.getScalarSizeInBits())
-      return DAG.getNode(ISD::TRUNCATE, SDLoc(N), VT, Src);
+      return DAG.getNode(ISD::TRUNCATE, DL, VT, Src);
     return DAG.getBitcast(VT, Src);
   }
   return SDValue();
@@ -18127,6 +18121,7 @@ static SDValue FoldIntToFPToInt(SDNode *N, SelectionDAG &DAG) {
 SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // fold (fp_to_sint undef) -> undef
   if (N0.isUndef())
@@ -18134,14 +18129,15 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
 
   // fold (fp_to_sint c1fp) -> c1
   if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FP_TO_SINT, SDLoc(N), VT, N0);
+    return DAG.getNode(ISD::FP_TO_SINT, DL, VT, N0);
 
-  return FoldIntToFPToInt(N, DAG);
+  return FoldIntToFPToInt(N, DL, DAG);
 }
 
 SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
   SDValue N0 = N->getOperand(0);
   EVT VT = N->getValueType(0);
+  SDLoc DL(N);
 
   // fold (fp_to_uint undef) -> undef
   if (N0.isUndef())
@@ -18149,9 +18145,9 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
 
   // fold (fp_to_uint c1fp) -> c1
   if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FP_TO_UINT, SDLoc(N), VT, N0);
+    return DAG.getNode(ISD::FP_TO_UINT, DL, VT, N0);
 
-  return FoldIntToFPToInt(N, DAG);
+  return FoldIntToFPToInt(N, DL, DAG);
 }
 
 SDValue DAGCombiner::visitXROUND(SDNode *N) {
-- 
GitLab


From 784c15a282803b23b451b51c533eb5df93fda874 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 11:52:59 +0100
Subject: [PATCH 238/329] [DAG] visitSINT_TO_FP/UINT_TO_FP - use
 FoldConstantArithmetic to attempt to constant fold

Don't rely on isConstantIntBuildVectorOrConstantInt followed by getNode() will constant fold - FoldConstantArithmetic will do all of this for us.

Cleanup for #112682
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 67e1b731ba5f..ff1ee01b8e43 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18002,10 +18002,10 @@ SDValue DAGCombiner::visitSINT_TO_FP(SDNode *N) {
     return DAG.getConstantFP(0.0, DL, VT);
 
   // fold (sint_to_fp c1) -> c1fp
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-      // ...but only if the target supports immediate floating-point values
-      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
-    return DAG.getNode(ISD::SINT_TO_FP, DL, VT, N0);
+  // ...but only if the target supports immediate floating-point values
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
+    if (SDValue C = DAG.FoldConstantArithmetic(ISD::SINT_TO_FP, DL, VT, {N0}))
+      return C;
 
   // If the input is a legal type, and SINT_TO_FP is not legal on this target,
   // but UINT_TO_FP is legal on this target, try to convert.
@@ -18050,10 +18050,10 @@ SDValue DAGCombiner::visitUINT_TO_FP(SDNode *N) {
     return DAG.getConstantFP(0.0, DL, VT);
 
   // fold (uint_to_fp c1) -> c1fp
-  if (DAG.isConstantIntBuildVectorOrConstantInt(N0) &&
-      // ...but only if the target supports immediate floating-point values
-      (!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
-    return DAG.getNode(ISD::UINT_TO_FP, DL, VT, N0);
+  // ...but only if the target supports immediate floating-point values
+  if ((!LegalOperations || TLI.isOperationLegalOrCustom(ISD::ConstantFP, VT)))
+    if (SDValue C = DAG.FoldConstantArithmetic(ISD::UINT_TO_FP, DL, VT, {N0}))
+      return C;
 
   // If the input is a legal type, and UINT_TO_FP is not legal on this target,
   // but SINT_TO_FP is legal on this target, try to convert.
-- 
GitLab


From 5692a0c6f846f9b1bacd445f4adedadf66c558ea Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 11:56:51 +0100
Subject: [PATCH 239/329] [DAG] visitFP_TO_SINT/FP_TO_UINT - use
 FoldConstantArithmetic to attempt to constant fold

Don't rely on isConstantFPBuildVectorOrConstantFP followed by getNode() will constant fold - FoldConstantArithmetic will do all of this for us.

Cleanup for #112682
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index ff1ee01b8e43..8efa74a3ea72 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18128,8 +18128,8 @@ SDValue DAGCombiner::visitFP_TO_SINT(SDNode *N) {
     return DAG.getUNDEF(VT);
 
   // fold (fp_to_sint c1fp) -> c1
-  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FP_TO_SINT, DL, VT, N0);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FP_TO_SINT, DL, VT, {N0}))
+    return C;
 
   return FoldIntToFPToInt(N, DL, DAG);
 }
@@ -18144,8 +18144,8 @@ SDValue DAGCombiner::visitFP_TO_UINT(SDNode *N) {
     return DAG.getUNDEF(VT);
 
   // fold (fp_to_uint c1fp) -> c1
-  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FP_TO_UINT, DL, VT, N0);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FP_TO_UINT, DL, VT, {N0}))
+    return C;
 
   return FoldIntToFPToInt(N, DL, DAG);
 }
-- 
GitLab


From bf5cf82dd471a7c561d0f0a60ff4c8eaa1d20ff9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 12:15:19 +0100
Subject: [PATCH 240/329] Fix MSVC signed/unsigned mismatch warning. NFC.

---
 llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
index be29e4b481c0..814b71d17319 100644
--- a/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
+++ b/llvm/lib/Target/ARM/Disassembler/ARMDisassembler.cpp
@@ -1541,7 +1541,7 @@ static bool PermitsD32(const MCInst &Inst, const MCDisassembler *Decoder) {
 static DecodeStatus DecodeDPRRegisterClass(MCInst &Inst, unsigned RegNo,
                                            uint64_t Address,
                                            const MCDisassembler *Decoder) {
-  if (RegNo > (PermitsD32(Inst, Decoder) ? 31 : 15))
+  if (RegNo > (PermitsD32(Inst, Decoder) ? 31u : 15u))
     return MCDisassembler::Fail;
 
   unsigned Register = DPRDecoderTable[RegNo];
-- 
GitLab


From cf046c8717468d4a4ff8d8080dcb1ba316edbea9 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 12:51:11 +0100
Subject: [PATCH 241/329] [DAG] visitSIGN_EXTEND_INREG - avoid SDLoc
 duplication. NFC.

---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 61 +++++++++----------
 1 file changed, 28 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 8efa74a3ea72..644054361dd3 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -14812,14 +14812,15 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   EVT ExtVT = cast<VTSDNode>(N1)->getVT();
   unsigned VTBits = VT.getScalarSizeInBits();
   unsigned ExtVTBits = ExtVT.getScalarSizeInBits();
+  SDLoc DL(N);
 
   // sext_vector_inreg(undef) = 0 because the top bit will all be the same.
   if (N0.isUndef())
-    return DAG.getConstant(0, SDLoc(N), VT);
+    return DAG.getConstant(0, DL, VT);
 
   // fold (sext_in_reg c1) -> c1
   if (DAG.isConstantIntBuildVectorOrConstantInt(N0))
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0, N1);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N0, N1);
 
   // If the input is already sign extended, just drop the extension.
   if (ExtVTBits >= DAG.ComputeMaxSignificantBits(N0))
@@ -14828,8 +14829,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // fold (sext_in_reg (sext_in_reg x, VT2), VT1) -> (sext_in_reg x, minVT) pt2
   if (N0.getOpcode() == ISD::SIGN_EXTEND_INREG &&
       ExtVT.bitsLT(cast<VTSDNode>(N0.getOperand(1))->getVT()))
-    return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, N0.getOperand(0),
-                       N1);
+    return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, N0.getOperand(0), N1);
 
   // fold (sext_in_reg (sext x)) -> (sext x)
   // fold (sext_in_reg (aext x)) -> (sext x)
@@ -14841,7 +14841,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     if ((N00Bits <= ExtVTBits ||
          DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits) &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
-      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
+      return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N00);
   }
 
   // fold (sext_in_reg (*_extend_vector_inreg x)) -> (sext_vector_inreg x)
@@ -14859,7 +14859,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
                       DAG.ComputeMaxSignificantBits(N00) <= ExtVTBits))) &&
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::SIGN_EXTEND_VECTOR_INREG, VT)))
-      return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, SDLoc(N), VT, N00);
+      return DAG.getNode(ISD::SIGN_EXTEND_VECTOR_INREG, DL, VT, N00);
   }
 
   // fold (sext_in_reg (zext x)) -> (sext x)
@@ -14868,12 +14868,12 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
     SDValue N00 = N0.getOperand(0);
     if (N00.getScalarValueSizeInBits() == ExtVTBits &&
         (!LegalOperations || TLI.isOperationLegal(ISD::SIGN_EXTEND, VT)))
-      return DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), VT, N00);
+      return DAG.getNode(ISD::SIGN_EXTEND, DL, VT, N00);
   }
 
   // fold (sext_in_reg x) -> (zext_in_reg x) if the sign bit is known zero.
   if (DAG.MaskedValueIsZero(N0, APInt::getOneBitSet(VTBits, ExtVTBits - 1)))
-    return DAG.getZeroExtendInReg(N0, SDLoc(N), ExtVT);
+    return DAG.getZeroExtendInReg(N0, DL, ExtVT);
 
   // fold operands of sext_in_reg based on knowledge that the top bits are not
   // demanded.
@@ -14895,7 +14895,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
         // extended enough.
         unsigned InSignBits = DAG.ComputeNumSignBits(N0.getOperand(0));
         if (((VTBits - ExtVTBits) - ShAmt->getZExtValue()) < InSignBits)
-          return DAG.getNode(ISD::SRA, SDLoc(N), VT, N0.getOperand(0),
+          return DAG.getNode(ISD::SRA, DL, VT, N0.getOperand(0),
                              N0.getOperand(1));
       }
   }
@@ -14904,37 +14904,33 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   // If sextload is not supported by target, we can only do the combine when
   // load has one use. Doing otherwise can block folding the extload with other
   // extends that the target does support.
-  if (ISD::isEXTLoad(N0.getNode()) &&
-      ISD::isUNINDEXEDLoad(N0.getNode()) &&
+  if (ISD::isEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
       ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple() &&
         N0.hasOneUse()) ||
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                     LN0->getChain(),
-                                     LN0->getBasePtr(), ExtVT,
-                                     LN0->getMemOperand());
+    auto *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad =
+        DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
+                       LN0->getBasePtr(), ExtVT, LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
     AddToWorklist(ExtLoad.getNode());
-    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    return SDValue(N, 0); // Return N so it doesn't get rechecked!
   }
 
   // fold (sext_inreg (zextload x)) -> (sextload x) iff load has one use
   if (ISD::isZEXTLoad(N0.getNode()) && ISD::isUNINDEXEDLoad(N0.getNode()) &&
-      N0.hasOneUse() &&
-      ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
+      N0.hasOneUse() && ExtVT == cast<LoadSDNode>(N0)->getMemoryVT() &&
       ((!LegalOperations && cast<LoadSDNode>(N0)->isSimple()) &&
        TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT))) {
-    LoadSDNode *LN0 = cast<LoadSDNode>(N0);
-    SDValue ExtLoad = DAG.getExtLoad(ISD::SEXTLOAD, SDLoc(N), VT,
-                                     LN0->getChain(),
-                                     LN0->getBasePtr(), ExtVT,
-                                     LN0->getMemOperand());
+    auto *LN0 = cast<LoadSDNode>(N0);
+    SDValue ExtLoad =
+        DAG.getExtLoad(ISD::SEXTLOAD, DL, VT, LN0->getChain(),
+                       LN0->getBasePtr(), ExtVT, LN0->getMemOperand());
     CombineTo(N, ExtLoad);
     CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
-    return SDValue(N, 0);   // Return N so it doesn't get rechecked!
+    return SDValue(N, 0); // Return N so it doesn't get rechecked!
   }
 
   // fold (sext_inreg (masked_load x)) -> (sext_masked_load x)
@@ -14944,7 +14940,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
         Ld->getExtensionType() != ISD::LoadExtType::NON_EXTLOAD &&
         TLI.isLoadExtLegal(ISD::SEXTLOAD, VT, ExtVT)) {
       SDValue ExtMaskedLoad = DAG.getMaskedLoad(
-          VT, SDLoc(N), Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(),
+          VT, DL, Ld->getChain(), Ld->getBasePtr(), Ld->getOffset(),
           Ld->getMask(), Ld->getPassThru(), ExtVT, Ld->getMemOperand(),
           Ld->getAddressingMode(), ISD::SEXTLOAD, Ld->isExpandingLoad());
       CombineTo(N, ExtMaskedLoad);
@@ -14955,15 +14951,14 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
 
   // fold (sext_inreg (masked_gather x)) -> (sext_masked_gather x)
   if (auto *GN0 = dyn_cast<MaskedGatherSDNode>(N0)) {
-    if (SDValue(GN0, 0).hasOneUse() &&
-        ExtVT == GN0->getMemoryVT() &&
+    if (SDValue(GN0, 0).hasOneUse() && ExtVT == GN0->getMemoryVT() &&
         TLI.isVectorLoadExtDesirable(SDValue(SDValue(GN0, 0)))) {
       SDValue Ops[] = {GN0->getChain(),   GN0->getPassThru(), GN0->getMask(),
                        GN0->getBasePtr(), GN0->getIndex(),    GN0->getScale()};
 
       SDValue ExtLoad = DAG.getMaskedGather(
-          DAG.getVTList(VT, MVT::Other), ExtVT, SDLoc(N), Ops,
-          GN0->getMemOperand(), GN0->getIndexType(), ISD::SEXTLOAD);
+          DAG.getVTList(VT, MVT::Other), ExtVT, DL, Ops, GN0->getMemOperand(),
+          GN0->getIndexType(), ISD::SEXTLOAD);
 
       CombineTo(N, ExtLoad);
       CombineTo(N0.getNode(), ExtLoad, ExtLoad.getValue(1));
@@ -14976,7 +14971,7 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
   if (ExtVTBits <= 16 && N0.getOpcode() == ISD::OR) {
     if (SDValue BSwap = MatchBSwapHWordLow(N0.getNode(), N0.getOperand(0),
                                            N0.getOperand(1), false))
-      return DAG.getNode(ISD::SIGN_EXTEND_INREG, SDLoc(N), VT, BSwap, N1);
+      return DAG.getNode(ISD::SIGN_EXTEND_INREG, DL, VT, BSwap, N1);
   }
 
   // Fold (iM_signext_inreg
@@ -14993,8 +14988,8 @@ SDValue DAGCombiner::visitSIGN_EXTEND_INREG(SDNode *N) {
         (!LegalOperations ||
          TLI.isOperationLegal(ISD::SIGN_EXTEND, InnerExtVT))) {
       SDValue SignExtExtendee =
-          DAG.getNode(ISD::SIGN_EXTEND, SDLoc(N), InnerExtVT, Extendee);
-      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, SDLoc(N), VT, SignExtExtendee,
+          DAG.getNode(ISD::SIGN_EXTEND, DL, InnerExtVT, Extendee);
+      return DAG.getNode(ISD::EXTRACT_SUBVECTOR, DL, VT, SignExtExtendee,
                          N0.getOperand(1));
     }
   }
-- 
GitLab


From c980a20b105c9298a5975b6944417f17cf772b6b Mon Sep 17 00:00:00 2001
From: Graham Hunter <graham.hunter@arm.com>
Date: Thu, 17 Oct 2024 13:17:24 +0100
Subject: [PATCH 242/329] [AArch64][SVE] Enable max vector bandwidth for SVE
 (#109671)

Returns true for shouldMaximizeVectorBandwidth when the register type
is a scalable vector and SVE or streaming SVE are available.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |   6 +-
 .../AArch64/conditional-branches-cost.ll      |  80 ++++++++----
 .../scalable-vectorization-cost-tuning.ll     |  12 +-
 .../AArch64/scalable-vectorization.ll         |   4 +-
 .../LoopVectorize/AArch64/store-costs-sve.ll  | 119 +++++++++++-------
 .../LoopVectorize/AArch64/sve2-histcnt.ll     |  88 ++++++++-----
 .../AArch64/type-shrinkage-zext-costs.ll      |  28 ++---
 .../AArch64/wider-VF-for-callinst.ll          |   2 +-
 8 files changed, 213 insertions(+), 126 deletions(-)

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index 0227532e94c7..d33d0aa58554 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -333,8 +333,10 @@ AArch64TTIImpl::getInlineCallPenalty(const Function *F, const CallBase &Call,
 bool AArch64TTIImpl::shouldMaximizeVectorBandwidth(
     TargetTransformInfo::RegisterKind K) const {
   assert(K != TargetTransformInfo::RGK_Scalar);
-  return (K == TargetTransformInfo::RGK_FixedWidthVector &&
-          ST->isNeonAvailable());
+  return ((K == TargetTransformInfo::RGK_FixedWidthVector &&
+           ST->isNeonAvailable()) ||
+          (K == TargetTransformInfo::RGK_ScalableVector &&
+           ST->isSVEorStreamingSVEAvailable()));
 }
 
 /// Calculate the cost of materializing a 64-bit value. This helper
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
index 7f325ce1a1f0..01fca39296da 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/conditional-branches-cost.ll
@@ -732,9 +732,20 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-LABEL: define void @multiple_exit_conditions(
 ; DEFAULT-SAME: ptr [[SRC:%.*]], ptr noalias [[DST:%.*]]) #[[ATTR2:[0-9]+]] {
 ; DEFAULT-NEXT:  entry:
-; DEFAULT-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 32
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 257, [[TMP8]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 2048
+; DEFAULT-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 32
+; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 257, [[TMP3]]
+; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 257, [[N_MOD_VF]]
+; DEFAULT-NEXT:    [[TMP17:%.*]] = mul i64 [[N_VEC]], 8
+; DEFAULT-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP17]]
+; DEFAULT-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
+; DEFAULT-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 32
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
@@ -742,20 +753,39 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; DEFAULT-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
 ; DEFAULT-NEXT:    [[TMP1:%.*]] = load i16, ptr [[SRC]], align 2
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <8 x i16> poison, i16 [[TMP1]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <8 x i16> [[BROADCAST_SPLATINSERT]], <8 x i16> poison, <8 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP2:%.*]] = or <8 x i16> [[BROADCAST_SPLAT]], <i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1, i16 1>
-; DEFAULT-NEXT:    [[TMP3:%.*]] = uitofp <8 x i16> [[TMP2]] to <8 x double>
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP1]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP9:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP10:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP11:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP12:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; DEFAULT-NEXT:    [[TMP13:%.*]] = uitofp <vscale x 8 x i16> [[TMP9]] to <vscale x 8 x double>
+; DEFAULT-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP10]] to <vscale x 8 x double>
+; DEFAULT-NEXT:    [[TMP15:%.*]] = uitofp <vscale x 8 x i16> [[TMP11]] to <vscale x 8 x double>
+; DEFAULT-NEXT:    [[TMP16:%.*]] = uitofp <vscale x 8 x i16> [[TMP12]] to <vscale x 8 x double>
 ; DEFAULT-NEXT:    [[TMP4:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; DEFAULT-NEXT:    store <8 x double> [[TMP3]], ptr [[TMP4]], align 8
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 8
-; DEFAULT-NEXT:    [[TMP5:%.*]] = icmp eq i64 [[INDEX_NEXT]], 256
-; DEFAULT-NEXT:    br i1 [[TMP5]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP18:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP19:%.*]] = mul i64 [[TMP18]], 8
+; DEFAULT-NEXT:    [[TMP20:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP19]]
+; DEFAULT-NEXT:    [[TMP21:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP22:%.*]] = mul i64 [[TMP21]], 16
+; DEFAULT-NEXT:    [[TMP23:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP22]]
+; DEFAULT-NEXT:    [[TMP24:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP25:%.*]] = mul i64 [[TMP24]], 24
+; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i64 [[TMP25]]
+; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP13]], ptr [[TMP4]], align 8
+; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP14]], ptr [[TMP20]], align 8
+; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP15]], ptr [[TMP23]], align 8
+; DEFAULT-NEXT:    store <vscale x 8 x double> [[TMP16]], ptr [[TMP26]], align 8
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP6]]
+; DEFAULT-NEXT:    [[TMP27:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[TMP27]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    br i1 false, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 257, [[N_VEC]]
+; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; DEFAULT:       scalar.ph:
 ; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi ptr [ [[IND_END]], [[MIDDLE_BLOCK]] ], [ [[DST]], [[ENTRY:%.*]] ]
-; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ 512, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL1:%.*]] = phi i64 [ [[IND_END1]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY]] ]
 ; DEFAULT-NEXT:    br label [[LOOP:%.*]]
 ; DEFAULT:       vector.scevcheck:
 ; DEFAULT-NEXT:    unreachable
@@ -780,7 +810,7 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
 ; PRED-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
 ; PRED-NEXT:    [[TMP2:%.*]] = sub i64 [[TMP1]], 1
 ; PRED-NEXT:    [[N_RND_UP:%.*]] = add i64 257, [[TMP2]]
 ; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[N_RND_UP]], [[TMP1]]
@@ -789,31 +819,31 @@ define void @multiple_exit_conditions(ptr %src, ptr noalias %dst) #1 {
 ; PRED-NEXT:    [[IND_END:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP3]]
 ; PRED-NEXT:    [[IND_END1:%.*]] = mul i64 [[N_VEC]], 2
 ; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 2
+; PRED-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 8
 ; PRED-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
-; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 2
+; PRED-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 8
 ; PRED-NEXT:    [[TMP8:%.*]] = sub i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP9:%.*]] = icmp ugt i64 257, [[TMP7]]
 ; PRED-NEXT:    [[TMP10:%.*]] = select i1 [[TMP9]], i64 [[TMP8]], i64 0
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 0, i64 257)
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 0, i64 257)
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 2 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PRED-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 8 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[OFFSET_IDX:%.*]] = mul i64 [[INDEX]], 8
 ; PRED-NEXT:    [[TMP11:%.*]] = add i64 [[OFFSET_IDX]], 0
 ; PRED-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP11]]
 ; PRED-NEXT:    [[TMP12:%.*]] = load i16, ptr [[SRC]], align 2
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[TMP12]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 2 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 2 x i16> insertelement (<vscale x 2 x i16> poison, i16 1, i64 0), <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 2 x i16> [[TMP13]] to <vscale x 2 x double>
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[TMP12]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP13:%.*]] = or <vscale x 8 x i16> [[BROADCAST_SPLAT]], shufflevector (<vscale x 8 x i16> insertelement (<vscale x 8 x i16> poison, i16 1, i64 0), <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP14:%.*]] = uitofp <vscale x 8 x i16> [[TMP13]] to <vscale x 8 x double>
 ; PRED-NEXT:    [[TMP15:%.*]] = getelementptr double, ptr [[NEXT_GEP]], i32 0
-; PRED-NEXT:    call void @llvm.masked.store.nxv2f64.p0(<vscale x 2 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 2 x i1> [[ACTIVE_LANE_MASK]])
+; PRED-NEXT:    call void @llvm.masked.store.nxv8f64.p0(<vscale x 8 x double> [[TMP14]], ptr [[TMP15]], i32 8, <vscale x 8 x i1> [[ACTIVE_LANE_MASK]])
 ; PRED-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP5]]
-; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 2 x i1> @llvm.get.active.lane.mask.nxv2i1.i64(i64 [[INDEX]], i64 [[TMP10]])
-; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 2 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer)
-; PRED-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 2 x i1> [[TMP16]], i32 0
+; PRED-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 8 x i1> @llvm.get.active.lane.mask.nxv8i1.i64(i64 [[INDEX]], i64 [[TMP10]])
+; PRED-NEXT:    [[TMP16:%.*]] = xor <vscale x 8 x i1> [[ACTIVE_LANE_MASK_NEXT]], shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer)
+; PRED-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 8 x i1> [[TMP16]], i32 0
 ; PRED-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; PRED:       middle.block:
 ; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
index f28f77bf1b15..59da1e10fd2a 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization-cost-tuning.ll
@@ -1,23 +1,23 @@
 ; REQUIRES: asserts
 ; RUN: opt -mtriple=aarch64 -mattr=+sve \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mattr=+sve -mcpu=generic \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=GENERIC,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-v1 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-V1,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
 
 ; RUN: opt -mtriple=aarch64 -mcpu=neoverse-n2 \
 ; RUN:     -force-target-instruction-cost=1 -passes=loop-vectorize -S -debug-only=loop-vectorize < %s 2>&1 \
-; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE4
+; RUN:     | FileCheck %s --check-prefixes=NEOVERSE-N2,VF-VSCALE16
 
 ; GENERIC: LV: Vector loop of width vscale x 2 costs: 3 (assuming a minimum vscale of 2).
 ; GENERIC: LV: Vector loop of width vscale x 4 costs: 1 (assuming a minimum vscale of 2).
@@ -29,7 +29,7 @@
 ; NEOVERSE-N2: LV: Vector loop of width vscale x 4 costs: 3 (assuming a minimum vscale of 1).
 
 ; VF-4: <4 x i32>
-; VF-VSCALE4: <16 x i32>
+; VF-VSCALE16: <vscale x 16 x i32>
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 entry:
   br label %loop
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
index e83eb729b521..a84932a2290d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/scalable-vectorization.ll
@@ -8,8 +8,8 @@
 ; (maximized bandwidth for i8 in the loop).
 define void @test0(ptr %a, ptr %b, ptr %c) #0 {
 ; CHECK: LV: Checking a loop in 'test0'
-; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 4
-; CHECK_SCALABLE_ON: LV: Selecting VF: 16
+; CHECK_SCALABLE_ON: LV: Found feasible scalable VF = vscale x 16
+; CHECK_SCALABLE_ON: LV: Selecting VF: vscale x 16
 ; CHECK_SCALABLE_DISABLED-NOT: LV: Found feasible scalable VF
 ; CHECK_SCALABLE_DISABLED: LV: Selecting VF: 16
 ; CHECK_SCALABLE_ON_MAXBW: LV: Found feasible scalable VF = vscale x 16
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
index 7d2fc348480a..a4861ad0b261 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/store-costs-sve.ll
@@ -145,7 +145,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; DEFAULT-NEXT:  iter.check:
 ; DEFAULT-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 2
+; DEFAULT-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP0]], 8
 ; DEFAULT-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
 ; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; DEFAULT:       vector.memcheck:
@@ -155,59 +155,72 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; DEFAULT-NEXT:    br i1 [[FOUND_CONFLICT]], label [[VEC_EPILOG_SCALAR_PH]], label [[VECTOR_MAIN_LOOP_ITER_CHECK:%.*]]
 ; DEFAULT:       vector.main.loop.iter.check:
-; DEFAULT-NEXT:    br i1 true, label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
+; DEFAULT-NEXT:    [[TMP9:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP9]], 32
+; DEFAULT-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 0, [[TMP3]]
+; DEFAULT-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; DEFAULT:       vector.ph:
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT3:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT4:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT3]], <16 x i16> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP7:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT4]] to <16 x i8>
+; DEFAULT-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP5:%.*]] = mul i64 [[TMP4]], 32
+; DEFAULT-NEXT:    [[N_MOD_VF1:%.*]] = urem i64 0, [[TMP5]]
+; DEFAULT-NEXT:    [[N_VEC1:%.*]] = sub i64 0, [[N_MOD_VF1]]
+; DEFAULT-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP7:%.*]] = mul i64 [[TMP6]], 32
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP8:%.*]] = trunc <vscale x 16 x i16> [[BROADCAST_SPLAT]] to <vscale x 16 x i8>
 ; DEFAULT-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; DEFAULT:       vector.body:
 ; DEFAULT-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP2:%.*]] = add i64 [[INDEX]], 0
-; DEFAULT-NEXT:    [[TMP4:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i64> poison, i64 [[TMP4]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT1]], <16 x i64> poison, <16 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP5:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT2]] to <16 x i8>
-; DEFAULT-NEXT:    [[TMP8:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
-; DEFAULT-NEXT:    [[TMP9:%.*]] = and <16 x i8> [[TMP5]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP14:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META5:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT2:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP14]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT3:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT2]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP11:%.*]] = trunc <vscale x 16 x i64> [[BROADCAST_SPLAT3]] to <vscale x 16 x i8>
+; DEFAULT-NEXT:    [[TMP22:%.*]] = and <vscale x 16 x i8> [[TMP11]], [[TMP8]]
+; DEFAULT-NEXT:    [[TMP13:%.*]] = and <vscale x 16 x i8> [[TMP11]], [[TMP8]]
 ; DEFAULT-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP2]]
 ; DEFAULT-NEXT:    [[TMP12:%.*]] = getelementptr i8, ptr [[TMP10]], i32 0
-; DEFAULT-NEXT:    [[TMP13:%.*]] = getelementptr i8, ptr [[TMP10]], i32 16
-; DEFAULT-NEXT:    store <16 x i8> [[TMP8]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
-; DEFAULT-NEXT:    store <16 x i8> [[TMP9]], ptr [[TMP13]], align 1, !alias.scope [[META8]], !noalias [[META5]]
-; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 32
-; DEFAULT-NEXT:    [[TMP14:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; DEFAULT-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
+; DEFAULT-NEXT:    [[TMP16:%.*]] = call i64 @llvm.vscale.i64()
+; DEFAULT-NEXT:    [[TMP23:%.*]] = mul i64 [[TMP16]], 16
+; DEFAULT-NEXT:    [[TMP24:%.*]] = getelementptr i8, ptr [[TMP10]], i64 [[TMP23]]
+; DEFAULT-NEXT:    store <vscale x 16 x i8> [[TMP22]], ptr [[TMP12]], align 1, !alias.scope [[META8:![0-9]+]], !noalias [[META5]]
+; DEFAULT-NEXT:    store <vscale x 16 x i8> [[TMP13]], ptr [[TMP24]], align 1, !alias.scope [[META8]], !noalias [[META5]]
+; DEFAULT-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
+; DEFAULT-NEXT:    [[TMP25:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC1]]
+; DEFAULT-NEXT:    br i1 [[TMP25]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; DEFAULT:       middle.block:
-; DEFAULT-NEXT:    br i1 true, label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; DEFAULT-NEXT:    [[CMP_N1:%.*]] = icmp eq i64 0, [[N_VEC1]]
+; DEFAULT-NEXT:    br i1 [[CMP_N1]], label [[EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
 ; DEFAULT:       vec.epilog.iter.check:
+; DEFAULT-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 0, [[N_VEC1]]
 ; DEFAULT-NEXT:    [[TMP15:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP16:%.*]] = mul i64 [[TMP15]], 2
-; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP16]]
+; DEFAULT-NEXT:    [[TMP31:%.*]] = mul i64 [[TMP15]], 8
+; DEFAULT-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP31]]
 ; DEFAULT-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[VEC_EPILOG_SCALAR_PH]], label [[VEC_EPILOG_PH]]
 ; DEFAULT:       vec.epilog.ph:
-; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
+; DEFAULT-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC1]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MAIN_LOOP_ITER_CHECK]] ]
 ; DEFAULT-NEXT:    [[TMP17:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 2
+; DEFAULT-NEXT:    [[TMP18:%.*]] = mul i64 [[TMP17]], 8
 ; DEFAULT-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP18]]
 ; DEFAULT-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
 ; DEFAULT-NEXT:    [[TMP19:%.*]] = call i64 @llvm.vscale.i64()
-; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 2
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 2 x i16> poison, i16 [[X]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 2 x i16> [[BROADCAST_SPLATINSERT6]], <vscale x 2 x i16> poison, <vscale x 2 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP24:%.*]] = trunc <vscale x 2 x i16> [[BROADCAST_SPLAT7]] to <vscale x 2 x i8>
+; DEFAULT-NEXT:    [[TMP20:%.*]] = mul i64 [[TMP19]], 8
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT6:%.*]] = insertelement <vscale x 8 x i16> poison, i16 [[X]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT7:%.*]] = shufflevector <vscale x 8 x i16> [[BROADCAST_SPLATINSERT6]], <vscale x 8 x i16> poison, <vscale x 8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP32:%.*]] = trunc <vscale x 8 x i16> [[BROADCAST_SPLAT7]] to <vscale x 8 x i8>
 ; DEFAULT-NEXT:    br label [[VEC_EPILOG_VECTOR_BODY:%.*]]
 ; DEFAULT:       vec.epilog.vector.body:
 ; DEFAULT-NEXT:    [[INDEX5:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT8:%.*]], [[VEC_EPILOG_VECTOR_BODY]] ]
 ; DEFAULT-NEXT:    [[TMP21:%.*]] = add i64 [[INDEX5]], 0
-; DEFAULT-NEXT:    [[TMP22:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]]
-; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 2 x i64> poison, i64 [[TMP22]], i64 0
-; DEFAULT-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 2 x i64> [[BROADCAST_SPLATINSERT]], <vscale x 2 x i64> poison, <vscale x 2 x i32> zeroinitializer
-; DEFAULT-NEXT:    [[TMP23:%.*]] = trunc <vscale x 2 x i64> [[BROADCAST_SPLAT]] to <vscale x 2 x i8>
-; DEFAULT-NEXT:    [[TMP25:%.*]] = and <vscale x 2 x i8> [[TMP23]], [[TMP24]]
+; DEFAULT-NEXT:    [[TMP33:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META11:![0-9]+]]
+; DEFAULT-NEXT:    [[BROADCAST_SPLATINSERT9:%.*]] = insertelement <vscale x 8 x i64> poison, i64 [[TMP33]], i64 0
+; DEFAULT-NEXT:    [[BROADCAST_SPLAT10:%.*]] = shufflevector <vscale x 8 x i64> [[BROADCAST_SPLATINSERT9]], <vscale x 8 x i64> poison, <vscale x 8 x i32> zeroinitializer
+; DEFAULT-NEXT:    [[TMP29:%.*]] = trunc <vscale x 8 x i64> [[BROADCAST_SPLAT10]] to <vscale x 8 x i8>
+; DEFAULT-NEXT:    [[TMP30:%.*]] = and <vscale x 8 x i8> [[TMP29]], [[TMP32]]
 ; DEFAULT-NEXT:    [[TMP26:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP21]]
 ; DEFAULT-NEXT:    [[TMP27:%.*]] = getelementptr i8, ptr [[TMP26]], i32 0
-; DEFAULT-NEXT:    store <vscale x 2 x i8> [[TMP25]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]]
+; DEFAULT-NEXT:    store <vscale x 8 x i8> [[TMP30]], ptr [[TMP27]], align 1, !alias.scope [[META14:![0-9]+]], !noalias [[META11]]
 ; DEFAULT-NEXT:    [[INDEX_NEXT8]] = add nuw i64 [[INDEX5]], [[TMP20]]
 ; DEFAULT-NEXT:    [[TMP28:%.*]] = icmp eq i64 [[INDEX_NEXT8]], [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[TMP28]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[VEC_EPILOG_VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
@@ -215,7 +228,7 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; DEFAULT-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
 ; DEFAULT-NEXT:    br i1 [[CMP_N]], label [[EXIT]], label [[VEC_EPILOG_SCALAR_PH]]
 ; DEFAULT:       vec.epilog.scalar.ph:
-; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ 0, [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; DEFAULT-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC1]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[VECTOR_MEMCHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
 ; DEFAULT-NEXT:    br label [[LOOP:%.*]]
 ; DEFAULT:       loop:
 ; DEFAULT-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[VEC_EPILOG_SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
@@ -234,7 +247,10 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-LABEL: define void @trunc_store(
 ; PRED-SAME: ptr [[DST:%.*]], ptr [[SRC:%.*]], i16 [[X:%.*]]) #[[ATTR1:[0-9]+]] {
 ; PRED-NEXT:  entry:
-; PRED-NEXT:    br i1 true, label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
+; PRED-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP1:%.*]] = mul i64 [[TMP7]], 16
+; PRED-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 0, [[TMP1]]
+; PRED-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; PRED:       vector.memcheck:
 ; PRED-NEXT:    [[SCEVGEP:%.*]] = getelementptr i8, ptr [[SRC]], i64 8
 ; PRED-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[DST]], [[SCEVGEP]]
@@ -242,28 +258,35 @@ define void @trunc_store(ptr %dst, ptr %src, i16 %x) #1 {
 ; PRED-NEXT:    [[FOUND_CONFLICT:%.*]] = and i1 [[BOUND0]], [[BOUND1]]
 ; PRED-NEXT:    br i1 [[FOUND_CONFLICT]], label [[SCALAR_PH]], label [[VECTOR_PH:%.*]]
 ; PRED:       vector.ph:
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <16 x i16> poison, i16 [[X]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <16 x i16> [[BROADCAST_SPLATINSERT1]], <16 x i16> poison, <16 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP3:%.*]] = trunc <16 x i16> [[BROADCAST_SPLAT2]] to <16 x i8>
+; PRED-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP3:%.*]] = mul i64 [[TMP2]], 16
+; PRED-NEXT:    [[N_MOD_VF:%.*]] = urem i64 0, [[TMP3]]
+; PRED-NEXT:    [[N_VEC:%.*]] = sub i64 0, [[N_MOD_VF]]
+; PRED-NEXT:    [[TMP4:%.*]] = call i64 @llvm.vscale.i64()
+; PRED-NEXT:    [[TMP11:%.*]] = mul i64 [[TMP4]], 16
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i16> poison, i16 [[X]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i16> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i16> poison, <vscale x 16 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP12:%.*]] = trunc <vscale x 16 x i16> [[BROADCAST_SPLAT]] to <vscale x 16 x i8>
 ; PRED-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; PRED:       vector.body:
 ; PRED-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PRED-NEXT:    [[TMP0:%.*]] = add i64 [[INDEX]], 0
-; PRED-NEXT:    [[TMP1:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]]
-; PRED-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <16 x i64> poison, i64 [[TMP1]], i64 0
-; PRED-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <16 x i64> [[BROADCAST_SPLATINSERT]], <16 x i64> poison, <16 x i32> zeroinitializer
-; PRED-NEXT:    [[TMP2:%.*]] = trunc <16 x i64> [[BROADCAST_SPLAT]] to <16 x i8>
-; PRED-NEXT:    [[TMP4:%.*]] = and <16 x i8> [[TMP2]], [[TMP3]]
+; PRED-NEXT:    [[TMP8:%.*]] = load i64, ptr [[SRC]], align 8, !alias.scope [[META4:![0-9]+]]
+; PRED-NEXT:    [[BROADCAST_SPLATINSERT1:%.*]] = insertelement <vscale x 16 x i64> poison, i64 [[TMP8]], i64 0
+; PRED-NEXT:    [[BROADCAST_SPLAT2:%.*]] = shufflevector <vscale x 16 x i64> [[BROADCAST_SPLATINSERT1]], <vscale x 16 x i64> poison, <vscale x 16 x i32> zeroinitializer
+; PRED-NEXT:    [[TMP9:%.*]] = trunc <vscale x 16 x i64> [[BROADCAST_SPLAT2]] to <vscale x 16 x i8>
+; PRED-NEXT:    [[TMP10:%.*]] = and <vscale x 16 x i8> [[TMP9]], [[TMP12]]
 ; PRED-NEXT:    [[TMP5:%.*]] = getelementptr i8, ptr [[DST]], i64 [[TMP0]]
 ; PRED-NEXT:    [[TMP6:%.*]] = getelementptr i8, ptr [[TMP5]], i32 0
-; PRED-NEXT:    store <16 x i8> [[TMP4]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
-; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], 16
-; PRED-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; PRED-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
+; PRED-NEXT:    store <vscale x 16 x i8> [[TMP10]], ptr [[TMP6]], align 1, !alias.scope [[META7:![0-9]+]], !noalias [[META4]]
+; PRED-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP11]]
+; PRED-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
+; PRED-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP9:![0-9]+]]
 ; PRED:       middle.block:
-; PRED-NEXT:    br i1 true, label [[EXIT:%.*]], label [[SCALAR_PH]]
+; PRED-NEXT:    [[CMP_N:%.*]] = icmp eq i64 0, [[N_VEC]]
+; PRED-NEXT:    br i1 [[CMP_N]], label [[EXIT:%.*]], label [[SCALAR_PH]]
 ; PRED:       scalar.ph:
-; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ 0, [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
+; PRED-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY:%.*]] ], [ 0, [[VECTOR_MEMCHECK]] ]
 ; PRED-NEXT:    br label [[LOOP:%.*]]
 ; PRED:       loop:
 ; PRED-NEXT:    [[IV:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT:%.*]], [[LOOP]] ]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
index 0c41477f285d..04ac89518502 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve2-histcnt.ll
@@ -313,36 +313,68 @@ for.exit:
 define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N) #0 {
 ; CHECK-LABEL: define void @histogram_8bit(
 ; CHECK-SAME: ptr noalias [[BUCKETS:%.*]], ptr readonly [[INDICES:%.*]], i64 [[N:%.*]]) #[[ATTR0]] {
-; CHECK-NEXT:  entry:
+; CHECK-NEXT:  iter.check:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 2
+; CHECK-NEXT:    [[TMP9:%.*]] = shl nuw nsw i64 [[TMP5]], 3
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[N]], [[TMP9]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[ENTRY:%.*]]
+; CHECK:       vector.main.loop.iter.check:
+; CHECK-NEXT:    [[TMP6:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP7:%.*]] = shl nuw nsw i64 [[TMP6]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK1:%.*]] = icmp ult i64 [[N]], [[TMP7]]
+; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK1]], label [[VEC_EPILOG_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -4
+; CHECK-NEXT:    [[DOTNEG:%.*]] = mul nsw i64 [[TMP2]], -16
 ; CHECK-NEXT:    [[N_VEC:%.*]] = and i64 [[N]], [[DOTNEG]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
+; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 4
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       vector.body:
-; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[ENTRY]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
+; CHECK-NEXT:    [[IV:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 4 x i32>, ptr [[ARRAYIDX]], align 4
-; CHECK-NEXT:    [[TMP6:%.*]] = zext <vscale x 4 x i32> [[WIDE_LOAD]] to <vscale x 4 x i64>
-; CHECK-NEXT:    [[TMP7:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 4 x i64> [[TMP6]]
-; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i8(<vscale x 4 x ptr> [[TMP7]], i8 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i32>, ptr [[ARRAYIDX]], align 4
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 16 x i32> [[WIDE_LOAD]] to <vscale x 16 x i64>
+; CHECK-NEXT:    [[TMP20:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 16 x i64> [[TMP10]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv16p0.i8(<vscale x 16 x ptr> [[TMP20]], i8 1, <vscale x 16 x i1> shufflevector (<vscale x 16 x i1> insertelement (<vscale x 16 x i1> poison, i1 true, i64 0), <vscale x 16 x i1> poison, <vscale x 16 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
-; CHECK:       scalar.ph:
-; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[MIDDLE_BLOCK]] ], [ 0, [[ENTRY1:%.*]] ]
+; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[VEC_EPILOG_ITER_CHECK:%.*]]
+; CHECK:       vec.epilog.iter.check:
+; CHECK-NEXT:    [[N_VEC_REMAINING:%.*]] = sub i64 [[N]], [[N_VEC]]
+; CHECK-NEXT:    [[TMP11:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP12:%.*]] = shl nuw nsw i64 [[TMP11]], 3
+; CHECK-NEXT:    [[MIN_EPILOG_ITERS_CHECK:%.*]] = icmp ult i64 [[N_VEC_REMAINING]], [[TMP12]]
+; CHECK-NEXT:    br i1 [[MIN_EPILOG_ITERS_CHECK]], label [[SCALAR_PH]], label [[VEC_EPILOG_PH]]
+; CHECK:       vec.epilog.ph:
+; CHECK-NEXT:    [[VEC_EPILOG_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[TMP13:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[DOTNEG8:%.*]] = mul nsw i64 [[TMP13]], -8
+; CHECK-NEXT:    [[N_VEC3:%.*]] = and i64 [[N]], [[DOTNEG8]]
+; CHECK-NEXT:    [[TMP14:%.*]] = call i64 @llvm.vscale.i64()
+; CHECK-NEXT:    [[TMP15:%.*]] = shl nuw nsw i64 [[TMP14]], 3
 ; CHECK-NEXT:    br label [[FOR_BODY1:%.*]]
+; CHECK:       vec.epilog.vector.body:
+; CHECK-NEXT:    [[INDEX4:%.*]] = phi i64 [ [[VEC_EPILOG_RESUME_VAL]], [[VEC_EPILOG_PH]] ], [ [[INDEX_NEXT6:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[TMP16:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[INDEX4]]
+; CHECK-NEXT:    [[WIDE_LOAD5:%.*]] = load <vscale x 8 x i32>, ptr [[TMP16]], align 4
+; CHECK-NEXT:    [[TMP17:%.*]] = zext <vscale x 8 x i32> [[WIDE_LOAD5]] to <vscale x 8 x i64>
+; CHECK-NEXT:    [[TMP18:%.*]] = getelementptr inbounds i8, ptr [[BUCKETS]], <vscale x 8 x i64> [[TMP17]]
+; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv8p0.i8(<vscale x 8 x ptr> [[TMP18]], i8 1, <vscale x 8 x i1> shufflevector (<vscale x 8 x i1> insertelement (<vscale x 8 x i1> poison, i1 true, i64 0), <vscale x 8 x i1> poison, <vscale x 8 x i32> zeroinitializer))
+; CHECK-NEXT:    [[INDEX_NEXT6]] = add nuw i64 [[INDEX4]], [[TMP15]]
+; CHECK-NEXT:    [[TMP19:%.*]] = icmp eq i64 [[INDEX_NEXT6]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[TMP19]], label [[VEC_EPILOG_MIDDLE_BLOCK:%.*]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK:       vec.epilog.middle.block:
+; CHECK-NEXT:    [[CMP_N7:%.*]] = icmp eq i64 [[N]], [[N_VEC3]]
+; CHECK-NEXT:    br i1 [[CMP_N7]], label [[FOR_EXIT]], label [[SCALAR_PH]]
+; CHECK:       vec.epilog.scalar.ph:
+; CHECK-NEXT:    [[BC_RESUME_VAL:%.*]] = phi i64 [ [[N_VEC3]], [[VEC_EPILOG_MIDDLE_BLOCK]] ], [ [[N_VEC]], [[VEC_EPILOG_ITER_CHECK]] ], [ 0, [[ITER_CHECK:%.*]] ]
+; CHECK-NEXT:    br label [[FOR_BODY2:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY1]] ]
+; CHECK-NEXT:    [[IV1:%.*]] = phi i64 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[IV_NEXT1:%.*]], [[FOR_BODY2]] ]
 ; CHECK-NEXT:    [[GEP_INDICES:%.*]] = getelementptr inbounds i32, ptr [[INDICES]], i64 [[IV1]]
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[GEP_INDICES]], align 4
 ; CHECK-NEXT:    [[IDXPROM1:%.*]] = zext i32 [[TMP0]] to i64
@@ -352,7 +384,7 @@ define void @histogram_8bit(ptr noalias %buckets, ptr readonly %indices, i64 %N)
 ; CHECK-NEXT:    store i8 [[INC]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP11:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY2]], !llvm.loop [[LOOP12:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -393,7 +425,7 @@ define void @histogram_float(ptr noalias %buckets, ptr readonly %indices, i64 %N
 ; CHECK-NEXT:    store float [[INC]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -436,7 +468,7 @@ define void @histogram_varying_increment(ptr noalias %buckets, ptr readonly %ind
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP12]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP13]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -494,7 +526,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP21]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP5]]
 ; CHECK-NEXT:    [[TMP11:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP14:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -512,7 +544,7 @@ define void @simple_histogram_user_interleave(ptr noalias %buckets, ptr readonly
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP15:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -564,7 +596,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 {
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP11]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP16:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -582,7 +614,7 @@ define void @histogram_array_3op_gep(i64 noundef %N) #0 {
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[ARRAYIDX6]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP17:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -634,7 +666,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv4p0.i32(<vscale x 4 x ptr> [[TMP7]], i32 1, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw i64 [[IV]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = icmp eq i64 [[IV_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP18:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP8]], label [[MIDDLE_BLOCK:%.*]], label [[FOR_BODY]], !llvm.loop [[LOOP19:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -652,7 +684,7 @@ define void @histogram_array_4op_gep_nonzero_const_idx(i64 noundef %N, ptr reado
 ; CHECK-NEXT:    store i32 [[INC]], ptr [[GEP_BUCKET]], align 4
 ; CHECK-NEXT:    [[IV_NEXT1]] = add nuw nsw i64 [[IV1]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT1]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP19:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY1]], !llvm.loop [[LOOP20:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -701,13 +733,13 @@ define void @simple_histogram_tailfold(ptr noalias %buckets, ptr readonly %indic
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], [[TMP1]]
 ; CHECK-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 4 x i1> @llvm.get.active.lane.mask.nxv4i1.i64(i64 [[INDEX]], i64 [[TMP6]])
 ; CHECK-NEXT:    [[TMP11:%.*]] = extractelement <vscale x 4 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP20:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP11]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP21:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    br i1 true, label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
 ; CHECK:       scalar.ph:
 ; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
 ; CHECK:       for.body:
-; CHECK-NEXT:    br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP21:![0-9]+]]
+; CHECK-NEXT:    br i1 poison, label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -774,7 +806,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 4 x i32> [[VEC_IND]], [[DOTSPLAT]]
 ; CHECK-NEXT:    [[TMP16:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP22:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -795,7 +827,7 @@ define void @simple_histogram_rtdepcheck(ptr noalias %buckets, ptr %array, ptr %
 ; CHECK-NEXT:    store i32 [[IV_TRUNC]], ptr [[IDX_ADDR]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP23:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
@@ -887,7 +919,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i
 ; CHECK-NEXT:    call void @llvm.experimental.vector.histogram.add.nxv2p0.i64(<vscale x 2 x ptr> [[TMP6]], i64 1, <vscale x 2 x i1> shufflevector (<vscale x 2 x i1> insertelement (<vscale x 2 x i1> poison, i1 true, i64 0), <vscale x 2 x i1> poison, <vscale x 2 x i32> zeroinitializer))
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP4]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
-; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP24:![0-9]+]]
+; CHECK-NEXT:    br i1 [[TMP7]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
 ; CHECK:       middle.block:
 ; CHECK-NEXT:    [[CMP_N:%.*]] = icmp eq i64 [[N]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[CMP_N]], label [[FOR_EXIT:%.*]], label [[SCALAR_PH]]
@@ -904,7 +936,7 @@ define void @simple_histogram_64b(ptr noalias %buckets, ptr readonly %indices, i
 ; CHECK-NEXT:    store i64 [[INC]], ptr [[GEP_BUCKET]], align 4
 ; CHECK-NEXT:    [[IV_NEXT]] = add nuw nsw i64 [[IV]], 1
 ; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[IV_NEXT]], [[N]]
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP25:![0-9]+]]
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_EXIT]], label [[FOR_BODY]], !llvm.loop [[LOOP26:![0-9]+]]
 ; CHECK:       for.exit:
 ; CHECK-NEXT:    ret void
 ;
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
index dec3c286345a..691c743be7d7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/type-shrinkage-zext-costs.ll
@@ -24,25 +24,25 @@ define void @zext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 8 x i16> [[TMP10]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = zext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i16> [[TMP10]], trunc (<vscale x 16 x i32> shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i16>)
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 8 x i16> [[TMP11]], ptr [[TMP12]], align 2
+; CHECK-NEXT:    store <vscale x 16 x i16> [[TMP11]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
@@ -107,25 +107,25 @@ define void @sext_i8_i16(ptr noalias nocapture readonly %p, ptr noalias nocaptur
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i32 [[TMP0]] to i64
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 8
+; CHECK-NEXT:    [[TMP4:%.*]] = mul i64 [[TMP3]], 16
 ; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 8
+; CHECK-NEXT:    [[TMP6:%.*]] = mul i64 [[TMP5]], 16
 ; CHECK-NEXT:    [[N_MOD_VF:%.*]] = urem i64 [[TMP2]], [[TMP6]]
 ; CHECK-NEXT:    [[N_VEC:%.*]] = sub i64 [[TMP2]], [[N_MOD_VF]]
 ; CHECK-NEXT:    [[TMP7:%.*]] = call i64 @llvm.vscale.i64()
-; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 8
+; CHECK-NEXT:    [[TMP8:%.*]] = mul i64 [[TMP7]], 16
 ; CHECK-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[INDEX]]
-; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 8 x i8>, ptr [[TMP9]], align 1
-; CHECK-NEXT:    [[TMP10:%.*]] = sext <vscale x 8 x i8> [[WIDE_LOAD]] to <vscale x 8 x i16>
-; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 8 x i16> [[TMP10]], trunc (<vscale x 8 x i32> shufflevector (<vscale x 8 x i32> insertelement (<vscale x 8 x i32> poison, i32 2, i64 0), <vscale x 8 x i32> poison, <vscale x 8 x i32> zeroinitializer) to <vscale x 8 x i16>)
+; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <vscale x 16 x i8>, ptr [[TMP9]], align 1
+; CHECK-NEXT:    [[TMP10:%.*]] = sext <vscale x 16 x i8> [[WIDE_LOAD]] to <vscale x 16 x i16>
+; CHECK-NEXT:    [[TMP11:%.*]] = add <vscale x 16 x i16> [[TMP10]], trunc (<vscale x 16 x i32> shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 2, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer) to <vscale x 16 x i16>)
 ; CHECK-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i16, ptr [[Q]], i64 [[INDEX]]
-; CHECK-NEXT:    store <vscale x 8 x i16> [[TMP11]], ptr [[TMP12]], align 2
+; CHECK-NEXT:    store <vscale x 16 x i16> [[TMP11]], ptr [[TMP12]], align 2
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP8]]
 ; CHECK-NEXT:    [[TMP13:%.*]] = icmp eq i64 [[INDEX_NEXT]], [[N_VEC]]
 ; CHECK-NEXT:    br i1 [[TMP13]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
index 4a2f9d07ed91..a1a13f1e0c37 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/wider-VF-for-callinst.ll
@@ -1,6 +1,6 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
 ; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -S | FileCheck %s --check-prefixes=WIDE
-; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW
+; RUN: opt < %s -passes=loop-vectorize,instsimplify -force-vector-interleave=1 -vectorizer-maximize-bandwidth=false -vectorizer-maximize-bandwidth-for-vector-calls=false -S | FileCheck %s --check-prefixes=NARROW
 
 target triple = "aarch64-unknown-linux-gnu"
 
-- 
GitLab


From 4a2bd78f5b0d0661c23dff9c4b93a393a49dbf9a Mon Sep 17 00:00:00 2001
From: gxlayer <151722229+guoxin049@users.noreply.github.com>
Date: Thu, 17 Oct 2024 20:25:06 +0800
Subject: [PATCH 243/329] [ARM] Fix -mno-omit-leaf-frame-pointer flag doesn't 
 works on 32-bit ARM  (#109628)

The -mno-omit-leaf-frame-pointer flag works on 32-bit ARM architectures
and addresses the bug reported in #108019
---
 clang/docs/ReleaseNotes.rst                   |   5 +
 llvm/docs/ReleaseNotes.md                     |   6 +
 .../llvm/CodeGen/TargetFrameLowering.h        |   6 -
 llvm/lib/CodeGen/TargetOptionsImpl.cpp        |   8 -
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      |   7 +-
 llvm/lib/Target/ARM/ARMFrameLowering.h        |   2 +-
 .../CodeGen/ARM/2011-03-15-LdStMultipleBug.ll |   2 +-
 .../CodeGen/ARM/2011-12-19-sjlj-clobber.ll    |   2 +-
 llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll  |   2 +-
 llvm/test/CodeGen/ARM/atomic-load-store.ll    |  56 ++--
 llvm/test/CodeGen/ARM/call-tc.ll              |   4 +-
 llvm/test/CodeGen/ARM/debug-frame.ll          |   2 +-
 llvm/test/CodeGen/ARM/ehabi.ll                |   2 +-
 llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll  |   6 +-
 llvm/test/CodeGen/ARM/frame-chain.ll          |  11 +-
 llvm/test/CodeGen/ARM/ifcvt5.ll               |   2 +-
 llvm/test/CodeGen/ARM/ldrd.ll                 |   4 +-
 .../CodeGen/ARM/stack-frame-layout-remarks.ll |   9 +-
 llvm/test/CodeGen/ARM/stack-size-section.ll   |   2 +-
 llvm/test/CodeGen/ARM/swifterror.ll           | 302 +++++++++---------
 llvm/test/CodeGen/ARM/v7k-abi-align.ll        |   4 +-
 llvm/test/CodeGen/Thumb/frame-chain.ll        |  16 +-
 llvm/test/CodeGen/Thumb2/frame-pointer.ll     |   2 +-
 llvm/test/CodeGen/Thumb2/frameless.ll         |   4 +-
 llvm/test/CodeGen/Thumb2/frameless2.ll        |   2 +-
 llvm/test/CodeGen/Thumb2/machine-licm.ll      |   4 +-
 .../Inputs/arm_generated_funcs.ll             |   2 +-
 .../arm_generated_funcs.ll.generated.expected |   2 +-
 ...rm_generated_funcs.ll.nogenerated.expected |   2 +-
 29 files changed, 234 insertions(+), 244 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index dc5564b6db11..9977e8bd3ca6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -612,6 +612,11 @@ X86 Support
 Arm and AArch64 Support
 ^^^^^^^^^^^^^^^^^^^^^^^
 
+- In the ARM Target, the frame pointer (FP) of a leaf function can be retained
+  by using the ``-fno-omit-frame-pointer`` option. If you want to eliminate the FP
+  in leaf functions after enabling ``-fno-omit-frame-pointer``, you can do so by adding
+  the ``-momit-leaf-frame-pointer`` option.
+
 Android Support
 ^^^^^^^^^^^^^^^
 
diff --git a/llvm/docs/ReleaseNotes.md b/llvm/docs/ReleaseNotes.md
index dcdd7a25c7fb..f8bc7e79239b 100644
--- a/llvm/docs/ReleaseNotes.md
+++ b/llvm/docs/ReleaseNotes.md
@@ -125,6 +125,12 @@ Changes to the ARM Backend
   the required alignment space with a sequence of `0x0` bytes (the requested
   fill value) rather than NOPs.
 
+* The default behavior for frame pointers in leaf functions has been updated.
+  When the `-fno-omit-frame-pointer` option is specified, `FPKeepKindStr` is
+  set to `-mframe-pointer=all`, meaning the frame pointer (FP) is now retained
+  in leaf functions by default. To eliminate the frame pointer in leaf functions,
+  you must explicitly use the `-momit-leaf-frame-pointer` option.
+
 Changes to the AVR Backend
 --------------------------
 
diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 3df9e56db38a..9882d8511875 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -277,12 +277,6 @@ public:
     return false;
   }
 
-  /// Return true if the target wants to keep the frame pointer regardless of
-  /// the function attribute "frame-pointer".
-  virtual bool keepFramePointer(const MachineFunction &MF) const {
-    return false;
-  }
-
   /// hasFP - Return true if the specified function should have a dedicated
   /// frame pointer register. For most targets this is true only if the function
   /// has variable sized allocas or if frame pointer elimination is disabled.
diff --git a/llvm/lib/CodeGen/TargetOptionsImpl.cpp b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
index 5bf1d265092f..01ffaed585ea 100644
--- a/llvm/lib/CodeGen/TargetOptionsImpl.cpp
+++ b/llvm/lib/CodeGen/TargetOptionsImpl.cpp
@@ -22,10 +22,6 @@ using namespace llvm;
 /// DisableFramePointerElim - This returns true if frame pointer elimination
 /// optimization should be disabled for the given machine function.
 bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
-  // Check to see if the target want to forcibly keep frame pointer.
-  if (MF.getSubtarget().getFrameLowering()->keepFramePointer(MF))
-    return true;
-
   const Function &F = MF.getFunction();
 
   if (!F.hasFnAttribute("frame-pointer"))
@@ -41,10 +37,6 @@ bool TargetOptions::DisableFramePointerElim(const MachineFunction &MF) const {
 }
 
 bool TargetOptions::FramePointerIsReserved(const MachineFunction &MF) const {
-  // Check to see if the target want to forcibly keep frame pointer.
-  if (MF.getSubtarget().getFrameLowering()->keepFramePointer(MF))
-    return true;
-
   const Function &F = MF.getFunction();
 
   if (!F.hasFnAttribute("frame-pointer"))
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 57e2d5525a1a..2706efa83fc3 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -330,6 +330,10 @@ bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
+  // Check to see if the target want to forcibly keep frame pointer.
+  if (keepFramePointer(MF))
+    return true;
+
   // ABI-required frame pointer.
   if (MF.getTarget().Options.DisableFramePointerElim(MF))
     return true;
@@ -2403,7 +2407,8 @@ void ARMFrameLowering::determineCalleeSaves(MachineFunction &MF,
   // to take advantage the eliminateFrameIndex machinery. This also ensures it
   // is spilled in the order specified by getCalleeSavedRegs() to make it easier
   // to combine multiple loads / stores.
-  bool CanEliminateFrame = !(requiresAAPCSFrameRecord(MF) && hasFP(MF));
+  bool CanEliminateFrame = !(requiresAAPCSFrameRecord(MF) && hasFP(MF)) &&
+                           !MF.getTarget().Options.DisableFramePointerElim(MF);
   bool CS1Spilled = false;
   bool LRSpilled = false;
   unsigned NumGPRSpills = 0;
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index 3f55884f80a4..3c5bc00cb449 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -41,7 +41,7 @@ public:
                               MutableArrayRef<CalleeSavedInfo> CSI,
                               const TargetRegisterInfo *TRI) const override;
 
-  bool keepFramePointer(const MachineFunction &MF) const override;
+  bool keepFramePointer(const MachineFunction &MF) const;
 
   bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
 
diff --git a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
index 233230e416c1..71a1678669a8 100644
--- a/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
+++ b/llvm/test/CodeGen/ARM/2011-03-15-LdStMultipleBug.ll
@@ -9,7 +9,7 @@
 
 @oStruct = external global %struct.Outer, align 4
 
-define void @main(i8 %val8) nounwind {
+define void @main(i8 %val8) nounwind "frame-pointer"="none" {
 ; CHECK-LABEL: main:
 ; CHECK:       @ %bb.0: @ %for.body.lr.ph
 ; CHECK-NEXT:    movw r0, :lower16:(L_oStruct$non_lazy_ptr-(LPC0_0+4))
diff --git a/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll b/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll
index 6728b9d4584c..af6ccdc8f4e1 100644
--- a/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll
+++ b/llvm/test/CodeGen/ARM/2011-12-19-sjlj-clobber.ll
@@ -3,7 +3,7 @@
 ; Radar 10567930: Make sure that all the caller-saved registers are saved and
 ; restored in a function with setjmp/longjmp EH.  In particular, r6 was not
 ; being saved here.
-; CHECK: push {r4, r5, r6, r7, lr}
+; CHECK: push.w {r4, r5, r6, r7, r8, r10, r11, lr}
 
 %0 = type opaque
 %struct.NSConstantString = type { ptr, i32, ptr, i32 }
diff --git a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
index aa79e4156dac..b6adc995091c 100644
--- a/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
+++ b/llvm/test/CodeGen/ARM/arm-shrink-wrapping.ll
@@ -1732,7 +1732,7 @@ if.end:
 ; Another infinite loop test this time with two nested infinite loop.
 ; infiniteloop3
 ; bx lr
-define void @infiniteloop3() "frame-pointer"="all" {
+define void @infiniteloop3() "frame-pointer"="none" {
 ; ARM-LABEL: infiniteloop3:
 ; ARM:       @ %bb.0: @ %entry
 ; ARM-NEXT:    mov r0, #0
diff --git a/llvm/test/CodeGen/ARM/atomic-load-store.ll b/llvm/test/CodeGen/ARM/atomic-load-store.ll
index 14e49bf3c937..560dfde356c2 100644
--- a/llvm/test/CodeGen/ARM/atomic-load-store.ll
+++ b/llvm/test/CodeGen/ARM/atomic-load-store.ll
@@ -324,18 +324,17 @@ define void @test_old_store_64bit(ptr %p, i64 %v) {
 ;
 ; ARMOPTNONE-LABEL: test_old_store_64bit:
 ; ARMOPTNONE:       @ %bb.0:
-; ARMOPTNONE-NEXT:    push {r4, r5, r7, lr}
-; ARMOPTNONE-NEXT:    add r7, sp, #8
-; ARMOPTNONE-NEXT:    push {r8, r10, r11}
-; ARMOPTNONE-NEXT:    sub sp, sp, #24
-; ARMOPTNONE-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    str r2, [sp, #8] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    str r1, [sp, #12] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    dmb ish
-; ARMOPTNONE-NEXT:    ldr r1, [r0]
-; ARMOPTNONE-NEXT:    ldr r0, [r0, #4]
-; ARMOPTNONE-NEXT:    str r1, [sp, #16] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    push	{r4, r5, r7, r8, r10, r11, lr}
+; ARMOPTNONE-NEXT:    add	r7, sp, #20
+; ARMOPTNONE-NEXT:    sub	sp, sp, #24
+; ARMOPTNONE-NEXT:    str	r0, [sp, #4] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str	r2, [sp, #8] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str	r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    dmb	ish
+; ARMOPTNONE-NEXT:    ldr	r1, [r0]
+; ARMOPTNONE-NEXT:    ldr	r0, [r0, #4]
+; ARMOPTNONE-NEXT:    str	r1, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str	r0, [sp, #20] @ 4-byte Spill
 ; ARMOPTNONE-NEXT:    b LBB5_1
 ; ARMOPTNONE-NEXT:  LBB5_1: @ %atomicrmw.start
 ; ARMOPTNONE-NEXT:    @ =>This Loop Header: Depth=1
@@ -382,8 +381,7 @@ define void @test_old_store_64bit(ptr %p, i64 %v) {
 ; ARMOPTNONE-NEXT:  LBB5_5: @ %atomicrmw.end
 ; ARMOPTNONE-NEXT:    dmb ish
 ; ARMOPTNONE-NEXT:    sub sp, r7, #20
-; ARMOPTNONE-NEXT:    pop {r8, r10, r11}
-; ARMOPTNONE-NEXT:    pop {r4, r5, r7, pc}
+; ARMOPTNONE-NEXT:    pop	{r4, r5, r7, r8, r10, r11, pc}
 ;
 ; THUMBTWO-LABEL: test_old_store_64bit:
 ; THUMBTWO:       @ %bb.0:
@@ -864,20 +862,19 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
 ;
 ; ARMOPTNONE-LABEL: store_atomic_f64__seq_cst:
 ; ARMOPTNONE:       @ %bb.0:
-; ARMOPTNONE-NEXT:    push {r4, r5, r7, lr}
-; ARMOPTNONE-NEXT:    add r7, sp, #8
-; ARMOPTNONE-NEXT:    push {r8, r10, r11}
-; ARMOPTNONE-NEXT:    sub sp, sp, #24
-; ARMOPTNONE-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    vmov d16, r1, r2
-; ARMOPTNONE-NEXT:    vmov r1, r2, d16
-; ARMOPTNONE-NEXT:    str r2, [sp, #8] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    str r1, [sp, #12] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    dmb ish
-; ARMOPTNONE-NEXT:    ldr r1, [r0]
-; ARMOPTNONE-NEXT:    ldr r0, [r0, #4]
-; ARMOPTNONE-NEXT:    str r1, [sp, #16] @ 4-byte Spill
-; ARMOPTNONE-NEXT:    str r0, [sp, #20] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    push	{r4, r5, r7, r8, r10, r11, lr}
+; ARMOPTNONE-NEXT:    add	r7, sp, #20
+; ARMOPTNONE-NEXT:    sub	sp, sp, #24
+; ARMOPTNONE-NEXT:    str	r0, [sp, #4] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    vmov	d16, r1, r2
+; ARMOPTNONE-NEXT:    vmov	r1, r2, d16
+; ARMOPTNONE-NEXT:    str	r2, [sp, #8] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str	r1, [sp, #12] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    dmb	ish
+; ARMOPTNONE-NEXT:    ldr	r1, [r0]
+; ARMOPTNONE-NEXT:    ldr	r0, [r0, #4]
+; ARMOPTNONE-NEXT:    str	r1, [sp, #16] @ 4-byte Spill
+; ARMOPTNONE-NEXT:    str	r0, [sp, #20] @ 4-byte Spill
 ; ARMOPTNONE-NEXT:    b LBB13_1
 ; ARMOPTNONE-NEXT:  LBB13_1: @ %atomicrmw.start
 ; ARMOPTNONE-NEXT:    @ =>This Loop Header: Depth=1
@@ -924,8 +921,7 @@ define void @store_atomic_f64__seq_cst(ptr %ptr, double %val1) {
 ; ARMOPTNONE-NEXT:  LBB13_5: @ %atomicrmw.end
 ; ARMOPTNONE-NEXT:    dmb ish
 ; ARMOPTNONE-NEXT:    sub sp, r7, #20
-; ARMOPTNONE-NEXT:    pop {r8, r10, r11}
-; ARMOPTNONE-NEXT:    pop {r4, r5, r7, pc}
+; ARMOPTNONE-NEXT:    pop	{r4, r5, r7, r8, r10, r11, pc}
 ;
 ; THUMBTWO-LABEL: store_atomic_f64__seq_cst:
 ; THUMBTWO:       @ %bb.0:
diff --git a/llvm/test/CodeGen/ARM/call-tc.ll b/llvm/test/CodeGen/ARM/call-tc.ll
index 18d83bdc03e2..9c70bac0322f 100644
--- a/llvm/test/CodeGen/ARM/call-tc.ll
+++ b/llvm/test/CodeGen/ARM/call-tc.ll
@@ -17,7 +17,7 @@ define void @t1() "frame-pointer"="all" {
         ret void
 }
 
-define void @t2() "frame-pointer"="all" {
+define void @t2() "frame-pointer"="none" {
 ; CHECKV6-LABEL: t2:
 ; CHECKV6: bx r0
 ; CHECKT2D-LABEL: t2:
@@ -102,7 +102,7 @@ bb:
 
 ; Make sure codegenprep is duplicating ret instructions to enable tail calls.
 ; rdar://11140249
-define i32 @t8(i32 %x) nounwind ssp "frame-pointer"="all" {
+define i32 @t8(i32 %x) nounwind ssp "frame-pointer"="none" {
 entry:
 ; CHECKT2D-LABEL: t8:
 ; CHECKT2D-NOT: push
diff --git a/llvm/test/CodeGen/ARM/debug-frame.ll b/llvm/test/CodeGen/ARM/debug-frame.ll
index faeafdf45dc3..72e7cfcab487 100644
--- a/llvm/test/CodeGen/ARM/debug-frame.ll
+++ b/llvm/test/CodeGen/ARM/debug-frame.ll
@@ -526,7 +526,7 @@ entry:
 ; Test 4
 ;-------------------------------------------------------------------------------
 
-define void @test4() nounwind {
+define void @test4() nounwind "frame-pointer"="none" {
 entry:
   ret void
 }
diff --git a/llvm/test/CodeGen/ARM/ehabi.ll b/llvm/test/CodeGen/ARM/ehabi.ll
index fea497076030..d1a4e9a6bcca 100644
--- a/llvm/test/CodeGen/ARM/ehabi.ll
+++ b/llvm/test/CodeGen/ARM/ehabi.ll
@@ -575,7 +575,7 @@ entry:
 ; Test 4
 ;-------------------------------------------------------------------------------
 
-define void @test4() nounwind {
+define void @test4() nounwind "frame-pointer"="none" {
 entry:
   ret void
 }
diff --git a/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll b/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll
index e29ddd52f3d0..8a7bfbe42900 100644
--- a/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll
+++ b/llvm/test/CodeGen/ARM/fast-isel-frameaddr.ll
@@ -16,7 +16,7 @@ entry:
 ; DARWIN-THUMB2: mov r0, r7
 
 ; LINUX-ARM-LABEL: frameaddr_index0:
-; LINUX-ARM: push {r11, lr}
+; LINUX-ARM: push {r11}
 ; LINUX-ARM: mov r11, sp
 ; LINUX-ARM: mov r0, r11
 
@@ -42,7 +42,7 @@ entry:
 ; DARWIN-THUMB2: ldr r0, [r7]
 
 ; LINUX-ARM-LABEL: frameaddr_index1:
-; LINUX-ARM: push {r11, lr}
+; LINUX-ARM: push {r11}
 ; LINUX-ARM: mov r11, sp
 ; LINUX-ARM: ldr r0, [r11]
 
@@ -73,7 +73,7 @@ entry:
 ; DARWIN-THUMB2: ldr r0, [r0]
 
 ; LINUX-ARM-LABEL: frameaddr_index3:
-; LINUX-ARM: push {r11, lr}
+; LINUX-ARM: push {r11}
 ; LINUX-ARM: mov r11, sp
 ; LINUX-ARM: ldr r0, [r11]
 ; LINUX-ARM: ldr r0, [r0]
diff --git a/llvm/test/CodeGen/ARM/frame-chain.ll b/llvm/test/CodeGen/ARM/frame-chain.ll
index e37213e4aaf8..7b722cd5fcef 100644
--- a/llvm/test/CodeGen/ARM/frame-chain.ll
+++ b/llvm/test/CodeGen/ARM/frame-chain.ll
@@ -10,11 +10,14 @@
 define dso_local noundef i32 @leaf(i32 noundef %0) {
 ; LEAF-FP-LABEL: leaf:
 ; LEAF-FP:       @ %bb.0:
-; LEAF-FP-NEXT:    .pad #4
-; LEAF-FP-NEXT:    sub sp, sp, #4
-; LEAF-FP-NEXT:    str r0, [sp]
+; LEAF-FP-NEXT:    .save {r11, lr}
+; LEAF-FP-NEXT:    push {r11, lr}
+; LEAF-FP-NEXT:    .setfp r11, sp
+; LEAF-FP-NEXT:    mov r11, sp
+; LEAF-FP-NEXT:    push {r0}
 ; LEAF-FP-NEXT:    add r0, r0, #4
-; LEAF-FP-NEXT:    add sp, sp, #4
+; LEAF-FP-NEXT:    mov sp, r11
+; LEAF-FP-NEXT:    pop {r11, lr}
 ; LEAF-FP-NEXT:    mov pc, lr
 ;
 ; LEAF-FP-AAPCS-LABEL: leaf:
diff --git a/llvm/test/CodeGen/ARM/ifcvt5.ll b/llvm/test/CodeGen/ARM/ifcvt5.ll
index dc9a3400b691..30a92eb34989 100644
--- a/llvm/test/CodeGen/ARM/ifcvt5.ll
+++ b/llvm/test/CodeGen/ARM/ifcvt5.ll
@@ -5,7 +5,7 @@
 
 @x = external global ptr		; <ptr> [#uses=1]
 
-define void @foo(i32 %a) "frame-pointer"="all" {
+define void @foo(i32 %a) "frame-pointer"="none" {
 ; A8-LABEL: foo:
 ; A8:       @ %bb.0: @ %entry
 ; A8-NEXT:    movw r1, :lower16:(L_x$non_lazy_ptr-(LPC0_0+8))
diff --git a/llvm/test/CodeGen/ARM/ldrd.ll b/llvm/test/CodeGen/ARM/ldrd.ll
index cf5c2dfe5ef6..3cf10f0e64b4 100644
--- a/llvm/test/CodeGen/ARM/ldrd.ll
+++ b/llvm/test/CodeGen/ARM/ldrd.ll
@@ -168,7 +168,7 @@ define void @ldrd_postupdate_inc(ptr %p0) "frame-pointer"="all" {
 ; NORMAL: strd r1, r2, [r0], #-8
 ; CONSERVATIVE-NOT: strd
 ; CHECK: bx lr
-define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all" {
+define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="none" {
   %p0.1 = getelementptr i32, ptr %p0, i32 1
   store i32 %v0, ptr %p0
   store i32 %v1, ptr %p0.1
@@ -180,7 +180,7 @@ define ptr @strd_postupdate_dec(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all"
 ; NORMAL: strd r1, r2, [r0], #8
 ; CONSERVATIVE-NOT: strd
 ; CHECK: bx lr
-define ptr @strd_postupdate_inc(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="all" {
+define ptr @strd_postupdate_inc(ptr %p0, i32 %v0, i32 %v1) "frame-pointer"="none" {
   %p0.1 = getelementptr i32, ptr %p0, i32 1
   store i32 %v0, ptr %p0
   store i32 %v1, ptr %p0.1
diff --git a/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll b/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll
index c76dc24bae7e..ea059e49c9f9 100644
--- a/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll
+++ b/llvm/test/CodeGen/ARM/stack-frame-layout-remarks.ll
@@ -51,7 +51,7 @@ declare void @llvm.dbg.declare(metadata, metadata, metadata) #0
 ; BOTH:  [SP-8]{{.+}}8{{.+}}4
 ; DEBUG: a @ dot.c:13
 ; STRIPPED-NOT: a @ dot.c:13
-define void @cleanup_array(ptr %0) #1 {
+define void @cleanup_array(ptr %0) #3 {
   %2 = alloca ptr, align 8
   store ptr %0, ptr %2, align 8
   call void @llvm.dbg.declare(metadata ptr %2, metadata !41, metadata !DIExpression()), !dbg !46
@@ -62,7 +62,7 @@ define void @cleanup_array(ptr %0) #1 {
 ; BOTH:  [SP-8]{{.+}}8{{.+}}4
 ; DEBUG: res @ dot.c:21
 ; STRIPPED-NOT: res @ dot.c:21
-define void @cleanup_result(ptr %0) #1 {
+define void @cleanup_result(ptr %0) #3 {
   %2 = alloca ptr, align 8
   store ptr %0, ptr %2, align 8
   call void @llvm.dbg.declare(metadata ptr %2, metadata !47, metadata !DIExpression()), !dbg !51
@@ -92,7 +92,7 @@ define void @cleanup_result(ptr %0) #1 {
 ; BOTH:  [SP-40]{{.+}}4{{.+}}4
 ; DEBUG: i @ dot.c:55
 ; STRIPPED-NOT: i @ dot.c:55
-define i32 @do_work(ptr %0, ptr %1, ptr %2) #1 {
+define i32 @do_work(ptr %0, ptr %1, ptr %2) #3 {
   %4 = alloca i32, align 4
   %5 = alloca ptr, align 8
   %6 = alloca ptr, align 8
@@ -144,7 +144,7 @@ define i32 @do_work(ptr %0, ptr %1, ptr %2) #1 {
 ; BOTH:  [SP-20]{{.+}}4{{.*}}4
 ; DEBUG: i @ dot.c:69
 ; STRIPPED-NOT: i @ dot.c:69
-define ptr @gen_array(i32 %0) #1 {
+define ptr @gen_array(i32 %0) #3 {
   %2 = alloca ptr, align 8
   %3 = alloca i32, align 4
   %4 = alloca ptr, align 8
@@ -227,6 +227,7 @@ uselistorder ptr @llvm.dbg.declare, { 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12,
 attributes #0 = { nocallback nofree nosync nounwind readnone speculatable willreturn }
 attributes #1 = { "frame-pointer"="all" }
 attributes #2 = { ssp "stack-protector-buffer-size"="5" "frame-pointer"="all" }
+attributes #3 = { "frame-pointer"="none" }
 
 !llvm.dbg.cu = !{!0, !2}
 !llvm.module.flags = !{!18, !19, !20, !21, !22, !23, !24}
diff --git a/llvm/test/CodeGen/ARM/stack-size-section.ll b/llvm/test/CodeGen/ARM/stack-size-section.ll
index fb23e358d856..8272389719a6 100644
--- a/llvm/test/CodeGen/ARM/stack-size-section.ll
+++ b/llvm/test/CodeGen/ARM/stack-size-section.ll
@@ -29,4 +29,4 @@ define void @dynalloc(i32 %N) #0 {
   ret void
 }
 
-attributes #0 = { "frame-pointer"="all" }
+attributes #0 = { "frame-pointer"="none" }
diff --git a/llvm/test/CodeGen/ARM/swifterror.ll b/llvm/test/CodeGen/ARM/swifterror.ll
index 4f950ba68760..f002c54fc60c 100644
--- a/llvm/test/CodeGen/ARM/swifterror.ll
+++ b/llvm/test/CodeGen/ARM/swifterror.ll
@@ -79,18 +79,17 @@ define float @caller(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r8}
-; CHECK-O0-NEXT:    sub sp, sp, #12
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #12
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    bl _foo
-; CHECK-O0-NEXT:    str r8, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    movw r0, #0
-; CHECK-O0-NEXT:    cmp r8, r0
-; CHECK-O0-NEXT:    bne LBB1_2
+; CHECK-O0-NEXT:    str	r0, [sp]                        @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    bl	_foo
+; CHECK-O0-NEXT:    str	r8, [sp, #4]                    @ 4-byte Spill
+; CHECK-O0-NEXT:    movw	r0, #0
+; CHECK-O0-NEXT:    cmp	r8, r0
+; CHECK-O0-NEXT:    bne	LBB1_2
 ; CHECK-O0-NEXT:  @ %bb.1: @ %cont
 ; CHECK-O0-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-O0-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
@@ -101,8 +100,7 @@ define float @caller(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r8}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -176,12 +174,11 @@ define float @caller2(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller2:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r8}
-; CHECK-O0-NEXT:    sub sp, sp, #16
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #16
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r0, [sp, #8] @ 4-byte Spill
 ; CHECK-O0-NEXT:  LBB2_1: @ %bb_loop
 ; CHECK-O0-NEXT:    @ =>This Inner Loop Header: Depth=1
 ; CHECK-O0-NEXT:    mov r8, #0
@@ -209,8 +206,7 @@ define float @caller2(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r8}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller2:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -585,21 +581,20 @@ define float @caller3(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller3:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r8}
-; CHECK-O0-NEXT:    sub sp, sp, #44
-; CHECK-O0-NEXT:    bfc sp, #0, #3
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #44
+; CHECK-O0-NEXT:    bfc	sp, #0, #3
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    add r0, sp, #16
-; CHECK-O0-NEXT:    mov r1, #1
-; CHECK-O0-NEXT:    bl _foo_sret
-; CHECK-O0-NEXT:    str r8, [sp, #8] @ 4-byte Spill
-; CHECK-O0-NEXT:    movw r0, #0
-; CHECK-O0-NEXT:    cmp r8, r0
-; CHECK-O0-NEXT:    bne LBB6_2
+; CHECK-O0-NEXT:    str	r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    add	r0, sp, #16
+; CHECK-O0-NEXT:    mov	r1, #1
+; CHECK-O0-NEXT:    bl	_foo_sret
+; CHECK-O0-NEXT:    str	r8, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    movw	r0, #0
+; CHECK-O0-NEXT:    cmp	r8, r0
+; CHECK-O0-NEXT:    bne	LBB6_2
 ; CHECK-O0-NEXT:  @ %bb.1: @ %cont
 ; CHECK-O0-NEXT:    ldr r1, [sp, #4] @ 4-byte Reload
 ; CHECK-O0-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
@@ -610,8 +605,7 @@ define float @caller3(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r8}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller3:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -809,27 +803,26 @@ define float @caller4(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: caller4:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r8}
-; CHECK-O0-NEXT:    sub sp, sp, #24
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #24
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    str r0, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    mov r0, #10
-; CHECK-O0-NEXT:    str r0, [r7, #-12]
-; CHECK-O0-NEXT:    mov r0, #11
-; CHECK-O0-NEXT:    str r0, [sp, #12]
-; CHECK-O0-NEXT:    mov r0, #12
-; CHECK-O0-NEXT:    str r0, [sp, #8]
-; CHECK-O0-NEXT:    ldr r0, [r7, #-12]
-; CHECK-O0-NEXT:    ldr r1, [sp, #12]
-; CHECK-O0-NEXT:    ldr r2, [sp, #8]
-; CHECK-O0-NEXT:    bl _foo_vararg
-; CHECK-O0-NEXT:    str r8, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    movw r0, #0
-; CHECK-O0-NEXT:    cmp r8, r0
-; CHECK-O0-NEXT:    bne LBB8_2
+; CHECK-O0-NEXT:    str	r0, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    mov	r0, #10
+; CHECK-O0-NEXT:    str	r0, [r7, #-12]
+; CHECK-O0-NEXT:    mov	r0, #11
+; CHECK-O0-NEXT:    str	r0, [sp, #12]
+; CHECK-O0-NEXT:    mov	r0, #12
+; CHECK-O0-NEXT:    str	r0, [sp, #8]
+; CHECK-O0-NEXT:    ldr	r0, [r7, #-12]
+; CHECK-O0-NEXT:    ldr	r1, [sp, #12]
+; CHECK-O0-NEXT:    ldr	r2, [sp, #8]
+; CHECK-O0-NEXT:    bl	_foo_vararg
+; CHECK-O0-NEXT:    str	r8, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    movw	r0, #0
+; CHECK-O0-NEXT:    cmp	r8, r0
+; CHECK-O0-NEXT:    bne	LBB8_2
 ; CHECK-O0-NEXT:  @ %bb.1: @ %cont
 ; CHECK-O0-NEXT:    ldr r1, [sp] @ 4-byte Reload
 ; CHECK-O0-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
@@ -840,8 +833,7 @@ define float @caller4(ptr %error_ref) {
 ; CHECK-O0-NEXT:    bl _free
 ; CHECK-O0-NEXT:    mov r0, #1065353216
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r8}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: caller4:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
@@ -995,14 +987,12 @@ define swiftcc void @swifterror_reg_clobber(ptr nocapture %err) {
 ;
 ; CHECK-O0-LABEL: swifterror_reg_clobber:
 ; CHECK-O0:       @ %bb.0:
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r8}
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
 ; CHECK-O0-NEXT:    @ InlineAsm Start
 ; CHECK-O0-NEXT:    nop
 ; CHECK-O0-NEXT:    @ InlineAsm End
-; CHECK-O0-NEXT:    pop {r8}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: swifterror_reg_clobber:
 ; CHECK-ANDROID:       @ %bb.0:
@@ -1048,36 +1038,34 @@ define swiftcc void @params_in_reg(i32, i32, i32, i32, ptr swiftself, ptr nocapt
 ;
 ; CHECK-O0-LABEL: params_in_reg:
 ; CHECK-O0:       @ %bb.0:
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r10}
-; CHECK-O0-NEXT:    sub sp, sp, #28
-; CHECK-O0-NEXT:    bfc sp, #0, #3
-; CHECK-O0-NEXT:    str r8, [sp, #20] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r10, [sp] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r3, [sp, #16] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r2, [sp, #12] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r1, [sp, #8] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r0, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    push	{r7, r10, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #28
+; CHECK-O0-NEXT:    bfc	sp, #0, #3
+; CHECK-O0-NEXT:    str	r8, [sp, #20] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r10, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r3, [sp, #16] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r2, [sp, #12] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r1, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r0, [sp, #4] @ 4-byte Spill
 ; CHECK-O0-NEXT:    @ implicit-def: $r0
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    mov r0, #1
-; CHECK-O0-NEXT:    mov r1, #2
-; CHECK-O0-NEXT:    mov r2, #3
-; CHECK-O0-NEXT:    mov r3, #4
-; CHECK-O0-NEXT:    mov r10, r8
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    ldr r10, [sp] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r0, [sp, #4] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r1, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r2, [sp, #12] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r3, [sp, #16] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #20] @ 4-byte Reload
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r10}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    mov	r0, #1
+; CHECK-O0-NEXT:    mov	r1, #2
+; CHECK-O0-NEXT:    mov	r2, #3
+; CHECK-O0-NEXT:    mov	r3, #4
+; CHECK-O0-NEXT:    mov	r10, r8
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    ldr	r10, [sp] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r0, [sp, #4] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r1, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r2, [sp, #12] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r3, [sp, #16] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #20] @ 4-byte Reload
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    sub	sp, r7, #4
+; CHECK-O0-NEXT:    pop	{r7, r10, pc}
 ;
 ; CHECK-ANDROID-LABEL: params_in_reg:
 ; CHECK-ANDROID:       @ %bb.0:
@@ -1165,65 +1153,63 @@ define swiftcc { i32, i32, i32, i32} @params_and_return_in_reg(i32, i32, i32, i3
 ;
 ; CHECK-O0-LABEL: params_and_return_in_reg:
 ; CHECK-O0:       @ %bb.0:
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r10}
-; CHECK-O0-NEXT:    sub sp, sp, #76
-; CHECK-O0-NEXT:    bfc sp, #0, #3
-; CHECK-O0-NEXT:    str r8, [sp, #24] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r10, [sp, #4] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r3, [sp, #20] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r2, [sp, #16] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r1, [sp, #12] @ 4-byte Spill
-; CHECK-O0-NEXT:    str r0, [sp, #8] @ 4-byte Spill
+; CHECK-O0-NEXT:    push	{r7, r10, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #76
+; CHECK-O0-NEXT:    bfc	sp, #0, #3
+; CHECK-O0-NEXT:    str	r8, [sp, #24] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r10, [sp, #4] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r3, [sp, #20] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r2, [sp, #16] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r1, [sp, #12] @ 4-byte Spill
+; CHECK-O0-NEXT:    str	r0, [sp, #8] @ 4-byte Spill
 ; CHECK-O0-NEXT:    @ implicit-def: $r0
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    str r8, [sp, #28] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r0, #1
-; CHECK-O0-NEXT:    str r0, [sp, #32] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r1, #2
-; CHECK-O0-NEXT:    str r1, [sp, #36] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r2, #3
-; CHECK-O0-NEXT:    str r2, [sp, #40] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r3, #4
-; CHECK-O0-NEXT:    str r3, [sp, #44] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r10, r8
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    ldr r10, [sp, #4] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r0, [sp, #8] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r1, [sp, #12] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r2, [sp, #16] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r3, [sp, #20] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #24] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #48] @ 4-byte Spill
-; CHECK-O0-NEXT:    bl _params_and_return_in_reg2
-; CHECK-O0-NEXT:    ldr r10, [sp, #28] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r0
-; CHECK-O0-NEXT:    ldr r0, [sp, #32] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #52] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r1
-; CHECK-O0-NEXT:    ldr r1, [sp, #36] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #56] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r2
-; CHECK-O0-NEXT:    ldr r2, [sp, #40] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #60] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r3
-; CHECK-O0-NEXT:    ldr r3, [sp, #44] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #64] @ 4-byte Spill
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #48] @ 4-byte Reload
-; CHECK-O0-NEXT:    str r9, [sp, #68] @ 4-byte Spill
-; CHECK-O0-NEXT:    bl _params_in_reg2
-; CHECK-O0-NEXT:    ldr r0, [sp, #52] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r1, [sp, #56] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r2, [sp, #60] @ 4-byte Reload
-; CHECK-O0-NEXT:    ldr r3, [sp, #64] @ 4-byte Reload
-; CHECK-O0-NEXT:    mov r9, r8
-; CHECK-O0-NEXT:    ldr r8, [sp, #68] @ 4-byte Reload
-; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r10}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    str	r8, [sp, #28] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r0, #1
+; CHECK-O0-NEXT:    str	r0, [sp, #32] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r1, #2
+; CHECK-O0-NEXT:    str	r1, [sp, #36] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r2, #3
+; CHECK-O0-NEXT:    str	r2, [sp, #40] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r3, #4
+; CHECK-O0-NEXT:    str	r3, [sp, #44] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r10, r8
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    ldr	r10, [sp, #4] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r0, [sp, #8] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r1, [sp, #12] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r2, [sp, #16] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r3, [sp, #20] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #24] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #48] @ 4-byte Spill
+; CHECK-O0-NEXT:    bl	_params_and_return_in_reg2
+; CHECK-O0-NEXT:    ldr	r10, [sp, #28] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r0
+; CHECK-O0-NEXT:    ldr	r0, [sp, #32] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #52] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r1
+; CHECK-O0-NEXT:    ldr	r1, [sp, #36] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #56] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r2
+; CHECK-O0-NEXT:    ldr	r2, [sp, #40] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #60] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r3
+; CHECK-O0-NEXT:    ldr	r3, [sp, #44] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #64] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #48] @ 4-byte Reload
+; CHECK-O0-NEXT:    str	r9, [sp, #68] @ 4-byte Spill
+; CHECK-O0-NEXT:    bl	_params_in_reg2
+; CHECK-O0-NEXT:    ldr	r0, [sp, #52] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r1, [sp, #56] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r2, [sp, #60] @ 4-byte Reload
+; CHECK-O0-NEXT:    ldr	r3, [sp, #64] @ 4-byte Reload
+; CHECK-O0-NEXT:    mov	r9, r8
+; CHECK-O0-NEXT:    ldr	r8, [sp, #68] @ 4-byte Reload
+; CHECK-O0-NEXT:    sub	sp, r7, #4
+; CHECK-O0-NEXT:    pop	{r7, r10, pc}
 ;
 ; CHECK-ANDROID-LABEL: params_and_return_in_reg:
 ; CHECK-ANDROID:       @ %bb.0:
@@ -1339,19 +1325,17 @@ define swiftcc ptr @testAssign(ptr %error_ref) {
 ;
 ; CHECK-O0-LABEL: testAssign:
 ; CHECK-O0:       @ %bb.0: @ %entry
-; CHECK-O0-NEXT:    push {r7, lr}
-; CHECK-O0-NEXT:    mov r7, sp
-; CHECK-O0-NEXT:    push {r8}
-; CHECK-O0-NEXT:    sub sp, sp, #8
+; CHECK-O0-NEXT:    push	{r7, r8, lr}
+; CHECK-O0-NEXT:    add	r7, sp, #4
+; CHECK-O0-NEXT:    sub	sp, sp, #8
 ; CHECK-O0-NEXT:    @ implicit-def: $r1
-; CHECK-O0-NEXT:    mov r8, #0
-; CHECK-O0-NEXT:    bl _foo2
-; CHECK-O0-NEXT:    str r8, [sp] @ 4-byte Spill
+; CHECK-O0-NEXT:    mov	r8, #0
+; CHECK-O0-NEXT:    bl	_foo2
+; CHECK-O0-NEXT:    str	r8, [sp] @ 4-byte Spill
 ; CHECK-O0-NEXT:  @ %bb.1: @ %a
 ; CHECK-O0-NEXT:    ldr r0, [sp] @ 4-byte Reload
 ; CHECK-O0-NEXT:    sub sp, r7, #4
-; CHECK-O0-NEXT:    pop {r8}
-; CHECK-O0-NEXT:    pop {r7, pc}
+; CHECK-O0-NEXT:    pop	{r7, r8, pc}
 ;
 ; CHECK-ANDROID-LABEL: testAssign:
 ; CHECK-ANDROID:       @ %bb.0: @ %entry
diff --git a/llvm/test/CodeGen/ARM/v7k-abi-align.ll b/llvm/test/CodeGen/ARM/v7k-abi-align.ll
index 20c7aea5dcbe..b27c4354f432 100644
--- a/llvm/test/CodeGen/ARM/v7k-abi-align.ll
+++ b/llvm/test/CodeGen/ARM/v7k-abi-align.ll
@@ -117,7 +117,7 @@ define void @test_dpr_unwind_align_no_dprs() "frame-pointer"="all" {
 
 ; 128-bit vectors should use 128-bit (i.e. correctly aligned) slots on
 ; the stack.
-define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) "frame-pointer"="all" {
+define <4 x float> @test_v128_stack_pass([8 x double], float, <4 x float> %in) "frame-pointer"="none" {
 ; CHECK-LABEL: test_v128_stack_pass:
 ; CHECK: add r[[ADDR:[0-9]+]], sp, #16
 ; CHECK: vld1.64 {d0, d1}, [r[[ADDR]]:128]
@@ -140,7 +140,7 @@ define void @test_v128_stack_pass_varargs(<4 x float> %in) "frame-pointer"="all"
 
 ; To be compatible with AAPCS's va_start model (store r0-r3 at incoming SP, give
 ; a single pointer), 64-bit quantities must be pass
-define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) "frame-pointer"="all" {
+define i64 @test_64bit_gpr_align(i32, i64 %r2_r3, i32 %sp) "frame-pointer"="none" {
 ; CHECK-LABEL: test_64bit_gpr_align:
 ; CHECK: ldr [[RHS:r[0-9]+]], [sp]
 ; CHECK: adds r0, [[RHS]], r2
diff --git a/llvm/test/CodeGen/Thumb/frame-chain.ll b/llvm/test/CodeGen/Thumb/frame-chain.ll
index eb62ce09caf1..e68fc626be98 100644
--- a/llvm/test/CodeGen/Thumb/frame-chain.ll
+++ b/llvm/test/CodeGen/Thumb/frame-chain.ll
@@ -8,12 +8,16 @@
 define dso_local noundef i32 @leaf(i32 noundef %0) {
 ; LEAF-FP-LABEL: leaf:
 ; LEAF-FP:       @ %bb.0:
-; LEAF-FP-NEXT:    .pad #4
-; LEAF-FP-NEXT:    sub sp, #4
-; LEAF-FP-NEXT:    str r0, [sp]
-; LEAF-FP-NEXT:    adds r0, r0, #4
-; LEAF-FP-NEXT:    add sp, #4
-; LEAF-FP-NEXT:    bx lr
+; LEAF-FP-NEXT:    .save	{r7, lr}
+; LEAF-FP-NEXT:    push	{r7, lr}
+; LEAF-FP-NEXT:	   .setfp	r7, sp
+; LEAF-FP-NEXT:	   add	r7, sp, #0
+; LEAF-FP-NEXT:	   .pad	#4
+; LEAF-FP-NEXT:	   sub	sp, #4
+; LEAF-FP-NEXT:	   str	r0, [sp]
+; LEAF-FP-NEXT:	   adds	r0, r0, #4
+; LEAF-FP-NEXT:	   add	sp, #4
+; LEAF-FP-NEXT:	   pop	{r7, pc}
 ;
 ; LEAF-FP-AAPCS-LABEL: leaf:
 ; LEAF-FP-AAPCS:       @ %bb.0:
diff --git a/llvm/test/CodeGen/Thumb2/frame-pointer.ll b/llvm/test/CodeGen/Thumb2/frame-pointer.ll
index ae3c1c8a50e2..85c919a50d88 100644
--- a/llvm/test/CodeGen/Thumb2/frame-pointer.ll
+++ b/llvm/test/CodeGen/Thumb2/frame-pointer.ll
@@ -14,7 +14,7 @@ define void @leaf() {
 
 ; Leaf function, frame pointer is requested but we don't need any stack frame,
 ; so don't create a frame pointer.
-define void @leaf_nofpelim() "frame-pointer"="all" {
+define void @leaf_nofpelim() "frame-pointer"="none" {
 ; CHECK-LABEL: leaf_nofpelim:
 ; CHECK-NOT: push
 ; CHECK-NOT: sp
diff --git a/llvm/test/CodeGen/Thumb2/frameless.ll b/llvm/test/CodeGen/Thumb2/frameless.ll
index 01e0414de37d..44914136b1f8 100644
--- a/llvm/test/CodeGen/Thumb2/frameless.ll
+++ b/llvm/test/CodeGen/Thumb2/frameless.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=all | not grep mov
-; RUN: llc < %s -mtriple=thumbv7-linux -frame-pointer=all | not grep mov
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=none | not grep mov
+; RUN: llc < %s -mtriple=thumbv7-linux -frame-pointer=none | not grep mov
 
 define void @t() nounwind readnone {
   ret void
diff --git a/llvm/test/CodeGen/Thumb2/frameless2.ll b/llvm/test/CodeGen/Thumb2/frameless2.ll
index 4750527ae555..4848deaf8a1e 100644
--- a/llvm/test/CodeGen/Thumb2/frameless2.ll
+++ b/llvm/test/CodeGen/Thumb2/frameless2.ll
@@ -1,4 +1,4 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=all | not grep r7
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -frame-pointer=none | not grep r7
 
 %struct.noise3 = type { [3 x [17 x i32]] }
 %struct.noiseguard = type { i32, i32, i32 }
diff --git a/llvm/test/CodeGen/Thumb2/machine-licm.ll b/llvm/test/CodeGen/Thumb2/machine-licm.ll
index 5a2ec9280de7..a2f379f7b543 100644
--- a/llvm/test/CodeGen/Thumb2/machine-licm.ll
+++ b/llvm/test/CodeGen/Thumb2/machine-licm.ll
@@ -1,5 +1,5 @@
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -frame-pointer=all | FileCheck %s
-; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=all | FileCheck %s --check-prefix=PIC
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=dynamic-no-pic -frame-pointer=none | FileCheck %s
+; RUN: llc < %s -mtriple=thumbv7-apple-darwin -mcpu=cortex-a8 -relocation-model=pic -frame-pointer=none | FileCheck %s --check-prefix=PIC
 ; rdar://7353541
 ; rdar://7354376
 
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll
index bae66d456f89..174cca4fab09 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll
@@ -60,4 +60,4 @@ define dso_local i32 @main() #0 {
   ret i32 0
 }
 
-attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
+attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="none" }
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected
index de5571f64361..2dfb725f5566 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.generated.expected
@@ -61,7 +61,7 @@ define dso_local i32 @main() #0 {
   ret i32 0
 }
 
-attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
+attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="none" }
 ; CHECK-LABEL: check_boundaries:
 ; CHECK:       @ %bb.0:
 ; CHECK-NEXT:    sub sp, sp, #20
diff --git a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected
index 4f623384ade6..85d3389cdaaf 100644
--- a/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected
+++ b/llvm/test/tools/UpdateTestChecks/update_llc_test_checks/Inputs/arm_generated_funcs.ll.nogenerated.expected
@@ -121,4 +121,4 @@ define dso_local i32 @main() #0 {
   ret i32 0
 }
 
-attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="all" }
+attributes #0 = { noredzone nounwind ssp uwtable "frame-pointer"="none" }
-- 
GitLab


From 0f7d148db45e782373c5d6a0faf745986753982b Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Oct 2024 12:28:57 +0200
Subject: [PATCH 244/329] [InstCombine] Add shared helper for logical and
 bitwise and/or (NFC)

Add a helper for shared folds between logical and bitwise and/or
and move the and/or of icmp and fcmp folds in there. This makes
it easier to extend to more folds.

A possible extension would be to base the current and/or of icmp
reassociation logic on this helper, so that it for example also
applies to fcmp.
---
 .../InstCombine/InstCombineAndOrXor.cpp       | 47 ++++++++++++-------
 .../InstCombine/InstCombineInternal.h         |  3 ++
 .../InstCombine/InstCombineSelect.cpp         | 20 ++------
 3 files changed, 37 insertions(+), 33 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index c8407e8ba5ab..8112255a0b6c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -2722,13 +2722,15 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
       return BinaryOperator::CreateAnd(Builder.CreateNot(A), B);
   }
 
+  if (Value *Res =
+          foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/true, /*IsLogical=*/false))
+    return replaceInstUsesWith(I, Res);
+
   {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
-    if (LHS && RHS)
-      if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ true))
-        return replaceInstUsesWith(I, Res);
 
+    // TODO: Base this on foldBooleanAndOr instead?
     // TODO: Make this recursive; it's a little tricky because an arbitrary
     // number of 'and' instructions might have to be created.
     if (LHS && match(Op1, m_OneUse(m_LogicalAnd(m_Value(X), m_Value(Y))))) {
@@ -2767,11 +2769,6 @@ Instruction *InstCombinerImpl::visitAnd(BinaryOperator &I) {
     }
   }
 
-  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
-    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true))
-        return replaceInstUsesWith(I, Res);
-
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
     return FoldedFCmps;
 
@@ -3523,6 +3520,27 @@ Value *InstCombinerImpl::foldAndOrOfICmps(ICmpInst *LHS, ICmpInst *RHS,
   return foldAndOrOfICmpsUsingRanges(LHS, RHS, IsAnd);
 }
 
+/// If IsLogical is true, then the and/or is in select form and the transform
+/// must be poison-safe.
+Value *InstCombinerImpl::foldBooleanAndOr(Value *LHS, Value *RHS,
+                                          Instruction &I, bool IsAnd,
+                                          bool IsLogical) {
+  if (!LHS->getType()->isIntOrIntVectorTy(1))
+    return nullptr;
+
+  if (auto *LHSCmp = dyn_cast<ICmpInst>(LHS))
+    if (auto *RHSCmp = dyn_cast<ICmpInst>(RHS))
+      if (Value *Res = foldAndOrOfICmps(LHSCmp, RHSCmp, I, IsAnd, IsLogical))
+        return Res;
+
+  if (auto *LHSCmp = dyn_cast<FCmpInst>(LHS))
+    if (auto *RHSCmp = dyn_cast<FCmpInst>(RHS))
+      if (Value *Res = foldLogicOfFCmps(LHSCmp, RHSCmp, IsAnd, IsLogical))
+        return Res;
+
+  return nullptr;
+}
+
 static Value *foldOrOfInversions(BinaryOperator &I,
                                  InstCombiner::BuilderTy &Builder) {
   assert(I.getOpcode() == Instruction::Or &&
@@ -3804,13 +3822,15 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
   if (SwappedForXor)
     std::swap(Op0, Op1);
 
+  if (Value *Res =
+          foldBooleanAndOr(Op0, Op1, I, /*IsAnd=*/false, /*IsLogical=*/false))
+    return replaceInstUsesWith(I, Res);
+
   {
     ICmpInst *LHS = dyn_cast<ICmpInst>(Op0);
     ICmpInst *RHS = dyn_cast<ICmpInst>(Op1);
-    if (LHS && RHS)
-      if (Value *Res = foldAndOrOfICmps(LHS, RHS, I, /* IsAnd */ false))
-        return replaceInstUsesWith(I, Res);
 
+    // TODO: Base this on foldBooleanAndOr instead?
     // TODO: Make this recursive; it's a little tricky because an arbitrary
     // number of 'or' instructions might have to be created.
     Value *X, *Y;
@@ -3850,11 +3870,6 @@ Instruction *InstCombinerImpl::visitOr(BinaryOperator &I) {
     }
   }
 
-  if (FCmpInst *LHS = dyn_cast<FCmpInst>(I.getOperand(0)))
-    if (FCmpInst *RHS = dyn_cast<FCmpInst>(I.getOperand(1)))
-      if (Value *Res = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false))
-        return replaceInstUsesWith(I, Res);
-
   if (Instruction *FoldedFCmps = reassociateFCmps(I, Builder))
     return FoldedFCmps;
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
index 7c6f42de77fc..7a060cdab2d3 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
+++ b/llvm/lib/Transforms/InstCombine/InstCombineInternal.h
@@ -425,6 +425,9 @@ private:
   Instruction *foldLogicOfIsFPClass(BinaryOperator &Operator, Value *LHS,
                                     Value *RHS);
 
+  Value *foldBooleanAndOr(Value *LHS, Value *RHS, Instruction &I, bool IsAnd,
+                          bool IsLogical);
+
   Instruction *
   canonicalizeConditionalNegationViaMathToSelect(BinaryOperator &i);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
index ed44f0596f32..c5f39a4c381e 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineSelect.cpp
@@ -3143,12 +3143,6 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
           SI, Builder.CreateLogicalOr(A, Builder.CreateOr(B, FalseVal)));
     }
 
-    if (auto *LHS = dyn_cast<FCmpInst>(CondVal))
-      if (auto *RHS = dyn_cast<FCmpInst>(FalseVal))
-        if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ false,
-                                        /*IsSelectLogical*/ true))
-          return replaceInstUsesWith(SI, V);
-
     // (A && B) || (C && B) --> (A || C) && B
     if (match(CondVal, m_LogicalAnd(m_Value(A), m_Value(B))) &&
         match(FalseVal, m_LogicalAnd(m_Value(C), m_Value(D))) &&
@@ -3191,12 +3185,6 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
           SI, Builder.CreateLogicalAnd(A, Builder.CreateAnd(B, TrueVal)));
     }
 
-    if (auto *LHS = dyn_cast<FCmpInst>(CondVal))
-      if (auto *RHS = dyn_cast<FCmpInst>(TrueVal))
-        if (Value *V = foldLogicOfFCmps(LHS, RHS, /*IsAnd*/ true,
-                                        /*IsSelectLogical*/ true))
-          return replaceInstUsesWith(SI, V);
-
     // (A || B) && (C || B) --> (A && C) || B
     if (match(CondVal, m_LogicalOr(m_Value(A), m_Value(B))) &&
         match(TrueVal, m_LogicalOr(m_Value(C), m_Value(D))) &&
@@ -3305,11 +3293,9 @@ Instruction *InstCombinerImpl::foldSelectOfBools(SelectInst &SI) {
       return replaceInstUsesWith(SI, Op1);
     }
 
-    if (auto *ICmp0 = dyn_cast<ICmpInst>(CondVal))
-      if (auto *ICmp1 = dyn_cast<ICmpInst>(Op1))
-        if (auto *V = foldAndOrOfICmps(ICmp0, ICmp1, SI, IsAnd,
-                                       /* IsLogical */ true))
-          return replaceInstUsesWith(SI, V);
+    if (auto *V = foldBooleanAndOr(CondVal, Op1, SI, IsAnd,
+                                   /*IsLogical=*/true))
+      return replaceInstUsesWith(SI, V);
   }
 
   // select (a || b), c, false -> select a, c, false
-- 
GitLab


From 370fd74361be476ff17ecf8fa3c36ae9f51b9e0e Mon Sep 17 00:00:00 2001
From: Nashe Mncube <nashe.mncube@arm.com>
Date: Thu, 17 Oct 2024 13:38:01 +0100
Subject: [PATCH 245/329] Revert "[llvm][ARM]Add  widen global arrays pass"
 (#112701)

Reverts llvm/llvm-project#107120

Unexpected build failures in post-commit pipelines. Needs investigation
---
 .../llvm/Analysis/TargetTransformInfo.h       |  11 --
 .../llvm/Analysis/TargetTransformInfoImpl.h   |   4 -
 llvm/lib/Analysis/TargetTransformInfo.cpp     |   6 -
 .../lib/Target/ARM/ARMTargetTransformInfo.cpp |  33 ----
 llvm/lib/Target/ARM/ARMTargetTransformInfo.h  |   3 -
 llvm/lib/Transforms/IPO/GlobalOpt.cpp         | 165 ------------------
 .../GlobalOpt/ARM/arm-widen-dest-non-array.ll |  39 -----
 .../GlobalOpt/ARM/arm-widen-global-dest.ll    |  28 ---
 .../GlobalOpt/ARM/arm-widen-non-byte-array.ll |  22 ---
 .../ARM/arm-widen-non-const-global.ll         |  21 ---
 .../ARM/arm-widen-string-multi-use.ll         |  33 ----
 .../GlobalOpt/ARM/arm-widen-strings-1.ll      |  21 ---
 .../GlobalOpt/ARM/arm-widen-strings-2.ll      |  21 ---
 .../arm-widen-strings-lengths-dont-match.ll   |  26 ---
 .../arm-widen-strings-more-than-64-bytes.ll   |  28 ---
 .../ARM/arm-widen-strings-ptrtoint.ll         |  54 ------
 .../ARM/arm-widen-strings-struct-test.ll      |  45 -----
 .../ARM/arm-widen-strings-volatile.ll         |  29 ---
 18 files changed, 589 deletions(-)
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
 delete mode 100644 llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll

diff --git a/llvm/include/llvm/Analysis/TargetTransformInfo.h b/llvm/include/llvm/Analysis/TargetTransformInfo.h
index 0dc513d8e65b..0459941fe05c 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfo.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfo.h
@@ -1819,10 +1819,6 @@ public:
   /// \return The maximum number of function arguments the target supports.
   unsigned getMaxNumArgs() const;
 
-  /// \return For an array of given Size, return alignment boundary to
-  /// pad to. Default is no padding.
-  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
-
   /// @}
 
 private:
@@ -2229,8 +2225,6 @@ public:
   getVPLegalizationStrategy(const VPIntrinsic &PI) const = 0;
   virtual bool hasArmWideBranch(bool Thumb) const = 0;
   virtual unsigned getMaxNumArgs() const = 0;
-  virtual unsigned getNumBytesToPadGlobalArray(unsigned Size,
-                                               Type *ArrayType) const = 0;
 };
 
 template <typename T>
@@ -3032,11 +3026,6 @@ public:
   unsigned getMaxNumArgs() const override {
     return Impl.getMaxNumArgs();
   }
-
-  unsigned getNumBytesToPadGlobalArray(unsigned Size,
-                                       Type *ArrayType) const override {
-    return Impl.getNumBytesToPadGlobalArray(Size, ArrayType);
-  }
 };
 
 template <typename T>
diff --git a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
index 0b7792f89a05..dbdfb4d8cdfa 100644
--- a/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
+++ b/llvm/include/llvm/Analysis/TargetTransformInfoImpl.h
@@ -1006,10 +1006,6 @@ public:
 
   unsigned getMaxNumArgs() const { return UINT_MAX; }
 
-  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const {
-    return 0;
-  }
-
 protected:
   // Obtain the minimum required size to hold the value (without the sign)
   // In case of a vector it returns the min required size for one element.
diff --git a/llvm/lib/Analysis/TargetTransformInfo.cpp b/llvm/lib/Analysis/TargetTransformInfo.cpp
index 607047336376..a47462b61e03 100644
--- a/llvm/lib/Analysis/TargetTransformInfo.cpp
+++ b/llvm/lib/Analysis/TargetTransformInfo.cpp
@@ -1383,12 +1383,6 @@ bool TargetTransformInfo::isVectorShiftByScalarCheap(Type *Ty) const {
   return TTIImpl->isVectorShiftByScalarCheap(Ty);
 }
 
-unsigned
-TargetTransformInfo::getNumBytesToPadGlobalArray(unsigned Size,
-                                                 Type *ArrayType) const {
-  return TTIImpl->getNumBytesToPadGlobalArray(Size, ArrayType);
-}
-
 TargetTransformInfo::Concept::~Concept() = default;
 
 TargetIRAnalysis::TargetIRAnalysis() : TTICallback(&getDefaultTTI) {}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
index 9f6e5e5ab142..835ae98efb85 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.cpp
@@ -56,10 +56,6 @@ static cl::opt<bool>
     AllowWLSLoops("allow-arm-wlsloops", cl::Hidden, cl::init(true),
                   cl::desc("Enable the generation of WLS loops"));
 
-static cl::opt<bool> UseWidenGlobalArrays(
-    "widen-global-strings", cl::Hidden, cl::init(true),
-    cl::desc("Enable the widening of global strings to alignment boundaries"));
-
 extern cl::opt<TailPredication::Mode> EnableTailPredication;
 
 extern cl::opt<bool> EnableMaskedGatherScatters;
@@ -2809,32 +2805,3 @@ bool ARMTTIImpl::isProfitableToSinkOperands(Instruction *I,
   }
   return true;
 }
-
-unsigned ARMTTIImpl::getNumBytesToPadGlobalArray(unsigned Size,
-                                                 Type *ArrayType) const {
-  if (!UseWidenGlobalArrays) {
-    LLVM_DEBUG(dbgs() << "Padding global arrays disabled\n");
-    return false;
-  }
-
-  // Don't modify none integer array types
-  if (!ArrayType || !ArrayType->isArrayTy() ||
-      !ArrayType->getArrayElementType()->isIntegerTy())
-    return 0;
-
-  // We pad to 4 byte boundaries
-  if (Size % 4 == 0)
-    return 0;
-
-  unsigned NumBytesToPad = 4 - (Size % 4);
-  unsigned NewSize = Size + NumBytesToPad;
-
-  // Max number of bytes that memcpy allows for lowering to load/stores before
-  // it uses library function (__aeabi_memcpy).
-  unsigned MaxMemIntrinsicSize = getMaxMemIntrinsicInlineSizeThreshold();
-
-  if (NewSize > MaxMemIntrinsicSize)
-    return 0;
-
-  return NumBytesToPad;
-}
diff --git a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
index 3a4f940088b2..b0a75134ee02 100644
--- a/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
+++ b/llvm/lib/Target/ARM/ARMTargetTransformInfo.h
@@ -337,9 +337,6 @@ public:
 
   bool isProfitableToSinkOperands(Instruction *I,
                                   SmallVectorImpl<Use *> &Ops) const;
-
-  unsigned getNumBytesToPadGlobalArray(unsigned Size, Type *ArrayType) const;
-
   /// @}
 };
 
diff --git a/llvm/lib/Transforms/IPO/GlobalOpt.cpp b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
index 4647c65a5c85..aae4926e027f 100644
--- a/llvm/lib/Transforms/IPO/GlobalOpt.cpp
+++ b/llvm/lib/Transforms/IPO/GlobalOpt.cpp
@@ -92,8 +92,6 @@ STATISTIC(NumInternalFunc, "Number of internal functions");
 STATISTIC(NumColdCC, "Number of functions marked coldcc");
 STATISTIC(NumIFuncsResolved, "Number of statically resolved IFuncs");
 STATISTIC(NumIFuncsDeleted, "Number of IFuncs removed");
-STATISTIC(NumGlobalArraysPadded,
-          "Number of global arrays padded to alignment boundary");
 
 static cl::opt<bool>
     EnableColdCCStressTest("enable-coldcc-stress-test",
@@ -2031,165 +2029,6 @@ OptimizeFunctions(Module &M,
   return Changed;
 }
 
-static bool callInstIsMemcpy(CallInst *CI) {
-  if (!CI)
-    return false;
-
-  Function *F = CI->getCalledFunction();
-  if (!F || !F->isIntrinsic() || F->getIntrinsicID() != Intrinsic::memcpy)
-    return false;
-
-  return true;
-}
-
-static bool destArrayCanBeWidened(CallInst *CI) {
-  auto *IsVolatile = dyn_cast<ConstantInt>(CI->getArgOperand(3));
-  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-
-  if (!Alloca || !IsVolatile || IsVolatile->isOne())
-    return false;
-
-  if (!Alloca->isStaticAlloca())
-    return false;
-
-  if (!Alloca->getAllocatedType()->isArrayTy())
-    return false;
-
-  return true;
-}
-
-static GlobalVariable *widenGlobalVariable(GlobalVariable *OldVar, Function *F,
-                                           unsigned NumBytesToPad,
-                                           unsigned NumBytesToCopy) {
-  if (!OldVar->hasInitializer())
-    return nullptr;
-
-  ConstantDataArray *DataArray =
-      dyn_cast<ConstantDataArray>(OldVar->getInitializer());
-  if (!DataArray)
-    return nullptr;
-
-  // Update to be word aligned (memcpy(...,X,...))
-  // create replacement with padded null bytes.
-  StringRef Data = DataArray->getRawDataValues();
-  std::vector<uint8_t> StrData(Data.begin(), Data.end());
-  for (unsigned int p = 0; p < NumBytesToPad; p++)
-    StrData.push_back('\0');
-  auto Arr = ArrayRef(StrData.data(), NumBytesToCopy + NumBytesToPad);
-  // Create new padded version of global variable.
-  Constant *SourceReplace = ConstantDataArray::get(F->getContext(), Arr);
-  GlobalVariable *NewGV = new GlobalVariable(
-      *(F->getParent()), SourceReplace->getType(), true, OldVar->getLinkage(),
-      SourceReplace, SourceReplace->getName());
-  // Copy any other attributes from original global variable
-  // e.g. unamed_addr
-  NewGV->copyAttributesFrom(OldVar);
-  NewGV->takeName(OldVar);
-  return NewGV;
-}
-
-static void widenDestArray(CallInst *CI, const unsigned NumBytesToPad,
-                           const unsigned NumBytesToCopy,
-                           ConstantDataArray *SourceDataArray) {
-
-  auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-  if (Alloca) {
-    unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
-    unsigned int TotalBytes = NumBytesToCopy + NumBytesToPad;
-    unsigned NumElementsToCopy = divideCeil(TotalBytes, ElementByteWidth);
-    // Update destination array to be word aligned (memcpy(X,...,...))
-    IRBuilder<> BuildAlloca(Alloca);
-    AllocaInst *NewAlloca = BuildAlloca.CreateAlloca(ArrayType::get(
-        Alloca->getAllocatedType()->getArrayElementType(), NumElementsToCopy));
-    NewAlloca->takeName(Alloca);
-    NewAlloca->setAlignment(Alloca->getAlign());
-    Alloca->replaceAllUsesWith(NewAlloca);
-    Alloca->eraseFromParent();
-  }
-}
-
-static bool tryWidenGlobalArrayAndDests(Function *F, GlobalVariable *SourceVar,
-                                        const unsigned NumBytesToPad,
-                                        const unsigned NumBytesToCopy,
-                                        ConstantInt *BytesToCopyOp,
-                                        ConstantDataArray *SourceDataArray) {
-  auto *NewSourceGV =
-      widenGlobalVariable(SourceVar, F, NumBytesToPad, NumBytesToCopy);
-  if (!NewSourceGV)
-    return false;
-
-  // Update arguments of remaining uses  that
-  // are memcpys.
-  for (auto *User : SourceVar->users()) {
-    auto *CI = dyn_cast<CallInst>(User);
-    if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
-      continue;
-
-    if (CI->getArgOperand(1) != SourceVar)
-      continue;
-
-    widenDestArray(CI, NumBytesToPad, NumBytesToCopy, SourceDataArray);
-
-    CI->setArgOperand(2, ConstantInt::get(BytesToCopyOp->getType(),
-                                          NumBytesToCopy + NumBytesToPad));
-  }
-  SourceVar->replaceAllUsesWith(NewSourceGV);
-
-  NumGlobalArraysPadded++;
-  return true;
-}
-
-static bool tryWidenGlobalArraysUsedByMemcpy(
-    GlobalVariable *GV,
-    function_ref<TargetTransformInfo &(Function &)> GetTTI) {
-
-  if (!GV->hasInitializer() || !GV->isConstant() || !GV->hasLocalLinkage() ||
-      !GV->hasGlobalUnnamedAddr())
-    return false;
-
-  for (auto *User : GV->users()) {
-    CallInst *CI = dyn_cast<CallInst>(User);
-    if (!callInstIsMemcpy(CI) || !destArrayCanBeWidened(CI))
-      continue;
-
-    Function *F = CI->getCalledFunction();
-
-    auto *BytesToCopyOp = dyn_cast<ConstantInt>(CI->getArgOperand(2));
-    if (!BytesToCopyOp)
-      continue;
-
-    ConstantDataArray *SourceDataArray =
-        dyn_cast<ConstantDataArray>(GV->getInitializer());
-    if (!SourceDataArray)
-      continue;
-
-    unsigned NumBytesToCopy = BytesToCopyOp->getZExtValue();
-
-    auto *Alloca = dyn_cast<AllocaInst>(CI->getArgOperand(0));
-    uint64_t DZSize = Alloca->getAllocatedType()->getArrayNumElements();
-    uint64_t SZSize = SourceDataArray->getType()->getNumElements();
-    unsigned ElementByteWidth = SourceDataArray->getElementByteSize();
-    // Calculate the number of elements to copy while avoiding floored
-    // division of integers returning wrong values i.e. copying one byte
-    // from an array of i16 would yield 0 elements to copy as supposed to 1.
-    unsigned NumElementsToCopy = divideCeil(NumBytesToCopy, ElementByteWidth);
-
-    // For safety purposes lets add a constraint and only pad when
-    // NumElementsToCopy == destination array size ==
-    // source which is a constant
-    if (NumElementsToCopy != DZSize || DZSize != SZSize)
-      continue;
-
-    unsigned NumBytesToPad = GetTTI(*F).getNumBytesToPadGlobalArray(
-        NumBytesToCopy, SourceDataArray->getType());
-    if (NumBytesToPad) {
-      return tryWidenGlobalArrayAndDests(F, GV, NumBytesToPad, NumBytesToCopy,
-                                         BytesToCopyOp, SourceDataArray);
-    }
-  }
-  return false;
-}
-
 static bool
 OptimizeGlobalVars(Module &M,
                    function_ref<TargetTransformInfo &(Function &)> GetTTI,
@@ -2219,10 +2058,6 @@ OptimizeGlobalVars(Module &M,
       continue;
     }
 
-    // For global variable arrays called in a memcpy
-    // we try to pad to nearest valid alignment boundary
-    Changed |= tryWidenGlobalArraysUsedByMemcpy(&GV, GetTTI);
-
     Changed |= processGlobal(GV, GetTTI, GetTLI, LookupDomTree);
   }
   return Changed;
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
deleted file mode 100644
index ab04e0a5bc69..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-dest-non-array.ll
+++ /dev/null
@@ -1,39 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
-
-define  void @memcpy_struct()  {
-; CHECK-LABEL: define void @memcpy_struct() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca { i8, i8, i8 }, align 1
-; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca {i8, i8, i8}, align 1
-  %call1 = call i32 @bar(ptr nonnull %something)
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-  ret void
-}
-
-
-@.i8_multi = private unnamed_addr constant [2 x [3 x i8]] [[3 x i8] [i8 1, i8 2, i8 3], [3 x i8] [i8 4, i8 5, i8 6]] , align 1
-
-define  void @memcpy_array_multidimensional()  {
-; CHECK-LABEL: define void @memcpy_array_multidimensional() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [2 x [3 x i8]], align 1
-; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [2 x [3 x i8]], align 1
-  %call1 = call i32 @bar(ptr nonnull %something)
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8_multi, i32 3, i1 false)
-  ret void
-}
-
-declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
deleted file mode 100644
index f435ffdeed2c..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-global-dest.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-; CHECK: [3 x i8]
-@other = private unnamed_addr global [3 x i8] [i8 1, i8 2, i8 3] , align 1
-; CHECK: [4 x i8]
-@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
-
-define  void @memcpy_multiple()  {
-; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull @other)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [3 x i8], align 1
-  %call1 = call i32 @bar(ptr nonnull %something)
-  %call2 = call i32 @bar(ptr nonnull @other)
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) @other, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-  ret void
-}
-
-declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
deleted file mode 100644
index c7ca7271fd3d..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-byte-array.ll
+++ /dev/null
@@ -1,22 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-@.i16 = private unnamed_addr constant [5 x i16] [i16 1, i16 2, i16 3, i16 4, i16 5] , align 1
-
-define  void @memcpy_i16_array()  {
-; CHECK-LABEL: define void @memcpy_i16_array() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [6 x i16], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 12, i1 false)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [5 x i16], align 1
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.i16, i32 10, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something)
-  ret void
-}
-
-
-declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll
deleted file mode 100644
index 3d9c42fe1f3d..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-non-const-global.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-@.str = unnamed_addr global [3 x i8] c"12\00", align 1
-
-define  void @foo()  {
-; CHECK-LABEL: define void @foo() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [3 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(3) @.str, i32 3, i1 false)
-; CHECK-NEXT:    [[CALL1:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [3 x i8], align 1
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.str, i32 3, i1 false)
-  %call1 = call i32 @bar(ptr nonnull %something)
-  ret void
-}
-
-declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
deleted file mode 100644
index e37925a78d2c..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-string-multi-use.ll
+++ /dev/null
@@ -1,33 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-@.i8 = private unnamed_addr constant [3 x i8] [i8 1, i8 2, i8 3] , align 1
-
-define  void @memcpy_multiple()  {
-; CHECK-LABEL: define void @memcpy_multiple() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING2:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    [[SOMETHING1:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    [[SOMETHING3:%.*]] = alloca [4 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING2]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING1]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) [[SOMETHING3]], ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 4, i1 false)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING2]])
-; CHECK-NEXT:    [[CALL3:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING1]])
-; CHECK-NEXT:    [[CALL4:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING3]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [3 x i8], align 1
-  %something1 = alloca [3 x i8], align 1
-  %something2 = alloca [3 x i8], align 1
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something1, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(3) %something2, ptr noundef nonnull align 1 dereferenceable(3) @.i8, i32 3, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something)
-  %call3 = call i32 @bar(ptr nonnull %something1)
-  %call4 = call i32 @bar(ptr nonnull %something2)
-  ret void
-}
-
-declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
deleted file mode 100644
index 8ea9e2804370..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-1.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-@.str = private unnamed_addr constant [10 x i8] c"123456789\00", align 1
-
-define  void @foo()  {
-; CHECK-LABEL: define void @foo() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [12 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 12, i1 false)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [10 x i8], align 1
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(10) %something, ptr noundef nonnull align 1 dereferenceable(10) @.str, i32 10, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something)
-  ret void
-}
-
-declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
deleted file mode 100644
index ad3620b14ea2..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-2.ll
+++ /dev/null
@@ -1,21 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-@.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
-
-define  void @foo()  {
-; CHECK-LABEL: define void @foo() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [64 x i8], align 1
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) [[SOMETHING]], ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 64, i1 false)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [62 x i8], align 1
-  call void @llvm.memcpy.p0.p0.i32(ptr noundef nonnull align 1 dereferenceable(62) %something, ptr noundef nonnull align 1 dereferenceable(62) @.str, i32 62, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something)
-  ret void
-}
-
-declare i32 @bar(...)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
deleted file mode 100644
index b8e02c3f996d..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-lengths-dont-match.ll
+++ /dev/null
@@ -1,26 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-; CHECK: [17 x i8]
-@.str = private unnamed_addr constant [17 x i8] c"aaaaaaaaaaaaaaaa\00", align 1
-
-; Function Attrs: nounwind
-define  void @foo()   {
-; CHECK-LABEL: define void @foo() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [20 x i8], align 1
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[SOMETHING]], ptr align 1 @.str, i32 17, i1 false)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [20 x i8], align 1
-  call void @llvm.lifetime.start(i64 20, ptr nonnull %something) #3
-  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 17, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something) #3
-  call void @llvm.lifetime.end(i64 20, ptr nonnull %something) #3
-  ret void
-}
-
-declare i32 @bar(...)  #2
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
deleted file mode 100644
index 4ac31aa2f976..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-more-than-64-bytes.ll
+++ /dev/null
@@ -1,28 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-; CHECK: [65 x i8]
-; CHECK-NOT: [68 x i8]
-@.str = private unnamed_addr constant [65 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaazzz\00", align 1
-
-; Function Attrs: nounwind
-define  void @foo()   {
-; CHECK-LABEL: define void @foo() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [65 x i8], align 1
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 65, ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[SOMETHING]], ptr align 1 @.str, i32 65, i1 false)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 65, ptr nonnull [[SOMETHING]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [65 x i8], align 1
-  call void @llvm.lifetime.start(i64 65, ptr nonnull %something) #3
-  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %something, ptr align 1 @.str, i32 65, i1 false)
-  %call2 = call i32 @bar(ptr nonnull %something) #3
-  call void @llvm.lifetime.end(i64 65, ptr nonnull %something) #3
-  ret void
-}
-
-declare i32 @bar(...)  #2
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
deleted file mode 100644
index 64f57884cd39..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-ptrtoint.ll
+++ /dev/null
@@ -1,54 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-@f.string1 = private unnamed_addr constant [45 x i8] c"The quick brown dog jumps over the lazy fox.\00", align 1
-
-; Function Attrs: nounwind
-define  i32 @f() {
-; CHECK-LABEL: define i32 @f() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[STRING1:%.*]] = alloca [48 x i8], align 1
-; CHECK-NEXT:    [[POS:%.*]] = alloca i32, align 4
-; CHECK-NEXT:    [[TOKEN:%.*]] = alloca ptr, align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 45, ptr [[STRING1]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[STRING1]], ptr align 1 @f.string1, i32 48, i1 false)
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[POS]])
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 4, ptr [[TOKEN]])
-; CHECK-NEXT:    [[CALL:%.*]] = call ptr @strchr(ptr [[STRING1]], i32 101)
-; CHECK-NEXT:    store ptr [[CALL]], ptr [[TOKEN]], align 4
-; CHECK-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TOKEN]], align 4
-; CHECK-NEXT:    [[SUB_PTR_LHS_CAST:%.*]] = ptrtoint ptr [[TMP1]] to i32
-; CHECK-NEXT:    [[SUB_PTR_RHS_CAST:%.*]] = ptrtoint ptr [[STRING1]] to i32
-; CHECK-NEXT:    [[SUB_PTR_SUB:%.*]] = sub i32 [[SUB_PTR_LHS_CAST]], [[SUB_PTR_RHS_CAST]]
-; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[SUB_PTR_SUB]], 1
-; CHECK-NEXT:    store i32 [[ADD]], ptr [[POS]], align 4
-; CHECK-NEXT:    [[TMP2:%.*]] = load i32, ptr [[POS]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[TOKEN]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 4, ptr [[POS]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 45, ptr [[STRING1]])
-; CHECK-NEXT:    ret i32 [[TMP2]]
-;
-entry:
-  %string1 = alloca [45 x i8], align 1
-  %pos = alloca i32, align 4
-  %token = alloca ptr, align 4
-  call void @llvm.lifetime.start.p0i8(i64 45, ptr %string1)
-  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %string1, ptr align 1 @f.string1, i32 45, i1 false)
-  call void @llvm.lifetime.start.p0i8(i64 4, ptr %pos)
-  call void @llvm.lifetime.start.p0i8(i64 4, ptr %token)
-  %call = call ptr @strchr(ptr %string1, i32 101)
-  store ptr %call, ptr %token, align 4
-  %0 = load ptr, ptr %token, align 4
-  %sub.ptr.lhs.cast = ptrtoint ptr %0 to i32
-  %sub.ptr.rhs.cast = ptrtoint ptr %string1 to i32
-  %sub.ptr.sub = sub i32 %sub.ptr.lhs.cast, %sub.ptr.rhs.cast
-  %add = add nsw i32 %sub.ptr.sub, 1
-  store i32 %add, ptr %pos, align 4
-  %1 = load i32, ptr %pos, align 4
-  call void @llvm.lifetime.end.p0i8(i64 4, ptr %token)
-  call void @llvm.lifetime.end.p0i8(i64 4, ptr %pos)
-  call void @llvm.lifetime.end.p0i8(i64 45, ptr %string1)
-  ret i32 %1
-}
-
-declare ptr @strchr(ptr, i32)
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
deleted file mode 100644
index 5367572704b1..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-struct-test.ll
+++ /dev/null
@@ -1,45 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-%struct.P = type { i32, [13 x i8] }
-
-; CHECK-NOT: [16 x i8]
-@.str = private unnamed_addr constant [13 x i8] c"hello world\0A\00", align 1
-
-; Function Attrs: nounwind
-define  i32 @main()   {
-; CHECK-LABEL: define i32 @main() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[P:%.*]] = alloca [[STRUCT_P:%.*]], align 4
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 20, ptr nonnull [[P]])
-; CHECK-NEXT:    store i32 10, ptr [[P]], align 4, !tbaa [[TBAA0:![0-9]+]]
-; CHECK-NEXT:    [[ARRAYDECAY:%.*]] = getelementptr inbounds [[STRUCT_P]], ptr [[P]], i32 0, i32 1, i32 0
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr align 1 [[ARRAYDECAY]], ptr align 1 @.str, i32 13, i1 false)
-; CHECK-NEXT:    [[PUTS:%.*]] = call i32 @puts(ptr [[ARRAYDECAY]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 20, ptr nonnull [[P]])
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %p = alloca %struct.P, align 4
-  call void @llvm.lifetime.start(i64 20, ptr nonnull %p) #2
-  store i32 10, ptr %p, align 4, !tbaa !1
-  %arraydecay = getelementptr inbounds %struct.P, ptr %p, i32 0, i32 1, i32 0
-  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 %arraydecay, ptr align 1 @.str, i32 13, i1 false)
-  %puts = call i32 @puts(ptr %arraydecay)
-  call void @llvm.lifetime.end(i64 20, ptr nonnull %p) #2
-  ret i32 0
-}
-
-declare i32 @puts(ptr nocapture readonly) #2
-
-!1 = !{!2, !3, i64 0}
-!2 = !{!"P", !3, i64 0, !4, i64 4}
-!3 = !{!"int", !4, i64 0}
-!4 = !{!"omnipotent char", !5, i64 0}
-!5 = !{!"Simple C/C++ TBAA"}
-;.
-; CHECK: [[TBAA0]] = !{[[META1:![0-9]+]], [[META2:![0-9]+]], i64 0}
-; CHECK: [[META1]] = !{!"P", [[META2]], i64 0, [[META3:![0-9]+]], i64 4}
-; CHECK: [[META2]] = !{!"int", [[META3]], i64 0}
-; CHECK: [[META3]] = !{!"omnipotent char", [[META4:![0-9]+]], i64 0}
-; CHECK: [[META4]] = !{!"Simple C/C++ TBAA"}
-;.
diff --git a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll b/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
deleted file mode 100644
index b735a7788742..000000000000
--- a/llvm/test/Transforms/GlobalOpt/ARM/arm-widen-strings-volatile.ll
+++ /dev/null
@@ -1,29 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
-; RUN: opt < %s -mtriple=arm-none-eabi -passes=globalopt -S | FileCheck %s
-
-; CHECK-NOT: [64 x i8]
-@.str = private unnamed_addr constant [62 x i8] c"aaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaaa\00", align 1
-
-; Function Attrs: nounwind
-define  void @foo()   {
-; CHECK-LABEL: define void @foo() local_unnamed_addr {
-; CHECK-NEXT:  [[ENTRY:.*:]]
-; CHECK-NEXT:    [[SOMETHING:%.*]] = alloca [62 x i8], align 1
-; CHECK-NEXT:    [[TMP0:%.*]] = getelementptr inbounds [62 x i8], ptr [[SOMETHING]], i32 0, i32 0
-; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 62, ptr nonnull [[TMP0]])
-; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i32(ptr nonnull align 1 [[TMP0]], ptr align 1 @.str, i32 62, i1 true)
-; CHECK-NEXT:    [[CALL2:%.*]] = call i32 @bar(ptr nonnull [[TMP0]])
-; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 62, ptr nonnull [[TMP0]])
-; CHECK-NEXT:    ret void
-;
-entry:
-  %something = alloca [62 x i8], align 1
-  %0 = getelementptr inbounds [62 x i8], ptr %something, i32 0, i32 0
-  call void @llvm.lifetime.start(i64 62, ptr nonnull %0) #3
-  call void @llvm.memcpy.p0i8.p0i8.i32(ptr align 1 nonnull %0, ptr align 1 @.str, i32 62, i1 true)
-  %call2 = call i32 @bar(ptr nonnull %0) #3
-  call void @llvm.lifetime.end(i64 62, ptr nonnull %0) #3
-  ret void
-}
-
-declare i32 @bar(...)  #2
-- 
GitLab


From 095d49da76be09143582e07a807c86d3b4334dec Mon Sep 17 00:00:00 2001
From: Yingwei Zheng <dtcxzyw2333@gmail.com>
Date: Thu, 17 Oct 2024 20:43:48 +0800
Subject: [PATCH 246/329] [InstCombine] Set `samesign` when converting signed
 predicates into unsigned (#112642)

Alive2: https://alive2.llvm.org/ce/z/6cqdt-
---
 .../InstCombine/InstCombineCompares.cpp       |  12 +-
 .../ValueTracking/non-negative-phi-bits.ll    |   2 +-
 .../test/Transforms/InstCombine/call-guard.ll |   2 +-
 llvm/test/Transforms/InstCombine/cast_phi.ll  |   2 +-
 .../Transforms/InstCombine/cmp-intrinsic.ll   |   4 +-
 .../InstCombine/fold-ctpop-of-not.ll          |  10 +-
 .../InstCombine/fold-log2-ceil-idiom.ll       |  18 +-
 .../InstCombine/gep-combine-loop-invariant.ll |   2 +-
 .../Transforms/InstCombine/icmp-mul-zext.ll   |   2 +-
 llvm/test/Transforms/InstCombine/icmp-mul.ll  |   8 +-
 .../Transforms/InstCombine/icmp-ne-pow2.ll    |   4 +-
 .../InstCombine/icmp-of-trunc-ext.ll          |  12 +-
 .../test/Transforms/InstCombine/icmp-range.ll |  10 +-
 llvm/test/Transforms/InstCombine/icmp-shr.ll  |  10 +-
 ...al-to-icmp-eq-of-lshr-val-by-bits-and-0.ll |   2 +-
 ...al-to-icmp-ne-of-lshr-val-by-bits-and-0.ll |   2 +-
 .../Transforms/InstCombine/icmp-vscale.ll     |   4 +-
 llvm/test/Transforms/InstCombine/icmp.ll      |   8 +-
 .../icmp_sdiv_with_and_without_range.ll       |   2 +-
 .../InstCombine/indexed-gep-compares.ll       |   2 +-
 llvm/test/Transforms/InstCombine/ispow2.ll    | 106 ++++-----
 .../InstCombine/load-bitcast-select.ll        |   2 +-
 .../InstCombine/lshr-and-negC-icmpeq-zero.ll  |   2 +-
 llvm/test/Transforms/InstCombine/memchr.ll    |   2 +-
 .../phi-known-bits-operand-order.ll           |   4 +-
 llvm/test/Transforms/InstCombine/pr100298.ll  |   4 +-
 llvm/test/Transforms/InstCombine/pr63791.ll   |   2 +-
 llvm/test/Transforms/InstCombine/rem.ll       |   4 +-
 .../remove-loop-phi-multiply-by-zero.ll       |  16 +-
 llvm/test/Transforms/InstCombine/strchr-1.ll  |   2 +-
 llvm/test/Transforms/InstCombine/sub.ll       |   4 +-
 .../AArch64/sve-interleaved-accesses.ll       |  18 +-
 .../sve-interleaved-masked-accesses.ll        | 220 +++++++++---------
 .../LoopVectorize/AArch64/sve-widen-phi.ll    |   2 +-
 .../x86-interleaved-accesses-masked-group.ll  |   2 +-
 .../LoopVectorize/interleaved-accesses.ll     |   6 +-
 .../AArch64/matrix-extract-insert.ll          |  26 +--
 .../Transforms/PhaseOrdering/X86/ctlz-loop.ll |   2 +-
 .../PhaseOrdering/gvn-replacement-vs-hoist.ll |   2 +-
 39 files changed, 274 insertions(+), 270 deletions(-)

diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index 18a6fdcec172..fb6d7a72f2f6 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -6771,11 +6771,15 @@ Instruction *InstCombinerImpl::foldICmpUsingKnownBits(ICmpInst &I) {
   }
 
   // Turn a signed comparison into an unsigned one if both operands are known to
-  // have the same sign.
-  if (I.isSigned() &&
+  // have the same sign. Set samesign if possible (except for equality
+  // predicates).
+  if ((I.isSigned() || (I.isUnsigned() && !I.hasSameSign())) &&
       ((Op0Known.Zero.isNegative() && Op1Known.Zero.isNegative()) ||
-       (Op0Known.One.isNegative() && Op1Known.One.isNegative())))
-    return new ICmpInst(I.getUnsignedPredicate(), Op0, Op1);
+       (Op0Known.One.isNegative() && Op1Known.One.isNegative()))) {
+    I.setPredicate(I.getUnsignedPredicate());
+    I.setSameSign();
+    return &I;
+  }
 
   return nullptr;
 }
diff --git a/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll b/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll
index 410320665e36..97f6bf927be2 100644
--- a/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll
+++ b/llvm/test/Analysis/ValueTracking/non-negative-phi-bits.ll
@@ -8,7 +8,7 @@ define void @test() #0 {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ult i64 [[INDVARS_IV]], 39
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 39
 ; CHECK-NEXT:    br i1 [[EXITCOND]], label [[FOR_END:%.*]], label [[FOR_BODY]]
 ; CHECK:       for.end:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/call-guard.ll b/llvm/test/Transforms/InstCombine/call-guard.ll
index 6b31c78118d0..bc5f319e64f7 100644
--- a/llvm/test/Transforms/InstCombine/call-guard.ll
+++ b/llvm/test/Transforms/InstCombine/call-guard.ll
@@ -43,7 +43,7 @@ define void @test_guard_adjacent_diff_cond2(i32 %V1, i32 %V2) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = and i32 [[V1:%.*]], [[V2:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp slt i32 [[TMP1]], 0
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[V1]], 255
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[AND]], 129
+; CHECK-NEXT:    [[C:%.*]] = icmp samesign ult i32 [[AND]], 129
 ; CHECK-NEXT:    [[TMP3:%.*]] = and i1 [[TMP2]], [[C]]
 ; CHECK-NEXT:    call void (i1, ...) @llvm.experimental.guard(i1 [[TMP3]], i32 123) [ "deopt"() ]
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/cast_phi.ll b/llvm/test/Transforms/InstCombine/cast_phi.ll
index 99da3ac1c7c8..2819b7d05f7b 100644
--- a/llvm/test/Transforms/InstCombine/cast_phi.ll
+++ b/llvm/test/Transforms/InstCombine/cast_phi.ll
@@ -1,5 +1,5 @@
 ; NOTE: Assertions have been autogenerated by utils/update_test_checks.py
-; RUN: opt < %s -passes=instcombine -S | FileCheck %s
+; RUN: opt < %s -passes="instcombine<no-verify-fixpoint>" -S | FileCheck %s
 
 target datalayout = "n32:64"
 
diff --git a/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll b/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll
index 9a9f359fa80b..67815e41ecd3 100644
--- a/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll
+++ b/llvm/test/Transforms/InstCombine/cmp-intrinsic.ll
@@ -386,7 +386,7 @@ define i1 @cttz_ugt_other_multiuse_i33(i33 %x, ptr %p) {
 ; CHECK-LABEL: @cttz_ugt_other_multiuse_i33(
 ; CHECK-NEXT:    [[TZ:%.*]] = tail call range(i33 0, 34) i33 @llvm.cttz.i33(i33 [[X:%.*]], i1 false)
 ; CHECK-NEXT:    store i33 [[TZ]], ptr [[P:%.*]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i33 [[TZ]], 16
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i33 [[TZ]], 16
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %tz = tail call i33 @llvm.cttz.i33(i33 %x, i1 false)
@@ -430,7 +430,7 @@ define <2 x i1> @cttz_ult_other_multiuse_v2i32(<2 x i32> %x, ptr %p) {
 ; CHECK-LABEL: @cttz_ult_other_multiuse_v2i32(
 ; CHECK-NEXT:    [[TZ:%.*]] = tail call range(i32 0, 33) <2 x i32> @llvm.cttz.v2i32(<2 x i32> [[X:%.*]], i1 false)
 ; CHECK-NEXT:    store <2 x i32> [[TZ]], ptr [[P:%.*]], align 8
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i32> [[TZ]], <i32 16, i32 16>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult <2 x i32> [[TZ]], <i32 16, i32 16>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %tz = tail call <2 x i32> @llvm.cttz.v2i32(<2 x i32> %x, i1 false)
diff --git a/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll b/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll
index 4626d19bd289..6d5fde364c23 100644
--- a/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll
+++ b/llvm/test/Transforms/InstCombine/fold-ctpop-of-not.ll
@@ -160,7 +160,7 @@ define i1 @fold_cmp_ult_ctpop_c(i8 %x, i8 %y, i1 %cond) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 -16, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i8 [[X:%.*]], i8 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP2]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[TMP3]], 3
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i8 [[TMP3]], 3
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -176,7 +176,7 @@ define i1 @fold_cmp_sle_ctpop_c(i8 %x, i8 %y, i1 %cond) {
 ; CHECK-NEXT:    [[TMP1:%.*]] = sub i8 -16, [[Y:%.*]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = select i1 [[COND:%.*]], i8 [[X:%.*]], i8 [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[TMP2]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[TMP3]], 4
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i8 [[TMP3]], 4
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %nx = xor i8 %x, -1
@@ -191,7 +191,7 @@ define i1 @fold_cmp_ult_ctpop_c_no_not_inst_save_fail(i8 %x) {
 ; CHECK-LABEL: @fold_cmp_ult_ctpop_c_no_not_inst_save_fail(
 ; CHECK-NEXT:    [[NX:%.*]] = xor i8 [[X:%.*]], -2
 ; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[NX]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[CNT]], 5
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i8 [[CNT]], 5
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %nx = xor i8 %x, -2
@@ -203,7 +203,7 @@ define i1 @fold_cmp_ult_ctpop_c_no_not_inst_save_fail(i8 %x) {
 define <2 x i1> @fold_cmp_ugt_ctpop_c(<2 x i8> %x) {
 ; CHECK-LABEL: @fold_cmp_ugt_ctpop_c(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult <2 x i8> [[TMP1]], <i8 0, i8 2>
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult <2 x i8> [[TMP1]], <i8 0, i8 2>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %nx = xor <2 x i8> %x, <i8 -1, i8 -1>
@@ -216,7 +216,7 @@ define <2 x i1> @fold_cmp_ugt_ctpop_c_out_of_range_fail(<2 x i8> %x) {
 ; CHECK-LABEL: @fold_cmp_ugt_ctpop_c_out_of_range_fail(
 ; CHECK-NEXT:    [[NX:%.*]] = xor <2 x i8> [[X:%.*]], <i8 -1, i8 -1>
 ; CHECK-NEXT:    [[CNT:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[NX]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt <2 x i8> [[CNT]], <i8 2, i8 10>
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt <2 x i8> [[CNT]], <i8 2, i8 10>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %nx = xor <2 x i8> %x, <i8 -1, i8 -1>
diff --git a/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll b/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll
index 17e51e73201b..656673e7cb20 100644
--- a/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll
+++ b/llvm/test/Transforms/InstCombine/fold-log2-ceil-idiom.ll
@@ -118,7 +118,7 @@ define i32 @log2_ceil_idiom_x_may_be_zero(i32 %x) {
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 false)
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[CTLZ]], 31
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -139,7 +139,7 @@ define i4 @log2_ceil_idiom_trunc_too_short(i32 %x) {
 ; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i32 [[CTLZ]] to i4
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i4 [[TRUNC]], -1
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i4
 ; CHECK-NEXT:    [[RET:%.*]] = add i4 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i4 [[RET]]
@@ -160,7 +160,7 @@ define i32 @log2_ceil_idiom_mismatched_operands(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 true)
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[CTLZ]], 31
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -180,7 +180,7 @@ define i32 @log2_ceil_idiom_wrong_constant(i32 %x) {
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 true)
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[CTLZ]], 30
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -220,7 +220,7 @@ define i32 @log2_ceil_idiom_not_a_power2_test2(i32 %x) {
 ; CHECK-NEXT:    [[CTLZ:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[X]], i1 true)
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[CTLZ]], 31
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 2
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -241,7 +241,7 @@ define i32 @log2_ceil_idiom_multiuse2(i32 %x) {
 ; CHECK-NEXT:    call void @use32(i32 [[CTLZ]])
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[CTLZ]], 31
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -263,7 +263,7 @@ define i32 @log2_ceil_idiom_multiuse3(i32 %x) {
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i32 [[CTLZ]], 31
 ; CHECK-NEXT:    call void @use32(i32 [[XOR]])
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i32
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i32 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i32 [[RET]]
@@ -286,7 +286,7 @@ define i5 @log2_ceil_idiom_trunc_multiuse4(i32 %x) {
 ; CHECK-NEXT:    call void @use5(i5 [[TRUNC]])
 ; CHECK-NEXT:    [[XOR:%.*]] = xor i5 [[TRUNC]], -1
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i5
 ; CHECK-NEXT:    [[RET:%.*]] = add i5 [[XOR]], [[ZEXT]]
 ; CHECK-NEXT:    ret i5 [[RET]]
@@ -310,7 +310,7 @@ define i64 @log2_ceil_idiom_zext_multiuse5(i32 %x) {
 ; CHECK-NEXT:    [[EXT:%.*]] = zext nneg i32 [[XOR]] to i64
 ; CHECK-NEXT:    call void @use64(i64 [[EXT]])
 ; CHECK-NEXT:    [[CTPOP:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    [[ZEXT:%.*]] = zext i1 [[CMP]] to i64
 ; CHECK-NEXT:    [[RET:%.*]] = add nuw nsw i64 [[EXT]], [[ZEXT]]
 ; CHECK-NEXT:    ret i64 [[RET]]
diff --git a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
index b2bc1abeaba5..1cb7cf99bea3 100644
--- a/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
+++ b/llvm/test/Transforms/InstCombine/gep-combine-loop-invariant.ll
@@ -217,7 +217,7 @@ define float @gep_cross_loop(ptr %_arg_, ptr %_arg_3, float %_arg_8) {
 ; CHECK:       for.cond.i:
 ; CHECK-NEXT:    [[IDX:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[ADD11_I:%.*]], [[FOR_BODY_I:%.*]] ]
 ; CHECK-NEXT:    [[SUM:%.*]] = phi float [ 0.000000e+00, [[ENTRY]] ], [ [[ADD_I:%.*]], [[FOR_BODY_I]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[IDX]], 17
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[IDX]], 17
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY_I]], label [[FOR_COND_I_I_I_PREHEADER:%.*]]
 ; CHECK:       for.cond.i.i.i.preheader:
 ; CHECK-NEXT:    ret float [[SUM]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll
index 07536f271ceb..653b818f7eb5 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul-zext.ll
@@ -13,7 +13,7 @@ define i32 @sterix(i32, i8, i64) {
 ; CHECK-NEXT:    [[SHR:%.*]] = lshr i32 [[MUL]], [[SH_PROM]]
 ; CHECK-NEXT:    [[CONV2:%.*]] = zext i32 [[SHR]] to i64
 ; CHECK-NEXT:    [[MUL3:%.*]] = mul nuw nsw i64 [[CONV]], [[CONV2]]
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ult i64 [[MUL3]], 4294967296
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp samesign ult i64 [[MUL3]], 4294967296
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[LOR_RHS:%.*]], label [[LOR_END:%.*]]
 ; CHECK:       lor.rhs:
 ; CHECK-NEXT:    [[AND:%.*]] = and i64 [[TMP2]], [[MUL3]]
diff --git a/llvm/test/Transforms/InstCombine/icmp-mul.ll b/llvm/test/Transforms/InstCombine/icmp-mul.ll
index a14f342ae248..c4543c9deef3 100644
--- a/llvm/test/Transforms/InstCombine/icmp-mul.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-mul.ll
@@ -849,7 +849,7 @@ define i1 @not_mul_of_bool(i32 %x, i8 %y) {
 ; CHECK-NEXT:    [[Q:%.*]] = and i32 [[X:%.*]], 3
 ; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[Q]], [[Z]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 255
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[M]], 255
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %q = and i32 %x, 3
@@ -866,7 +866,7 @@ define i1 @not_mul_of_bool_commute(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[X30:%.*]] = lshr i32 [[X:%.*]], 30
 ; CHECK-NEXT:    [[Y8:%.*]] = and i32 [[Y:%.*]], 255
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X30]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 255
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[M]], 255
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x30 = lshr i32 %x, 30
@@ -935,7 +935,7 @@ define i1 @not_mul_of_pow2(i32 %x, i8 %y) {
 ; CHECK-NEXT:    [[Q:%.*]] = and i32 [[X:%.*]], 6
 ; CHECK-NEXT:    [[Z:%.*]] = zext i8 [[Y:%.*]] to i32
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[Q]], [[Z]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 1530
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[M]], 1530
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %q = and i32 %x, 6
@@ -952,7 +952,7 @@ define i1 @not_mul_of_pow2_commute(i32 %x, i32 %y) {
 ; CHECK-NEXT:    [[X30:%.*]] = and i32 [[X:%.*]], 12
 ; CHECK-NEXT:    [[Y8:%.*]] = and i32 [[Y:%.*]], 255
 ; CHECK-NEXT:    [[M:%.*]] = mul nuw nsw i32 [[Y8]], [[X30]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[M]], 3060
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[M]], 3060
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x30 = and i32 %x, 12
diff --git a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
index 618f5d641dc1..b19909a23448 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ne-pow2.ll
@@ -350,7 +350,7 @@ define i32 @not_pow2_32_nonconst_assume(i32 %x, i32 %y) {
 define i32 @pow2_or_zero_32_nonconst_assume(i32 %x, i32 %y) {
 ; CHECK-LABEL: @pow2_or_zero_32_nonconst_assume(
 ; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
-; CHECK-NEXT:    [[YP2:%.*]] = icmp ult i32 [[CTPOP]], 2
+; CHECK-NEXT:    [[YP2:%.*]] = icmp samesign ult i32 [[CTPOP]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[CMP:%.*]] = icmp ne i32 [[AND]], 0
@@ -426,7 +426,7 @@ False:
 define i32 @pow2_or_zero_32_nonconst_assume_br(i32 %x, i32 %y) {
 ; CHECK-LABEL: @pow2_or_zero_32_nonconst_assume_br(
 ; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[Y:%.*]])
-; CHECK-NEXT:    [[YP2:%.*]] = icmp ult i32 [[CTPOP]], 2
+; CHECK-NEXT:    [[YP2:%.*]] = icmp samesign ult i32 [[CTPOP]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[YP2]])
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], [[Y]]
 ; CHECK-NEXT:    [[CMP_NOT:%.*]] = icmp eq i32 [[AND]], 0
diff --git a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
index f2a02fac90b1..2c2de5dbf09f 100644
--- a/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-of-trunc-ext.ll
@@ -70,7 +70,7 @@ define i1 @icmp_trunc_x_trunc_y_2_illegal_anyways(i33 %x, i63 %y) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_LB_ONLY]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_LB_ONLY]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext nneg i33 [[X]] to i63
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i63 [[Y]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i63 [[Y]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x_lb_only = icmp ult i33 %x, 512
@@ -90,7 +90,7 @@ define i1 @icmp_trunc_x_trunc_y_3(i64 %x, i32 %y) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_LB_ONLY]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_LB_ONLY]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw nsw i64 [[X]] to i32
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i32 [[Y]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ule i32 [[Y]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x_lb_only = icmp ult i64 %x, 123
@@ -152,7 +152,7 @@ define i1 @icmp_trunc_x_trunc_y_swap0(i33 %x, i32 %y) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_LB_ONLY]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_LB_ONLY]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw nsw i33 [[X]] to i32
-; CHECK-NEXT:    [[R:%.*]] = icmp uge i32 [[Y]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign uge i32 [[Y]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x_lb_only = icmp ult i33 %x, 65536
@@ -172,7 +172,7 @@ define i1 @icmp_trunc_x_trunc_y_swap1(i33 %x, i32 %y) {
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_LB_ONLY]])
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[Y_LB_ONLY]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc nuw nsw i33 [[X]] to i32
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i32 [[Y]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ule i32 [[Y]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x_lb_only = icmp ult i33 %x, 65536
@@ -190,7 +190,7 @@ define i1 @icmp_trunc_x_zext_y(i32 %x, i8 %y) {
 ; CHECK-NEXT:    [[X_LB_ONLY:%.*]] = icmp ult i32 [[X:%.*]], 65536
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_LB_ONLY]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[X]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[X]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x_lb_only = icmp ult i32 %x, 65536
@@ -206,7 +206,7 @@ define i1 @icmp_trunc_x_zext_y_2(i32 %x, i8 %y) {
 ; CHECK-NEXT:    [[X_LB_ONLY:%.*]] = icmp ult i32 [[X:%.*]], 65536
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[X_LB_ONLY]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = zext i8 [[Y:%.*]] to i32
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i32 [[X]], [[TMP1]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ule i32 [[X]], [[TMP1]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x_lb_only = icmp ult i32 %x, 65536
diff --git a/llvm/test/Transforms/InstCombine/icmp-range.ll b/llvm/test/Transforms/InstCombine/icmp-range.ll
index 8b690826a7bf..2db5bad17b19 100644
--- a/llvm/test/Transforms/InstCombine/icmp-range.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-range.ll
@@ -140,7 +140,7 @@ define i1 @test_two_ranges(ptr nocapture readonly %arg1, ptr nocapture readonly
 ; CHECK-LABEL: @test_two_ranges(
 ; CHECK-NEXT:    [[VAL1:%.*]] = load i32, ptr [[ARG1:%.*]], align 4, !range [[RNG4:![0-9]+]]
 ; CHECK-NEXT:    [[VAL2:%.*]] = load i32, ptr [[ARG2:%.*]], align 4, !range [[RNG5:![0-9]+]]
-; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]]
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp samesign ult i32 [[VAL2]], [[VAL1]]
 ; CHECK-NEXT:    ret i1 [[RVAL]]
 ;
   %val1 = load i32, ptr %arg1, !range !5
@@ -152,7 +152,7 @@ define i1 @test_two_ranges(ptr nocapture readonly %arg1, ptr nocapture readonly
 ; Values' ranges overlap each other, so it can not be simplified.
 define i1 @test_two_attribute_ranges(i32 range(i32 5, 10) %arg1, i32 range(i32 8, 16) %arg2) {
 ; CHECK-LABEL: @test_two_attribute_ranges(
-; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult i32 [[ARG2:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp samesign ult i32 [[ARG2:%.*]], [[ARG1:%.*]]
 ; CHECK-NEXT:    ret i1 [[RVAL]]
 ;
   %rval = icmp ult i32 %arg2, %arg1
@@ -215,7 +215,7 @@ define <2 x i1> @test_two_ranges_vec(ptr nocapture readonly %arg1, ptr nocapture
 ; CHECK-LABEL: @test_two_ranges_vec(
 ; CHECK-NEXT:    [[VAL1:%.*]] = load <2 x i32>, ptr [[ARG1:%.*]], align 8, !range [[RNG4]]
 ; CHECK-NEXT:    [[VAL2:%.*]] = load <2 x i32>, ptr [[ARG2:%.*]], align 8, !range [[RNG5]]
-; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult <2 x i32> [[VAL2]], [[VAL1]]
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp samesign ult <2 x i32> [[VAL2]], [[VAL1]]
 ; CHECK-NEXT:    ret <2 x i1> [[RVAL]]
 ;
   %val1 = load <2 x i32>, ptr %arg1, !range !5
@@ -249,7 +249,7 @@ define <2 x i1> @test_two_ranges_vec_true(ptr nocapture readonly %arg1, ptr noca
 ; Values' ranges overlap each other, so it can not be simplified.
 define <2 x i1> @test_two_argument_ranges_vec(<2 x i32> range(i32 5, 10) %arg1, <2 x i32> range(i32 8, 16) %arg2) {
 ; CHECK-LABEL: @test_two_argument_ranges_vec(
-; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult <2 x i32> [[ARG2:%.*]], [[ARG1:%.*]]
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp samesign ult <2 x i32> [[ARG2:%.*]], [[ARG1:%.*]]
 ; CHECK-NEXT:    ret <2 x i1> [[RVAL]]
 ;
   %rval = icmp ult <2 x i32> %arg2, %arg1
@@ -283,7 +283,7 @@ define i1 @test_two_return_attribute_ranges_not_simplified() {
 ; CHECK-LABEL: @test_two_return_attribute_ranges_not_simplified(
 ; CHECK-NEXT:    [[VAL1:%.*]] = call range(i32 5, 10) i32 @create_range1()
 ; CHECK-NEXT:    [[VAL2:%.*]] = call i32 @create_range2()
-; CHECK-NEXT:    [[RVAL:%.*]] = icmp ult i32 [[VAL2]], [[VAL1]]
+; CHECK-NEXT:    [[RVAL:%.*]] = icmp samesign ult i32 [[VAL2]], [[VAL1]]
 ; CHECK-NEXT:    ret i1 [[RVAL]]
 ;
   %val1 = call range(i32 5, 10) i32 @create_range1()
diff --git a/llvm/test/Transforms/InstCombine/icmp-shr.ll b/llvm/test/Transforms/InstCombine/icmp-shr.ll
index 71b4f5a970c2..bdcba9ed1549 100644
--- a/llvm/test/Transforms/InstCombine/icmp-shr.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-shr.ll
@@ -1072,7 +1072,7 @@ define <2 x i1> @lshr_pow2_ugt_vec(<2 x i8> %x) {
 define i1 @lshr_not_pow2_ugt(i8 %x) {
 ; CHECK-LABEL: @lshr_not_pow2_ugt(
 ; CHECK-NEXT:    [[S:%.*]] = lshr i8 3, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[S]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i8 [[S]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %s = lshr i8 3, %x
@@ -1095,7 +1095,7 @@ define i1 @lshr_pow2_ugt1(i8 %x) {
 define i1 @ashr_pow2_ugt(i8 %x) {
 ; CHECK-LABEL: @ashr_pow2_ugt(
 ; CHECK-NEXT:    [[S:%.*]] = ashr exact i8 -128, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[S]], -96
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i8 [[S]], -96
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %s = ashr i8 128, %x
@@ -1154,7 +1154,7 @@ define <2 x i1> @lshr_pow2_ult_vec(<2 x i8> %x) {
 define i1 @lshr_not_pow2_ult(i8 %x) {
 ; CHECK-LABEL: @lshr_not_pow2_ult(
 ; CHECK-NEXT:    [[S:%.*]] = lshr i8 3, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[S]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i8 [[S]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %s = lshr i8 3, %x
@@ -1187,7 +1187,7 @@ define i1 @lshr_pow2_ult_smin(i8 %x) {
 define i1 @ashr_pow2_ult(i8 %x) {
 ; CHECK-LABEL: @ashr_pow2_ult(
 ; CHECK-NEXT:    [[S:%.*]] = ashr exact i8 -128, [[X:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[S]], -96
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i8 [[S]], -96
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %s = ashr i8 128, %x
@@ -1631,7 +1631,7 @@ define i1 @slt_zero_ult_i1_fail1(i32 %a, i1 %b) {
 ; CHECK-LABEL: @slt_zero_ult_i1_fail1(
 ; CHECK-NEXT:    [[CONV:%.*]] = zext i1 [[B:%.*]] to i32
 ; CHECK-NEXT:    [[CMP1:%.*]] = lshr i32 [[A:%.*]], 30
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[CMP1]], [[CONV]]
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp samesign ugt i32 [[CMP1]], [[CONV]]
 ; CHECK-NEXT:    ret i1 [[CMP2]]
 ;
   %conv = zext i1 %b to i32
diff --git a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
index ba47ed02edbd..c185e632b519 100644
--- a/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-uge-of-not-of-shl-allones-by-bits-and-val-to-icmp-eq-of-lshr-val-by-bits-and-0.ll
@@ -101,7 +101,7 @@ define i1 @both(i8 %bits0, i8 %bits1) {
 ; CHECK-LABEL: @both(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[BITS0:%.*]]
 ; CHECK-NEXT:    [[T2:%.*]] = shl nsw i8 -1, [[BITS1:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ule i8 [[T0]], [[T2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ule i8 [[T0]], [[T2]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = shl i8 -1, %bits0
diff --git a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
index 37aa85202e56..c1912e11b93a 100644
--- a/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-ult-of-not-of-shl-allones-by-bits-and-val-to-icmp-ne-of-lshr-val-by-bits-and-0.ll
@@ -101,7 +101,7 @@ define i1 @both(i8 %bits0, i8 %bits1) {
 ; CHECK-LABEL: @both(
 ; CHECK-NEXT:    [[T0:%.*]] = shl nsw i8 -1, [[BITS0:%.*]]
 ; CHECK-NEXT:    [[T2:%.*]] = shl nsw i8 -1, [[BITS1:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[T0]], [[T2]]
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i8 [[T0]], [[T2]]
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = shl i8 -1, %bits0
diff --git a/llvm/test/Transforms/InstCombine/icmp-vscale.ll b/llvm/test/Transforms/InstCombine/icmp-vscale.ll
index ae1be58938aa..a13e90456d63 100644
--- a/llvm/test/Transforms/InstCombine/icmp-vscale.ll
+++ b/llvm/test/Transforms/InstCombine/icmp-vscale.ll
@@ -94,7 +94,7 @@ define i1 @vscale_ule_max() vscale_range(4,8) {
 define i1 @vscale_ult_max() vscale_range(4,8) {
 ; CHECK-LABEL: @vscale_ult_max(
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i16 @llvm.vscale.i16()
-; CHECK-NEXT:    [[RES:%.*]] = icmp ult i16 [[VSCALE]], 8
+; CHECK-NEXT:    [[RES:%.*]] = icmp samesign ult i16 [[VSCALE]], 8
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %vscale = call i16 @llvm.vscale.i16()
@@ -114,7 +114,7 @@ define i1 @vscale_uge_min() vscale_range(4,8) {
 define i1 @vscale_ugt_min() vscale_range(4,8) {
 ; CHECK-LABEL: @vscale_ugt_min(
 ; CHECK-NEXT:    [[VSCALE:%.*]] = call i16 @llvm.vscale.i16()
-; CHECK-NEXT:    [[RES:%.*]] = icmp ugt i16 [[VSCALE]], 4
+; CHECK-NEXT:    [[RES:%.*]] = icmp samesign ugt i16 [[VSCALE]], 4
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %vscale = call i16 @llvm.vscale.i16()
diff --git a/llvm/test/Transforms/InstCombine/icmp.ll b/llvm/test/Transforms/InstCombine/icmp.ll
index 7cafb4885ff0..c695dc1cd69c 100644
--- a/llvm/test/Transforms/InstCombine/icmp.ll
+++ b/llvm/test/Transforms/InstCombine/icmp.ll
@@ -1457,7 +1457,7 @@ define <2 x i1> @test67vecinverse(<2 x i32> %x) {
 define i1 @test68(i32 %x) {
 ; CHECK-LABEL: @test68(
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[X:%.*]], 127
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[AND]], 30
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[AND]], 30
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %and = and i32 %x, 127
@@ -2213,7 +2213,7 @@ define i1 @icmp_and_ashr_mixed_and_shiftout(i8 %x) {
 ; CHECK-LABEL: @icmp_and_ashr_mixed_and_shiftout(
 ; CHECK-NEXT:    [[ASHR:%.*]] = ashr i8 [[X:%.*]], 4
 ; CHECK-NEXT:    [[AND:%.*]] = and i8 [[ASHR]], 31
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[AND]], 8
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i8 [[AND]], 8
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %ashr = ashr i8 %x, 4
@@ -5334,7 +5334,7 @@ define i1 @test_icmp_shl_sgt(i64 %x) {
 
 define i1 @pr94897(i32 range(i32 -2147483648, 0) %x) {
 ; CHECK-LABEL: @pr94897(
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[X:%.*]], -3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[X:%.*]], -3
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %shl = shl nsw i32 %x, 24
@@ -5364,7 +5364,7 @@ define i1 @icmp_and_inv_pow2_ne_0(i32 %A, i32 %B) {
 define i1 @icmp_and_inv_pow2_or_zero_ne_0(i32 %A, i32 %B) {
 ; CHECK-LABEL: @icmp_and_inv_pow2_or_zero_ne_0(
 ; CHECK-NEXT:    [[POPCNT:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[A:%.*]])
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i32 [[POPCNT]], 2
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i32 [[POPCNT]], 2
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[COND]])
 ; CHECK-NEXT:    [[INV:%.*]] = xor i32 [[B:%.*]], -1
 ; CHECK-NEXT:    [[AND:%.*]] = and i32 [[A]], [[INV]]
diff --git a/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll b/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll
index baad241f8dad..beab27392507 100644
--- a/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll
+++ b/llvm/test/Transforms/InstCombine/icmp_sdiv_with_and_without_range.ll
@@ -19,7 +19,7 @@ define i1 @without_range(ptr %A) {
 define i1 @with_range(ptr %A) {
 ; CHECK-LABEL: @with_range(
 ; CHECK-NEXT:    [[A_VAL:%.*]] = load i32, ptr [[A:%.*]], align 8, !range [[RNG0:![0-9]+]]
-; CHECK-NEXT:    [[C:%.*]] = icmp ult i32 [[A_VAL]], 2
+; CHECK-NEXT:    [[C:%.*]] = icmp samesign ult i32 [[A_VAL]], 2
 ; CHECK-NEXT:    ret i1 [[C]]
 ;
   %A.val = load i32, ptr %A, align 8, !range !0
diff --git a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
index 79511cf9e666..1110bbc5403e 100644
--- a/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
+++ b/llvm/test/Transforms/InstCombine/indexed-gep-compares.ll
@@ -40,7 +40,7 @@ define ptr @test1_nuw(ptr %A, i32 %Offset) {
 ; CHECK:       bb:
 ; CHECK-NEXT:    [[RHS_IDX:%.*]] = phi i32 [ [[RHS_ADD:%.*]], [[BB]] ], [ [[TMP_IDX]], [[ENTRY:%.*]] ]
 ; CHECK-NEXT:    [[RHS_ADD]] = add nuw nsw i32 [[RHS_IDX]], 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[RHS_IDX]], 400
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ugt i32 [[RHS_IDX]], 400
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB2:%.*]], label [[BB]]
 ; CHECK:       bb2:
 ; CHECK-NEXT:    [[RHS_PTR:%.*]] = getelementptr inbounds nuw i8, ptr [[A:%.*]], i32 [[RHS_IDX]]
diff --git a/llvm/test/Transforms/InstCombine/ispow2.ll b/llvm/test/Transforms/InstCombine/ispow2.ll
index 348508769c58..7ace998556c7 100644
--- a/llvm/test/Transforms/InstCombine/ispow2.ll
+++ b/llvm/test/Transforms/InstCombine/ispow2.ll
@@ -4,7 +4,7 @@
 define i1 @is_pow2or0_negate_op(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_negate_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %neg = sub i32 0, %x
@@ -16,7 +16,7 @@ define i1 @is_pow2or0_negate_op(i32 %x) {
 define <2 x i1> @is_pow2or0_negate_op_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @is_pow2or0_negate_op_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i32> [[TMP1]], <i32 2, i32 2>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult <2 x i32> [[TMP1]], <i32 2, i32 2>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %neg = sub <2 x i32> zeroinitializer, %x
@@ -28,7 +28,7 @@ define <2 x i1> @is_pow2or0_negate_op_vec(<2 x i32> %x) {
 define i1 @is_pow2or0_decrement_op(i8 %x) {
 ; CHECK-LABEL: @is_pow2or0_decrement_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[TMP1]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i8 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %dec = add i8 %x, -1
@@ -40,7 +40,7 @@ define i1 @is_pow2or0_decrement_op(i8 %x) {
 define <2 x i1> @is_pow2or0_decrement_op_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @is_pow2or0_decrement_op_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult <2 x i8> [[TMP1]], <i8 2, i8 2>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult <2 x i8> [[TMP1]], <i8 2, i8 2>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %dec = add <2 x i8> %x, <i8 -1, i8 -1>
@@ -52,7 +52,7 @@ define <2 x i1> @is_pow2or0_decrement_op_vec(<2 x i8> %x) {
 define i1 @isnot_pow2or0_negate_op(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2or0_negate_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %neg = sub i32 0, %x
@@ -64,7 +64,7 @@ define i1 @isnot_pow2or0_negate_op(i32 %x) {
 define <2 x i1> @isnot_pow2or0_negate_op_vec(<2 x i32> %x) {
 ; CHECK-LABEL: @isnot_pow2or0_negate_op_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i32> [[TMP1]], <i32 1, i32 1>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt <2 x i32> [[TMP1]], <i32 1, i32 1>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %neg = sub <2 x i32> zeroinitializer, %x
@@ -76,7 +76,7 @@ define <2 x i1> @isnot_pow2or0_negate_op_vec(<2 x i32> %x) {
 define i1 @isnot_pow2or0_decrement_op(i8 %x) {
 ; CHECK-LABEL: @isnot_pow2or0_decrement_op(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i8 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i8 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %dec = add i8 %x, -1
@@ -88,7 +88,7 @@ define i1 @isnot_pow2or0_decrement_op(i8 %x) {
 define <2 x i1> @isnot_pow2or0_decrement_op_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @isnot_pow2or0_decrement_op_vec(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt <2 x i8> [[TMP1]], <i8 1, i8 1>
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt <2 x i8> [[TMP1]], <i8 1, i8 1>
 ; CHECK-NEXT:    ret <2 x i1> [[CMP]]
 ;
   %dec = add <2 x i8> %x, <i8 -1, i8 -1>
@@ -101,7 +101,7 @@ define i1 @is_pow2or0_negate_op_commute1(i32 %p) {
 ; CHECK-LABEL: @is_pow2or0_negate_op_commute1(
 ; CHECK-NEXT:    [[X:%.*]] = srem i32 42, [[P:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 7) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %x = srem i32 42, %p ; thwart complexity-based canonicalization
@@ -117,7 +117,7 @@ define i1 @isnot_pow2or0_negate_op_commute2(i32 %p) {
 ; CHECK-LABEL: @isnot_pow2or0_negate_op_commute2(
 ; CHECK-NEXT:    [[X:%.*]] = urem i32 42, [[P:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 7) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %x = urem i32 42, %p ; thwart complexity-based canonicalization
@@ -131,7 +131,7 @@ define i1 @isnot_pow2or0_negate_op_commute3(i32 %p) {
 ; CHECK-LABEL: @isnot_pow2or0_negate_op_commute3(
 ; CHECK-NEXT:    [[X:%.*]] = urem i32 42, [[P:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 7) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %x = urem i32 42, %p ; thwart complexity-based canonicalization
@@ -148,7 +148,7 @@ define i1 @is_pow2or0_negate_op_extra_use1(i32 %x) {
 ; CHECK-NEXT:    [[NEG:%.*]] = sub i32 0, [[X:%.*]]
 ; CHECK-NEXT:    call void @use(i32 [[NEG]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[TMP1]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %neg = sub i32 0, %x
@@ -198,7 +198,7 @@ define i1 @is_pow2_non_zero_ult_2(i32 %x) {
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %notzero = icmp ne i32 %x, 0
@@ -228,7 +228,7 @@ define i1 @is_pow2_non_zero_ugt_1(i32 %x) {
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X:%.*]], 0
 ; CHECK-NEXT:    call void @llvm.assume(i1 [[NOTZERO]])
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %notzero = icmp ne i32 %x, 0
@@ -272,7 +272,7 @@ declare void @use_i1(i1)
 define i1 @is_pow2_ctpop_extra_uses(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_extra_uses(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[NOTZERO]])
@@ -291,7 +291,7 @@ define i1 @is_pow2_ctpop_extra_uses(i32 %x) {
 define i1 @is_pow2_ctpop_extra_uses_logical(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_extra_uses_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[NOTZERO]])
@@ -327,7 +327,7 @@ define <2 x i1> @is_pow2_ctpop_commute_vec(<2 x i8> %x) {
 define i1 @is_pow2_ctpop_wrong_cmp_op1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 3
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -342,7 +342,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op1(i32 %x) {
 define i1 @is_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op1_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 3
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 3
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -359,7 +359,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) {
 define i1 @is_pow2_ctpop_wrong_cmp_op2(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op2(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 1
 ; CHECK-NEXT:    [[R:%.*]] = and i1 [[NOTZERO]], [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -374,7 +374,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op2(i32 %x) {
 define i1 @is_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_cmp_op2_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 1
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[NOTZERO]], i1 [[CMP]], i1 false
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -391,7 +391,7 @@ define i1 @is_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) {
 define i1 @is_pow2_ctpop_wrong_pred1(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -404,7 +404,7 @@ define i1 @is_pow2_ctpop_wrong_pred1(i32 %x) {
 define i1 @is_pow2_ctpop_wrong_pred1_logical(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred1_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[CMP]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -419,7 +419,7 @@ define i1 @is_pow2_ctpop_wrong_pred1_logical(i32 %x) {
 define i1 @is_pow2_ctpop_wrong_pred2(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred2(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = and i1 [[CMP2]], [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -434,7 +434,7 @@ define i1 @is_pow2_ctpop_wrong_pred2(i32 %x) {
 define i1 @is_pow2_ctpop_wrong_pred2_logical(i32 %x) {
 ; CHECK-LABEL: @is_pow2_ctpop_wrong_pred2_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp sgt i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP2]], i1 [[CMP]], i1 false
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -479,7 +479,7 @@ define i1 @isnot_pow2_ctpop_logical(i32 %x) {
 define i1 @isnot_pow2_ctpop_extra_uses(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_extra_uses(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[ISZERO]])
@@ -498,7 +498,7 @@ define i1 @isnot_pow2_ctpop_extra_uses(i32 %x) {
 define i1 @isnot_pow2_ctpop_extra_uses_logical(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_extra_uses_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[ISZERO]])
@@ -534,7 +534,7 @@ define <2 x i1> @isnot_pow2_ctpop_commute_vec(<2 x i8> %x) {
 define i1 @isnot_pow2_ctpop_wrong_cmp_op1(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op1(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -549,7 +549,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op1(i32 %x) {
 define i1 @isnot_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op1_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 2
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 2
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[ISZERO]], i1 true, i1 [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -566,7 +566,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op1_logical(i32 %x) {
 define i1 @isnot_pow2_ctpop_wrong_cmp_op2(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op2(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 1
 ; CHECK-NEXT:    [[R:%.*]] = or i1 [[ISZERO]], [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -581,7 +581,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op2(i32 %x) {
 define i1 @isnot_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_cmp_op2_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 1
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[ISZERO]], i1 true, i1 [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -598,7 +598,7 @@ define i1 @isnot_pow2_ctpop_wrong_cmp_op2_logical(i32 %x) {
 define i1 @isnot_pow2_ctpop_wrong_pred2(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_pred2(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = or i1 [[CMP2]], [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -613,7 +613,7 @@ define i1 @isnot_pow2_ctpop_wrong_pred2(i32 %x) {
 define i1 @isnot_pow2_ctpop_wrong_pred2_logical(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2_ctpop_wrong_pred2_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    [[CMP2:%.*]] = icmp slt i32 [[X]], 0
 ; CHECK-NEXT:    [[R:%.*]] = select i1 [[CMP2]], i1 true, i1 [[CMP]]
 ; CHECK-NEXT:    ret i1 [[R]]
@@ -798,7 +798,7 @@ define <2 x i1> @isnot_pow2_decrement_op_vec(<2 x i8> %x) {
 define i1 @is_pow2or0_ctpop(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -811,7 +811,7 @@ define i1 @is_pow2or0_ctpop(i32 %x) {
 define i1 @is_pow2or0_ctpop_swap_cmp(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop_swap_cmp(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -824,7 +824,7 @@ define i1 @is_pow2or0_ctpop_swap_cmp(i32 %x) {
 define i1 @is_pow2or0_ctpop_logical(i32 %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -837,7 +837,7 @@ define i1 @is_pow2or0_ctpop_logical(i32 %x) {
 define <2 x i1> @is_pow2or0_ctpop_commute_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @is_pow2or0_ctpop_commute_vec(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult <2 x i8> [[T0]], <i8 2, i8 2>
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult <2 x i8> [[T0]], <i8 2, i8 2>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t0 = tail call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %x)
@@ -857,7 +857,7 @@ define i1 @is_pow2or0_ctpop_extra_uses(i32 %x) {
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[ISZERO]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -878,7 +878,7 @@ define i1 @is_pow2or0_ctpop_logical_extra_uses(i32 %x) {
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[ISZERO:%.*]] = icmp eq i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[ISZERO]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i32 [[T0]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i32 [[T0]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -991,7 +991,7 @@ define <2 x i1> @is_pow2or0_ctpop_commute_vec_wrong_pred3(<2 x i8> %x) {
 define i1 @isnot_pow2nor0_ctpop(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -1004,7 +1004,7 @@ define i1 @isnot_pow2nor0_ctpop(i32 %x) {
 define i1 @isnot_pow2nor0_ctpop_swap_cmp(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop_swap_cmp(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -1017,7 +1017,7 @@ define i1 @isnot_pow2nor0_ctpop_swap_cmp(i32 %x) {
 define i1 @isnot_pow2nor0_ctpop_logical(i32 %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop_logical(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -1030,7 +1030,7 @@ define i1 @isnot_pow2nor0_ctpop_logical(i32 %x) {
 define <2 x i1> @isnot_pow2nor0_ctpop_commute_vec(<2 x i8> %x) {
 ; CHECK-LABEL: @isnot_pow2nor0_ctpop_commute_vec(
 ; CHECK-NEXT:    [[T0:%.*]] = tail call range(i8 0, 9) <2 x i8> @llvm.ctpop.v2i8(<2 x i8> [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt <2 x i8> [[T0]], <i8 1, i8 1>
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt <2 x i8> [[T0]], <i8 1, i8 1>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %t0 = tail call <2 x i8> @llvm.ctpop.v2i8(<2 x i8> %x)
@@ -1050,7 +1050,7 @@ define i1 @isnot_pow2nor0_ctpop_extra_uses(i32 %x) {
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -1071,7 +1071,7 @@ define i1 @isnot_pow2nor0_ctpop_logical_extra_uses(i32 %x) {
 ; CHECK-NEXT:    call void @use_i1(i1 [[CMP]])
 ; CHECK-NEXT:    [[NOTZERO:%.*]] = icmp ne i32 [[X]], 0
 ; CHECK-NEXT:    call void @use_i1(i1 [[NOTZERO]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[T0]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[T0]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %t0 = tail call i32 @llvm.ctpop.i32(i32 %x)
@@ -1210,7 +1210,7 @@ define i1 @blsmsk_is_p2_or_z(i32 %xx, i32 %yy) {
 define i1 @blsmsk_isnt_p2_or_z(i32 %x) {
 ; CHECK-LABEL: @blsmsk_isnt_p2_or_z(
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %xm1 = add i32 %x, -1
@@ -1223,7 +1223,7 @@ define i1 @blsmsk_is_p2_or_z_fail(i32 %xx, i32 %yy) {
 ; CHECK-LABEL: @blsmsk_is_p2_or_z_fail(
 ; CHECK-NEXT:    [[X:%.*]] = or i32 [[XX:%.*]], [[YY:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i32 [[TMP1]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i32 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = or i32 %xx, %yy
@@ -1308,7 +1308,7 @@ define i1 @blsmsk_is_p2_or_z_ule_xy(i8 %xx, i8 %yy) {
 ; CHECK-LABEL: @blsmsk_is_p2_or_z_ule_xy(
 ; CHECK-NEXT:    [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[TMP1]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i8 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = or i8 %xx, %yy
@@ -1339,7 +1339,7 @@ define i1 @blsmsk_is_p2_or_z_uge_yx(i8 %xx, i8 %yy) {
 ; CHECK-LABEL: @blsmsk_is_p2_or_z_uge_yx(
 ; CHECK-NEXT:    [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i8 [[TMP1]], 2
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i8 [[TMP1]], 2
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = or i8 %xx, %yy
@@ -1369,7 +1369,7 @@ define i1 @blsmsk_isnt_p2_or_z_ugt_xy(i8 %xx, i8 %yy) {
 ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_ugt_xy(
 ; CHECK-NEXT:    [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[TMP1]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i8 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = or i8 %xx, %yy
@@ -1400,7 +1400,7 @@ define i1 @blsmsk_isnt_p2_or_z_ult_yx(i8 %xx, i8 %yy) {
 ; CHECK-LABEL: @blsmsk_isnt_p2_or_z_ult_yx(
 ; CHECK-NEXT:    [[X:%.*]] = or i8 [[XX:%.*]], [[YY:%.*]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = call range(i8 0, 9) i8 @llvm.ctpop.i8(i8 [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt i8 [[TMP1]], 1
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt i8 [[TMP1]], 1
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %x = or i8 %xx, %yy
@@ -1506,7 +1506,7 @@ define <2 x i1> @not_pow2_or_z_known_bits_fail_wrong_cmp(<2 x i32> %xin) {
 ; CHECK-LABEL: @not_pow2_or_z_known_bits_fail_wrong_cmp(
 ; CHECK-NEXT:    [[X:%.*]] = or <2 x i32> [[XIN:%.*]], <i32 64, i32 64>
 ; CHECK-NEXT:    [[CNT:%.*]] = call range(i32 1, 33) <2 x i32> @llvm.ctpop.v2i32(<2 x i32> [[X]])
-; CHECK-NEXT:    [[R:%.*]] = icmp ugt <2 x i32> [[CNT]], <i32 2, i32 2>
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ugt <2 x i32> [[CNT]], <i32 2, i32 2>
 ; CHECK-NEXT:    ret <2 x i1> [[R]]
 ;
   %x = or <2 x i32> %xin, <i32 64, i32 64>
@@ -1550,7 +1550,7 @@ entry:
 define i1 @is_power2_or_zero_with_range(i32 %x) {
 ; CHECK-LABEL: @is_power2_or_zero_with_range(
 ; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[RES:%.*]] = icmp ult i32 [[CTPOP]], 2
+; CHECK-NEXT:    [[RES:%.*]] = icmp samesign ult i32 [[CTPOP]], 2
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %ctpop = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
@@ -1563,7 +1563,7 @@ define i1 @is_power2_or_zero_with_range(i32 %x) {
 define i1 @is_power2_or_zero_inv_with_range(i32 %x) {
 ; CHECK-LABEL: @is_power2_or_zero_inv_with_range(
 ; CHECK-NEXT:    [[CTPOP:%.*]] = call range(i32 0, 33) i32 @llvm.ctpop.i32(i32 [[X:%.*]])
-; CHECK-NEXT:    [[RES:%.*]] = icmp ugt i32 [[CTPOP]], 1
+; CHECK-NEXT:    [[RES:%.*]] = icmp samesign ugt i32 [[CTPOP]], 1
 ; CHECK-NEXT:    ret i1 [[RES]]
 ;
   %ctpop = call range(i32 1, 33) i32 @llvm.ctpop.i32(i32 %x)
diff --git a/llvm/test/Transforms/InstCombine/load-bitcast-select.ll b/llvm/test/Transforms/InstCombine/load-bitcast-select.ll
index 4c5f94d53ada..83a34de768f2 100644
--- a/llvm/test/Transforms/InstCombine/load-bitcast-select.ll
+++ b/llvm/test/Transforms/InstCombine/load-bitcast-select.ll
@@ -10,7 +10,7 @@ define void @_Z3foov() {
 ; CHECK-NEXT:    br label [[FOR_COND:%.*]]
 ; CHECK:       for.cond:
 ; CHECK-NEXT:    [[I_0:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY:%.*]] ]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_0]], 1000
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[I_0]], 1000
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
 ; CHECK:       for.cond.cleanup:
 ; CHECK-NEXT:    ret void
diff --git a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll
index 89522a00d789..e1c441e9c0b4 100644
--- a/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll
+++ b/llvm/test/Transforms/InstCombine/lshr-and-negC-icmpeq-zero.ll
@@ -177,7 +177,7 @@ define i1 @scalar_lshr_and_negC_eq_extra_use_lshr_and(i32 %x, i32 %y, i32 %z, pt
 define i1 @scalar_i32_lshr_and_negC_eq_X_is_constant1(i32 %y) {
 ; CHECK-LABEL: @scalar_i32_lshr_and_negC_eq_X_is_constant1(
 ; CHECK-NEXT:    [[LSHR:%.*]] = lshr i32 12345, [[Y:%.*]]
-; CHECK-NEXT:    [[R:%.*]] = icmp ult i32 [[LSHR]], 8
+; CHECK-NEXT:    [[R:%.*]] = icmp samesign ult i32 [[LSHR]], 8
 ; CHECK-NEXT:    ret i1 [[R]]
 ;
   %lshr = lshr i32 12345, %y
diff --git a/llvm/test/Transforms/InstCombine/memchr.ll b/llvm/test/Transforms/InstCombine/memchr.ll
index 08435a5e0388..0cf85a2a1b55 100644
--- a/llvm/test/Transforms/InstCombine/memchr.ll
+++ b/llvm/test/Transforms/InstCombine/memchr.ll
@@ -124,7 +124,7 @@ define i1 @test11(i32 %C) {
 ; CHECK-LABEL: @test11(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i16
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 255
-; CHECK-NEXT:    [[MEMCHR_BOUNDS:%.*]] = icmp ult i16 [[TMP2]], 16
+; CHECK-NEXT:    [[MEMCHR_BOUNDS:%.*]] = icmp samesign ult i16 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i16 1, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i16 [[TMP3]], 9217
 ; CHECK-NEXT:    [[MEMCHR_BITS:%.*]] = icmp ne i16 [[TMP4]], 0
diff --git a/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll b/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll
index 6a0546887535..9ff093635ad7 100644
--- a/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll
+++ b/llvm/test/Transforms/InstCombine/phi-known-bits-operand-order.ll
@@ -20,7 +20,7 @@ define void @phi_recurrence_start_first() {
 ; CHECK-NEXT:    br i1 [[COND_V2]], label [[FOR_COND11:%.*]], label [[FOR_COND26]]
 ; CHECK:       for.cond11:
 ; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[START]], [[IF_THEN]] ], [ [[STEP:%.*]], [[FOR_COND11]] ]
-; CHECK-NEXT:    [[CMP13:%.*]] = icmp ult i32 [[I_1]], 100
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp samesign ult i32 [[I_1]], 100
 ; CHECK-NEXT:    [[STEP]] = add nuw nsw i32 [[I_1]], 1
 ; CHECK-NEXT:    br i1 [[CMP13]], label [[FOR_COND11]], label [[WHILE_END]]
 ; CHECK:       for.cond26:
@@ -68,7 +68,7 @@ define void @phi_recurrence_step_first() {
 ; CHECK-NEXT:    br i1 [[COND_V2]], label [[FOR_COND11:%.*]], label [[FOR_COND26]]
 ; CHECK:       for.cond11:
 ; CHECK-NEXT:    [[I_1:%.*]] = phi i32 [ [[STEP:%.*]], [[FOR_COND11]] ], [ [[START]], [[IF_THEN]] ]
-; CHECK-NEXT:    [[CMP13:%.*]] = icmp ult i32 [[I_1]], 100
+; CHECK-NEXT:    [[CMP13:%.*]] = icmp samesign ult i32 [[I_1]], 100
 ; CHECK-NEXT:    [[STEP]] = add nuw nsw i32 [[I_1]], 1
 ; CHECK-NEXT:    br i1 [[CMP13]], label [[FOR_COND11]], label [[WHILE_END]]
 ; CHECK:       for.cond26:
diff --git a/llvm/test/Transforms/InstCombine/pr100298.ll b/llvm/test/Transforms/InstCombine/pr100298.ll
index 6cf2a71bb916..eff67edd2b03 100644
--- a/llvm/test/Transforms/InstCombine/pr100298.ll
+++ b/llvm/test/Transforms/InstCombine/pr100298.ll
@@ -11,11 +11,11 @@ define i16 @pr100298() {
 ; CHECK-NEXT:    [[INDVAR:%.*]] = phi i32 [ -15, %[[ENTRY]] ], [ [[MASK:%.*]], %[[FOR_INC]] ]
 ; CHECK-NEXT:    [[ADD:%.*]] = add nsw i32 [[INDVAR]], 9
 ; CHECK-NEXT:    [[MASK]] = and i32 [[ADD]], 65535
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[MASK]], 5
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[MASK]], 5
 ; CHECK-NEXT:    br i1 [[CMP1]], label %[[FOR_INC]], label %[[FOR_END:.*]]
 ; CHECK:       [[FOR_END]]:
 ; CHECK-NEXT:    [[CONV:%.*]] = trunc i32 [[ADD]] to i16
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ugt i32 [[MASK]], 3
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp samesign ugt i32 [[MASK]], 3
 ; CHECK-NEXT:    [[SHL:%.*]] = shl nuw i16 [[CONV]], 14
 ; CHECK-NEXT:    [[RES:%.*]] = select i1 [[CMP2]], i16 [[CONV]], i16 [[SHL]]
 ; CHECK-NEXT:    ret i16 [[RES]]
diff --git a/llvm/test/Transforms/InstCombine/pr63791.ll b/llvm/test/Transforms/InstCombine/pr63791.ll
index 73a559f98926..78cc1130fb33 100644
--- a/llvm/test/Transforms/InstCombine/pr63791.ll
+++ b/llvm/test/Transforms/InstCombine/pr63791.ll
@@ -15,7 +15,7 @@ define void @y() {
 ; CHECK-NEXT:    store i1 true, ptr poison, align 1
 ; CHECK-NEXT:    br i1 poison, label [[FOR_COND_I]], label [[FOR_COND5_PREHEADER_I]]
 ; CHECK:       for.cond5.preheader.i:
-; CHECK-NEXT:    br i1 true, label [[FOR_COND1_LOOPEXIT_I:%.*]], label [[FOR_INC19_I:%.*]]
+; CHECK-NEXT:    br i1 false, label [[FOR_INC19_I:%.*]], label [[FOR_COND1_LOOPEXIT_I:%.*]]
 ; CHECK:       for.inc19.i:
 ; CHECK-NEXT:    br i1 poison, label [[FOR_INC19_I]], label [[FOR_COND1_LOOPEXIT_I]]
 ;
diff --git a/llvm/test/Transforms/InstCombine/rem.ll b/llvm/test/Transforms/InstCombine/rem.ll
index 2cf56dfd50a8..4262ef85553b 100644
--- a/llvm/test/Transforms/InstCombine/rem.ll
+++ b/llvm/test/Transforms/InstCombine/rem.ll
@@ -1045,7 +1045,7 @@ define <2 x i32> @PR62401(<2 x i1> %x, <2 x i32> %y) {
 define i16 @rem_pow2_or_zero(i16 %x, i16 %y) {
 ; CHECK-LABEL: @rem_pow2_or_zero(
 ; CHECK-NEXT:    [[POPCNT:%.*]] = call range(i16 1, 17) i16 @llvm.ctpop.i16(i16 [[Y:%.*]])
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i16 [[POPCNT]], 2
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i16 [[POPCNT]], 2
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[COND]])
 ; CHECK-NEXT:    [[TMP1:%.*]] = add i16 [[Y]], -1
 ; CHECK-NEXT:    [[REM:%.*]] = and i16 [[X:%.*]], [[TMP1]]
@@ -1130,7 +1130,7 @@ define i64 @rem_pow2_or_zero_domcond(i64 %a, i64 %b) {
 ; CHECK-LABEL: @rem_pow2_or_zero_domcond(
 ; CHECK-NEXT:  start:
 ; CHECK-NEXT:    [[CPOP:%.*]] = call range(i64 0, 65) i64 @llvm.ctpop.i64(i64 [[B:%.*]])
-; CHECK-NEXT:    [[COND:%.*]] = icmp ult i64 [[CPOP]], 2
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ult i64 [[CPOP]], 2
 ; CHECK-NEXT:    br i1 [[COND]], label [[BB1:%.*]], label [[BB2:%.*]]
 ; CHECK:       bb1:
 ; CHECK-NEXT:    [[TMP0:%.*]] = add i64 [[B]], -1
diff --git a/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll b/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll
index 4123bc555789..56580a8b68c6 100644
--- a/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll
+++ b/llvm/test/Transforms/InstCombine/remove-loop-phi-multiply-by-zero.ll
@@ -7,7 +7,7 @@ define double @test_mul_fast_flags(ptr %arr_d) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double 0.000000e+00
@@ -37,7 +37,7 @@ define double @test_nsz_nnan_flags_enabled(ptr %arr_d) {
 ; CHECK:       for.body:
 ; CHECK-NEXT:    [[I_02:%.*]] = phi i64 [ 0, [[ENTRY:%.*]] ], [ [[INC:%.*]], [[FOR_BODY]] ]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double 0.000000e+00
@@ -71,7 +71,7 @@ define double @test_nnan_flag_enabled(ptr %arr_d) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[MUL]] = fmul nnan double [[F_PROD_01]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double [[MUL]]
@@ -105,7 +105,7 @@ define double @test_ninf_flag_enabled(ptr %arr_d) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[MUL]] = fmul ninf double [[F_PROD_01]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double [[MUL]]
@@ -139,7 +139,7 @@ define double @test_nsz_flag_enabled(ptr %arr_d) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[MUL]] = fmul nsz double [[F_PROD_01]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double [[MUL]]
@@ -173,7 +173,7 @@ define double @test_phi_initalise_to_non_zero(ptr %arr_d) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load double, ptr [[ARRAYIDX]], align 8
 ; CHECK-NEXT:    [[MUL]] = fmul fast double [[F_PROD_01]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret double [[MUL]]
@@ -285,7 +285,7 @@ define i32 @test_int_phi_operands(ptr %arr_d) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL]] = mul nsw i32 [[F_PROD_01]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i32 [[MUL]]
@@ -319,7 +319,7 @@ define i32 @test_int_phi_operands_initalise_to_non_zero(ptr %arr_d) {
 ; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX]], align 4
 ; CHECK-NEXT:    [[MUL]] = mul i32 [[F_PROD_01]], [[TMP0]]
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i64 [[I_02]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[I_02]], 999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[I_02]], 999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[END:%.*]]
 ; CHECK:       end:
 ; CHECK-NEXT:    ret i32 [[MUL]]
diff --git a/llvm/test/Transforms/InstCombine/strchr-1.ll b/llvm/test/Transforms/InstCombine/strchr-1.ll
index 0cedc3ad5181..50ef6abb2107 100644
--- a/llvm/test/Transforms/InstCombine/strchr-1.ll
+++ b/llvm/test/Transforms/InstCombine/strchr-1.ll
@@ -86,7 +86,7 @@ define i1 @test_simplify7(i32 %C) {
 ; CHECK-LABEL: @test_simplify7(
 ; CHECK-NEXT:    [[TMP1:%.*]] = trunc i32 [[C:%.*]] to i16
 ; CHECK-NEXT:    [[TMP2:%.*]] = and i16 [[TMP1]], 255
-; CHECK-NEXT:    [[MEMCHR_BOUNDS:%.*]] = icmp ult i16 [[TMP2]], 16
+; CHECK-NEXT:    [[MEMCHR_BOUNDS:%.*]] = icmp samesign ult i16 [[TMP2]], 16
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw i16 1, [[TMP2]]
 ; CHECK-NEXT:    [[TMP4:%.*]] = and i16 [[TMP3]], 9217
 ; CHECK-NEXT:    [[MEMCHR_BITS:%.*]] = icmp ne i16 [[TMP4]], 0
diff --git a/llvm/test/Transforms/InstCombine/sub.ll b/llvm/test/Transforms/InstCombine/sub.ll
index ba596e10e8b3..ff3f046607ec 100644
--- a/llvm/test/Transforms/InstCombine/sub.ll
+++ b/llvm/test/Transforms/InstCombine/sub.ll
@@ -2710,7 +2710,7 @@ if.else:
 
 define i64 @sub_infer_nuw_from_domcond_fold2(i32 range(i32 0, 2147483648) %x, i32 range(i32 0, 2147483648) %y) {
 ; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold2(
-; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp ult i32 [[X:%.*]], [[Y:%.*]]
+; CHECK-NEXT:    [[COND_NOT:%.*]] = icmp samesign ult i32 [[X:%.*]], [[Y:%.*]]
 ; CHECK-NEXT:    br i1 [[COND_NOT]], label [[IF_ELSE:%.*]], label [[IF_THEN:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    [[SUB:%.*]] = sub nuw nsw i32 [[X]], [[Y]]
@@ -2734,7 +2734,7 @@ if.else:
 define i1 @sub_infer_nuw_from_domcond_fold3(i16 %xx, i32 range(i32 0, 12) %y) {
 ; CHECK-LABEL: @sub_infer_nuw_from_domcond_fold3(
 ; CHECK-NEXT:    [[X:%.*]] = zext i16 [[XX:%.*]] to i32
-; CHECK-NEXT:    [[COND:%.*]] = icmp ugt i32 [[Y:%.*]], [[X]]
+; CHECK-NEXT:    [[COND:%.*]] = icmp samesign ugt i32 [[Y:%.*]], [[X]]
 ; CHECK-NEXT:    br i1 [[COND]], label [[IF_THEN:%.*]], label [[IF_ELSE:%.*]]
 ; CHECK:       if.then:
 ; CHECK-NEXT:    ret i1 false
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
index c6fb1c25274d..2c8271cf978d 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-accesses.ll
@@ -119,7 +119,7 @@ define void @test_array_load2_i16_store2(i32 %C, i32 %D) #1 {
 ; CHECK:       vector.body:
 ; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
-; CHECK-NEXT:    [[OFFSET_IDX:%.+]] = shl i64 [[INDEX]], 1
+; CHECK-NEXT:    [[OFFSET_IDX:%.*]] = shl i64 [[INDEX]], 1
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds [1024 x i16], ptr @AB_i16, i64 0, <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    [[WIDE_MASKED_GATHER:%.*]] = call <vscale x 4 x i16> @llvm.masked.gather.nxv4i16.nxv4p0(<vscale x 4 x ptr> [[TMP6]], i32 2, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer), <vscale x 4 x i16> poison)
 ; CHECK-NEXT:    [[TMP7:%.*]] = or disjoint <vscale x 4 x i64> [[VEC_IND]], shufflevector (<vscale x 4 x i64> insertelement (<vscale x 4 x i64> poison, i64 1, i64 0), <vscale x 4 x i64> poison, <vscale x 4 x i32> zeroinitializer)
@@ -542,7 +542,7 @@ define void @even_load_dynamic_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[TMP1:%.*]] = lshr i64 [[TMP0]], 1
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP3:%.*]] = shl nuw nsw i64 [[TMP2]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp ult i64 [[TMP1]], [[TMP3]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT_NOT:%.*]] = icmp samesign ult i64 [[TMP1]], [[TMP3]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT_NOT]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw i64 [[TMP1]], 1
@@ -872,7 +872,7 @@ define void @PR27626_0(ptr %p, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -957,7 +957,7 @@ define i32 @PR27626_1(ptr %p, i64 %n) #1 {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -1050,7 +1050,7 @@ define void @PR27626_2(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -1138,7 +1138,7 @@ define i32 @PR27626_3(ptr %p, i64 %n, i32 %z) #1 {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp ugt i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_NOT:%.*]] = icmp samesign ugt i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_NOT]], label [[VECTOR_PH:%.*]], label [[SCALAR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
@@ -1242,7 +1242,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
@@ -1268,7 +1268,7 @@ define void @PR27626_4(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 4 x i64> [ [[TMP9]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; CHECK-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i32, ptr [[A:%.*]], <vscale x 4 x i64> [[VEC_IND]]
 ; CHECK-NEXT:    call void @llvm.masked.scatter.nxv4i32.nxv4p0(<vscale x 4 x i32> [[BROADCAST_SPLAT]], <vscale x 4 x ptr> [[TMP13]], i32 4, <vscale x 4 x i1> shufflevector (<vscale x 4 x i1> insertelement (<vscale x 4 x i1> poison, i1 true, i64 0), <vscale x 4 x i1> poison, <vscale x 4 x i32> zeroinitializer))
-; CHECK-NEXT:    [[P:%.+]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
+; CHECK-NEXT:    [[P:%.*]] = extractelement <vscale x 4 x ptr> [[TMP13]], i64 0
 ; CHECK-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 8 x i32> @llvm.vector.interleave2.nxv8i32(<vscale x 4 x i32> [[BROADCAST_SPLAT2]], <vscale x 4 x i32> [[BROADCAST_SPLAT4]])
 ; CHECK-NEXT:    store <vscale x 8 x i32> [[INTERLEAVED_VEC]], ptr [[P]], align 4
 ; CHECK-NEXT:    [[INDEX_NEXT]] = add nuw i64 [[INDEX]], [[TMP7]]
@@ -1333,7 +1333,7 @@ define void @PR27626_5(ptr %a, i32 %x, i32 %y, i32 %z, i64 %n) #1 {
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nuw nsw i64 [[TMP1]], 1
 ; CHECK-NEXT:    [[TMP3:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP4:%.*]] = shl nuw nsw i64 [[TMP3]], 2
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[TMP2]], [[TMP4]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[TMP2]], [[TMP4]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP5:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
index baec7daa463d..26ac9c3dead7 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-interleaved-masked-accesses.ll
@@ -35,17 +35,17 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = shl i32 [[TMP19]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP20]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
@@ -55,17 +55,17 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP8]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP8]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
-; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP20]]
+; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -74,24 +74,24 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       for.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[IX_024:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_024]], [[CONV]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_024]], [[CONV]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       if.then:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_024]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = zext nneg i32 [[MUL]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP22]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext nneg i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP18]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ADD:%.*]] = or disjoint i32 [[MUL]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP24:%.*]] = zext nneg i32 [[ADD]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP24]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP25:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP23]], i8 [[TMP25]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP26:%.*]] = zext nneg i32 [[MUL]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP26]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = zext nneg i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX4:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 [[TMP20]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = load i8, ptr [[ARRAYIDX4]], align 1
+; SCALAR_TAIL_FOLDING-NEXT:    [[SPEC_SELECT_I:%.*]] = call i8 @llvm.smax.i8(i8 [[TMP19]], i8 [[TMP21]])
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP22:%.*]] = zext nneg i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX6:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP22]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SPEC_SELECT_I]], ptr [[ARRAYIDX6]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    [[SUB:%.*]] = sub i8 0, [[SPEC_SELECT_I]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP27:%.*]] = zext nneg i32 [[ADD]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP27]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP23:%.*]] = zext nneg i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX11:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP23]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 [[SUB]], ptr [[ARRAYIDX11]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       for.inc:
@@ -107,14 +107,14 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP20:%.*]] = shl i32 [[TMP19]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP20]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -122,29 +122,29 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl i32 [[INDEX]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = sext i32 [[TMP7]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP9]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP6]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = shl i32 [[INDEX]], 1
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr i8, ptr [[P]], i64 [[TMP9]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[WIDE_MASKED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.masked.load.nxv32i8.p0(ptr [[TMP10]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK]], <vscale x 32 x i8> poison)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[STRIDED_VEC:%.*]] = call { <vscale x 16 x i8>, <vscale x 16 x i8> } @llvm.vector.deinterleave2.nxv32i8(<vscale x 32 x i8> [[WIDE_MASKED_VEC]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = extractvalue { <vscale x 16 x i8>, <vscale x 16 x i8> } [[STRIDED_VEC]], 1
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sext i32 [[TMP7]] to i64
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP16]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP14]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP14]], <vscale x 16 x i8> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP10]], <vscale x 16 x i1> [[TMP10]])
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP17]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP20]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = call <vscale x 16 x i8> @llvm.smax.nxv16i8(<vscale x 16 x i8> [[TMP11]], <vscale x 16 x i8> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = sext i32 [[TMP8]] to i64
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr i8, ptr [[Q]], i64 [[TMP14]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = sub <vscale x 16 x i8> zeroinitializer, [[TMP13]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_VEC:%.*]] = call <vscale x 32 x i8> @llvm.vector.interleave2.nxv32i8(<vscale x 16 x i8> [[TMP13]], <vscale x 16 x i8> [[TMP16]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INTERLEAVED_MASK1:%.*]] = call <vscale x 32 x i1> @llvm.vector.interleave2.nxv32i1(<vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> [[TMP7]])
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.store.nxv32i8.p0(<vscale x 32 x i8> [[INTERLEAVED_VEC]], ptr [[TMP15]], i32 1, <vscale x 32 x i1> [[INTERLEAVED_MASK1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP21:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP21]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP0:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
@@ -219,17 +219,17 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = shl i32 [[TMP14]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP15]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
@@ -239,10 +239,10 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP15]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP4:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -252,15 +252,15 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING:       for.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[IX_012:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_012]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg i32 [[MUL]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP17]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = zext nneg i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP15]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 1, ptr [[ARRAYIDX]], align 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_012]], [[CONV]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_012]], [[CONV]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       if.then:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ADD:%.*]] = or disjoint i32 [[MUL]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext nneg i32 [[ADD]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP18]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = zext nneg i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 2, ptr [[ARRAYIDX3]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       for.inc:
@@ -276,14 +276,14 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 false, label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = shl i32 [[TMP14]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP15]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -291,22 +291,22 @@ define dso_local void @masked_strided2(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP7]]
 ; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP8]], i32 1, <vscale x 16 x i1> [[ACTIVE_LANE_MASK]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = zext nneg <vscale x 16 x i32> [[TMP10]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP11]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP12]], i32 1, <vscale x 16 x i1> [[TMP13]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP15]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP9]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = zext nneg <vscale x 16 x i32> [[TMP11]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP12]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP13]], i32 1, <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP14]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP4:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
@@ -377,10 +377,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_MOD_VF:%.*]] = urem i32 1024, [[TMP3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[N_VEC:%.*]] = sub nuw nsw i32 1024, [[N_MOD_VF]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = call i32 @llvm.vscale.i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = shl i32 [[TMP15]], 4
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP16]], i64 0
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.vscale.i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = shl i32 [[TMP4]], 4
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP5]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; SCALAR_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -389,7 +389,7 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[VECTOR_BODY:%.*]]
 ; SCALAR_TAIL_FOLDING:       vector.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
-; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP4]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP6]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP7]] to <vscale x 16 x i64>
@@ -400,10 +400,10 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
 ; SCALAR_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
 ; SCALAR_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP11]])
-; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP16]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add nuw i32 [[INDEX]], [[TMP5]]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
-; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP17]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = icmp eq i32 [[INDEX_NEXT]], [[N_VEC]]
+; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[TMP15]], label [[MIDDLE_BLOCK:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP6:![0-9]+]]
 ; SCALAR_TAIL_FOLDING:       middle.block:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[CMP_N:%.*]] = icmp eq i32 [[N_MOD_VF]], 0
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP_N]], label [[FOR_END:%.*]], label [[SCALAR_PH]]
@@ -413,20 +413,20 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; SCALAR_TAIL_FOLDING:       for.body:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[IX_018:%.*]] = phi i32 [ [[BC_RESUME_VAL]], [[SCALAR_PH]] ], [ [[INC:%.*]], [[FOR_INC:%.*]] ]
 ; SCALAR_TAIL_FOLDING-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_018]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_018]], [[CONV]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_018]], [[CONV]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[IF_END:%.*]]
 ; SCALAR_TAIL_FOLDING:       if.then:
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = zext nneg i32 [[MUL]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP18]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = zext nneg i32 [[MUL]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP16]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 1, ptr [[ARRAYIDX]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[IF_END]]
 ; SCALAR_TAIL_FOLDING:       if.end:
-; SCALAR_TAIL_FOLDING-NEXT:    [[CMP4:%.*]] = icmp ugt i32 [[IX_018]], [[CONV3]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[CMP4:%.*]] = icmp samesign ugt i32 [[IX_018]], [[CONV3]]
 ; SCALAR_TAIL_FOLDING-NEXT:    br i1 [[CMP4]], label [[IF_THEN6:%.*]], label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       if.then6:
 ; SCALAR_TAIL_FOLDING-NEXT:    [[ADD:%.*]] = or disjoint i32 [[MUL]], 1
-; SCALAR_TAIL_FOLDING-NEXT:    [[TMP19:%.*]] = zext nneg i32 [[ADD]] to i64
-; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP19]]
+; SCALAR_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = zext nneg i32 [[ADD]] to i64
+; SCALAR_TAIL_FOLDING-NEXT:    [[ARRAYIDX7:%.*]] = getelementptr inbounds i8, ptr [[Q]], i64 [[TMP17]]
 ; SCALAR_TAIL_FOLDING-NEXT:    store i8 2, ptr [[ARRAYIDX7]], align 1
 ; SCALAR_TAIL_FOLDING-NEXT:    br label [[FOR_INC]]
 ; SCALAR_TAIL_FOLDING:       for.inc:
@@ -443,14 +443,14 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.ph:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV3:%.*]] = zext i8 [[GUARD2]] to i32
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[CONV:%.*]] = zext i8 [[GUARD1]] to i32
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = call i32 @llvm.vscale.i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP17:%.*]] = shl i32 [[TMP16]], 4
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP0:%.*]] = call i32 @llvm.vscale.i32()
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP1:%.*]] = shl i32 [[TMP0]], 4
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP1]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP2:%.*]] = call i32 @llvm.vscale.i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = shl i32 [[TMP2]], 4
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP4:%.*]] = call i32 @llvm.usub.sat.i32(i32 1024, i32 [[TMP3]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_ENTRY:%.*]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 0, i32 1024)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP3:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
-; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP17]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP5:%.*]] = call <vscale x 16 x i32> @llvm.stepvector.nxv16i32()
+; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[TMP1]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[DOTSPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[DOTSPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLATINSERT:%.*]] = insertelement <vscale x 16 x i32> poison, i32 [[CONV]], i64 0
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[BROADCAST_SPLAT:%.*]] = shufflevector <vscale x 16 x i32> [[BROADCAST_SPLATINSERT]], <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer
@@ -460,24 +460,24 @@ define dso_local void @masked_strided3(ptr noalias nocapture readnone %p, ptr no
 ; PREDICATED_TAIL_FOLDING:       vector.body:
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX:%.*]] = phi i32 [ 0, [[VECTOR_PH]] ], [ [[INDEX_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK:%.*]] = phi <vscale x 16 x i1> [ [[ACTIVE_LANE_MASK_ENTRY]], [[VECTOR_PH]] ], [ [[ACTIVE_LANE_MASK_NEXT:%.*]], [[VECTOR_BODY]] ]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP3]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND:%.*]] = phi <vscale x 16 x i32> [ [[TMP5]], [[VECTOR_PH]] ], [ [[VEC_IND_NEXT:%.*]], [[VECTOR_BODY]] ]
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP6:%.*]] = shl nuw nsw <vscale x 16 x i32> [[VEC_IND]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP7:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP8]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP9]], i32 1, <vscale x 16 x i1> [[TMP10]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP8:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP7]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP9:%.*]] = zext nneg <vscale x 16 x i32> [[TMP6]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP10:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP9]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 1, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP10]], i32 1, <vscale x 16 x i1> [[TMP8]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP11:%.*]] = icmp ugt <vscale x 16 x i32> [[VEC_IND]], [[BROADCAST_SPLAT2]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = zext nneg <vscale x 16 x i32> [[TMP12]] to <vscale x 16 x i64>
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP13]]
-; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP14]], i32 1, <vscale x 16 x i1> [[TMP15]])
-; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP17]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP2]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP12:%.*]] = select <vscale x 16 x i1> [[ACTIVE_LANE_MASK]], <vscale x 16 x i1> [[TMP11]], <vscale x 16 x i1> zeroinitializer
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP13:%.*]] = or disjoint <vscale x 16 x i32> [[TMP6]], shufflevector (<vscale x 16 x i32> insertelement (<vscale x 16 x i32> poison, i32 1, i64 0), <vscale x 16 x i32> poison, <vscale x 16 x i32> zeroinitializer)
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP14:%.*]] = zext nneg <vscale x 16 x i32> [[TMP13]] to <vscale x 16 x i64>
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP15:%.*]] = getelementptr inbounds i8, ptr [[Q]], <vscale x 16 x i64> [[TMP14]]
+; PREDICATED_TAIL_FOLDING-NEXT:    call void @llvm.masked.scatter.nxv16i8.nxv16p0(<vscale x 16 x i8> shufflevector (<vscale x 16 x i8> insertelement (<vscale x 16 x i8> poison, i8 2, i64 0), <vscale x 16 x i8> poison, <vscale x 16 x i32> zeroinitializer), <vscale x 16 x ptr> [[TMP15]], i32 1, <vscale x 16 x i1> [[TMP12]])
+; PREDICATED_TAIL_FOLDING-NEXT:    [[INDEX_NEXT]] = add i32 [[INDEX]], [[TMP1]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[ACTIVE_LANE_MASK_NEXT]] = call <vscale x 16 x i1> @llvm.get.active.lane.mask.nxv16i1.i32(i32 [[INDEX]], i32 [[TMP4]])
 ; PREDICATED_TAIL_FOLDING-NEXT:    [[VEC_IND_NEXT]] = add <vscale x 16 x i32> [[VEC_IND]], [[DOTSPLAT]]
-; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP18:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
-; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP18]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
+; PREDICATED_TAIL_FOLDING-NEXT:    [[TMP16:%.*]] = extractelement <vscale x 16 x i1> [[ACTIVE_LANE_MASK_NEXT]], i64 0
+; PREDICATED_TAIL_FOLDING-NEXT:    br i1 [[TMP16]], label [[VECTOR_BODY]], label [[MIDDLE_BLOCK:%.*]], !llvm.loop [[LOOP6:![0-9]+]]
 ; PREDICATED_TAIL_FOLDING:       middle.block:
 ; PREDICATED_TAIL_FOLDING-NEXT:    br i1 true, label [[FOR_END:%.*]], label [[SCALAR_PH]]
 ; PREDICATED_TAIL_FOLDING:       scalar.ph:
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
index 15819070f1e1..fd35fdee1613 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-widen-phi.ll
@@ -223,7 +223,7 @@ define i32 @pointer_iv_mixed(ptr noalias %a, ptr noalias %b, i64 %n) #0 {
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N:%.*]], i64 1)
 ; CHECK-NEXT:    [[TMP0:%.*]] = call i64 @llvm.vscale.i64()
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nuw nsw i64 [[TMP0]], 1
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i64 [[SMAX]], [[TMP1]]
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i64 [[SMAX]], [[TMP1]]
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[SCALAR_PH:%.*]], label [[VECTOR_PH:%.*]]
 ; CHECK:       vector.ph:
 ; CHECK-NEXT:    [[TMP2:%.*]] = call i64 @llvm.vscale.i64()
diff --git a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
index 968058134690..d25b03943fa2 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/x86-interleaved-accesses-masked-group.ll
@@ -153,7 +153,7 @@ define dso_local void @masked_strided1(ptr noalias nocapture readonly %p, ptr no
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[TMP4]], label [[FOR_BODY:%.*]], label [[VECTOR_BODY]], !llvm.loop [[LOOP0:![0-9]+]]
 ; ENABLED_MASKED_STRIDED:       for.body:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[IX_09:%.*]] = phi i32 [ [[INC:%.*]], [[FOR_INC:%.*]] ], [ 1016, [[VECTOR_BODY]] ]
-; ENABLED_MASKED_STRIDED-NEXT:    [[CMP1:%.*]] = icmp ugt i32 [[IX_09]], [[CONV]]
+; ENABLED_MASKED_STRIDED-NEXT:    [[CMP1:%.*]] = icmp samesign ugt i32 [[IX_09]], [[CONV]]
 ; ENABLED_MASKED_STRIDED-NEXT:    br i1 [[CMP1]], label [[IF_THEN:%.*]], label [[FOR_INC]]
 ; ENABLED_MASKED_STRIDED:       if.then:
 ; ENABLED_MASKED_STRIDED-NEXT:    [[MUL:%.*]] = shl nuw nsw i32 [[IX_09]], 1
diff --git a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
index 2878786cb972..54c08b47598e 100644
--- a/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
+++ b/llvm/test/Transforms/LoopVectorize/interleaved-accesses.ll
@@ -113,7 +113,7 @@ define void @test_struct_array_load3_store3() {
 ; CHECK-NEXT:    [[STRIDED_VEC2:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 1, i32 4, i32 7, i32 10>
 ; CHECK-NEXT:    [[STRIDED_VEC3:%.*]] = shufflevector <12 x i32> [[WIDE_VEC]], <12 x i32> poison, <4 x i32> <i32 2, i32 5, i32 8, i32 11>
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[STRIDED_VEC]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds [1024 x %struct.ST3], ptr @S, i64 0, i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nsw <4 x i32> [[STRIDED_VEC2]], <i32 2, i32 2, i32 2, i32 2>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[STRIDED_VEC3]], <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP4:%.*]] = shufflevector <4 x i32> [[TMP0]], <4 x i32> [[TMP1]], <8 x i32> <i32 0, i32 1, i32 2, i32 3, i32 4, i32 5, i32 6, i32 7>
@@ -268,7 +268,7 @@ define void @test_struct_store4(ptr noalias nocapture readonly %A, ptr noalias n
 ; CHECK-NEXT:    [[NEXT_GEP:%.*]] = getelementptr i8, ptr [[A:%.*]], i64 [[OFFSET_IDX]]
 ; CHECK-NEXT:    [[WIDE_LOAD:%.*]] = load <4 x i32>, ptr [[NEXT_GEP]], align 4
 ; CHECK-NEXT:    [[TMP0:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
-; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP4:%.*]] = getelementptr inbounds [[STRUCT_ST4:%.*]], ptr [[B:%.*]], i64 [[INDEX]], i32 0
 ; CHECK-NEXT:    [[TMP1:%.*]] = shl nsw <4 x i32> [[WIDE_LOAD]], <i32 1, i32 1, i32 1, i32 1>
 ; CHECK-NEXT:    [[TMP2:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 3, i32 3, i32 3, i32 3>
 ; CHECK-NEXT:    [[TMP3:%.*]] = add nsw <4 x i32> [[WIDE_LOAD]], <i32 4, i32 4, i32 4, i32 4>
@@ -443,7 +443,7 @@ define void @even_load_static_tc(ptr noalias nocapture readonly %A, ptr noalias
 ; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[B]], i64 [[TMP1]]
 ; CHECK-NEXT:    store i32 [[MUL]], ptr [[ARRAYIDX2]], align 4
 ; CHECK-NEXT:    [[INDVARS_IV_NEXT]] = add nuw nsw i64 [[INDVARS_IV]], 2
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i64 [[INDVARS_IV]], 1022
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 1022
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]], !llvm.loop [[LOOP13:![0-9]+]]
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
index db0656da579f..ec46ef959ce6 100644
--- a/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
+++ b/llvm/test/Transforms/PhaseOrdering/AArch64/matrix-extract-insert.ll
@@ -10,13 +10,13 @@ define void @matrix_extract_insert_scalar(i32 %i, i32 %k, i32 %j, ptr nonnull al
 ; CHECK-NEXT:    [[CONV1:%.*]] = zext i32 [[J:%.*]] to i64
 ; CHECK-NEXT:    [[TMP0:%.*]] = mul nuw nsw i64 [[CONV1]], 15
 ; CHECK-NEXT:    [[TMP1:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV]]
-; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[TMP1]], 225
+; CHECK-NEXT:    [[TMP2:%.*]] = icmp samesign ult i64 [[TMP1]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[A:%.*]], i64 0, i64 [[TMP1]]
 ; CHECK-NEXT:    [[MATRIXEXT:%.*]] = load double, ptr [[TMP3]], align 8
 ; CHECK-NEXT:    [[CONV2:%.*]] = zext i32 [[I:%.*]] to i64
 ; CHECK-NEXT:    [[TMP4:%.*]] = add nuw nsw i64 [[TMP0]], [[CONV2]]
-; CHECK-NEXT:    [[TMP5:%.*]] = icmp ult i64 [[TMP4]], 225
+; CHECK-NEXT:    [[TMP5:%.*]] = icmp samesign ult i64 [[TMP4]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP5]])
 ; CHECK-NEXT:    [[TMP6:%.*]] = getelementptr inbounds <225 x double>, ptr [[B:%.*]], i64 0, i64 [[TMP4]]
 ; CHECK-NEXT:    [[MATRIXEXT4:%.*]] = load double, ptr [[TMP6]], align 8
@@ -94,7 +94,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    [[TMP2:%.*]] = icmp ult i32 [[I]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP2]])
 ; CHECK-NEXT:    [[TMP3:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[CONV6]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp ult i32 [[I]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK:%.*]] = icmp samesign ult i32 [[I]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK]], label [[FOR_BODY4_US_PREHEADER:%.*]], label [[VECTOR_MEMCHECK:%.*]]
 ; CHECK:       vector.memcheck:
 ; CHECK-NEXT:    [[BOUND0:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]]
@@ -151,7 +151,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    br label [[FOR_BODY4_US:%.*]]
 ; CHECK:       for.body4.us:
 ; CHECK-NEXT:    [[INDVARS_IV:%.*]] = phi i64 [ [[INDVARS_IV_NEXT:%.*]], [[FOR_BODY4_US]] ], [ [[INDVARS_IV_PH]], [[FOR_BODY4_US_PREHEADER]] ]
-; CHECK-NEXT:    [[TMP27:%.*]] = icmp ult i64 [[INDVARS_IV]], 225
+; CHECK-NEXT:    [[TMP27:%.*]] = icmp samesign ult i64 [[INDVARS_IV]], 225
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP27]])
 ; CHECK-NEXT:    [[TMP28:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[INDVARS_IV]]
 ; CHECK-NEXT:    [[MATRIXEXT_US:%.*]] = load double, ptr [[TMP28]], align 8
@@ -166,10 +166,10 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US]], label [[FOR_BODY4_US]], !llvm.loop [[LOOP10:![0-9]+]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us:
 ; CHECK-NEXT:    [[TMP30:%.*]] = add nuw nsw i64 [[CONV6]], 15
-; CHECK-NEXT:    [[TMP31:%.*]] = icmp ult i32 [[I]], 210
+; CHECK-NEXT:    [[TMP31:%.*]] = icmp samesign ult i32 [[I]], 210
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP31]])
 ; CHECK-NEXT:    [[TMP32:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP30]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_1:%.*]] = icmp ult i32 [[I]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_1:%.*]] = icmp samesign ult i32 [[I]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_1]], label [[FOR_BODY4_US_PREHEADER_1:%.*]], label [[VECTOR_MEMCHECK_1:%.*]]
 ; CHECK:       vector.memcheck.1:
 ; CHECK-NEXT:    [[BOUND0_1:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]]
@@ -228,7 +228,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       for.body4.us.1:
 ; CHECK-NEXT:    [[INDVARS_IV_1:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_1:%.*]], [[FOR_BODY4_US_1]] ], [ [[INDVARS_IV_PH_1]], [[FOR_BODY4_US_PREHEADER_1]] ]
 ; CHECK-NEXT:    [[TMP57:%.*]] = add nuw nsw i64 [[INDVARS_IV_1]], 15
-; CHECK-NEXT:    [[TMP58:%.*]] = icmp ult i64 [[INDVARS_IV_1]], 210
+; CHECK-NEXT:    [[TMP58:%.*]] = icmp samesign ult i64 [[INDVARS_IV_1]], 210
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP58]])
 ; CHECK-NEXT:    [[TMP59:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP57]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_1:%.*]] = load double, ptr [[TMP59]], align 8
@@ -243,10 +243,10 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_1]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_1]], label [[FOR_BODY4_US_1]], !llvm.loop [[LOOP10]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.1:
 ; CHECK-NEXT:    [[TMP61:%.*]] = add nuw nsw i64 [[CONV6]], 30
-; CHECK-NEXT:    [[TMP62:%.*]] = icmp ult i32 [[I]], 195
+; CHECK-NEXT:    [[TMP62:%.*]] = icmp samesign ult i32 [[I]], 195
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP62]])
 ; CHECK-NEXT:    [[TMP63:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP61]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_2:%.*]] = icmp ult i32 [[I]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_2:%.*]] = icmp samesign ult i32 [[I]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_2]], label [[FOR_BODY4_US_PREHEADER_2:%.*]], label [[VECTOR_MEMCHECK_2:%.*]]
 ; CHECK:       vector.memcheck.2:
 ; CHECK-NEXT:    [[BOUND0_2:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]]
@@ -305,7 +305,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       for.body4.us.2:
 ; CHECK-NEXT:    [[INDVARS_IV_2:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_2:%.*]], [[FOR_BODY4_US_2]] ], [ [[INDVARS_IV_PH_2]], [[FOR_BODY4_US_PREHEADER_2]] ]
 ; CHECK-NEXT:    [[TMP88:%.*]] = add nuw nsw i64 [[INDVARS_IV_2]], 30
-; CHECK-NEXT:    [[TMP89:%.*]] = icmp ult i64 [[INDVARS_IV_2]], 195
+; CHECK-NEXT:    [[TMP89:%.*]] = icmp samesign ult i64 [[INDVARS_IV_2]], 195
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP89]])
 ; CHECK-NEXT:    [[TMP90:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP88]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_2:%.*]] = load double, ptr [[TMP90]], align 8
@@ -320,10 +320,10 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK-NEXT:    br i1 [[EXITCOND_NOT_2]], label [[FOR_COND1_FOR_COND_CLEANUP3_CRIT_EDGE_US_2]], label [[FOR_BODY4_US_2]], !llvm.loop [[LOOP10]]
 ; CHECK:       for.cond1.for.cond.cleanup3_crit_edge.us.2:
 ; CHECK-NEXT:    [[TMP92:%.*]] = add nuw nsw i64 [[CONV6]], 45
-; CHECK-NEXT:    [[TMP93:%.*]] = icmp ult i32 [[I]], 180
+; CHECK-NEXT:    [[TMP93:%.*]] = icmp samesign ult i32 [[I]], 180
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP93]])
 ; CHECK-NEXT:    [[TMP94:%.*]] = getelementptr inbounds <225 x double>, ptr [[B]], i64 0, i64 [[TMP92]]
-; CHECK-NEXT:    [[MIN_ITERS_CHECK_3:%.*]] = icmp ult i32 [[I]], 4
+; CHECK-NEXT:    [[MIN_ITERS_CHECK_3:%.*]] = icmp samesign ult i32 [[I]], 4
 ; CHECK-NEXT:    br i1 [[MIN_ITERS_CHECK_3]], label [[FOR_BODY4_US_PREHEADER_3:%.*]], label [[VECTOR_MEMCHECK_3:%.*]]
 ; CHECK:       vector.memcheck.3:
 ; CHECK-NEXT:    [[BOUND0_3:%.*]] = icmp ult ptr [[B]], [[SCEVGEP20]]
@@ -382,7 +382,7 @@ define void @matrix_extract_insert_loop(i32 %i, ptr nonnull align 8 dereferencea
 ; CHECK:       for.body4.us.3:
 ; CHECK-NEXT:    [[INDVARS_IV_3:%.*]] = phi i64 [ [[INDVARS_IV_NEXT_3:%.*]], [[FOR_BODY4_US_3]] ], [ [[INDVARS_IV_PH_3]], [[FOR_BODY4_US_PREHEADER_3]] ]
 ; CHECK-NEXT:    [[TMP119:%.*]] = add nuw nsw i64 [[INDVARS_IV_3]], 45
-; CHECK-NEXT:    [[TMP120:%.*]] = icmp ult i64 [[INDVARS_IV_3]], 180
+; CHECK-NEXT:    [[TMP120:%.*]] = icmp samesign ult i64 [[INDVARS_IV_3]], 180
 ; CHECK-NEXT:    tail call void @llvm.assume(i1 [[TMP120]])
 ; CHECK-NEXT:    [[TMP121:%.*]] = getelementptr inbounds <225 x double>, ptr [[A]], i64 0, i64 [[TMP119]]
 ; CHECK-NEXT:    [[MATRIXEXT_US_3:%.*]] = load double, ptr [[TMP121]], align 8
diff --git a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll
index 397c2571b3b6..eb5e279947ec 100644
--- a/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll
+++ b/llvm/test/Transforms/PhaseOrdering/X86/ctlz-loop.ll
@@ -32,7 +32,7 @@ define i32 @ctlz_loop_with_abs(i32 %n) {
 ; CHECK-NEXT:    [[I_02:%.*]] = phi i32 [ [[INC:%.*]], [[WHILE_BODY]] ], [ 0, [[WHILE_BODY_PREHEADER]] ]
 ; CHECK-NEXT:    [[TMP1]] = lshr i32 [[N_ADDR_03]], 1
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_02]], 1
-; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp ult i32 [[N_ADDR_03]], 2
+; CHECK-NEXT:    [[TOBOOL_NOT:%.*]] = icmp samesign ult i32 [[N_ADDR_03]], 2
 ; CHECK-NEXT:    br i1 [[TOBOOL_NOT]], label [[WHILE_END]], label [[WHILE_BODY]]
 ; CHECK:       while.end:
 ; CHECK-NEXT:    [[I_0_LCSSA:%.*]] = phi i32 [ 0, [[ENTRY:%.*]] ], [ [[INC]], [[WHILE_BODY]] ]
diff --git a/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll b/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll
index 522ebf9dcc04..862f40a9ae2e 100644
--- a/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll
+++ b/llvm/test/Transforms/PhaseOrdering/gvn-replacement-vs-hoist.ll
@@ -26,7 +26,7 @@ define void @test(ptr noundef %a, i32 noundef %beam) {
 ; CHECK-NEXT:    br label [[FOR_INC]]
 ; CHECK:       for.inc:
 ; CHECK-NEXT:    [[INC]] = add nuw nsw i32 [[I_06]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[I_06]], 9999
+; CHECK-NEXT:    [[CMP:%.*]] = icmp samesign ult i32 [[I_06]], 9999
 ; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[FOR_COND_CLEANUP:%.*]]
 ;
 entry:
-- 
GitLab


From 388d7f144880dcd85ff31f06793304405a9f44b6 Mon Sep 17 00:00:00 2001
From: Mikhnenko Sasha <78651190+4JustMe4@users.noreply.github.com>
Date: Thu, 17 Oct 2024 15:44:45 +0300
Subject: [PATCH 247/329] Different info in docs in AST methods (#112190)

[Here](https://github.com/llvm/llvm-project/blob/6a98c4a1602591c942f01dceb3aa29ffd4cf1e5b/clang/include/clang/ASTMatchers/ASTMatchers.h#L4188-L4203)
and
[here](https://github.com/llvm/llvm-project/blob/6a98c4a1602591c942f01dceb3aa29ffd4cf1e5b/clang/include/clang/ASTMatchers/ASTMatchers.h#L3679-L3695)
we can see similar code samples and same examples:
```
cxxMemberCallExpr(on(callExpr()))
```

In the first case, it is
[written](https://github.com/llvm/llvm-project/blob/6a98c4a1602591c942f01dceb3aa29ffd4cf1e5b/clang/include/clang/ASTMatchers/ASTMatchers.h#L4201)
that the object must not be matched:
```
/// cxxMemberCallExpr(on(callExpr()))
///   does not match `(g()).m()`, because the parens are not ignored.
```
In the second case, it is
[written](https://github.com/llvm/llvm-project/blob/6a98c4a1602591c942f01dceb3aa29ffd4cf1e5b/clang/include/clang/ASTMatchers/ASTMatchers.h#L3693)
that the object must be matched:
```
/// cxxMemberCallExpr(on(callExpr()))
///   matches `(g()).m()`.
```

I think that parens are ignored
---
 clang/docs/LibASTMatchersReference.html       | 4 ++--
 clang/include/clang/ASTMatchers/ASTMatchers.h | 4 ++--
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/clang/docs/LibASTMatchersReference.html b/clang/docs/LibASTMatchersReference.html
index a16b9c44ef0e..c6307954d7f1 100644
--- a/clang/docs/LibASTMatchersReference.html
+++ b/clang/docs/LibASTMatchersReference.html
@@ -7239,9 +7239,9 @@ Given
   void z(Y y, X x) { y.m(); x.m(); x.g(); (g()).m(); }
 cxxMemberCallExpr(onImplicitObjectArgument(hasType(
     cxxRecordDecl(hasName("Y")))))
-  matches `y.m()`, `x.m()` and (g()).m(), but not `x.g()`.
+  matches `y.m()`, `x.m()` and (`g()).m()`, but not `x.g()`).
 cxxMemberCallExpr(on(callExpr()))
-  does not match `(g()).m()`, because the parens are not ignored.
+  only matches `(g()).m()` (the parens are ignored).
 
 FIXME: Overload to allow directly matching types?
 </pre></td></tr>
diff --git a/clang/include/clang/ASTMatchers/ASTMatchers.h b/clang/include/clang/ASTMatchers/ASTMatchers.h
index f1c72efc2387..54e484d41fb1 100644
--- a/clang/include/clang/ASTMatchers/ASTMatchers.h
+++ b/clang/include/clang/ASTMatchers/ASTMatchers.h
@@ -4197,9 +4197,9 @@ AST_MATCHER_P_OVERLOAD(QualType, references, internal::Matcher<Decl>,
 /// \endcode
 /// cxxMemberCallExpr(onImplicitObjectArgument(hasType(
 ///     cxxRecordDecl(hasName("Y")))))
-///   matches `y.m()`, `x.m()` and (g()).m(), but not `x.g()`.
+///   matches `y.m()`, `x.m()` and (`g()).m()`, but not `x.g()`).
 /// cxxMemberCallExpr(on(callExpr()))
-///   does not match `(g()).m()`, because the parens are not ignored.
+///   only matches `(g()).m()` (the parens are ignored).
 ///
 /// FIXME: Overload to allow directly matching types?
 AST_MATCHER_P(CXXMemberCallExpr, onImplicitObjectArgument,
-- 
GitLab


From ea796e5237afbbef396b21ac04d4f32557c8db61 Mon Sep 17 00:00:00 2001
From: VladiKrapp-Arm <vladi.krapp@arm.com>
Date: Thu, 17 Oct 2024 13:53:22 +0100
Subject: [PATCH 248/329] [ARM] Prefer MUL to MULS on some implementations
 (#112540)

MULS adversely affects performance on many implementations. Where this
is the case, we prefer not to shrink MUL to MULS.
---
 llvm/lib/Target/ARM/ARMFeatures.td          |  7 +++
 llvm/lib/Target/ARM/ARMProcessors.td        |  1 +
 llvm/lib/Target/ARM/Thumb2SizeReduction.cpp |  3 +
 llvm/test/CodeGen/Thumb2/avoidmuls.mir      | 67 +++------------------
 4 files changed, 21 insertions(+), 57 deletions(-)

diff --git a/llvm/lib/Target/ARM/ARMFeatures.td b/llvm/lib/Target/ARM/ARMFeatures.td
index 3a2188adbec3..bb437698296c 100644
--- a/llvm/lib/Target/ARM/ARMFeatures.td
+++ b/llvm/lib/Target/ARM/ARMFeatures.td
@@ -398,6 +398,13 @@ def FeatureAvoidPartialCPSR : SubtargetFeature<"avoid-partial-cpsr",
                                                "AvoidCPSRPartialUpdate", "true",
                                  "Avoid CPSR partial update for OOO execution">;
 
+/// FeatureAvoidMULS - If true, codegen would avoid using the MULS instruction,
+/// prefering the thumb2 MUL which doesn't set flags.
+def FeatureAvoidMULS : SubtargetFeature<"avoid-muls",
+                                        "AvoidMULS", "true",
+                                 "Avoid MULS instructions for M class cores">;
+
+
 /// Disable +1 predication cost for instructions updating CPSR.
 /// Enabled for Cortex-A57.
 /// True if disable +1 predication cost for instructions updating CPSR. Enabled for Cortex-A57.
diff --git a/llvm/lib/Target/ARM/ARMProcessors.td b/llvm/lib/Target/ARM/ARMProcessors.td
index 08f62d12f4a9..b94a5fc16146 100644
--- a/llvm/lib/Target/ARM/ARMProcessors.td
+++ b/llvm/lib/Target/ARM/ARMProcessors.td
@@ -360,6 +360,7 @@ def : ProcessorModel<"cortex-m33", CortexM4Model,       [ARMv8mMainline,
                                                          FeatureHasSlowFPVFMx,
                                                          FeatureUseMISched,
                                                          FeatureHasNoBranchPredictor,
+                                                         FeatureAvoidMULS,
                                                          FeatureFixCMSE_CVE_2021_35465]>;
 
 def : ProcessorModel<"star-mc1", CortexM4Model,         [ARMv8mMainline,
diff --git a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
index f572af986007..f4a9915a78b9 100644
--- a/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
+++ b/llvm/lib/Target/ARM/Thumb2SizeReduction.cpp
@@ -755,6 +755,9 @@ Thumb2SizeReduce::ReduceTo2Addr(MachineBasicBlock &MBB, MachineInstr *MI,
   Register Reg1 = MI->getOperand(1).getReg();
   // t2MUL is "special". The tied source operand is second, not first.
   if (MI->getOpcode() == ARM::t2MUL) {
+    // MULS can be slower than MUL
+    if (!MinimizeSize && STI->avoidMULS())
+      return false;
     Register Reg2 = MI->getOperand(2).getReg();
     // Early exit if the regs aren't all low regs.
     if (!isARMLowRegister(Reg0) || !isARMLowRegister(Reg1)
diff --git a/llvm/test/CodeGen/Thumb2/avoidmuls.mir b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
index 8d5567482d5c..865152068fdf 100644
--- a/llvm/test/CodeGen/Thumb2/avoidmuls.mir
+++ b/llvm/test/CodeGen/Thumb2/avoidmuls.mir
@@ -1,67 +1,20 @@
-# RUN: llc -run-pass=thumb2-reduce-size %s -o - | FileCheck %s
+# RUN: llc -mtriple=thumbv7m-none-eabi -mcpu=cortex-m33 -run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MUL
+# RUN: llc -mtriple=thumbv7m-none-eabi --run-pass=thumb2-reduce-size %s -o - | FileCheck %s --check-prefix=MULS
 
---- |
-  target datalayout = "e-m:e-p:32:32-i64:64-v128:64:128-a:0:32-n32-S64"
-  target triple = "thumbv8m.main-arm-none-eabi"
-
-  ; Function Attrs: norecurse nounwind readnone
-  define i32 @test(i32 %x, i32 %y) local_unnamed_addr #0 {
-  entry:
-    %cmp6 = icmp sgt i32 %y, 0
-    br i1 %cmp6, label %for.body.preheader, label %for.cond.cleanup
-
-  for.body.preheader:                               ; preds = %entry
-    br label %for.body
-
-  for.cond.cleanup:                                 ; preds = %for.body, %entry
-    %sum.0.lcssa = phi i32 [ 1, %entry ], [ %mul, %for.body ]
-    ret i32 %sum.0.lcssa
-
-  for.body:                                         ; preds = %for.body, %for.body.preheader
-    %lsr.iv1 = phi i32 [ %lsr.iv.next2, %for.body ], [ %x, %for.body.preheader ]
-    %lsr.iv = phi i32 [ %lsr.iv.next, %for.body ], [ %y, %for.body.preheader ]
-    %sum.07 = phi i32 [ %mul, %for.body ], [ 1, %for.body.preheader ]
-    %mul = mul nsw i32 %lsr.iv1, %sum.07
-    %lsr.iv.next = add i32 %lsr.iv, -1
-    %lsr.iv.next2 = add i32 %lsr.iv1, 1
-    %exitcond = icmp eq i32 %lsr.iv.next, 0
-    br i1 %exitcond, label %for.cond.cleanup, label %for.body
-  }
-
-  attributes #0 = { norecurse nounwind readnone "correctly-rounded-divide-sqrt-fp-math"="false" "disable-tail-calls"="false" "less-precise-fpmad"="false" "no-frame-pointer-elim"="false" "no-infs-fp-math"="false" "no-jump-tables"="false" "no-nans-fp-math"="false" "no-signed-zeros-fp-math"="false" "no-trapping-math"="false" "stack-protector-buffer-size"="8" "target-cpu"="cortex-m33" "target-features"="-d32,+dsp,+fp-armv8,-fp64,+hwdiv,+strict-align,+thumb-mode,-crc,-dotprod,-hwdiv-arm,-ras" "unsafe-fp-math"="false" "use-soft-float"="false" }
-
-...
 ---
 name:            test
-tracksRegLiveness: true
-liveins:
-  - { reg: '$r0', virtual-reg: '' }
-  - { reg: '$r1', virtual-reg: '' }
 body:             |
-  bb.0.entry:
-    successors: %bb.1.for.body, %bb.2.for.cond.cleanup
-    liveins: $r0, $r1
-
+  bb.0:
     $r2 = tMOVr $r0, 14, _
     $r0 = t2MOVi 1, 14, _, _
-    t2CMPri $r1, 1, 14, _, implicit-def $cpsr
-    t2Bcc %bb.2.for.cond.cleanup, 11, killed $cpsr
-
-  bb.1.for.body:
-    successors: %bb.2.for.cond.cleanup, %bb.1.for.body
-    liveins: $r0, $r1, $r2
-
     $r0 = t2MUL $r2, killed $r0, 14, _
-    $r2 = t2ADDri killed $r2, 1, 14, _, _
-    $r1 = t2SUBri killed $r1, 1, 14, _, def $cpsr
-    t2Bcc %bb.1.for.body, 1, killed $cpsr
-
-  bb.2.for.cond.cleanup:
-    liveins: $r0
-
     tBX_RET 14, _, implicit $r0
 
 ...
-# CHECK-LABEL: test
-# CHECK: tMUL
-# CHECK-NOT: t2MUL
+# MUL-LABEL: test
+# MUL: t2MUL
+# MUL-NOT: tMUL
+
+# MULS-LABEL: test
+# MULS: tMUL
+# MULS-NOT: t2MUL
\ No newline at end of file
-- 
GitLab


From d9cd6072000488a80ba1c602f16a65055c594e0f Mon Sep 17 00:00:00 2001
From: Nikita Popov <npopov@redhat.com>
Date: Thu, 17 Oct 2024 14:56:30 +0200
Subject: [PATCH 249/329] [InstCombine] Add tests for #110919 (NFC)

---
 .../Transforms/InstCombine/eq-of-parts.ll     | 100 ++++++++++++++++++
 1 file changed, 100 insertions(+)

diff --git a/llvm/test/Transforms/InstCombine/eq-of-parts.ll b/llvm/test/Transforms/InstCombine/eq-of-parts.ll
index 217e37b85933..afe5d6af1fcd 100644
--- a/llvm/test/Transforms/InstCombine/eq-of-parts.ll
+++ b/llvm/test/Transforms/InstCombine/eq-of-parts.ll
@@ -1438,3 +1438,103 @@ define i1 @ne_optimized_highbits_cmp_todo_overlapping(i32 %x, i32 %y) {
   %r = or i1 %cmp_hi, %cmp_lo
   ret i1 %r
 }
+
+define i1 @and_trunc_i1(i8 %a1, i8 %a2) {
+; CHECK-LABEL: @and_trunc_i1(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[XOR]], 2
+; CHECK-NEXT:    [[LOBIT:%.*]] = trunc i8 [[XOR]] to i1
+; CHECK-NEXT:    [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %xor = xor i8 %a1, %a2
+  %cmp = icmp ult i8 %xor, 2
+  %lobit = trunc i8 %xor to i1
+  %lobit.inv = xor i1 %lobit, true
+  %and = and i1 %cmp, %lobit.inv
+  ret i1 %and
+}
+
+define i1 @and_trunc_i1_wrong_const(i8 %a1, i8 %a2) {
+; CHECK-LABEL: @and_trunc_i1_wrong_const(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[XOR]], 4
+; CHECK-NEXT:    [[LOBIT:%.*]] = trunc i8 [[XOR]] to i1
+; CHECK-NEXT:    [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %xor = xor i8 %a1, %a2
+  %cmp = icmp ult i8 %xor, 4
+  %lobit = trunc i8 %xor to i1
+  %lobit.inv = xor i1 %lobit, true
+  %and = and i1 %cmp, %lobit.inv
+  ret i1 %and
+}
+
+define i1 @and_trunc_i1_wrong_operands(i8 %a1, i8 %a2, i8 %a3) {
+; CHECK-LABEL: @and_trunc_i1_wrong_operands(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i8 [[A1:%.*]], [[A2:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i8 [[XOR]], 2
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i8 [[A1]], [[A3:%.*]]
+; CHECK-NEXT:    [[LOBIT:%.*]] = trunc i8 [[XOR2]] to i1
+; CHECK-NEXT:    [[LOBIT_INV:%.*]] = xor i1 [[LOBIT]], true
+; CHECK-NEXT:    [[AND:%.*]] = and i1 [[CMP]], [[LOBIT_INV]]
+; CHECK-NEXT:    ret i1 [[AND]]
+;
+  %xor = xor i8 %a1, %a2
+  %cmp = icmp ult i8 %xor, 2
+  %xor2 = xor i8 %a1, %a3
+  %lobit = trunc i8 %xor2 to i1
+  %lobit.inv = xor i1 %lobit, true
+  %and = and i1 %cmp, %lobit.inv
+  ret i1 %and
+}
+
+define i1 @or_trunc_i1(i64 %a1, i64 %a2) {
+; CHECK-LABEL: @or_trunc_i1(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[XOR]], 1
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[XOR]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %xor = xor i64 %a2, %a1
+  %cmp = icmp ugt i64 %xor, 1
+  %trunc = trunc i64 %xor to i1
+  %or = or i1 %cmp, %trunc
+  ret i1 %or
+}
+
+define i1 @or_trunc_i1_wrong_const(i64 %a1, i64 %a2) {
+; CHECK-LABEL: @or_trunc_i1_wrong_const(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[XOR]], 2
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[XOR]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %xor = xor i64 %a2, %a1
+  %cmp = icmp ugt i64 %xor, 2
+  %trunc = trunc i64 %xor to i1
+  %or = or i1 %cmp, %trunc
+  ret i1 %or
+}
+
+define i1 @or_trunc_i1_wrong_operands(i64 %a1, i64 %a2, i64 %a3) {
+; CHECK-LABEL: @or_trunc_i1_wrong_operands(
+; CHECK-NEXT:    [[XOR:%.*]] = xor i64 [[A2:%.*]], [[A1:%.*]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ugt i64 [[XOR]], 1
+; CHECK-NEXT:    [[XOR2:%.*]] = xor i64 [[A3:%.*]], [[A1]]
+; CHECK-NEXT:    [[TRUNC:%.*]] = trunc i64 [[XOR2]] to i1
+; CHECK-NEXT:    [[OR:%.*]] = or i1 [[CMP]], [[TRUNC]]
+; CHECK-NEXT:    ret i1 [[OR]]
+;
+  %xor = xor i64 %a2, %a1
+  %cmp = icmp ugt i64 %xor, 1
+  %xor2 = xor i64 %a3, %a1
+  %trunc = trunc i64 %xor2 to i1
+  %or = or i1 %cmp, %trunc
+  ret i1 %or
+}
-- 
GitLab


From c1047ba8366a447b61f845048a5f287dae24d9d0 Mon Sep 17 00:00:00 2001
From: Prashant Kumar <pk5561@gmail.com>
Date: Thu, 17 Oct 2024 18:32:03 +0530
Subject: [PATCH 250/329] [MLIR] Enable pattern only for scf.forall op
 (#110230)

The init args shape might change in the loop body and hence the pattern
doesn't hold true.
---
 .../ResolveShapedTypeResultDims.cpp           | 23 +++++++++++++++----
 1 file changed, 19 insertions(+), 4 deletions(-)

diff --git a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
index fb2921fec9f7..792e72291830 100644
--- a/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
+++ b/mlir/lib/Dialect/MemRef/Transforms/ResolveShapedTypeResultDims.cpp
@@ -18,6 +18,7 @@
 #include "mlir/Dialect/Arith/Utils/Utils.h"
 #include "mlir/Dialect/MemRef/IR/MemRef.h"
 #include "mlir/Dialect/MemRef/Transforms/Transforms.h"
+#include "mlir/Dialect/SCF/IR/SCF.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/Interfaces/InferTypeOpInterface.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
@@ -131,11 +132,25 @@ struct IterArgsToInitArgs : public OpRewritePattern<tensor::DimOp> {
     auto blockArg = dyn_cast<BlockArgument>(dimOp.getSource());
     if (!blockArg)
       return failure();
-    auto loopLikeOp =
-        dyn_cast<LoopLikeOpInterface>(blockArg.getParentBlock()->getParentOp());
-    if (!loopLikeOp)
+    // TODO: Enable this for loopLikeInterface. Restricting for scf.for
+    // because the init args shape might change in the loop body.
+    // For e.g.:
+    // ```
+    //  %0 = tensor.empty(%c1) : tensor<?xf32>
+    //  %r = scf.for %iv = %c0 to %c10 step %c1 iter_args(%arg0 = %0) ->
+    //  tensor<?xf32> {
+    //    %1 = tensor.dim %arg0, %c0 : tensor<?xf32>
+    //    %2 = arith.addi %c1, %1 : index
+    //    %3 = tensor.empty(%2) : tensor<?xf32>
+    //    scf.yield %3 : tensor<?xf32>
+    //  }
+    //
+    // ```
+    auto forAllOp =
+        dyn_cast<scf::ForallOp>(blockArg.getParentBlock()->getParentOp());
+    if (!forAllOp)
       return failure();
-    Value initArg = loopLikeOp.getTiedLoopInit(blockArg)->get();
+    Value initArg = forAllOp.getTiedLoopInit(blockArg)->get();
     rewriter.modifyOpInPlace(
         dimOp, [&]() { dimOp.getSourceMutable().assign(initArg); });
     return success();
-- 
GitLab


From 51b4ada4588ecb3044b57c325a59aedcc19d7084 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Thu, 17 Oct 2024 17:10:45 +0400
Subject: [PATCH 251/329] clang/AMDGPU: Set noalias.addrspace metadata on
 atomicrmw (#102462)

---
 clang/include/clang/AST/Expr.h              |  11 +
 clang/include/clang/Basic/LangOptions.h     |   8 +
 clang/lib/CodeGen/CGAtomic.cpp              |   9 +-
 clang/lib/CodeGen/CodeGenFunction.h         |   3 +-
 clang/lib/CodeGen/TargetInfo.h              |   4 +-
 clang/lib/CodeGen/Targets/AMDGPU.cpp        |  36 ++-
 clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu |  94 +++++--
 clang/test/CodeGenCUDA/atomic-ops.cu        | 286 ++++++++++----------
 clang/test/CodeGenOpenCL/atomic-ops.cl      | 116 ++++----
 9 files changed, 339 insertions(+), 228 deletions(-)

diff --git a/clang/include/clang/AST/Expr.h b/clang/include/clang/AST/Expr.h
index cbe62411d11b..466c65a9685a 100644
--- a/clang/include/clang/AST/Expr.h
+++ b/clang/include/clang/AST/Expr.h
@@ -6777,6 +6777,17 @@ public:
            getOp() <= AO__opencl_atomic_store;
   }
 
+  bool isHIP() const {
+    return Op >= AO__hip_atomic_compare_exchange_strong &&
+           Op <= AO__hip_atomic_store;
+  }
+
+  /// Return true if atomics operations targeting allocations in private memory
+  /// are undefined.
+  bool threadPrivateMemoryAtomicsAreUndefined() const {
+    return isOpenCL() || isHIP();
+  }
+
   SourceLocation getBuiltinLoc() const { return BuiltinLoc; }
   SourceLocation getRParenLoc() const { return RParenLoc; }
 
diff --git a/clang/include/clang/Basic/LangOptions.h b/clang/include/clang/Basic/LangOptions.h
index e2d4206c72cc..949c8f5d448b 100644
--- a/clang/include/clang/Basic/LangOptions.h
+++ b/clang/include/clang/Basic/LangOptions.h
@@ -698,6 +698,14 @@ public:
     return ConvergentFunctions;
   }
 
+  /// Return true if atomicrmw operations targeting allocations in private
+  /// memory are undefined.
+  bool threadPrivateMemoryAtomicsAreUndefined() const {
+    // Should be false for OpenMP.
+    // TODO: Should this be true for SYCL?
+    return OpenCL || CUDA;
+  }
+
   /// Return the OpenCL C or C++ version as a VersionTuple.
   VersionTuple getOpenCLVersionTuple() const;
 
diff --git a/clang/lib/CodeGen/CGAtomic.cpp b/clang/lib/CodeGen/CGAtomic.cpp
index a2a87e012b8b..f8736695acf1 100644
--- a/clang/lib/CodeGen/CGAtomic.cpp
+++ b/clang/lib/CodeGen/CGAtomic.cpp
@@ -389,6 +389,7 @@ static void emitAtomicCmpXchg(CodeGenFunction &CGF, AtomicExpr *E, bool IsWeak,
       Ptr, Expected, Desired, SuccessOrder, FailureOrder, Scope);
   Pair->setVolatile(E->isVolatile());
   Pair->setWeak(IsWeak);
+  CGF.getTargetHooks().setTargetAtomicMetadata(CGF, *Pair, E);
 
   // Cmp holds the result of the compare-exchange operation: true on success,
   // false on failure.
@@ -727,7 +728,7 @@ static void EmitAtomicOp(CodeGenFunction &CGF, AtomicExpr *E, Address Dest,
 
   llvm::Value *LoadVal1 = CGF.Builder.CreateLoad(Val1);
   llvm::AtomicRMWInst *RMWI =
-      CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope);
+      CGF.emitAtomicRMWInst(Op, Ptr, LoadVal1, Order, Scope, E);
   RMWI->setVolatile(E->isVolatile());
 
   // For __atomic_*_fetch operations, perform the operation again to
@@ -2048,11 +2049,11 @@ std::pair<RValue, llvm::Value *> CodeGenFunction::EmitAtomicCompareExchange(
 llvm::AtomicRMWInst *
 CodeGenFunction::emitAtomicRMWInst(llvm::AtomicRMWInst::BinOp Op, Address Addr,
                                    llvm::Value *Val, llvm::AtomicOrdering Order,
-                                   llvm::SyncScope::ID SSID) {
-
+                                   llvm::SyncScope::ID SSID,
+                                   const AtomicExpr *AE) {
   llvm::AtomicRMWInst *RMW =
       Builder.CreateAtomicRMW(Op, Addr, Val, Order, SSID);
-  getTargetHooks().setTargetAtomicMetadata(*this, *RMW);
+  getTargetHooks().setTargetAtomicMetadata(*this, *RMW, AE);
   return RMW;
 }
 
diff --git a/clang/lib/CodeGen/CodeGenFunction.h b/clang/lib/CodeGen/CodeGenFunction.h
index 9ba0ed02a564..5f203fe0b128 100644
--- a/clang/lib/CodeGen/CodeGenFunction.h
+++ b/clang/lib/CodeGen/CodeGenFunction.h
@@ -4166,7 +4166,8 @@ public:
   llvm::AtomicRMWInst *emitAtomicRMWInst(
       llvm::AtomicRMWInst::BinOp Op, Address Addr, llvm::Value *Val,
       llvm::AtomicOrdering Order = llvm::AtomicOrdering::SequentiallyConsistent,
-      llvm::SyncScope::ID SSID = llvm::SyncScope::System);
+      llvm::SyncScope::ID SSID = llvm::SyncScope::System,
+      const AtomicExpr *AE = nullptr);
 
   void EmitAtomicUpdate(LValue LVal, llvm::AtomicOrdering AO,
                         const llvm::function_ref<RValue(RValue)> &UpdateOp,
diff --git a/clang/lib/CodeGen/TargetInfo.h b/clang/lib/CodeGen/TargetInfo.h
index 3e503538b2b1..373f8b8a80fd 100644
--- a/clang/lib/CodeGen/TargetInfo.h
+++ b/clang/lib/CodeGen/TargetInfo.h
@@ -336,7 +336,9 @@ public:
 
   /// Allow the target to apply other metadata to an atomic instruction
   virtual void setTargetAtomicMetadata(CodeGenFunction &CGF,
-                                       llvm::AtomicRMWInst &RMW) const {}
+                                       llvm::Instruction &AtomicInst,
+                                       const AtomicExpr *Expr = nullptr) const {
+  }
 
   /// Interface class for filling custom fields of a block literal for OpenCL.
   class TargetOpenCLBlockHelper {
diff --git a/clang/lib/CodeGen/Targets/AMDGPU.cpp b/clang/lib/CodeGen/Targets/AMDGPU.cpp
index b852dcffb295..56ad0503a11a 100644
--- a/clang/lib/CodeGen/Targets/AMDGPU.cpp
+++ b/clang/lib/CodeGen/Targets/AMDGPU.cpp
@@ -9,6 +9,7 @@
 #include "ABIInfoImpl.h"
 #include "TargetInfo.h"
 #include "clang/Basic/TargetOptions.h"
+#include "llvm/Support/AMDGPUAddrSpace.h"
 
 using namespace clang;
 using namespace clang::CodeGen;
@@ -312,7 +313,8 @@ public:
                                          llvm::AtomicOrdering Ordering,
                                          llvm::LLVMContext &Ctx) const override;
   void setTargetAtomicMetadata(CodeGenFunction &CGF,
-                               llvm::AtomicRMWInst &RMW) const override;
+                               llvm::Instruction &AtomicInst,
+                               const AtomicExpr *Expr = nullptr) const override;
   llvm::Value *createEnqueuedBlockKernel(CodeGenFunction &CGF,
                                          llvm::Function *BlockInvokeFunc,
                                          llvm::Type *BlockTy) const override;
@@ -546,19 +548,39 @@ AMDGPUTargetCodeGenInfo::getLLVMSyncScopeID(const LangOptions &LangOpts,
 }
 
 void AMDGPUTargetCodeGenInfo::setTargetAtomicMetadata(
-    CodeGenFunction &CGF, llvm::AtomicRMWInst &RMW) const {
-  if (!CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
+    CodeGenFunction &CGF, llvm::Instruction &AtomicInst,
+    const AtomicExpr *AE) const {
+  auto *RMW = dyn_cast<llvm::AtomicRMWInst>(&AtomicInst);
+  auto *CmpX = dyn_cast<llvm::AtomicCmpXchgInst>(&AtomicInst);
+
+  // OpenCL and old style HIP atomics consider atomics targeting thread private
+  // memory to be undefined.
+  //
+  // TODO: This is probably undefined for atomic load/store, but there's not
+  // much direct codegen benefit to knowing this.
+  if (((RMW && RMW->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS) ||
+       (CmpX &&
+        CmpX->getPointerAddressSpace() == llvm::AMDGPUAS::FLAT_ADDRESS)) &&
+      AE && AE->threadPrivateMemoryAtomicsAreUndefined()) {
+    llvm::MDBuilder MDHelper(CGF.getLLVMContext());
+    llvm::MDNode *ASRange = MDHelper.createRange(
+        llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS),
+        llvm::APInt(32, llvm::AMDGPUAS::PRIVATE_ADDRESS + 1));
+    AtomicInst.setMetadata(llvm::LLVMContext::MD_noalias_addrspace, ASRange);
+  }
+
+  if (!RMW || !CGF.getTarget().allowAMDGPUUnsafeFPAtomics())
     return;
 
   // TODO: Introduce new, more controlled options that also work for integers,
   // and deprecate allowAMDGPUUnsafeFPAtomics.
-  llvm::AtomicRMWInst::BinOp RMWOp = RMW.getOperation();
+  llvm::AtomicRMWInst::BinOp RMWOp = RMW->getOperation();
   if (llvm::AtomicRMWInst::isFPOperation(RMWOp)) {
     llvm::MDNode *Empty = llvm::MDNode::get(CGF.getLLVMContext(), {});
-    RMW.setMetadata("amdgpu.no.fine.grained.memory", Empty);
+    RMW->setMetadata("amdgpu.no.fine.grained.memory", Empty);
 
-    if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW.getType()->isFloatTy())
-      RMW.setMetadata("amdgpu.ignore.denormal.mode", Empty);
+    if (RMWOp == llvm::AtomicRMWInst::FAdd && RMW->getType()->isFloatTy())
+      RMW->setMetadata("amdgpu.ignore.denormal.mode", Empty);
   }
 }
 
diff --git a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
index 8bf8241e343e..efe75be8488b 100644
--- a/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/amdgpu-atomic-ops.cu
@@ -26,15 +26,19 @@ __global__ void ffp1(float *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE:[0-9]+]], !amdgpu.no.fine.grained.memory !{{[0-9]+}}, !amdgpu.ignore.denormal.mode !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE: _Z4ffp1Pf
   // SAFE: global_atomic_cmpswap
@@ -56,6 +60,9 @@ __global__ void ffp1(float *p) {
   __atomic_fetch_sub(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0f, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0f, memory_order_relaxed);
+
+  __hip_atomic_fetch_add(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
+  __hip_atomic_fetch_sub(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
   __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
 }
@@ -66,15 +73,19 @@ __global__ void ffp2(double *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE-LABEL: @_Z4ffp2Pd
   // SAFE: global_atomic_cmpswap_b64
@@ -95,8 +106,10 @@ __global__ void ffp2(double *p) {
   __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
-  __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
-  __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
+  __hip_atomic_fetch_add(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
+  __hip_atomic_fetch_sub(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
+  __hip_atomic_fetch_max(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
+  __hip_atomic_fetch_min(p, 1.0, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
 }
 
 // long double is the same as double for amdgcn.
@@ -106,15 +119,19 @@ __global__ void ffp3(long double *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE-LABEL: @_Z4ffp3Pe
   // SAFE: global_atomic_cmpswap_b64
@@ -132,8 +149,10 @@ __global__ void ffp3(long double *p) {
   __atomic_fetch_sub(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0L, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0L, memory_order_relaxed);
-  __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
-  __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
+  __hip_atomic_fetch_add(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
+  __hip_atomic_fetch_sub(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
+  __hip_atomic_fetch_max(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
+  __hip_atomic_fetch_min(p, 1.0L, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
 }
 
 __device__ double ffp4(double *p, float f) {
@@ -141,7 +160,11 @@ __device__ double ffp4(double *p, float f) {
   // CHECK: fpext float {{.*}} to double
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  return __atomic_fetch_sub(p, f, memory_order_relaxed);
+
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  __atomic_fetch_sub(p, f, memory_order_relaxed);
+  return __hip_atomic_fetch_sub(p, f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
 }
 
 __device__ double ffp5(double *p, int i) {
@@ -149,7 +172,11 @@ __device__ double ffp5(double *p, int i) {
   // CHECK: sitofp i32 {{.*}} to double
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8{{$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  return __atomic_fetch_sub(p, i, memory_order_relaxed);
+  __atomic_fetch_sub(p, i, memory_order_relaxed);
+
+  // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 8, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  return __hip_atomic_fetch_sub(p, i, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
 }
 
 __global__ void ffp6(_Float16 *p) {
@@ -158,15 +185,19 @@ __global__ void ffp6(_Float16 *p) {
   // SAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2{{$}}
   // SAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2{{$}}
   // SAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2{{$}}
-  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2{{$}}
-  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2{{$}}
+  // SAFEIR: atomicrmw fadd ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fsub ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+  // SAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
 
   // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmax ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
   // UNSAFEIR: atomicrmw fmin ptr {{.*}} monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
-  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fadd ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fsub ptr {{.*}} monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmax ptr {{.*}} syncscope("agent-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
+  // UNSAFEIR: atomicrmw fmin ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 2, !noalias.addrspace ![[$NO_PRIVATE]], !amdgpu.no.fine.grained.memory !{{[0-9]+$}}
 
   // SAFE: _Z4ffp6PDF16
   // SAFE: global_atomic_cmpswap
@@ -187,6 +218,25 @@ __global__ void ffp6(_Float16 *p) {
   __atomic_fetch_sub(p, 1.0, memory_order_relaxed);
   __atomic_fetch_max(p, 1.0, memory_order_relaxed);
   __atomic_fetch_min(p, 1.0, memory_order_relaxed);
+
+  __hip_atomic_fetch_add(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
+  __hip_atomic_fetch_sub(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
   __hip_atomic_fetch_max(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_AGENT);
   __hip_atomic_fetch_min(p, 1.0f, memory_order_relaxed, __HIP_MEMORY_SCOPE_WORKGROUP);
 }
+
+// CHECK-LABEL: @_Z12test_cmpxchgPiii
+// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}}
+// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} acquire acquire, align 4{{$}}
+// CHECK: cmpxchg ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+// CHECK: cmpxchg weak ptr %{{.+}}, i32 %{{.+}}, i32 %{{.+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NO_PRIVATE]]{{$}}
+__device__ int test_cmpxchg(int *ptr, int cmp, int desired) {
+  bool flag = __atomic_compare_exchange(ptr, &cmp, &desired, 0, memory_order_acquire, memory_order_acquire);
+  flag = __atomic_compare_exchange_n(ptr, &cmp, desired, 1, memory_order_acquire, memory_order_acquire);
+  flag = __hip_atomic_compare_exchange_strong(ptr, &cmp, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  flag = __hip_atomic_compare_exchange_weak(ptr, &cmp, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
+  return flag;
+}
+
+// SAFEIR: ![[$NO_PRIVATE]] = !{i32 5, i32 6}
+// UNSAFEIR: ![[$NO_PRIVATE]] = !{i32 5, i32 6}
diff --git a/clang/test/CodeGenCUDA/atomic-ops.cu b/clang/test/CodeGenCUDA/atomic-ops.cu
index fbc042caa809..1accd1712bec 100644
--- a/clang/test/CodeGenCUDA/atomic-ops.cu
+++ b/clang/test/CodeGenCUDA/atomic-ops.cu
@@ -1,19 +1,19 @@
-// RUN: %clang_cc1 -x hip -std=c++11 -triple amdgcn -fcuda-is-device -emit-llvm %s -o - | FileCheck %s
+// RUN: %clang_cc1 -x hip -std=c++11 -triple amdgcn -fcuda-is-device -emit-llvm %s -o - | FileCheck -enable-var-scope %s
 #include "Inputs/cuda.h"
 
 // CHECK-LABEL: @_Z24atomic32_op_singlethreadPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 4
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK:[0-9]+]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 4{{$}}
 __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -31,8 +31,8 @@ __device__ int atomic32_op_singlethread(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z25atomicu32_op_singlethreadPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as")
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -40,18 +40,18 @@ __device__ unsigned int atomicu32_op_singlethread(unsigned int *ptr, unsigned in
 }
 
 // CHECK-LABEL: @_Z21atomic32_op_wavefrontPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i32, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4{{$}}
 __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -69,8 +69,8 @@ __device__ int atomic32_op_wavefront(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z22atomicu32_op_wavefrontPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as")
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -78,17 +78,17 @@ __device__ unsigned int atomicu32_op_wavefront(unsigned int *ptr, unsigned int v
 }
 
 // CHECK-LABEL: @_Z21atomic32_op_workgroupPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
 __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -105,8 +105,8 @@ __device__ int atomic32_op_workgroup(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z22atomicu32_op_workgroupPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as")
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -114,17 +114,17 @@ __device__ unsigned int atomicu32_op_workgroup(unsigned int *ptr, unsigned int v
 }
 
 // CHECK-LABEL: @_Z17atomic32_op_agentPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
 __device__ int atomic32_op_agent(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -141,8 +141,8 @@ __device__ int atomic32_op_agent(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z18atomicu32_op_agentPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as")
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -150,18 +150,18 @@ __device__ unsigned int atomicu32_op_agent(unsigned int *ptr, unsigned int val,
 }
 
 // CHECK-LABEL: @_Z18atomic32_op_systemPiii
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: load i32, ptr %{{.*}}, align 4
-// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 4
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i32 {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load i32, ptr %{{.*}}, align 4{{$}}
+// CHECK: store atomic i32 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 4{{$}}
 __device__ int atomic32_op_system(int *ptr, int val, int desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -179,8 +179,8 @@ __device__ int atomic32_op_system(int *ptr, int val, int desired) {
 }
 
 // CHECK-LABEL: @_Z19atomicu32_op_systemPjjj
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as")
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i32 {{%[0-9]+}} syncscope("one-as") monotonic, align 4, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val, unsigned int desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -188,17 +188,17 @@ __device__ unsigned int atomicu32_op_system(unsigned int *ptr, unsigned int val,
 }
 
 // CHECK-LABEL: @_Z24atomic64_op_singlethreadPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
 __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -215,10 +215,10 @@ __device__ long long atomic64_op_singlethread(long long *ptr, long long *ptr2, l
 }
 
 // CHECK-LABEL: @_Z25atomicu64_op_singlethreadPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as")
-// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("singlethread-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("singlethread-one-as") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SINGLETHREAD);
@@ -228,18 +228,18 @@ __device__ unsigned long long atomicu64_op_singlethread(unsigned long long *ptr,
 }
 
 // CHECK-LABEL: @_Z21atomic64_op_wavefrontPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
 __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -257,10 +257,10 @@ __device__ long long atomic64_op_wavefront(long long *ptr, long long *ptr2, long
 }
 
 // CHECK-LABEL: @_Z22atomicu64_op_wavefrontPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as")
-// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: load atomic i64, ptr {{%[0-9]+}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WAVEFRONT);
@@ -270,17 +270,17 @@ __device__ unsigned long long atomicu64_op_wavefront(unsigned long long *ptr, un
 }
 
 // CHECK-LABEL: @_Z21atomic64_op_workgroupPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
 __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -297,9 +297,9 @@ __device__ long long atomic64_op_workgroup(long long *ptr, long long *ptr2, long
 }
 
 // CHECK-LABEL: @_Z22atomicu64_op_workgroupPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as")
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_WORKGROUP);
@@ -308,17 +308,17 @@ __device__ unsigned long long atomicu64_op_workgroup(unsigned long long *ptr, un
 }
 
 // CHECK-LABEL: @_Z17atomic64_op_agentPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
 __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -335,9 +335,9 @@ __device__ long long atomic64_op_agent(long long *ptr, long long *ptr2, long lon
 }
 
 // CHECK-LABEL: @_Z18atomicu64_op_agentPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as")
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("agent-one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_AGENT);
@@ -346,18 +346,18 @@ __device__ unsigned long long atomicu64_op_agent(unsigned long long *ptr, unsign
 }
 
 // CHECK-LABEL: @_Z18atomic64_op_systemPxS_xx
-// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8
-// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
+// CHECK: cmpxchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: cmpxchg weak ptr {{%[0-9]+}}, i64 {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xchg ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw add ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw sub ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw and ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw or ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw xor ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw min ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw max ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i64, ptr %{{.*}}, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}}
 __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long long val, long long desired) {
   bool flag = __hip_atomic_compare_exchange_strong(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   flag = __hip_atomic_compare_exchange_weak(ptr, &val, desired, __ATOMIC_RELAXED, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -375,10 +375,10 @@ __device__ long long atomic64_op_system(long long *ptr, long long *ptr2, long lo
 }
 
 // CHECK-LABEL: @_Z19atomicu64_op_systemPyS_yy
-// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
-// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as")
+// CHECK: atomicrmw umin ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
+// CHECK: atomicrmw umax ptr {{%[0-9]+}}, i64 {{%[0-9]+}} syncscope("one-as") monotonic, align 8, !noalias.addrspace ![[$NOALIAS_ADDRSPACE_STACK]]{{$}}
 // CHECK: load i64, ptr %{{.*}}, align 8
-// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8
+// CHECK: store atomic i64 %{{.*}}, ptr %{{.*}} syncscope("one-as") monotonic, align 8{{$}}
 __device__ unsigned long long atomicu64_op_system(unsigned long long *ptr, unsigned long long *ptr2, unsigned long long val, unsigned long long desired) {
   val = __hip_atomic_fetch_min(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   val = __hip_atomic_fetch_max(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
@@ -386,3 +386,5 @@ __device__ unsigned long long atomicu64_op_system(unsigned long long *ptr, unsig
   __hip_atomic_store(ptr, val, __ATOMIC_RELAXED, __HIP_MEMORY_SCOPE_SYSTEM);
   return val;
 }
+
+// [[$NOALIAS_ADDRSPACE_STACK]] = !{i32 5, i32 6}
diff --git a/clang/test/CodeGenOpenCL/atomic-ops.cl b/clang/test/CodeGenOpenCL/atomic-ops.cl
index 5e2de38ac3d3..9c1775cc0430 100644
--- a/clang/test/CodeGenOpenCL/atomic-ops.cl
+++ b/clang/test/CodeGenOpenCL/atomic-ops.cl
@@ -37,58 +37,58 @@ atomic_int j;
 
 void fi1(atomic_int *i) {
   // CHECK-LABEL: @fi1
-  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}}
   int x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_work_group);
 
-  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("agent") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("agent") seq_cst, align 4{{$}}
   x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_device);
 
-  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} seq_cst, align 4{{$}}
   x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_all_svm_devices);
 
-  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("wavefront") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{[.0-9A-Z_a-z]+}} syncscope("wavefront") seq_cst, align 4{{$}}
   x = __opencl_atomic_load(i, memory_order_seq_cst, memory_scope_sub_group);
 }
 
 void fi2(atomic_int *i) {
   // CHECK-LABEL: @fi2
-  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}}
   __opencl_atomic_store(i, 1, memory_order_seq_cst, memory_scope_work_group);
 }
 
 void test_addr(global atomic_int *ig, private atomic_int *ip, local atomic_int *il) {
   // CHECK-LABEL: @test_addr
-  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(1) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(1) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}}
   __opencl_atomic_store(ig, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(5) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(5) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}}
   __opencl_atomic_store(ip, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(3) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: store atomic i32 %{{[.0-9A-Z_a-z]+}}, ptr addrspace(3) %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4{{$}}
   __opencl_atomic_store(il, 1, memory_order_seq_cst, memory_scope_work_group);
 }
 
 void fi3(atomic_int *i, atomic_uint *ui) {
   // CHECK-LABEL: @fi3
-  // CHECK: atomicrmw and ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: atomicrmw and ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE:![0-9]+]]{{$}}
   int x = __opencl_atomic_fetch_and(i, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // CHECK: atomicrmw min ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: atomicrmw min ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   x = __opencl_atomic_fetch_min(i, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // CHECK: atomicrmw max ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: atomicrmw max ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   x = __opencl_atomic_fetch_max(i, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // CHECK: atomicrmw umin ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: atomicrmw umin ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   x = __opencl_atomic_fetch_min(ui, 1, memory_order_seq_cst, memory_scope_work_group);
 
-  // CHECK: atomicrmw umax ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: atomicrmw umax ptr %{{[.0-9A-Z_a-z]+}}, i32 %{{[.0-9A-Z_a-z]+}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   x = __opencl_atomic_fetch_max(ui, 1, memory_order_seq_cst, memory_scope_work_group);
 }
 
 bool fi4(atomic_int *i) {
   // CHECK-LABEL: @fi4(
-  // CHECK: [[PAIR:%[.0-9A-Z_a-z]+]] = cmpxchg ptr [[PTR:%[.0-9A-Z_a-z]+]], i32 [[EXPECTED:%[.0-9A-Z_a-z]+]], i32 [[DESIRED:%[.0-9A-Z_a-z]+]] syncscope("workgroup-one-as") acquire acquire, align 4
+  // CHECK: [[PAIR:%[.0-9A-Z_a-z]+]] = cmpxchg ptr [[PTR:%[.0-9A-Z_a-z]+]], i32 [[EXPECTED:%[.0-9A-Z_a-z]+]], i32 [[DESIRED:%[.0-9A-Z_a-z]+]] syncscope("workgroup-one-as") acquire acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: [[OLD:%[.0-9A-Z_a-z]+]] = extractvalue { i32, i1 } [[PAIR]], 0
   // CHECK: [[CMP:%[.0-9A-Z_a-z]+]] = extractvalue { i32, i1 } [[PAIR]], 1
   // CHECK: br i1 [[CMP]], label %[[STORE_EXPECTED:[.0-9A-Z_a-z]+]], label %[[CONTINUE:[.0-9A-Z_a-z]+]]
@@ -105,16 +105,16 @@ void fi5(atomic_int *i, int scope) {
   // CHECK-NEXT: i32 4, label %[[opencl_subgroup:.*]]
   // CHECK-NEXT: ]
   // CHECK: [[opencl_workgroup]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4{{$}}
   // CHECK: br label %[[continue:.*]]
   // CHECK: [[opencl_device]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4{{$}}
   // CHECK: br label %[[continue]]
   // CHECK: [[opencl_allsvmdevices]]:
   // CHECK: load atomic i32, ptr %{{.*}} seq_cst, align 4
   // CHECK: br label %[[continue]]
   // CHECK: [[opencl_subgroup]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4{{$}}
   // CHECK: br label %[[continue]]
   // CHECK: [[continue]]:
   int x = __opencl_atomic_load(i, memory_order_seq_cst, scope);
@@ -146,35 +146,35 @@ void fi6(atomic_int *i, int order, int scope) {
   // CHECK-NEXT: i32 4, label %[[SEQ_SUB:.*]]
   // CHECK-NEXT: ]
   // CHECK: [[MON_WG]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
   // CHECK: [[MON_DEV]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") monotonic, align 4{{$}}
   // CHECK: [[MON_ALL]]:
-  // CHECK: load atomic i32, ptr %{{.*}} monotonic, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} monotonic, align 4{{$}}
   // CHECK: [[MON_SUB]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") monotonic, align 4{{$}}
   // CHECK: [[ACQ_WG]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") acquire, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup-one-as") acquire, align 4{{$}}
   // CHECK: [[ACQ_DEV]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") acquire, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent-one-as") acquire, align 4{{$}}
   // CHECK: [[ACQ_ALL]]:
-  // CHECK: load atomic i32, ptr %{{.*}} acquire, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} acquire, align 4{{$}}
   // CHECK: [[ACQ_SUB]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") acquire, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront-one-as") acquire, align 4{{$}}
   // CHECK: [[SEQ_WG]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("workgroup") seq_cst, align 4{{$}}
   // CHECK: [[SEQ_DEV]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("agent") seq_cst, align 4{{$}}
   // CHECK: [[SEQ_ALL]]:
-  // CHECK: load atomic i32, ptr %{{.*}} seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} seq_cst, align 4{{$}}
   // CHECK: [[SEQ_SUB]]:
-  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4
+  // CHECK: load atomic i32, ptr %{{.*}} syncscope("wavefront") seq_cst, align 4{{$}}
   int x = __opencl_atomic_load(i, order, scope);
 }
 
 float ff1(global atomic_float *d) {
   // CHECK-LABEL: @ff1
-  // CHECK: load atomic i32, ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 4
+  // CHECK: load atomic i32, ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
   return __opencl_atomic_load(d, memory_order_relaxed, memory_scope_work_group);
 }
 
@@ -186,19 +186,31 @@ void ff2(atomic_float *d) {
 
 float ff3(atomic_float *d) {
   // CHECK-LABEL: @ff3
-  // CHECK: atomicrmw xchg ptr {{.*}} syncscope("workgroup") seq_cst, align 4
+  // CHECK: atomicrmw xchg ptr {{.*}} syncscope("workgroup") seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   return __opencl_atomic_exchange(d, 2, memory_order_seq_cst, memory_scope_work_group);
 }
 
 float ff4(global atomic_float *d, float a) {
   // CHECK-LABEL: @ff4
-  // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic
+  // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 4{{$}}
   return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
 }
 
 float ff5(global atomic_double *d, double a) {
   // CHECK-LABEL: @ff5
-  // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic
+  // CHECK: atomicrmw fadd ptr addrspace(1) {{.*}} syncscope("workgroup-one-as") monotonic, align 8{{$}}
+  return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
+}
+
+float ff4_generic(atomic_float *d, float a) {
+  // CHECK-LABEL: @ff4_generic
+  // CHECK: atomicrmw fadd ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
+  return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
+}
+
+float ff5_generic(atomic_double *d, double a) {
+  // CHECK-LABEL: @ff5_generic
+  // CHECK: atomicrmw fadd ptr {{.*}} syncscope("workgroup-one-as") monotonic, align 8, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   return __opencl_atomic_fetch_add(d, a, memory_order_relaxed, memory_scope_work_group);
 }
 
@@ -215,10 +227,10 @@ void atomic_init_foo()
 
 // CHECK-LABEL: @failureOrder
 void failureOrder(atomic_int *ptr, int *ptr2) {
-  // CHECK: cmpxchg ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup-one-as") acquire monotonic, align 4
+  // CHECK: cmpxchg ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup-one-as") acquire monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   __opencl_atomic_compare_exchange_strong(ptr, ptr2, 43, memory_order_acquire, memory_order_relaxed, memory_scope_work_group);
 
-  // CHECK: cmpxchg weak ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup") seq_cst acquire, align 4
+  // CHECK: cmpxchg weak ptr {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z._]+}}, i32 {{%[0-9A-Za-z_.]+}} syncscope("workgroup") seq_cst acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   __opencl_atomic_compare_exchange_weak(ptr, ptr2, 43, memory_order_seq_cst, memory_order_acquire, memory_scope_work_group);
 }
 
@@ -268,63 +280,63 @@ void generalFailureOrder(atomic_int *ptr, int *ptr2, int success, int fail) {
   // CHECK-NEXT: ]
 
   // CHECK: [[MONOTONIC_MONOTONIC]]
-  // CHECK: cmpxchg {{.*}} monotonic monotonic, align 4
+  // CHECK: cmpxchg {{.*}} monotonic monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[MONOTONIC_ACQUIRE]]
-  // CHECK: cmpxchg {{.*}} monotonic acquire, align 4
+  // CHECK: cmpxchg {{.*}} monotonic acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[MONOTONIC_SEQCST]]
-  // CHECK: cmpxchg {{.*}} monotonic seq_cst, align 4
+  // CHECK: cmpxchg {{.*}} monotonic seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[ACQUIRE_MONOTONIC]]
-  // CHECK: cmpxchg {{.*}} acquire monotonic, align 4
+  // CHECK: cmpxchg {{.*}} acquire monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[ACQUIRE_ACQUIRE]]
-  // CHECK: cmpxchg {{.*}} acquire acquire, align 4
+  // CHECK: cmpxchg {{.*}} acquire acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[ACQUIRE_SEQCST]]
-  // CHECK: cmpxchg {{.*}} acquire seq_cst, align 4
+  // CHECK: cmpxchg {{.*}} acquire seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[RELEASE_MONOTONIC]]
-  // CHECK: cmpxchg {{.*}} release monotonic, align 4
+  // CHECK: cmpxchg {{.*}} release monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[RELEASE_ACQUIRE]]
-  // CHECK: cmpxchg {{.*}} release acquire, align 4
+  // CHECK: cmpxchg {{.*}} release acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[RELEASE_SEQCST]]
-  // CHECK: cmpxchg {{.*}} release seq_cst, align 4
+  // CHECK: cmpxchg {{.*}} release seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[ACQREL_MONOTONIC]]
-  // CHECK: cmpxchg {{.*}} acq_rel monotonic, align 4
+  // CHECK: cmpxchg {{.*}} acq_rel monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[ACQREL_ACQUIRE]]
-  // CHECK: cmpxchg {{.*}} acq_rel acquire, align 4
+  // CHECK: cmpxchg {{.*}} acq_rel acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[ACQREL_SEQCST]]
-  // CHECK: cmpxchg {{.*}} acq_rel seq_cst, align 4
+  // CHECK: cmpxchg {{.*}} acq_rel seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[SEQCST_MONOTONIC]]
-  // CHECK: cmpxchg {{.*}} seq_cst monotonic, align 4
+  // CHECK: cmpxchg {{.*}} seq_cst monotonic, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[SEQCST_ACQUIRE]]
-  // CHECK: cmpxchg {{.*}} seq_cst acquire, align 4
+  // CHECK: cmpxchg {{.*}} seq_cst acquire, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 
   // CHECK: [[SEQCST_SEQCST]]
-  // CHECK: cmpxchg {{.*}} seq_cst seq_cst, align 4
+  // CHECK: cmpxchg {{.*}} seq_cst seq_cst, align 4, !noalias.addrspace [[$NOPRIVATE]]{{$}}
   // CHECK: br
 }
 
@@ -334,7 +346,7 @@ int test_volatile(volatile atomic_int *i) {
   // CHECK-NEXT: %[[atomicdst:.*]] = alloca i32
   // CHECK-NEXT: store ptr %i, ptr addrspace(5) %[[i_addr]]
   // CHECK-NEXT: %[[addr:.*]] = load ptr, ptr addrspace(5) %[[i_addr]]
-  // CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4
+  // CHECK-NEXT: %[[res:.*]] = load atomic volatile i32, ptr %[[addr]] syncscope("workgroup") seq_cst, align 4{{$}}
   // CHECK-NEXT: store i32 %[[res]], ptr addrspace(5) %[[atomicdst]]
   // CHECK-NEXT: %[[retval:.*]] = load i32, ptr addrspace(5) %[[atomicdst]]
   // CHECK-NEXT: ret i32 %[[retval]]
@@ -342,3 +354,5 @@ int test_volatile(volatile atomic_int *i) {
 }
 
 #endif
+
+// CHECK: [[$NOPRIVATE]] = !{i32 5, i32 6}
-- 
GitLab


From 3f17da1f45dfcafebff1ef7fba031eae86ce1720 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 14:18:39 +0100
Subject: [PATCH 252/329] [X86] Regenerate test checks with vpternlog comments

---
 llvm/test/CodeGen/X86/combine-or-shuffle.ll |  4 ++--
 llvm/test/CodeGen/X86/psubus.ll             |  6 +++---
 llvm/test/CodeGen/X86/sat-add.ll            | 24 ++++++++++-----------
 3 files changed, 17 insertions(+), 17 deletions(-)

diff --git a/llvm/test/CodeGen/X86/combine-or-shuffle.ll b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
index 175d21a4f706..55b1cdeddb85 100644
--- a/llvm/test/CodeGen/X86/combine-or-shuffle.ll
+++ b/llvm/test/CodeGen/X86/combine-or-shuffle.ll
@@ -808,7 +808,7 @@ define <2 x i64> @or_and_v2i64(<2 x i64> %a0) {
 ; AVX512-LABEL: or_and_v2i64:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpbroadcastq {{.*#+}} xmm1 = [7,7]
-; AVX512-NEXT:    vpternlogq $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to2}, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
 ; AVX512-NEXT:    retq
   %1 = and <2 x i64> %a0, <i64 7, i64 7>
   %2 = or <2 x i64> %1, <i64 3, i64 3>
@@ -837,7 +837,7 @@ define <4 x i32> @or_and_v4i32(<4 x i32> %a0) {
 ; AVX512-LABEL: or_and_v4i32:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpmovsxbd {{.*#+}} xmm1 = [3,3,15,7]
-; AVX512-NEXT:    vpternlogd $200, {{\.?LCPI[0-9]+_[0-9]+}}(%rip), %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 | mem)
 ; AVX512-NEXT:    retq
   %1 = and <4 x i32> %a0, <i32 1, i32 3, i32 5, i32 7>
   %2 = or <4 x i32> %1, <i32 3, i32 2, i32 15, i32 2>
diff --git a/llvm/test/CodeGen/X86/psubus.ll b/llvm/test/CodeGen/X86/psubus.ll
index 4d220c43dc47..be8adf697d5c 100644
--- a/llvm/test/CodeGen/X86/psubus.ll
+++ b/llvm/test/CodeGen/X86/psubus.ll
@@ -146,7 +146,7 @@ define <4 x i32> @ashr_xor_and_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: ashr_xor_and_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
 ; AVX512-NEXT:    retq
   %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
   %flipsign = xor <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
@@ -187,7 +187,7 @@ define <4 x i32> @ashr_add_and_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: ashr_add_and_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
 ; AVX512-NEXT:    retq
   %signsplat = ashr <4 x i32> %x, <i32 undef, i32 31, i32 31, i32 31>
   %flipsign = add <4 x i32> %x, <i32 2147483648, i32 2147483648, i32 2147483648, i32 2147483648>
@@ -230,7 +230,7 @@ define <4 x i32> @usubsat_custom(<4 x i32> %x) nounwind {
 ; AVX512-LABEL: usubsat_custom:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpsrad $31, %xmm0, %xmm1
-; AVX512-NEXT:    vpternlogd $72, {{\.?LCPI[0-9]+_[0-9]+}}(%rip){1to4}, %xmm1, %xmm0
+; AVX512-NEXT:    vpternlogd {{.*#+}} xmm0 = xmm1 & (xmm0 ^ mem)
 ; AVX512-NEXT:    retq
   %res = call <4 x i32> @llvm.usub.sat.v4i32(<4 x i32> %x, <4 x i32> <i32 2147483648, i32 2147483648, i32 2147483648, i32 undef>)
   ret <4 x i32> %res
diff --git a/llvm/test/CodeGen/X86/sat-add.ll b/llvm/test/CodeGen/X86/sat-add.ll
index f78b57d895ee..949902a5ebc4 100644
--- a/llvm/test/CodeGen/X86/sat-add.ll
+++ b/llvm/test/CodeGen/X86/sat-add.ll
@@ -820,7 +820,7 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_min(<16 x i8> %x, <16 x i8>
 ; AVX512-LABEL: unsigned_sat_variable_v16i8_using_min:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
 ; AVX512-NEXT:    vpminub %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -875,10 +875,10 @@ define <16 x i8> @unsigned_sat_variable_v16i8_using_cmp_notval(<16 x i8> %x, <16
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vpaddb %xmm1, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm1 = ~xmm1
 ; AVX512-NEXT:    vpminub %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpeqb %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $222, %xmm2, %xmm3, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm3 | (xmm0 ^ xmm2)
 ; AVX512-NEXT:    retq
   %noty = xor <16 x i8> %y, <i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1, i8 -1>
   %a = add <16 x i8> %x, %y
@@ -917,7 +917,7 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_min(<8 x i16> %x, <8 x i16>
 ; AVX512-LABEL: unsigned_sat_variable_v8i16_using_min:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
 ; AVX512-NEXT:    vpminuw %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -982,10 +982,10 @@ define <8 x i16> @unsigned_sat_variable_v8i16_using_cmp_notval(<8 x i16> %x, <8
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vpaddw %xmm1, %xmm0, %xmm3
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm1
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm1 = ~xmm1
 ; AVX512-NEXT:    vpminuw %xmm1, %xmm0, %xmm1
 ; AVX512-NEXT:    vpcmpeqw %xmm1, %xmm0, %xmm0
-; AVX512-NEXT:    vpternlogq $222, %xmm2, %xmm3, %xmm0
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm0 = xmm3 | (xmm0 ^ xmm2)
 ; AVX512-NEXT:    retq
   %noty = xor <8 x i16> %y, <i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1, i16 -1>
   %a = add <8 x i16> %x, %y
@@ -1029,7 +1029,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_min(<4 x i32> %x, <4 x i32>
 ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_min:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
 ; AVX512-NEXT:    vpminud %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -1070,7 +1070,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_sum(<4 x i32> %x, <4 x i
 ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_sum:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
 ; AVX512-NEXT:    vpminud %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -1117,7 +1117,7 @@ define <4 x i32> @unsigned_sat_variable_v4i32_using_cmp_notval(<4 x i32> %x, <4
 ; AVX512-LABEL: unsigned_sat_variable_v4i32_using_cmp_notval:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm3
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = ~xmm3
 ; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vpcmpleud %xmm3, %xmm0, %k1
 ; AVX512-NEXT:    vpaddd %xmm1, %xmm0, %xmm2 {%k1}
@@ -1202,7 +1202,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_min(<2 x i64> %x, <2 x i64>
 ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_min:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
 ; AVX512-NEXT:    vpminuq %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -1272,7 +1272,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_sum(<2 x i64> %x, <2 x i
 ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_sum:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm2
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm2
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm2 = ~xmm2
 ; AVX512-NEXT:    vpminuq %xmm2, %xmm0, %xmm0
 ; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm0
 ; AVX512-NEXT:    retq
@@ -1339,7 +1339,7 @@ define <2 x i64> @unsigned_sat_variable_v2i64_using_cmp_notval(<2 x i64> %x, <2
 ; AVX512-LABEL: unsigned_sat_variable_v2i64_using_cmp_notval:
 ; AVX512:       # %bb.0:
 ; AVX512-NEXT:    vmovdqa %xmm1, %xmm3
-; AVX512-NEXT:    vpternlogq $15, %xmm1, %xmm1, %xmm3
+; AVX512-NEXT:    vpternlogq {{.*#+}} xmm3 = ~xmm3
 ; AVX512-NEXT:    vpcmpeqd %xmm2, %xmm2, %xmm2
 ; AVX512-NEXT:    vpcmpleuq %xmm3, %xmm0, %k1
 ; AVX512-NEXT:    vpaddq %xmm1, %xmm0, %xmm2 {%k1}
-- 
GitLab


From f9d07890640434a4be0e7f651dd295478598b36d Mon Sep 17 00:00:00 2001
From: Qiongsi Wu <274595+qiongsiwu@users.noreply.github.com>
Date: Thu, 17 Oct 2024 09:32:10 -0400
Subject: [PATCH 253/329] [PGO] Initialize GCOV Writeout and Reset Functions in
 the Runtime on AIX (#108570)

This PR registers the writeout and reset functions for `gcov` for all
modules in the PGO runtime, instead of registering them
using global constructors in each module. The change is made for AIX
only, but the same mechanism works on Linux on Power.

When registering such functions using global constructors in each module
without `-ffunction-sections`, the AIX linker cannot garbage collect
unused undefined symbols, because such symbols are grouped in the same
section as the `__sinit` symbol. Keeping such undefined symbols causes
link errors (see test case
https://github.com/llvm/llvm-project/pull/108570/files#diff-500a7e1ba871e1b6b61b523700d5e30987900002add306e1b5e4972cf6d5a4f1R1
for this scenario). This PR implements the initialization in the
runtime, hence avoiding introducing `__sinit` into each module.

The implementation adds a new global variable `__llvm_covinit_functions`
to each module. This new global variable contains the function pointers
to the `Writeout` and `Reset` functions. `__llvm_covinit_functions`'s
section is the named section `__llvm_covinit`. The linker will aggregate
all the `__llvm_covinit` sections from each module
to form one single named section in the final binary. The pair of
functions
```
const __llvm_gcov_init_func_struct *__llvm_profile_begin_covinit();
const __llvm_gcov_init_func_struct *__llvm_profile_end_covinit();
```
are implemented to return the start and end address of this named
section in the final binary, and they are used in function
```
__llvm_profile_gcov_initialize()
```
(which is a constructor function in the runtime) so the runtime knows
the addresses of all the `Writeout` and `Reset` functions from all the
modules.

One noticeable implementation detail relevant to AIX is that to preserve
the `__llvm_covinit` from the linker's garbage collection, a `.ref`
pseudo instruction is inserted into them, referring to the section that
contains the `__llvm_gcov_ctr` variables, which are used in the
instrumented code. The `__llvm_gcov_ctr` variables did not belong to
named sections before, but this PR added them to the
`__llvm_gcov_ctr_section` named section, so we can add a `.ref` pseudo
instruction that refers to them in the `__llvm_covinit` section.
---
 clang/test/CodeGen/code-coverage.c            |  23 ++-
 compiler-rt/include/profile/InstrProfData.inc |  23 +++
 compiler-rt/lib/profile/GCDAProfiling.c       |  19 +++
 compiler-rt/lib/profile/InstrProfiling.h      |  11 ++
 .../lib/profile/InstrProfilingPlatformAIX.c   |   5 +-
 .../lib/profile/InstrProfilingPlatformLinux.c |  16 ++
 .../test/profile/AIX/gcov-undef-sym.test      |  52 +++++++
 .../llvm/ProfileData/InstrProfData.inc        |  23 +++
 llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp     |  28 ++++
 .../Instrumentation/GCOVProfiling.cpp         |  47 +++++-
 .../test/CodeGen/PowerPC/gcov_ctr_ref_init.ll | 138 ++++++++++++++++++
 .../GCOVProfiling/kcfi-normalize.ll           |  15 +-
 llvm/test/Transforms/GCOVProfiling/kcfi.ll    |  15 +-
 .../Transforms/GCOVProfiling/module-flags.ll  |  13 +-
 14 files changed, 408 insertions(+), 20 deletions(-)
 create mode 100644 compiler-rt/test/profile/AIX/gcov-undef-sym.test
 create mode 100644 llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll

diff --git a/clang/test/CodeGen/code-coverage.c b/clang/test/CodeGen/code-coverage.c
index d7994bab35d8..4e3364df2178 100644
--- a/clang/test/CodeGen/code-coverage.c
+++ b/clang/test/CodeGen/code-coverage.c
@@ -3,12 +3,18 @@
 /// 4.7 enables cfg_checksum.
 /// 4.8 (default, compatible with gcov 7) emits the exit block the second.
 // RUN: rm -rf %t && mkdir %t && cd %t
-// RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,304 %s
-// RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,407 %s
-// RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \
-// RUN:   FileCheck --check-prefixes=CHECK,408 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,304 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,407 %s
+// RUN: %clang_cc1 -triple x86_64-unknown-unknown -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-CTOR-INIT,408 %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='304*' %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,304 %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null -coverage-version='407*' %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,407 %s
+// RUN: %clang_cc1 -triple powerpc64-ibm-aix -emit-llvm -disable-red-zone -coverage-data-file=/dev/null %s -o - | \
+// RUN:   FileCheck --check-prefixes=CHECK,CHECK-RT-INIT,408 %s
 
 // RUN: %clang_cc1 -emit-llvm -disable-red-zone -coverage-notes-file=aaa.gcno -coverage-data-file=bbb.gcda -debug-info-kind=limited -dwarf-version=4 %s -o - | FileCheck %s --check-prefix GCOV_FILE_INFO
 
@@ -49,10 +55,13 @@ int test2(int b) {
 /// 0x3430382a '4' '0' '8' '*'
 // 408-SAME: i32 875575338
 
+// Check for gcov initialization function pointers.
+// CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit"
+
 // Check that the noredzone flag is set on the generated functions.
 
 // CHECK: void @__llvm_gcov_writeout() unnamed_addr [[NRZ:#[0-9]+]]
-// CHECK: void @__llvm_gcov_init() unnamed_addr [[NRZ]]
+// CHECK-CTOR-INIT: void @__llvm_gcov_init() unnamed_addr [[NRZ]]
 
 // CHECK: attributes [[NRZ]] = { {{.*}}noredzone{{.*}} }
 
diff --git a/compiler-rt/include/profile/InstrProfData.inc b/compiler-rt/include/profile/InstrProfData.inc
index b9df3266fbcf..c66b0465a0b5 100644
--- a/compiler-rt/include/profile/InstrProfData.inc
+++ b/compiler-rt/include/profile/InstrProfData.inc
@@ -303,6 +303,18 @@ COVMAP_HEADER(uint32_t, Int32Ty, Version, \
 #undef COVMAP_HEADER
 /* COVMAP_HEADER end.  */
 
+/* COVINIT_FUNC start */
+#ifndef COVINIT_FUNC
+#define COVINIT_FUNC(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_DATA_DEFINED
+#endif
+COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), WriteoutFunction, \
+             WriteoutF)
+COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), ResetFunction, \
+             ResetF)
+#undef COVINIT_FUNC
+/* COVINIT_FUNC end */
 
 #ifdef INSTR_PROF_SECT_ENTRY
 #define INSTR_PROF_DATA_DEFINED
@@ -345,6 +357,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_covdata, \
 INSTR_PROF_SECT_ENTRY(IPSK_covname, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVNAME_COMMON), \
                       INSTR_PROF_COVNAME_COFF, "__LLVM_COV,")
+INSTR_PROF_SECT_ENTRY(IPSK_covinit, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON), \
+                      INSTR_PROF_COVINIT_COFF, "__LLVM_COV,")
 
 #undef INSTR_PROF_SECT_ENTRY
 #endif
@@ -761,6 +776,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
 #define INSTR_PROF_COVNAME_COMMON __llvm_covnames
 #define INSTR_PROF_ORDERFILE_COMMON __llvm_orderfile
+#define INSTR_PROF_COVINIT_COMMON __llvm_covinit
+
 /* Windows section names. Because these section names contain dollar characters,
  * they must be quoted.
  */
@@ -781,6 +798,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVNAME_COFF ".lcovn"
 #define INSTR_PROF_ORDERFILE_COFF ".lorderfile$M"
 
+// FIXME: Placeholder for Windows. Windows currently does not initialize
+// the GCOV functions in the runtime.
+#define INSTR_PROF_COVINIT_COFF ".lcovd$M"
+
 #ifdef _WIN32
 /* Runtime section names and name strings.  */
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF
@@ -800,6 +821,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVDATA_SECT_NAME INSTR_PROF_COVDATA_COFF
 #define INSTR_PROF_COVNAME_SECT_NAME INSTR_PROF_COVNAME_COFF
 #define INSTR_PROF_ORDERFILE_SECT_NAME INSTR_PROF_ORDERFILE_COFF
+#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_COVINIT_COFF
 #else
 /* Runtime section names and name strings.  */
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON)
@@ -821,6 +843,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Order file instrumentation. */
 #define INSTR_PROF_ORDERFILE_SECT_NAME                                         \
   INSTR_PROF_QUOTE(INSTR_PROF_ORDERFILE_COMMON)
+#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON)
 #endif
 
 #define INSTR_PROF_ORDERFILE_BUFFER_NAME _llvm_order_file_buffer
diff --git a/compiler-rt/lib/profile/GCDAProfiling.c b/compiler-rt/lib/profile/GCDAProfiling.c
index f67d95d21a7b..ac01805e70ad 100644
--- a/compiler-rt/lib/profile/GCDAProfiling.c
+++ b/compiler-rt/lib/profile/GCDAProfiling.c
@@ -624,6 +624,25 @@ void llvm_gcov_init(fn_ptr wfn, fn_ptr rfn) {
   }
 }
 
+#if defined(_AIX)
+COMPILER_RT_VISIBILITY __attribute__((constructor)) void
+__llvm_profile_gcov_initialize() {
+  const __llvm_gcov_init_func_struct *InitFuncStart =
+      __llvm_profile_begin_covinit();
+  const __llvm_gcov_init_func_struct *InitFuncEnd =
+      __llvm_profile_end_covinit();
+
+  for (const __llvm_gcov_init_func_struct *Ptr = InitFuncStart;
+       Ptr != InitFuncEnd; ++Ptr) {
+    fn_ptr wfn = (fn_ptr)Ptr->WriteoutFunction;
+    fn_ptr rfn = (fn_ptr)Ptr->ResetFunction;
+    if (!(wfn && rfn))
+      continue;
+    llvm_gcov_init(wfn, rfn);
+  }
+}
+#endif
+
 void __gcov_dump(void) {
   for (struct fn_node *f = writeout_fn_list.head; f; f = f->next)
     f->fn();
diff --git a/compiler-rt/lib/profile/InstrProfiling.h b/compiler-rt/lib/profile/InstrProfiling.h
index 9e43fd7c4789..7f0c0c194dc9 100644
--- a/compiler-rt/lib/profile/InstrProfiling.h
+++ b/compiler-rt/lib/profile/InstrProfiling.h
@@ -54,6 +54,12 @@ typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT) VTableProfData {
 #include "profile/InstrProfData.inc"
 } VTableProfData;
 
+typedef struct COMPILER_RT_ALIGNAS(INSTR_PROF_DATA_ALIGNMENT)
+    __llvm_gcov_init_func_struct {
+#define COVINIT_FUNC(Type, LLVMType, Name, Initializer) Type Name;
+#include "profile/InstrProfData.inc"
+} __llvm_gcov_init_func_struct;
+
 /*!
  * \brief Return 1 if profile counters are continuously synced to the raw
  * profile via an mmap(). This is in contrast to the default mode, in which
@@ -208,6 +214,9 @@ void __llvm_profile_initialize_file(void);
 /*! \brief Initialize the profile runtime. */
 void __llvm_profile_initialize(void);
 
+/*! \brief Initialize the gcov profile runtime. */
+void __llvm_profile_gcov_initialize(void);
+
 /*!
  * \brief Return path prefix (excluding the base filename) of the profile data.
  * This is useful for users using \c -fprofile-generate=./path_prefix who do
@@ -324,4 +333,6 @@ COMPILER_RT_VISIBILITY extern uint64_t
  */
 extern char INSTR_PROF_PROFILE_NAME_VAR[1]; /* __llvm_profile_filename. */
 
+const __llvm_gcov_init_func_struct *__llvm_profile_begin_covinit();
+const __llvm_gcov_init_func_struct *__llvm_profile_end_covinit();
 #endif /* PROFILE_INSTRPROFILING_H_ */
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
index b9d51b698b41..651f8785d0b9 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformAIX.c
@@ -202,6 +202,8 @@ static int dummy_vname[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_VNAME_SECT_NAME);
 static int dummy_vtab[0] COMPILER_RT_SECTION(
     COMPILER_RT_SEG INSTR_PROF_VTAB_SECT_NAME);
+static int dummy_covinit_funcs[0] COMPILER_RT_SECTION(
+    COMPILER_RT_SEG INSTR_PROF_COVINIT_SECT_NAME);
 
 // To avoid GC'ing of the dummy variables by the linker, reference them in an
 // array and reference the array in the runtime registration code
@@ -214,7 +216,8 @@ COMPILER_RT_VISIBILITY
 void *__llvm_profile_keep[] = {(void *)&dummy_cnts,  (void *)&dummy_bits,
                                (void *)&dummy_data,  (void *)&dummy_name,
                                (void *)&dummy_vnds,  (void *)&dummy_orderfile,
-                               (void *)&dummy_vname, (void *)&dummy_vtab};
+                               (void *)&dummy_vname, (void *)&dummy_vtab,
+                               (void *)&dummy_covinit_funcs};
 #ifdef __GNUC__
 #pragma GCC diagnostic pop
 #endif
diff --git a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
index 02f23379ce98..e2c06d51e0c6 100644
--- a/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
+++ b/compiler-rt/lib/profile/InstrProfilingPlatformLinux.c
@@ -35,6 +35,8 @@
 #define PROF_ORDERFILE_START INSTR_PROF_SECT_START(INSTR_PROF_ORDERFILE_COMMON)
 #define PROF_VNODES_START INSTR_PROF_SECT_START(INSTR_PROF_VNODES_COMMON)
 #define PROF_VNODES_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_VNODES_COMMON)
+#define PROF_COVINIT_START INSTR_PROF_SECT_START(INSTR_PROF_COVINIT_COMMON)
+#define PROF_COVINIT_STOP INSTR_PROF_SECT_STOP(INSTR_PROF_COVINIT_COMMON)
 
 /* Declare section start and stop symbols for various sections
  * generated by compiler instrumentation.
@@ -56,6 +58,10 @@ extern char PROF_NAME_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern char PROF_NAME_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern ValueProfNode PROF_VNODES_START COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
 extern ValueProfNode PROF_VNODES_STOP COMPILER_RT_VISIBILITY COMPILER_RT_WEAK;
+extern __llvm_gcov_init_func_struct PROF_COVINIT_START COMPILER_RT_VISIBILITY
+    COMPILER_RT_WEAK;
+extern __llvm_gcov_init_func_struct PROF_COVINIT_STOP COMPILER_RT_VISIBILITY
+    COMPILER_RT_WEAK;
 
 COMPILER_RT_VISIBILITY const __llvm_profile_data *
 __llvm_profile_begin_data(void) {
@@ -110,6 +116,16 @@ COMPILER_RT_VISIBILITY ValueProfNode *__llvm_profile_end_vnodes(void) {
 COMPILER_RT_VISIBILITY ValueProfNode *CurrentVNode = &PROF_VNODES_START;
 COMPILER_RT_VISIBILITY ValueProfNode *EndVNode = &PROF_VNODES_STOP;
 
+COMPILER_RT_VISIBILITY const __llvm_gcov_init_func_struct *
+__llvm_profile_begin_covinit() {
+  return &PROF_COVINIT_START;
+}
+
+COMPILER_RT_VISIBILITY const __llvm_gcov_init_func_struct *
+__llvm_profile_end_covinit() {
+  return &PROF_COVINIT_STOP;
+}
+
 #ifdef NT_GNU_BUILD_ID
 static size_t RoundUp(size_t size, size_t align) {
   return (size + align - 1) & ~(align - 1);
diff --git a/compiler-rt/test/profile/AIX/gcov-undef-sym.test b/compiler-rt/test/profile/AIX/gcov-undef-sym.test
new file mode 100644
index 000000000000..db9053952d95
--- /dev/null
+++ b/compiler-rt/test/profile/AIX/gcov-undef-sym.test
@@ -0,0 +1,52 @@
+// The undefined symbol should not cause link errors, and we should
+// obtain the expected coverage report.
+
+// Test the --coverage option.
+RUN: rm -rf %t0 && split-file %s %t0 && cd %t0
+RUN: %clang bar.c main.c undef.c --coverage -c
+RUN: ar -X32_64 -rv libfoo.a undef.o bar.o
+RUN: %clang main.o -L. -lfoo --coverage -o main.exe
+RUN: %run ./main.exe
+RUN: llvm-cov gcov -t main.gcda |  FileCheck --check-prefix=MAIN %s
+RUN: llvm-cov gcov -t bar.gcda |  FileCheck --check-prefix=BAR %s
+
+// Test the pgogen -fprofile-arcs -ftest-coverage option combination.
+RUN: rm -rf %t1 && split-file %s %t1 && cd %t1
+RUN: %clang_pgogen bar.c main.c undef.c -fprofile-arcs -ftest-coverage -c
+RUN: ar -X32_64 -rv libfoo.a undef.o bar.o
+RUN: %clang_pgogen main.o -L. -lfoo -fprofile-generate -fprofile-arcs -ftest-coverage -o main.exe
+RUN: %run ./main.exe
+RUN: llvm-cov gcov -t main.gcda |  FileCheck --check-prefix=MAIN %s
+RUN: llvm-cov gcov -t bar.gcda |  FileCheck --check-prefix=BAR %s
+
+// Test the pgogen -Wl,-bcdtors:mbr option combination.
+RUN: rm -rf %t2 && split-file %s %t2 && cd %t2
+RUN: %clang_pgogen bar.c main.c undef.c -fprofile-arcs -ftest-coverage -c
+RUN: ar -X32_64 -rv libfoo.a undef.o bar.o
+RUN: %clang_pgogen main.o -L. -lfoo -fprofile-generate -fprofile-arcs -ftest-coverage -Wl,-bcdtors:mbr -o main.exe
+RUN: %run ./main.exe
+RUN: llvm-cov gcov -t main.gcda |  FileCheck --check-prefix=MAIN %s
+RUN: llvm-cov gcov -t bar.gcda |  FileCheck --check-prefix=BAR %s
+
+MAIN:        1:    2:int main() {
+MAIN:        1:    3:  return bar();
+BAR:         1:    1:int bar() {
+BAR:         1:    2:  return 0;
+
+//--- main.c
+int bar();
+int main() {
+  return bar();
+}
+
+
+//--- bar.c
+int bar() {
+  return 0;
+}
+
+//--- undef.c
+void undef_func();
+void foo() {
+  undef_func();
+}
diff --git a/llvm/include/llvm/ProfileData/InstrProfData.inc b/llvm/include/llvm/ProfileData/InstrProfData.inc
index b9df3266fbcf..c66b0465a0b5 100644
--- a/llvm/include/llvm/ProfileData/InstrProfData.inc
+++ b/llvm/include/llvm/ProfileData/InstrProfData.inc
@@ -303,6 +303,18 @@ COVMAP_HEADER(uint32_t, Int32Ty, Version, \
 #undef COVMAP_HEADER
 /* COVMAP_HEADER end.  */
 
+/* COVINIT_FUNC start */
+#ifndef COVINIT_FUNC
+#define COVINIT_FUNC(Type, LLVMType, Name, Initializer)
+#else
+#define INSTR_PROF_DATA_DEFINED
+#endif
+COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), WriteoutFunction, \
+             WriteoutF)
+COVINIT_FUNC(IntPtrT, llvm::PointerType::getUnqual(Ctx), ResetFunction, \
+             ResetF)
+#undef COVINIT_FUNC
+/* COVINIT_FUNC end */
 
 #ifdef INSTR_PROF_SECT_ENTRY
 #define INSTR_PROF_DATA_DEFINED
@@ -345,6 +357,9 @@ INSTR_PROF_SECT_ENTRY(IPSK_covdata, \
 INSTR_PROF_SECT_ENTRY(IPSK_covname, \
                       INSTR_PROF_QUOTE(INSTR_PROF_COVNAME_COMMON), \
                       INSTR_PROF_COVNAME_COFF, "__LLVM_COV,")
+INSTR_PROF_SECT_ENTRY(IPSK_covinit, \
+                      INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON), \
+                      INSTR_PROF_COVINIT_COFF, "__LLVM_COV,")
 
 #undef INSTR_PROF_SECT_ENTRY
 #endif
@@ -761,6 +776,8 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVDATA_COMMON __llvm_covdata
 #define INSTR_PROF_COVNAME_COMMON __llvm_covnames
 #define INSTR_PROF_ORDERFILE_COMMON __llvm_orderfile
+#define INSTR_PROF_COVINIT_COMMON __llvm_covinit
+
 /* Windows section names. Because these section names contain dollar characters,
  * they must be quoted.
  */
@@ -781,6 +798,10 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVNAME_COFF ".lcovn"
 #define INSTR_PROF_ORDERFILE_COFF ".lorderfile$M"
 
+// FIXME: Placeholder for Windows. Windows currently does not initialize
+// the GCOV functions in the runtime.
+#define INSTR_PROF_COVINIT_COFF ".lcovd$M"
+
 #ifdef _WIN32
 /* Runtime section names and name strings.  */
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_DATA_COFF
@@ -800,6 +821,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 #define INSTR_PROF_COVDATA_SECT_NAME INSTR_PROF_COVDATA_COFF
 #define INSTR_PROF_COVNAME_SECT_NAME INSTR_PROF_COVNAME_COFF
 #define INSTR_PROF_ORDERFILE_SECT_NAME INSTR_PROF_ORDERFILE_COFF
+#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_COVINIT_COFF
 #else
 /* Runtime section names and name strings.  */
 #define INSTR_PROF_DATA_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_DATA_COMMON)
@@ -821,6 +843,7 @@ serializeValueProfDataFrom(ValueProfRecordClosure *Closure,
 /* Order file instrumentation. */
 #define INSTR_PROF_ORDERFILE_SECT_NAME                                         \
   INSTR_PROF_QUOTE(INSTR_PROF_ORDERFILE_COMMON)
+#define INSTR_PROF_COVINIT_SECT_NAME INSTR_PROF_QUOTE(INSTR_PROF_COVINIT_COMMON)
 #endif
 
 #define INSTR_PROF_ORDERFILE_BUFFER_NAME _llvm_order_file_buffer
diff --git a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
index 9528aea34979..b5a6c1c6e01d 100644
--- a/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
+++ b/llvm/lib/Target/PowerPC/PPCAsmPrinter.cpp
@@ -290,6 +290,8 @@ public:
 
   void emitPGORefs(Module &M);
 
+  void emitGCOVRefs();
+
   void emitEndOfAsmFile(Module &) override;
 
   void emitLinkage(const GlobalValue *GV, MCSymbol *GVSym) const override;
@@ -2962,6 +2964,31 @@ void PPCAIXAsmPrinter::emitPGORefs(Module &M) {
   }
 }
 
+void PPCAIXAsmPrinter::emitGCOVRefs() {
+  if (!OutContext.hasXCOFFSection(
+          "__llvm_gcov_ctr_section",
+          XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD)))
+    return;
+
+  MCSection *CtrSection = OutContext.getXCOFFSection(
+      "__llvm_gcov_ctr_section", SectionKind::getData(),
+      XCOFF::CsectProperties(XCOFF::XMC_RW, XCOFF::XTY_SD),
+      /*MultiSymbolsAllowed*/ true);
+
+  OutStreamer->switchSection(CtrSection);
+  const XCOFF::StorageMappingClass MappingClass =
+      TM.Options.XCOFFReadOnlyPointers ? XCOFF::XMC_RO : XCOFF::XMC_RW;
+  if (OutContext.hasXCOFFSection(
+          "__llvm_covinit",
+          XCOFF::CsectProperties(MappingClass, XCOFF::XTY_SD))) {
+    const char *SymbolStr = TM.Options.XCOFFReadOnlyPointers
+                                ? "__llvm_covinit[RO]"
+                                : "__llvm_covinit[RW]";
+    MCSymbol *S = OutContext.getOrCreateSymbol(SymbolStr);
+    OutStreamer->emitXCOFFRefDirective(S);
+  }
+}
+
 void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
   // If there are no functions and there are no toc-data definitions in this
   // module, we will never need to reference the TOC base.
@@ -2969,6 +2996,7 @@ void PPCAIXAsmPrinter::emitEndOfAsmFile(Module &M) {
     return;
 
   emitPGORefs(M);
+  emitGCOVRefs();
 
   // Switch to section to emit TOC base.
   OutStreamer->switchSection(getObjFileLowering().getTOCBaseSection());
diff --git a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
index a409f6150a71..2ea89be40a3d 100644
--- a/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
+++ b/llvm/lib/Transforms/Instrumentation/GCOVProfiling.cpp
@@ -29,6 +29,7 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicInst.h"
 #include "llvm/IR/Module.h"
+#include "llvm/ProfileData/InstrProf.h"
 #include "llvm/Support/CRC.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
@@ -121,8 +122,11 @@ private:
 
   Function *createInternalFunction(FunctionType *FTy, StringRef Name,
                                    StringRef MangledType = "");
+
   void emitGlobalConstructor(
       SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP);
+  void emitModuleInitFunctionPtrs(
+      SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP);
 
   bool isFunctionInstrumented(const Function &F);
   std::vector<Regex> createRegexesFromString(StringRef RegexesStr);
@@ -913,6 +917,9 @@ bool GCOVProfiler::emitProfileNotes(
         GlobalVariable *Counters = new GlobalVariable(
             *M, CounterTy, false, GlobalValue::InternalLinkage,
             Constant::getNullValue(CounterTy), "__llvm_gcov_ctr");
+        const llvm::Triple &Triple = llvm::Triple(M->getTargetTriple());
+        if (Triple.getObjectFormat() == llvm::Triple::XCOFF)
+          Counters->setSection("__llvm_gcov_ctr_section");
         CountersBySP.emplace_back(Counters, SP);
 
         for (size_t I : llvm::seq<size_t>(0, Measured)) {
@@ -979,7 +986,11 @@ bool GCOVProfiler::emitProfileNotes(
     }
 
     if (EmitGCDA) {
-      emitGlobalConstructor(CountersBySP);
+      const llvm::Triple &Triple = llvm::Triple(M->getTargetTriple());
+      if (Triple.getObjectFormat() == llvm::Triple::XCOFF)
+        emitModuleInitFunctionPtrs(CountersBySP);
+      else
+        emitGlobalConstructor(CountersBySP);
       EmitGCDA = false;
     }
   }
@@ -1028,6 +1039,40 @@ void GCOVProfiler::emitGlobalConstructor(
   appendToGlobalCtors(*M, F, 0);
 }
 
+void GCOVProfiler::emitModuleInitFunctionPtrs(
+    SmallVectorImpl<std::pair<GlobalVariable *, MDNode *>> &CountersBySP) {
+  Function *WriteoutF = insertCounterWriteout(CountersBySP);
+  Function *ResetF = insertReset(CountersBySP);
+
+  // Instead of creating a function call and add it to the constructors list,
+  // create a global variable in the __llvm_covinit section so the functions
+  // can be registered by a constructor in the runtime.
+
+  auto &Ctx = M->getContext();
+
+  Type *InitFuncDataTy[] = {
+#define COVINIT_FUNC(Type, LLVMType, Name, Init) LLVMType,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+
+  auto STy = StructType::get(Ctx, ArrayRef(InitFuncDataTy));
+
+  Constant *InitFuncPtrs[] = {
+#define COVINIT_FUNC(Type, LLVMType, Name, Init) Init,
+#include "llvm/ProfileData/InstrProfData.inc"
+  };
+
+  auto *CovInitGV =
+      new GlobalVariable(*M, STy, false, GlobalValue::PrivateLinkage, nullptr,
+                         "__llvm_covinit_functions");
+  CovInitGV->setInitializer(ConstantStruct::get(STy, InitFuncPtrs));
+  CovInitGV->setVisibility(GlobalValue::VisibilityTypes::DefaultVisibility);
+  CovInitGV->setSection(getInstrProfSectionName(
+      IPSK_covinit, Triple(M->getTargetTriple()).getObjectFormat()));
+  CovInitGV->setAlignment(Align(INSTR_PROF_DATA_ALIGNMENT));
+  CovInitGV->setConstant(true);
+}
+
 FunctionCallee GCOVProfiler::getStartFileFunc(const TargetLibraryInfo *TLI) {
   Type *Args[] = {
       PointerType::getUnqual(*Ctx), // const char *orig_filename
diff --git a/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll
new file mode 100644
index 000000000000..4710d5c14e5b
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/gcov_ctr_ref_init.ll
@@ -0,0 +1,138 @@
+; Tests if the __llvm_gcov_ctr section contains a .ref pseudo-op
+; referring to the __llvm_covinit section.
+; RUN: llc < %s | FileCheck --check-prefixes=CHECK,CHECK-RW %s
+; RUN: llc -mxcoff-roptr < %s | FileCheck --check-prefixes=CHECK,CHECK-RO %s
+
+target datalayout = "E-m:a-p:32:32-Fi32-i64:64-n32"
+target triple = "powerpc-ibm-aix"
+
+; CHECK-RW: .csect __llvm_covinit[RW],3
+; CHECK-RO: .csect __llvm_covinit[RO],3
+; CHECK-NEXT:    .align  3                               # @__llvm_covinit_functions
+; CHECK-NEXT: L..__llvm_covinit_functions:
+; CHECK-NEXT:     .vbyte  4, __llvm_gcov_writeout[DS]
+; CHECK-NEXT:     .vbyte  4, __llvm_gcov_reset[DS]
+; CHECK:    .csect __llvm_gcov_ctr_section[RW],3
+; CHECK-NEXT:    .lglobl __llvm_gcov_ctr                 # @_MergedGlobals
+; CHECK-NEXT:    .lglobl __llvm_gcov_ctr.1
+; CHECK-NEXT:    .align  3
+; CHECK-NEXT: L.._MergedGlobals:
+; CHECK-NEXT: __llvm_gcov_ctr:
+; CHECK-NEXT:     .space  16
+; CHECK-NEXT: __llvm_gcov_ctr.1:
+; CHECK-NEXT:     .extern .llvm_gcda_start_file[PR]
+; CHECK-NEXT:     .extern .llvm_gcda_emit_function[PR]
+; CHECK-NEXT:     .extern .llvm_gcda_emit_arcs[PR]
+; CHECK-NEXT:     .extern .llvm_gcda_summary_info[PR]
+; CHECK-NEXT:     .extern .llvm_gcda_end_file[PR]
+; CHECK-RW-NEXT:    .ref __llvm_covinit[RW]
+; CHECK-RO-NEXT:    .ref __llvm_covinit[RO]
+
+%emit_function_args_ty = type { i32, i32, i32 }
+%emit_arcs_args_ty = type { i32, ptr }
+%file_info = type { %start_file_args_ty, i32, ptr, ptr }
+%start_file_args_ty = type { ptr, i32, i32 }
+
+@__llvm_gcov_ctr = internal global [1 x i64] zeroinitializer, section "__llvm_gcov_ctr_section"
+@__llvm_gcov_ctr.1 = internal global [1 x i64] zeroinitializer, section "__llvm_gcov_ctr_section"
+@0 = private unnamed_addr constant [10 x i8] c"test.gcda\00", align 1
+@__llvm_internal_gcov_emit_function_args.0 = internal unnamed_addr constant [2 x %emit_function_args_ty] [%emit_function_args_ty { i32 0, i32 1961870044, i32 -801444649 }, %emit_function_args_ty { i32 1, i32 1795396728, i32 -801444649 }]
+@__llvm_internal_gcov_emit_arcs_args.0 = internal unnamed_addr constant [2 x %emit_arcs_args_ty] [%emit_arcs_args_ty { i32 1, ptr @__llvm_gcov_ctr }, %emit_arcs_args_ty { i32 1, ptr @__llvm_gcov_ctr.1 }]
+@__llvm_internal_gcov_emit_file_info = internal unnamed_addr constant [1 x %file_info] [%file_info { %start_file_args_ty { ptr @0, i32 875575338, i32 -801444649 }, i32 2, ptr @__llvm_internal_gcov_emit_function_args.0, ptr @__llvm_internal_gcov_emit_arcs_args.0 }]
+@__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit", align 8
+
+define i32 @bar() {
+entry:
+  %gcov_ctr = load i64, ptr @__llvm_gcov_ctr, align 8
+  %0 = add i64 %gcov_ctr, 1
+  store i64 %0, ptr @__llvm_gcov_ctr, align 8
+  ret i32 1
+}
+
+define i32 @main() {
+entry:
+  %gcov_ctr = load i64, ptr @__llvm_gcov_ctr.1, align 8
+  %0 = add i64 %gcov_ctr, 1
+  store i64 %0, ptr @__llvm_gcov_ctr.1, align 8
+  %retval = alloca i32, align 4
+  store i32 0, ptr %retval, align 4
+  %call = call i32 @bar()
+  %sub = sub nsw i32 %call, 1
+  ret i32 %sub
+}
+
+define internal void @__llvm_gcov_writeout() unnamed_addr {
+entry:
+  br label %file.loop.header
+
+file.loop.header:                                 ; preds = %file.loop.latch, %entry
+  %file_idx = phi i32 [ 0, %entry ], [ %next_file_idx, %file.loop.latch ]
+  %0 = getelementptr inbounds [1 x %file_info], ptr @__llvm_internal_gcov_emit_file_info, i32 0, i32 %file_idx
+  %start_file_args = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 0
+  %1 = getelementptr inbounds nuw %start_file_args_ty, ptr %start_file_args, i32 0, i32 0
+  %filename = load ptr, ptr %1, align 4
+  %2 = getelementptr inbounds nuw %start_file_args_ty, ptr %start_file_args, i32 0, i32 1
+  %version = load i32, ptr %2, align 4
+  %3 = getelementptr inbounds nuw %start_file_args_ty, ptr %start_file_args, i32 0, i32 2
+  %stamp = load i32, ptr %3, align 4
+  call void @llvm_gcda_start_file(ptr %filename, i32 %version, i32 %stamp)
+  %4 = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 1
+  %num_ctrs = load i32, ptr %4, align 4
+  %5 = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 2
+  %emit_function_args = load ptr, ptr %5, align 4
+  %6 = getelementptr inbounds nuw %file_info, ptr %0, i32 0, i32 3
+  %emit_arcs_args = load ptr, ptr %6, align 4
+  %7 = icmp slt i32 0, %num_ctrs
+  br i1 %7, label %counter.loop.header, label %file.loop.latch
+
+counter.loop.header:                              ; preds = %counter.loop.header, %file.loop.header
+  %ctr_idx = phi i32 [ 0, %file.loop.header ], [ %15, %counter.loop.header ]
+  %8 = getelementptr inbounds %emit_function_args_ty, ptr %emit_function_args, i32 %ctr_idx
+  %9 = getelementptr inbounds nuw %emit_function_args_ty, ptr %8, i32 0, i32 0
+  %ident = load i32, ptr %9, align 4
+  %10 = getelementptr inbounds nuw %emit_function_args_ty, ptr %8, i32 0, i32 1
+  %func_checkssum = load i32, ptr %10, align 4
+  %11 = getelementptr inbounds nuw %emit_function_args_ty, ptr %8, i32 0, i32 2
+  %cfg_checksum = load i32, ptr %11, align 4
+  call void @llvm_gcda_emit_function(i32 %ident, i32 %func_checkssum, i32 %cfg_checksum)
+  %12 = getelementptr inbounds %emit_arcs_args_ty, ptr %emit_arcs_args, i32 %ctr_idx
+  %13 = getelementptr inbounds nuw %emit_arcs_args_ty, ptr %12, i32 0, i32 0
+  %num_counters = load i32, ptr %13, align 4
+  %14 = getelementptr inbounds nuw %emit_arcs_args_ty, ptr %12, i32 0, i32 1
+  %counters = load ptr, ptr %14, align 4
+  call void @llvm_gcda_emit_arcs(i32 %num_counters, ptr %counters)
+  %15 = add i32 %ctr_idx, 1
+  %16 = icmp slt i32 %15, %num_ctrs
+  br i1 %16, label %counter.loop.header, label %file.loop.latch
+
+file.loop.latch:                                  ; preds = %counter.loop.header, %file.loop.header
+  call void @llvm_gcda_summary_info()
+  call void @llvm_gcda_end_file()
+  %next_file_idx = add i32 %file_idx, 1
+  %17 = icmp slt i32 %next_file_idx, 1
+  br i1 %17, label %file.loop.header, label %exit
+
+exit:                                             ; preds = %file.loop.latch
+  ret void
+}
+
+declare void @llvm_gcda_start_file(ptr, i32, i32)
+
+declare void @llvm_gcda_emit_function(i32, i32, i32)
+
+declare void @llvm_gcda_emit_arcs(i32, ptr)
+
+declare void @llvm_gcda_summary_info()
+
+declare void @llvm_gcda_end_file()
+
+define internal void @__llvm_gcov_reset() unnamed_addr {
+entry:
+  call void @llvm.memset.p0.i64(ptr @__llvm_gcov_ctr, i8 0, i64 8, i1 false)
+  call void @llvm.memset.p0.i64(ptr @__llvm_gcov_ctr.1, i8 0, i64 8, i1 false)
+  ret void
+}
+
+declare void @llvm.memset.p0.i64(ptr nocapture writeonly, i8, i64, i1 immarg)
+
+
diff --git a/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll b/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll
index 19122b920d1c..9ad0418025e5 100644
--- a/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll
+++ b/llvm/test/Transforms/GCOVProfiling/kcfi-normalize.ll
@@ -1,9 +1,16 @@
 ;; Ensure __llvm_gcov_(writeout|reset|init) have the correct !kcfi_type
 ;; with integer normalization.
 ; RUN: mkdir -p %t && cd %t
-; RUN: opt < %s -S -passes=insert-gcov-profiling | FileCheck %s
+; RUN: opt < %s -S -passes=insert-gcov-profiling \
+; RUN:  -mtriple=x86_64-unknown-linux-gnu | FileCheck \
+; RUN:  --check-prefixes=CHECK,CHECK-CTOR-INIT %s
+; RUN: opt < %s -S -passes=insert-gcov-profiling \
+; RUN:  -mtriple=powerpc64-ibm-aix | FileCheck \
+; RUN:  --check-prefixes=CHECK,CHECK-RT-INIT %s
 
-target triple = "x86_64-unknown-linux-gnu"
+; Check for gcov initialization function pointers when we initialize
+; the writeout and reset functions in the runtime.
+; CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit"
 
 define dso_local void @empty() !dbg !5 {
 entry:
@@ -29,7 +36,7 @@ entry:
 ; CHECK-SAME: !kcfi_type ![[#TYPE:]]
 ; CHECK: define internal void @__llvm_gcov_reset()
 ; CHECK-SAME: !kcfi_type ![[#TYPE]]
-; CHECK: define internal void @__llvm_gcov_init()
-; CHECK-SAME: !kcfi_type ![[#TYPE]]
+; CHECK-CTOR-INIT: define internal void @__llvm_gcov_init()
+; CHECK-CTOR-INIT-SAME: !kcfi_type ![[#TYPE]]
 
 ; CHECK: ![[#TYPE]] = !{i32 -440107680}
diff --git a/llvm/test/Transforms/GCOVProfiling/kcfi.ll b/llvm/test/Transforms/GCOVProfiling/kcfi.ll
index 1b97d25294cd..5e0e91fc92f5 100644
--- a/llvm/test/Transforms/GCOVProfiling/kcfi.ll
+++ b/llvm/test/Transforms/GCOVProfiling/kcfi.ll
@@ -1,8 +1,15 @@
 ;; Ensure __llvm_gcov_(writeout|reset|init) have !kcfi_type with KCFI.
 ; RUN: mkdir -p %t && cd %t
-; RUN: opt < %s -S -passes=insert-gcov-profiling | FileCheck %s
+; RUN: opt < %s -S -passes=insert-gcov-profiling \
+; RUN:  -mtriple=x86_64-unknown-linux-gnu | FileCheck \
+; RUN:  --check-prefixes=CHECK,CHECK-CTOR-INIT %s
+; RUN: opt < %s -S -passes=insert-gcov-profiling \
+; RUN:  -mtriple=powerpc64-ibm-aix | FileCheck \
+; RUN:  --check-prefixes=CHECK,CHECK-RT-INIT %s
 
-target triple = "x86_64-unknown-linux-gnu"
+; Check for gcov initialization function pointers when we initialize
+; the writeout and reset functions in the runtime.
+; CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit"
 
 define dso_local void @empty() !dbg !5 {
 entry:
@@ -27,7 +34,7 @@ entry:
 ; CHECK-SAME: !kcfi_type ![[#TYPE:]]
 ; CHECK: define internal void @__llvm_gcov_reset()
 ; CHECK-SAME: !kcfi_type ![[#TYPE]]
-; CHECK: define internal void @__llvm_gcov_init()
-; CHECK-SAME: !kcfi_type ![[#TYPE]]
+; CHECK-CTOR-INIT: define internal void @__llvm_gcov_init()
+; CHECK-CTOR-INIT-SAME: !kcfi_type ![[#TYPE]]
 
 ; CHECK: ![[#TYPE]] = !{i32 -1522505972}
diff --git a/llvm/test/Transforms/GCOVProfiling/module-flags.ll b/llvm/test/Transforms/GCOVProfiling/module-flags.ll
index 919dd41ea203..59f116d0d7e6 100644
--- a/llvm/test/Transforms/GCOVProfiling/module-flags.ll
+++ b/llvm/test/Transforms/GCOVProfiling/module-flags.ll
@@ -1,7 +1,14 @@
 ; RUN: mkdir -p %t && cd %t
-; RUN: opt < %s -S -passes=insert-gcov-profiling | FileCheck %s
+; RUN: opt < %s -S -passes=insert-gcov-profiling \
+; RUN:  -mtriple=x86_64-unknown-linux-gnu | FileCheck \
+; RUN:  --check-prefixes=CHECK,CHECK-CTOR-INIT %s
+; RUN: opt < %s -S -passes=insert-gcov-profiling \
+; RUN:  -mtriple=powerpc64-ibm-aix | FileCheck \
+; RUN:  --check-prefixes=CHECK,CHECK-RT-INIT %s
 
-target triple = "x86_64-unknown-linux-gnu"
+; Check for gcov initialization function pointers when we initialize
+; the writeout and reset functions in the runtime.
+; CHECK-RT-INIT: @__llvm_covinit_functions = private constant { ptr, ptr } { ptr @__llvm_gcov_writeout, ptr @__llvm_gcov_reset }, section "__llvm_covinit"
 
 define dso_local void @empty() !dbg !5 {
 entry:
@@ -30,5 +37,5 @@ entry:
 ;; Infer uwtable and "frame-pointer" from the module flags.
 ; CHECK: define internal void @__llvm_gcov_writeout() unnamed_addr #[[#ATTR:]]
 ; CHECK: define internal void @__llvm_gcov_reset() unnamed_addr #[[#ATTR]]
-; CHECK: define internal void @__llvm_gcov_init() unnamed_addr #[[#ATTR]]
+; CHECK-CTOR-INIT: define internal void @__llvm_gcov_init() unnamed_addr #[[#ATTR]]
 ; CHECK: attributes #[[#ATTR]] = { noinline nounwind uwtable "frame-pointer"="all" }
-- 
GitLab


From 8c60efe94ba33aaf0f4226377dbe6613966ea6cc Mon Sep 17 00:00:00 2001
From: hanbeom <kese111@gmail.com>
Date: Thu, 17 Oct 2024 22:47:37 +0900
Subject: [PATCH 254/329]  [InferAlignment][NFC] Unify Load/Store handling in
 tryToImproveAlign (#112699)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

﻿Removes code duplication in tryToImproveAlign by unifying load and
store instruction handling with getLoadStore helper functions.
---
 llvm/include/llvm/IR/Instructions.h           | 10 +++++++++
 llvm/lib/Transforms/Scalar/InferAlignment.cpp | 21 +++++++------------
 2 files changed, 17 insertions(+), 14 deletions(-)

diff --git a/llvm/include/llvm/IR/Instructions.h b/llvm/include/llvm/IR/Instructions.h
index 88c8c709c306..b6575d4c8572 100644
--- a/llvm/include/llvm/IR/Instructions.h
+++ b/llvm/include/llvm/IR/Instructions.h
@@ -4960,6 +4960,16 @@ inline Align getLoadStoreAlignment(const Value *I) {
   return cast<StoreInst>(I)->getAlign();
 }
 
+/// A helper function that set the alignment of load or store instruction.
+inline void setLoadStoreAlignment(Value *I, Align NewAlign) {
+  assert((isa<LoadInst>(I) || isa<StoreInst>(I)) &&
+         "Expected Load or Store instruction");
+  if (auto *LI = dyn_cast<LoadInst>(I))
+    LI->setAlignment(NewAlign);
+  else
+    cast<StoreInst>(I)->setAlignment(NewAlign);
+}
+
 /// A helper function that returns the address space of the pointer operand of
 /// load or store instruction.
 inline unsigned getLoadStoreAddressSpace(const Value *I) {
diff --git a/llvm/lib/Transforms/Scalar/InferAlignment.cpp b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
index 6e0c206bd198..21d373790ac5 100644
--- a/llvm/lib/Transforms/Scalar/InferAlignment.cpp
+++ b/llvm/lib/Transforms/Scalar/InferAlignment.cpp
@@ -25,21 +25,14 @@ using namespace llvm;
 static bool tryToImproveAlign(
     const DataLayout &DL, Instruction *I,
     function_ref<Align(Value *PtrOp, Align OldAlign, Align PrefAlign)> Fn) {
-  if (auto *LI = dyn_cast<LoadInst>(I)) {
-    Value *PtrOp = LI->getPointerOperand();
-    Align OldAlign = LI->getAlign();
-    Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(LI->getType()));
-    if (NewAlign > OldAlign) {
-      LI->setAlignment(NewAlign);
-      return true;
-    }
-  } else if (auto *SI = dyn_cast<StoreInst>(I)) {
-    Value *PtrOp = SI->getPointerOperand();
-    Value *ValOp = SI->getValueOperand();
-    Align OldAlign = SI->getAlign();
-    Align NewAlign = Fn(PtrOp, OldAlign, DL.getPrefTypeAlign(ValOp->getType()));
+
+  if (auto *PtrOp = getLoadStorePointerOperand(I)) {
+    Align OldAlign = getLoadStoreAlignment(I);
+    Align PrefAlign = DL.getPrefTypeAlign(getLoadStoreType(I));
+
+    Align NewAlign = Fn(PtrOp, OldAlign, PrefAlign);
     if (NewAlign > OldAlign) {
-      SI->setAlignment(NewAlign);
+      setLoadStoreAlignment(I, NewAlign);
       return true;
     }
   }
-- 
GitLab


From dccebddb3b802c4c1fe287222e454b63f850f012 Mon Sep 17 00:00:00 2001
From: Luke Drummond <luke.drummond@codeplay.com>
Date: Fri, 22 Mar 2024 17:09:54 +0000
Subject: [PATCH 255/329] Finally formalise our defacto line-ending policy

Historically, we've not automatically enforced how git tracks line
endings, but there are many, many commits that "undo" unintended CRLFs
getting into history.

`git log --pretty=oneline --grep=CRLF` shows nearly 100 commits
involving reverts of CRLF making its way into the index and then
history. As far as I can tell, there are none the other way round except
for specific cases like `.bat` files or tests for parsers that need to
accept such sequences.

Of note, one of the earliest of those listed in that output is:

```
  commit 9795860250734e5c2a879546c534e35d9edd5944
  Author: NAKAMURA Takumi <geek4civic@gmail.com>
  Date:   Thu Feb 3 11:41:27 2011 +0000

      cmake/*: Add svn:eol-style=native and fix CRLF.

      llvm-svn: 124793
```

...which introduced such a defacto policy for subversion.

With old versions of git, it's been a bit of a crap-shoot whether
enforcing storing line endings in the history will upset checkouts on
machines where such line endings are the norm. Indeed many users have
enforced that git checks out the working copy according to a global or
per-user config via core crlf, or core autocrlf.

For ~8 years now[1], however, git has supported the ability to "do as
the Romans do" on checkout, but internally store subsets of text files
with line-endings specified via a system of patterns in the
`.gitattributes` file. Since we now have this ability, and we've been
specifying attributes for various binary files, I think it makes sense
to rid us of all that work converting things "back", and just let git
handle the local checkout. Thus the new toplevel policy here is

    * text=auto

In simple terms this means "unless otherwise specified, convert all
files considered "text" files to LF in the project history, but check
them out as expected on the local machine. What is "expected on the
local machine" is dependent on configuration and default.

For those files in the repository that *do* need CRLF endings, I've
adopted a policy of `eol=crlf` which means that git will store them in
history with LF, but regardless of user config, they'll be checked out
in tree with CRLF.

Finally, existing files have been "corrected" in history via `git add
--renormalize .`

End users should *not* need to adjust their local git config or
workflow.

[1]: git 2.10 was released with fixed support for fine-grained
line-ending tracking that respects user-config *and* repo policy. This
can be considered the point at which git will respect both the user's
local working tree preference *and* the history as specified by the
maintainers. See
https://github.com/git/git/blob/master/Documentation/RelNotes/2.10.0.txt#L248
for the release note.
---
 .gitattributes                                          | 7 +++++++
 clang-tools-extra/clangd/test/.gitattributes            | 3 +++
 clang/test/.gitattributes                               | 4 ++++
 llvm/docs/TestingGuide.rst                              | 6 ++++++
 llvm/test/FileCheck/.gitattributes                      | 1 +
 llvm/test/tools/llvm-ar/Inputs/.gitattributes           | 1 +
 llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes | 1 +
 7 files changed, 23 insertions(+)
 create mode 100644 clang-tools-extra/clangd/test/.gitattributes
 create mode 100644 clang/test/.gitattributes
 create mode 100644 llvm/test/FileCheck/.gitattributes
 create mode 100644 llvm/test/tools/llvm-ar/Inputs/.gitattributes
 create mode 100644 llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes

diff --git a/.gitattributes b/.gitattributes
index 6b281f33f737..aced01d485c1 100644
--- a/.gitattributes
+++ b/.gitattributes
@@ -1,3 +1,10 @@
+# Checkout as native, commit as LF except in specific circumstances
+* text=auto
+*.bat text eol=crlf
+*.rc text eol=crlf
+*.sln text eol=crlf
+*.natvis text eol=crlf
+
 libcxx/src/**/*.cpp     merge=libcxx-reformat
 libcxx/include/**/*.h   merge=libcxx-reformat
 
diff --git a/clang-tools-extra/clangd/test/.gitattributes b/clang-tools-extra/clangd/test/.gitattributes
new file mode 100644
index 000000000000..20971adc2b5d
--- /dev/null
+++ b/clang-tools-extra/clangd/test/.gitattributes
@@ -0,0 +1,3 @@
+input-mirror.test text eol=crlf
+too_large.test text eol=crlf
+protocol.test text eol=crlf
diff --git a/clang/test/.gitattributes b/clang/test/.gitattributes
new file mode 100644
index 000000000000..160fc6cf5617
--- /dev/null
+++ b/clang/test/.gitattributes
@@ -0,0 +1,4 @@
+FixIt/fixit-newline-style.c text eol=crlf
+Frontend/system-header-line-directive-ms-lineendings.c text eol=crlf
+Frontend/rewrite-includes-mixed-eol-crlf.* text eol=crlf
+clang/test/Frontend/rewrite-includes-mixed-eol-lf.h text eolf=lf
diff --git a/llvm/docs/TestingGuide.rst b/llvm/docs/TestingGuide.rst
index 08617933519f..344a295226f6 100644
--- a/llvm/docs/TestingGuide.rst
+++ b/llvm/docs/TestingGuide.rst
@@ -360,6 +360,12 @@ Best practices for regression tests
 - Try to give values (including variables, blocks and functions) meaningful
   names, and avoid retaining complex names generated by the optimization
   pipeline (such as ``%foo.0.0.0.0.0.0``).
+- If your tests depend on specific input file encodings, beware of line-ending
+  issues across different platforms, and in the project's history. Before you
+  commit tests that depend on explicit encodings, consider adding filetype or
+  specific line-ending annotations to a `<.gitattributes
+  https://git-scm.com/docs/gitattributes#_effects>`_ file in the appropriate
+  directory in the repository.
 
 Extra files
 -----------
diff --git a/llvm/test/FileCheck/.gitattributes b/llvm/test/FileCheck/.gitattributes
new file mode 100644
index 000000000000..ba27d7fad76d
--- /dev/null
+++ b/llvm/test/FileCheck/.gitattributes
@@ -0,0 +1 @@
+dos-style-eol.txt text eol=crlf
diff --git a/llvm/test/tools/llvm-ar/Inputs/.gitattributes b/llvm/test/tools/llvm-ar/Inputs/.gitattributes
new file mode 100644
index 000000000000..6c8a26285daf
--- /dev/null
+++ b/llvm/test/tools/llvm-ar/Inputs/.gitattributes
@@ -0,0 +1 @@
+mri-crlf.mri text eol=crlf
diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes b/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes
new file mode 100644
index 000000000000..2df17345df5b
--- /dev/null
+++ b/llvm/utils/lit/tests/Inputs/shtest-shell/.gitattributes
@@ -0,0 +1 @@
+*.dos text eol=crlf
-- 
GitLab


From 9d98acb196a40fee5229afeb08f95fd36d41c10a Mon Sep 17 00:00:00 2001
From: Luke Drummond <luke.drummond@codeplay.com>
Date: Thu, 17 Oct 2024 14:49:26 +0100
Subject: [PATCH 256/329] Renormalize line endings whitespace only after
 dccebddb3b80

Line ending policies were changed in the parent, dccebddb3b80. To make
it easier to resolve downstream merge conflicts after line-ending
policies are adjusted this is a separate whitespace-only commit. If you
have merge conflicts as a result, you can simply `git add --renormalize
-u && git merge --continue` or `git add --renormalize -u && git rebase
--continue` - depending on your workflow.
---
 .../clangd/test/input-mirror.test             |   34 +-
 clang-tools-extra/clangd/test/protocol.test   |  226 +-
 clang-tools-extra/clangd/test/too_large.test  |   14 +-
 clang/test/AST/HLSL/StructuredBuffer-AST.hlsl |  128 +-
 clang/test/C/C2y/n3262.c                      |   40 +-
 clang/test/C/C2y/n3274.c                      |   36 +-
 .../StructuredBuffer-annotations.hlsl         |   44 +-
 .../StructuredBuffer-constructor.hlsl         |   38 +-
 .../StructuredBuffer-elementtype.hlsl         |  140 +-
 .../builtins/StructuredBuffer-subscript.hlsl  |   34 +-
 clang/test/CodeGenHLSL/builtins/atan2.hlsl    |  118 +-
 clang/test/CodeGenHLSL/builtins/cross.hlsl    |   74 +-
 clang/test/CodeGenHLSL/builtins/length.hlsl   |  146 +-
 .../test/CodeGenHLSL/builtins/normalize.hlsl  |  170 +-
 clang/test/CodeGenHLSL/builtins/step.hlsl     |  168 +-
 clang/test/Driver/flang/msvc-link.f90         |   10 +-
 clang/test/FixIt/fixit-newline-style.c        |   22 +-
 .../rewrite-includes-mixed-eol-crlf.c         |   16 +-
 .../rewrite-includes-mixed-eol-crlf.h         |   22 +-
 ...tem-header-line-directive-ms-lineendings.c |   42 +-
 clang/test/ParserHLSL/bitfields.hlsl          |   60 +-
 .../hlsl_annotations_on_struct_members.hlsl   |   42 +-
 .../ParserHLSL/hlsl_contained_type_attr.hlsl  |   50 +-
 .../hlsl_contained_type_attr_error.hlsl       |   56 +-
 clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl   |   44 +-
 .../ParserHLSL/hlsl_is_rov_attr_error.hlsl    |   40 +-
 .../test/ParserHLSL/hlsl_raw_buffer_attr.hlsl |   44 +-
 .../hlsl_raw_buffer_attr_error.hlsl           |   34 +-
 .../ParserHLSL/hlsl_resource_class_attr.hlsl  |   74 +-
 .../hlsl_resource_class_attr_error.hlsl       |   44 +-
 .../hlsl_resource_handle_attrs.hlsl           |   42 +-
 clang/test/Sema/aarch64-sve-vector-trig-ops.c |  130 +-
 clang/test/Sema/riscv-rvv-vector-trig-ops.c   |  134 +-
 .../avail-diag-default-compute.hlsl           |  238 +-
 .../Availability/avail-diag-default-lib.hlsl  |  360 +-
 .../avail-diag-relaxed-compute.hlsl           |  238 +-
 .../Availability/avail-diag-relaxed-lib.hlsl  |  324 +-
 .../avail-diag-strict-compute.hlsl            |  256 +-
 .../Availability/avail-diag-strict-lib.hlsl   |  384 +-
 .../avail-lib-multiple-stages.hlsl            |  114 +-
 .../SemaHLSL/BuiltIns/StructuredBuffers.hlsl  |   38 +-
 .../test/SemaHLSL/BuiltIns/cross-errors.hlsl  |   86 +-
 .../BuiltIns/half-float-only-errors2.hlsl     |   26 +-
 .../test/SemaHLSL/BuiltIns/length-errors.hlsl |   64 +-
 .../SemaHLSL/BuiltIns/normalize-errors.hlsl   |   62 +-
 clang/test/SemaHLSL/BuiltIns/step-errors.hlsl |   62 +-
 .../Types/Traits/IsIntangibleType.hlsl        |  162 +-
 .../Types/Traits/IsIntangibleTypeErrors.hlsl  |   24 +-
 .../resource_binding_attr_error_basic.hlsl    |   84 +-
 .../resource_binding_attr_error_other.hlsl    |   18 +-
 .../resource_binding_attr_error_resource.hlsl |   98 +-
 ...urce_binding_attr_error_silence_diags.hlsl |   54 +-
 .../resource_binding_attr_error_space.hlsl    |  124 +-
 .../resource_binding_attr_error_udt.hlsl      |  270 +-
 clang/tools/scan-build/bin/scan-build.bat     |    2 +-
 .../tools/scan-build/libexec/c++-analyzer.bat |    2 +-
 .../tools/scan-build/libexec/ccc-analyzer.bat |    2 +-
 clang/utils/ClangVisualizers/clang.natvis     | 2178 ++---
 .../test/Driver/msvc-dependent-lib-flags.f90  |   72 +-
 .../ir-interpreter-phi-nodes/Makefile         |    8 +-
 .../postmortem/minidump/fizzbuzz.syms         |    4 +-
 .../target-new-solib-notifications/Makefile   |   46 +-
 .../target-new-solib-notifications/a.cpp      |    6 +-
 .../target-new-solib-notifications/b.cpp      |    2 +-
 .../target-new-solib-notifications/c.cpp      |    2 +-
 .../target-new-solib-notifications/d.cpp      |    2 +-
 .../target-new-solib-notifications/main.cpp   |   32 +-
 .../unwind/zeroth_frame/Makefile              |    6 +-
 .../unwind/zeroth_frame/TestZerothFrame.py    |  176 +-
 lldb/test/API/python_api/debugger/Makefile    |    6 +-
 lldb/test/Shell/BuildScript/modes.test        |   70 +-
 lldb/test/Shell/BuildScript/script-args.test  |   64 +-
 .../Shell/BuildScript/toolchain-clang-cl.test |   98 +-
 .../Windows/Sigsegv/Inputs/sigsegv.cpp        |   80 +-
 .../NativePDB/Inputs/inline_sites.s           | 1244 +--
 .../Inputs/inline_sites_live.lldbinit         |   14 +-
 .../Inputs/local-variables-registers.lldbinit |   70 +-
 .../NativePDB/Inputs/lookup-by-types.lldbinit |    6 +-
 .../subfield_register_simple_type.lldbinit    |    4 +-
 .../NativePDB/function-types-classes.cpp      |   12 +-
 .../NativePDB/inline_sites_live.cpp           |   68 +-
 .../SymbolFile/NativePDB/lookup-by-types.cpp  |   92 +-
 lldb/unittests/Breakpoint/CMakeLists.txt      |   20 +-
 llvm/benchmarks/FormatVariadicBM.cpp          |  126 +-
 .../GetIntrinsicForClangBuiltin.cpp           |  100 +-
 .../GetIntrinsicInfoTableEntriesBM.cpp        |   60 +-
 llvm/docs/_static/LoopOptWG_invite.ics        |  160 +-
 llvm/lib/Support/rpmalloc/CACHE.md            |   38 +-
 llvm/lib/Support/rpmalloc/README.md           |  440 +-
 llvm/lib/Support/rpmalloc/malloc.c            | 1448 +--
 llvm/lib/Support/rpmalloc/rpmalloc.c          | 7984 ++++++++---------
 llvm/lib/Support/rpmalloc/rpmalloc.h          |  856 +-
 llvm/lib/Support/rpmalloc/rpnew.h             |  226 +-
 .../Target/DirectX/DXILFinalizeLinkage.cpp    |  130 +-
 .../DirectX/DirectXTargetTransformInfo.cpp    |   76 +-
 llvm/test/CodeGen/DirectX/atan2.ll            |  174 +-
 llvm/test/CodeGen/DirectX/atan2_error.ll      |   22 +-
 llvm/test/CodeGen/DirectX/cross.ll            |  112 +-
 llvm/test/CodeGen/DirectX/finalize_linkage.ll |  128 +-
 llvm/test/CodeGen/DirectX/normalize.ll        |  224 +-
 llvm/test/CodeGen/DirectX/normalize_error.ll  |   20 +-
 llvm/test/CodeGen/DirectX/step.ll             |  156 +-
 .../CodeGen/SPIRV/hlsl-intrinsics/atan2.ll    |   98 +-
 .../CodeGen/SPIRV/hlsl-intrinsics/cross.ll    |   66 +-
 .../CodeGen/SPIRV/hlsl-intrinsics/length.ll   |   58 +-
 .../SPIRV/hlsl-intrinsics/normalize.ll        |   62 +-
 .../CodeGen/SPIRV/hlsl-intrinsics/step.ll     |   66 +-
 .../Demangle/ms-placeholder-return-type.test  |   36 +-
 llvm/test/FileCheck/dos-style-eol.txt         |   20 +-
 llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri   |    8 +-
 .../tools/llvm-cvtres/Inputs/languages.rc     |   72 +-
 .../tools/llvm-cvtres/Inputs/test_resource.rc |   98 +-
 .../tools/llvm-rc/Inputs/dialog-with-menu.rc  |   32 +-
 .../COFF/Inputs/resources/test_resource.rc    |   88 +-
 llvm/unittests/Support/ModRefTest.cpp         |   54 +-
 llvm/utils/LLVMVisualizers/llvm.natvis        |  816 +-
 .../lit/tests/Inputs/shtest-shell/diff-in.dos |    6 +-
 llvm/utils/release/build_llvm_release.bat     | 1030 +--
 openmp/runtime/doc/doxygen/config             | 3644 ++++----
 pstl/CREDITS.txt                              |   42 +-
 120 files changed, 14283 insertions(+), 14283 deletions(-)

diff --git a/clang-tools-extra/clangd/test/input-mirror.test b/clang-tools-extra/clangd/test/input-mirror.test
index a34a4a08cf60..bce3f9923a3b 100644
--- a/clang-tools-extra/clangd/test/input-mirror.test
+++ b/clang-tools-extra/clangd/test/input-mirror.test
@@ -1,17 +1,17 @@
-# RUN: clangd -pretty -sync -input-mirror-file %t < %s
-# Note that we have to use '-b' as -input-mirror-file does not have a newline at the end of file.
-# RUN: diff -b %t %s
-# It is absolutely vital that this file has CRLF line endings.
-#
-Content-Length: 125
-
-{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
-Content-Length: 172
-
-{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"int main() {\nint a;\na;\n}\n"}}}
-Content-Length: 44
-
-{"jsonrpc":"2.0","id":3,"method":"shutdown"}
-Content-Length: 33
-
-{"jsonrpc":"2.0","method":"exit"}
+# RUN: clangd -pretty -sync -input-mirror-file %t < %s
+# Note that we have to use '-b' as -input-mirror-file does not have a newline at the end of file.
+# RUN: diff -b %t %s
+# It is absolutely vital that this file has CRLF line endings.
+#
+Content-Length: 125
+
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
+Content-Length: 172
+
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"int main() {\nint a;\na;\n}\n"}}}
+Content-Length: 44
+
+{"jsonrpc":"2.0","id":3,"method":"shutdown"}
+Content-Length: 33
+
+{"jsonrpc":"2.0","method":"exit"}
diff --git a/clang-tools-extra/clangd/test/protocol.test b/clang-tools-extra/clangd/test/protocol.test
index 5e852d1d9dee..64ccfaef1891 100644
--- a/clang-tools-extra/clangd/test/protocol.test
+++ b/clang-tools-extra/clangd/test/protocol.test
@@ -1,113 +1,113 @@
-# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s | FileCheck -strict-whitespace %s
-# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s 2>&1 | FileCheck -check-prefix=STDERR %s
-# vim: fileformat=dos
-# It is absolutely vital that this file has CRLF line endings.
-#
-# Note that we invert the test because we intent to let clangd exit prematurely.
-#
-# Test protocol parsing
-Content-Length: 125
-Content-Type: application/vscode-jsonrpc; charset-utf-8
-
-{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
-# Test message with Content-Type after Content-Length
-#
-#      CHECK:  "jsonrpc": "2.0",
-# CHECK-NEXT:  "result": {
-#      CHECK:  }
-Content-Length: 246
-
-{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct fake { int a, bb, ccc; int f(int i, const float f) const; };\nint main() {\n  fake f;\n  f.\n}\n"}}}
-
-Content-Length: 104
-
-{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp"}}}
-
-Content-Type: application/vscode-jsonrpc; charset-utf-8
-Content-Length: 146
-
-{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
-# Test message with Content-Type before Content-Length
-#
-#      CHECK:  "id": 1,
-# CHECK-NEXT:  "jsonrpc": "2.0",
-# CHECK-NEXT:  "result": {
-# CHECK-NEXT:    "isIncomplete": false,
-# CHECK-NEXT:    "items": [
-#      CHECK:        "filterText": "a",
-# CHECK-NEXT:        "insertText": "a",
-# CHECK-NEXT:        "insertTextFormat": 1,
-# CHECK-NEXT:        "kind": 5,
-# CHECK-NEXT:        "label": " a",
-# CHECK-NEXT:        "score": {{[0-9]+.[0-9]+}},
-# CHECK-NEXT:        "sortText": "{{.*}}"
-#      CHECK:    ]
-# CHECK-NEXT:  }
-
-X-Test: Testing
-Content-Type: application/vscode-jsonrpc; charset-utf-8
-Content-Length: 146
-Content-Type: application/vscode-jsonrpc; charset-utf-8
-X-Testing: Test
-
-{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
-
-Content-Type: application/vscode-jsonrpc; charset-utf-8
-Content-Length: 10
-Content-Length: 146
-
-{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
-# Test message with duplicate Content-Length headers
-#
-#      CHECK:  "id": 3,
-# CHECK-NEXT:  "jsonrpc": "2.0",
-# CHECK-NEXT:  "result": {
-# CHECK-NEXT:    "isIncomplete": false,
-# CHECK-NEXT:    "items": [
-#      CHECK:        "filterText": "a",
-# CHECK-NEXT:        "insertText": "a",
-# CHECK-NEXT:        "insertTextFormat": 1,
-# CHECK-NEXT:        "kind": 5,
-# CHECK-NEXT:        "label": " a",
-# CHECK-NEXT:        "score": {{[0-9]+.[0-9]+}},
-# CHECK-NEXT:        "sortText": "{{.*}}"
-#      CHECK:    ]
-# CHECK-NEXT:  }
-# STDERR: Warning: Duplicate Content-Length header received. The previous value for this message (10) was ignored.
-
-Content-Type: application/vscode-jsonrpc; charset-utf-8
-Content-Length: 10
-
-{"jsonrpc":"2.0","id":4,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
-# Test message with malformed Content-Length
-#
-# STDERR: JSON parse error
-# Ensure we recover by sending another (valid) message
-
-Content-Length: 146
-
-{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
-# Test message with Content-Type before Content-Length
-#
-#      CHECK:  "id": 5,
-# CHECK-NEXT:  "jsonrpc": "2.0",
-# CHECK-NEXT:  "result": {
-# CHECK-NEXT:    "isIncomplete": false,
-# CHECK-NEXT:    "items": [
-#      CHECK:        "filterText": "a",
-# CHECK-NEXT:        "insertText": "a",
-# CHECK-NEXT:        "insertTextFormat": 1,
-# CHECK-NEXT:        "kind": 5,
-# CHECK-NEXT:        "label": " a",
-# CHECK-NEXT:        "score": {{[0-9]+.[0-9]+}},
-# CHECK-NEXT:        "sortText": "{{.*}}"
-#      CHECK:    ]
-# CHECK-NEXT:  }
-Content-Length: 1024
-
-{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
-# Test message which reads beyond the end of the stream.
-#
-# Ensure this is the last test in the file!
-# STDERR: Input was aborted. Read only {{[0-9]+}} bytes of expected {{[0-9]+}}.
-
+# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s | FileCheck -strict-whitespace %s
+# RUN: not clangd -pretty -sync -enable-test-uri-scheme < %s 2>&1 | FileCheck -check-prefix=STDERR %s
+# vim: fileformat=dos
+# It is absolutely vital that this file has CRLF line endings.
+#
+# Note that we invert the test because we intent to let clangd exit prematurely.
+#
+# Test protocol parsing
+Content-Length: 125
+Content-Type: application/vscode-jsonrpc; charset-utf-8
+
+{"jsonrpc":"2.0","id":0,"method":"initialize","params":{"processId":123,"rootPath":"clangd","capabilities":{},"trace":"off"}}
+# Test message with Content-Type after Content-Length
+#
+#      CHECK:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": {
+#      CHECK:  }
+Content-Length: 246
+
+{"jsonrpc":"2.0","method":"textDocument/didOpen","params":{"textDocument":{"uri":"test:///main.cpp","languageId":"cpp","version":1,"text":"struct fake { int a, bb, ccc; int f(int i, const float f) const; };\nint main() {\n  fake f;\n  f.\n}\n"}}}
+
+Content-Length: 104
+
+{"jsonrpc":"2.0","method":"textDocument/didChange","params":{"textDocument":{"uri":"test:///main.cpp"}}}
+
+Content-Type: application/vscode-jsonrpc; charset-utf-8
+Content-Length: 146
+
+{"jsonrpc":"2.0","id":1,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
+# Test message with Content-Type before Content-Length
+#
+#      CHECK:  "id": 1,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": {
+# CHECK-NEXT:    "isIncomplete": false,
+# CHECK-NEXT:    "items": [
+#      CHECK:        "filterText": "a",
+# CHECK-NEXT:        "insertText": "a",
+# CHECK-NEXT:        "insertTextFormat": 1,
+# CHECK-NEXT:        "kind": 5,
+# CHECK-NEXT:        "label": " a",
+# CHECK-NEXT:        "score": {{[0-9]+.[0-9]+}},
+# CHECK-NEXT:        "sortText": "{{.*}}"
+#      CHECK:    ]
+# CHECK-NEXT:  }
+
+X-Test: Testing
+Content-Type: application/vscode-jsonrpc; charset-utf-8
+Content-Length: 146
+Content-Type: application/vscode-jsonrpc; charset-utf-8
+X-Testing: Test
+
+{"jsonrpc":"2.0","id":2,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
+
+Content-Type: application/vscode-jsonrpc; charset-utf-8
+Content-Length: 10
+Content-Length: 146
+
+{"jsonrpc":"2.0","id":3,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
+# Test message with duplicate Content-Length headers
+#
+#      CHECK:  "id": 3,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": {
+# CHECK-NEXT:    "isIncomplete": false,
+# CHECK-NEXT:    "items": [
+#      CHECK:        "filterText": "a",
+# CHECK-NEXT:        "insertText": "a",
+# CHECK-NEXT:        "insertTextFormat": 1,
+# CHECK-NEXT:        "kind": 5,
+# CHECK-NEXT:        "label": " a",
+# CHECK-NEXT:        "score": {{[0-9]+.[0-9]+}},
+# CHECK-NEXT:        "sortText": "{{.*}}"
+#      CHECK:    ]
+# CHECK-NEXT:  }
+# STDERR: Warning: Duplicate Content-Length header received. The previous value for this message (10) was ignored.
+
+Content-Type: application/vscode-jsonrpc; charset-utf-8
+Content-Length: 10
+
+{"jsonrpc":"2.0","id":4,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
+# Test message with malformed Content-Length
+#
+# STDERR: JSON parse error
+# Ensure we recover by sending another (valid) message
+
+Content-Length: 146
+
+{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
+# Test message with Content-Type before Content-Length
+#
+#      CHECK:  "id": 5,
+# CHECK-NEXT:  "jsonrpc": "2.0",
+# CHECK-NEXT:  "result": {
+# CHECK-NEXT:    "isIncomplete": false,
+# CHECK-NEXT:    "items": [
+#      CHECK:        "filterText": "a",
+# CHECK-NEXT:        "insertText": "a",
+# CHECK-NEXT:        "insertTextFormat": 1,
+# CHECK-NEXT:        "kind": 5,
+# CHECK-NEXT:        "label": " a",
+# CHECK-NEXT:        "score": {{[0-9]+.[0-9]+}},
+# CHECK-NEXT:        "sortText": "{{.*}}"
+#      CHECK:    ]
+# CHECK-NEXT:  }
+Content-Length: 1024
+
+{"jsonrpc":"2.0","id":5,"method":"textDocument/completion","params":{"textDocument":{"uri":"test:/main.cpp"},"position":{"line":3,"character":5}}}
+# Test message which reads beyond the end of the stream.
+#
+# Ensure this is the last test in the file!
+# STDERR: Input was aborted. Read only {{[0-9]+}} bytes of expected {{[0-9]+}}.
+
diff --git a/clang-tools-extra/clangd/test/too_large.test b/clang-tools-extra/clangd/test/too_large.test
index 7df981e79420..6986bd5e258e 100644
--- a/clang-tools-extra/clangd/test/too_large.test
+++ b/clang-tools-extra/clangd/test/too_large.test
@@ -1,7 +1,7 @@
-# RUN: not clangd -sync < %s 2>&1 | FileCheck -check-prefix=STDERR %s
-# vim: fileformat=dos
-# It is absolutely vital that this file has CRLF line endings.
-#
-Content-Length: 2147483648
-
-# STDERR: Refusing to read message
+# RUN: not clangd -sync < %s 2>&1 | FileCheck -check-prefix=STDERR %s
+# vim: fileformat=dos
+# It is absolutely vital that this file has CRLF line endings.
+#
+Content-Length: 2147483648
+
+# STDERR: Refusing to read message
diff --git a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
index 030fcfc31691..9c1630f6f570 100644
--- a/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
+++ b/clang/test/AST/HLSL/StructuredBuffer-AST.hlsl
@@ -1,64 +1,64 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s
-
-
-// This test tests two different AST generations. The "EMPTY" test mode verifies
-// the AST generated by forward declaration of the HLSL types which happens on
-// initializing the HLSL external AST with an AST Context.
-
-// The non-empty mode has a use that requires the StructuredBuffer type be complete,
-// which results in the AST being populated by the external AST source. That
-// case covers the full implementation of the template declaration and the
-// instantiated specialization.
-
-// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
-// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
-// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class StructuredBuffer
-// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-
-// There should be no more occurrances of StructuredBuffer
-// EMPTY-NOT: StructuredBuffer
-
-#ifndef EMPTY
-
-StructuredBuffer<float> Buffer;
-
-#endif
-
-// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
-// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
-// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class StructuredBuffer definition
-
-// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
-
-// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
-// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
-// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const StructuredBuffer<element_type>' lvalue implicit this
-// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
-// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
-// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
-// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
-// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'StructuredBuffer<element_type>' lvalue implicit this
-// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
-
-// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class StructuredBuffer definition
-
-// CHECK: TemplateArgument type 'float'
-// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
-// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
-// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump -DEMPTY %s | FileCheck -check-prefix=EMPTY %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library -x hlsl -ast-dump %s | FileCheck %s
+
+
+// This test tests two different AST generations. The "EMPTY" test mode verifies
+// the AST generated by forward declaration of the HLSL types which happens on
+// initializing the HLSL external AST with an AST Context.
+
+// The non-empty mode has a use that requires the StructuredBuffer type be complete,
+// which results in the AST being populated by the external AST source. That
+// case covers the full implementation of the template declaration and the
+// instantiated specialization.
+
+// EMPTY: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// EMPTY-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// EMPTY-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit <undeserialized declarations> class StructuredBuffer
+// EMPTY-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+
+// There should be no more occurrances of StructuredBuffer
+// EMPTY-NOT: StructuredBuffer
+
+#ifndef EMPTY
+
+StructuredBuffer<float> Buffer;
+
+#endif
+
+// CHECK: ClassTemplateDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit StructuredBuffer
+// CHECK-NEXT: TemplateTypeParmDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class depth 0 index 0 element_type
+// CHECK-NEXT: CXXRecordDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit class StructuredBuffer definition
+
+// CHECK: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(element_type)]]
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
+
+// CHECK: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &const (unsigned int) const'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'const StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK-NEXT: CXXMethodDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> operator[] 'element_type &(unsigned int)'
+// CHECK-NEXT: ParmVarDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> Idx 'unsigned int'
+// CHECK-NEXT: CompoundStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: ReturnStmt 0x{{[0-9A-Fa-f]+}} <<invalid sloc>>
+// CHECK-NEXT: MemberExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'element_type' lvalue .e 0x{{[0-9A-Fa-f]+}}
+// CHECK-NEXT: CXXThisExpr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> 'StructuredBuffer<element_type>' lvalue implicit this
+// CHECK-NEXT: AlwaysInlineAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit always_inline
+
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> class StructuredBuffer definition
+
+// CHECK: TemplateArgument type 'float'
+// CHECK-NEXT: BuiltinType 0x{{[0-9A-Fa-f]+}} 'float'
+// CHECK-NEXT: FinalAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit final
+// CHECK-NEXT: FieldDecl 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
+// CHECK-NEXT: HLSLResourceAttr 0x{{[0-9A-Fa-f]+}} <<invalid sloc>> Implicit TypedBuffer
diff --git a/clang/test/C/C2y/n3262.c b/clang/test/C/C2y/n3262.c
index 3ff2062d88dd..864ab351bdbc 100644
--- a/clang/test/C/C2y/n3262.c
+++ b/clang/test/C/C2y/n3262.c
@@ -1,20 +1,20 @@
-// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
-// expected-no-diagnostics
-
-/* WG14 N3262: Yes
- * Usability of a byte-wise copy of va_list
- *
- * NB: Clang explicitly documents this as being undefined behavior. A
- * diagnostic is produced for some targets but not for others for assignment or
- * initialization, but no diagnostic is possible to produce for use with memcpy
- * in the general case, nor with a manual bytewise copy via a for loop.
- *
- * Therefore, nothing is tested in this file; it serves as a reminder that we
- * validated our documentation against the paper. See
- * clang/docs/LanguageExtensions.rst for more details.
- *
- * FIXME: it would be nice to add ubsan support for recognizing when an invalid
- * copy is made and diagnosing on copy (or on use of the copied va_list).
- */
-
-int main() {}
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+// expected-no-diagnostics
+
+/* WG14 N3262: Yes
+ * Usability of a byte-wise copy of va_list
+ *
+ * NB: Clang explicitly documents this as being undefined behavior. A
+ * diagnostic is produced for some targets but not for others for assignment or
+ * initialization, but no diagnostic is possible to produce for use with memcpy
+ * in the general case, nor with a manual bytewise copy via a for loop.
+ *
+ * Therefore, nothing is tested in this file; it serves as a reminder that we
+ * validated our documentation against the paper. See
+ * clang/docs/LanguageExtensions.rst for more details.
+ *
+ * FIXME: it would be nice to add ubsan support for recognizing when an invalid
+ * copy is made and diagnosing on copy (or on use of the copied va_list).
+ */
+
+int main() {}
diff --git a/clang/test/C/C2y/n3274.c b/clang/test/C/C2y/n3274.c
index ccdb89f4069d..6bf8d72d0f33 100644
--- a/clang/test/C/C2y/n3274.c
+++ b/clang/test/C/C2y/n3274.c
@@ -1,18 +1,18 @@
-// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s
-// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
-
-/* WG14 N3274: Yes
- * Remove imaginary types
- */
-
-// Clang has never supported _Imaginary.
-#ifdef __STDC_IEC_559_COMPLEX__
-#error "When did this happen?"
-#endif
-
-_Imaginary float i; // expected-error {{imaginary types are not supported}}
-
-// _Imaginary is a keyword in older language modes, but doesn't need to be one
-// in C2y or later. However, to improve diagnostic behavior, we retain it as a
-// keyword in all language modes -- it is not available as an identifier.
-static_assert(!__is_identifier(_Imaginary));
+// RUN: %clang_cc1 -verify -std=c23 -Wall -pedantic %s
+// RUN: %clang_cc1 -verify -std=c2y -Wall -pedantic %s
+
+/* WG14 N3274: Yes
+ * Remove imaginary types
+ */
+
+// Clang has never supported _Imaginary.
+#ifdef __STDC_IEC_559_COMPLEX__
+#error "When did this happen?"
+#endif
+
+_Imaginary float i; // expected-error {{imaginary types are not supported}}
+
+// _Imaginary is a keyword in older language modes, but doesn't need to be one
+// in C2y or later. However, to improve diagnostic behavior, we retain it as a
+// keyword in all language modes -- it is not available as an identifier.
+static_assert(!__is_identifier(_Imaginary));
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
index 4d3d4908c396..81c5837d8f20 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-annotations.hlsl
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-
-StructuredBuffer<float> Buffer1;
-StructuredBuffer<vector<float, 4> > BufferArray[4];
-
-StructuredBuffer<float> Buffer2 : register(u3);
-StructuredBuffer<vector<float, 4> > BufferArray2[4] : register(u4);
-
-StructuredBuffer<float> Buffer3 : register(u3, space1);
-StructuredBuffer<vector<float, 4> > BufferArray3[4] : register(u4, space1);
-
-[numthreads(1,1,1)]
-void main() {
-}
-
-// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
-// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0}
-// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0}
-// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0}
-// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0}
-// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1}
-// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+
+StructuredBuffer<float> Buffer1;
+StructuredBuffer<vector<float, 4> > BufferArray[4];
+
+StructuredBuffer<float> Buffer2 : register(u3);
+StructuredBuffer<vector<float, 4> > BufferArray2[4] : register(u4);
+
+StructuredBuffer<float> Buffer3 : register(u3, space1);
+StructuredBuffer<vector<float, 4> > BufferArray3[4] : register(u4, space1);
+
+[numthreads(1,1,1)]
+void main() {
+}
+
+// CHECK: !hlsl.uavs = !{![[Single:[0-9]+]], ![[Array:[0-9]+]], ![[SingleAllocated:[0-9]+]], ![[ArrayAllocated:[0-9]+]], ![[SingleSpace:[0-9]+]], ![[ArraySpace:[0-9]+]]}
+// CHECK-DAG: ![[Single]] = !{ptr @Buffer1, i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[Array]] = !{ptr @BufferArray, i32 10, i32 9, i1 false, i32 -1, i32 0}
+// CHECK-DAG: ![[SingleAllocated]] = !{ptr @Buffer2, i32 10, i32 9, i1 false, i32 3, i32 0}
+// CHECK-DAG: ![[ArrayAllocated]] = !{ptr @BufferArray2, i32 10, i32 9, i1 false, i32 4, i32 0}
+// CHECK-DAG: ![[SingleSpace]] = !{ptr @Buffer3, i32 10, i32 9, i1 false, i32 3, i32 1}
+// CHECK-DAG: ![[ArraySpace]] = !{ptr @BufferArray3, i32 10, i32 9, i1 false, i32 4, i32 1}
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
index 178332d03e64..f65090410ce6 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
@@ -1,19 +1,19 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
-
-// XFAIL: *
-// This expectedly fails because create.handle is no longer invoked
-// from StructuredBuffer constructor and the replacement has not been
-// implemented yet. This test should be updated to expect
-// dx.create.handleFromBinding as part of issue #105076.
-
-StructuredBuffer<float> Buf;
-
-// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ"
-// CHECK-NEXT: entry:
-
-// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
-// CHECK: store ptr %[[HandleRes]], ptr %h, align 4
-
-// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
-// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
+// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+
+// XFAIL: *
+// This expectedly fails because create.handle is no longer invoked
+// from StructuredBuffer constructor and the replacement has not been
+// implemented yet. This test should be updated to expect
+// dx.create.handleFromBinding as part of issue #105076.
+
+StructuredBuffer<float> Buf;
+
+// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ"
+// CHECK-NEXT: entry:
+
+// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
+// CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+
+// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
+// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
index a99c7f98a1af..435a904327a2 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-elementtype.hlsl
@@ -1,70 +1,70 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s
-
-// NOTE: The number in type name and whether the struct is packed or not will mostly
-// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) 
-// and theinterim field of the contained type is removed.
-
-// CHECK: %"class.hlsl::StructuredBuffer" = type <{ target("dx.RawBuffer", i16, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.0" = type <{ target("dx.RawBuffer", i16, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.3" = type { target("dx.RawBuffer", i32, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.5" = type { target("dx.RawBuffer", i64, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.6" = type <{ target("dx.RawBuffer", half, 1, 0) 
-// CHECK: %"class.hlsl::StructuredBuffer.8" = type { target("dx.RawBuffer", float, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.9" = type { target("dx.RawBuffer", double, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.10" = type { target("dx.RawBuffer", <4 x i16>, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.11" = type { target("dx.RawBuffer", <3 x i32>, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.12" = type { target("dx.RawBuffer", <2 x half>, 1, 0)
-// CHECK: %"class.hlsl::StructuredBuffer.13" = type { target("dx.RawBuffer", <3 x float>, 1, 0)
-
-StructuredBuffer<int16_t> BufI16;
-StructuredBuffer<uint16_t> BufU16;
-StructuredBuffer<int> BufI32;
-StructuredBuffer<uint> BufU32;
-StructuredBuffer<int64_t> BufI64;
-StructuredBuffer<uint64_t> BufU64;
-StructuredBuffer<half> BufF16;
-StructuredBuffer<float> BufF32;
-StructuredBuffer<double> BufF64;
-StructuredBuffer< vector<int16_t, 4> > BufI16x4;
-StructuredBuffer< vector<uint, 3> > BufU32x3;
-StructuredBuffer<half2> BufF16x2;
-StructuredBuffer<float3> BufF32x3;
-// TODO: StructuredBuffer<snorm half> BufSNormF16; -> 11
-// TODO: StructuredBuffer<unorm half> BufUNormF16; -> 12
-// TODO: StructuredBuffer<snorm float> BufSNormF32; -> 13
-// TODO: StructuredBuffer<unorm float> BufUNormF32; -> 14
-// TODO: StructuredBuffer<snorm double> BufSNormF64; -> 15
-// TODO: StructuredBuffer<unorm double> BufUNormF64; -> 16
-
-[numthreads(1,1,1)]
-void main(int GI : SV_GroupIndex) {
-  BufI16[GI] = 0;
-  BufU16[GI] = 0;
-  BufI32[GI] = 0;
-  BufU32[GI] = 0;
-  BufI64[GI] = 0;
-  BufU64[GI] = 0;
-  BufF16[GI] = 0;
-  BufF32[GI] = 0;
-  BufF64[GI] = 0;
-  BufI16x4[GI] = 0;
-  BufU32x3[GI] = 0;
-  BufF16x2[GI] = 0;
-  BufF32x3[GI] = 0;
-}
-
-// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2,
-// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3,
-// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4,
-// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5,
-// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6,
-// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7,
-// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8,
-// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9,
-// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10,
-// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2,
-// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5,
-// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8,
-// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9,
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.2-compute -finclude-default-header -fnative-half-type -emit-llvm -o - %s | FileCheck %s
+
+// NOTE: The number in type name and whether the struct is packed or not will mostly
+// likely change once subscript operators are properly implemented (llvm/llvm-project#95956) 
+// and theinterim field of the contained type is removed.
+
+// CHECK: %"class.hlsl::StructuredBuffer" = type <{ target("dx.RawBuffer", i16, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.0" = type <{ target("dx.RawBuffer", i16, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.2" = type { target("dx.RawBuffer", i32, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.3" = type { target("dx.RawBuffer", i32, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.4" = type { target("dx.RawBuffer", i64, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.5" = type { target("dx.RawBuffer", i64, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.6" = type <{ target("dx.RawBuffer", half, 1, 0) 
+// CHECK: %"class.hlsl::StructuredBuffer.8" = type { target("dx.RawBuffer", float, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.9" = type { target("dx.RawBuffer", double, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.10" = type { target("dx.RawBuffer", <4 x i16>, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.11" = type { target("dx.RawBuffer", <3 x i32>, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.12" = type { target("dx.RawBuffer", <2 x half>, 1, 0)
+// CHECK: %"class.hlsl::StructuredBuffer.13" = type { target("dx.RawBuffer", <3 x float>, 1, 0)
+
+StructuredBuffer<int16_t> BufI16;
+StructuredBuffer<uint16_t> BufU16;
+StructuredBuffer<int> BufI32;
+StructuredBuffer<uint> BufU32;
+StructuredBuffer<int64_t> BufI64;
+StructuredBuffer<uint64_t> BufU64;
+StructuredBuffer<half> BufF16;
+StructuredBuffer<float> BufF32;
+StructuredBuffer<double> BufF64;
+StructuredBuffer< vector<int16_t, 4> > BufI16x4;
+StructuredBuffer< vector<uint, 3> > BufU32x3;
+StructuredBuffer<half2> BufF16x2;
+StructuredBuffer<float3> BufF32x3;
+// TODO: StructuredBuffer<snorm half> BufSNormF16; -> 11
+// TODO: StructuredBuffer<unorm half> BufUNormF16; -> 12
+// TODO: StructuredBuffer<snorm float> BufSNormF32; -> 13
+// TODO: StructuredBuffer<unorm float> BufUNormF32; -> 14
+// TODO: StructuredBuffer<snorm double> BufSNormF64; -> 15
+// TODO: StructuredBuffer<unorm double> BufUNormF64; -> 16
+
+[numthreads(1,1,1)]
+void main(int GI : SV_GroupIndex) {
+  BufI16[GI] = 0;
+  BufU16[GI] = 0;
+  BufI32[GI] = 0;
+  BufU32[GI] = 0;
+  BufI64[GI] = 0;
+  BufU64[GI] = 0;
+  BufF16[GI] = 0;
+  BufF32[GI] = 0;
+  BufF64[GI] = 0;
+  BufI16x4[GI] = 0;
+  BufU32x3[GI] = 0;
+  BufF16x2[GI] = 0;
+  BufF32x3[GI] = 0;
+}
+
+// CHECK: !{{[0-9]+}} = !{ptr @BufI16, i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU16, i32 10, i32 3,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI32, i32 10, i32 4,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU32, i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI64, i32 10, i32 6,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU64, i32 10, i32 7,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF16, i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF32, i32 10, i32 9,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF64, i32 10, i32 10,
+// CHECK: !{{[0-9]+}} = !{ptr @BufI16x4, i32 10, i32 2,
+// CHECK: !{{[0-9]+}} = !{ptr @BufU32x3, i32 10, i32 5,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF16x2, i32 10, i32 8,
+// CHECK: !{{[0-9]+}} = !{ptr @BufF32x3, i32 10, i32 9,
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl
index 155749ec4f94..89bde9236288 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-subscript.hlsl
@@ -1,17 +1,17 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s
-
-StructuredBuffer<int> In;
-StructuredBuffer<int> Out;
-
-[numthreads(1,1,1)]
-void main(unsigned GI : SV_GroupIndex) {
-  Out[GI] = In[GI];
-}
-
-// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy
-// and confusing to follow so the match here is pretty weak.
-
-// CHECK: define void @main()
-// Verify inlining leaves only calls to "llvm." intrinsics
-// CHECK-NOT:   call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
-// CHECK: ret void
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -emit-llvm -o - -O0 %s | FileCheck %s
+
+StructuredBuffer<int> In;
+StructuredBuffer<int> Out;
+
+[numthreads(1,1,1)]
+void main(unsigned GI : SV_GroupIndex) {
+  Out[GI] = In[GI];
+}
+
+// Even at -O0 the subscript operators get inlined. The -O0 IR is a bit messy
+// and confusing to follow so the match here is pretty weak.
+
+// CHECK: define void @main()
+// Verify inlining leaves only calls to "llvm." intrinsics
+// CHECK-NOT:   call {{[^@]*}} @{{[^l][^l][^v][^m][^\.]}}
+// CHECK: ret void
diff --git a/clang/test/CodeGenHLSL/builtins/atan2.hlsl b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
index 40796052e608..ada269db2f00 100644
--- a/clang/test/CodeGenHLSL/builtins/atan2.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/atan2.hlsl
@@ -1,59 +1,59 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// CHECK-LABEL: test_atan2_half
-// NATIVE_HALF: call half @llvm.atan2.f16
-// NO_HALF: call float @llvm.atan2.f32
-half test_atan2_half (half p0, half p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_half2
-// NATIVE_HALF: call <2 x half> @llvm.atan2.v2f16
-// NO_HALF: call <2 x float> @llvm.atan2.v2f32
-half2 test_atan2_half2 (half2 p0, half2 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_half3
-// NATIVE_HALF: call <3 x half> @llvm.atan2.v3f16
-// NO_HALF: call <3 x float> @llvm.atan2.v3f32
-half3 test_atan2_half3 (half3 p0, half3 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_half4
-// NATIVE_HALF: call <4 x half> @llvm.atan2.v4f16
-// NO_HALF: call <4 x float> @llvm.atan2.v4f32
-half4 test_atan2_half4 (half4 p0, half4 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_float
-// CHECK: call float @llvm.atan2.f32
-float test_atan2_float (float p0, float p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_float2
-// CHECK: call <2 x float> @llvm.atan2.v2f32
-float2 test_atan2_float2 (float2 p0, float2 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_float3
-// CHECK: call <3 x float> @llvm.atan2.v3f32
-float3 test_atan2_float3 (float3 p0, float3 p1) {
-  return atan2(p0, p1);
-}
-
-// CHECK-LABEL: test_atan2_float4
-// CHECK: call <4 x float> @llvm.atan2.v4f32
-float4 test_atan2_float4 (float4 p0, float4 p1) {
-  return atan2(p0, p1);
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// CHECK-LABEL: test_atan2_half
+// NATIVE_HALF: call half @llvm.atan2.f16
+// NO_HALF: call float @llvm.atan2.f32
+half test_atan2_half (half p0, half p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half2
+// NATIVE_HALF: call <2 x half> @llvm.atan2.v2f16
+// NO_HALF: call <2 x float> @llvm.atan2.v2f32
+half2 test_atan2_half2 (half2 p0, half2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half3
+// NATIVE_HALF: call <3 x half> @llvm.atan2.v3f16
+// NO_HALF: call <3 x float> @llvm.atan2.v3f32
+half3 test_atan2_half3 (half3 p0, half3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_half4
+// NATIVE_HALF: call <4 x half> @llvm.atan2.v4f16
+// NO_HALF: call <4 x float> @llvm.atan2.v4f32
+half4 test_atan2_half4 (half4 p0, half4 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float
+// CHECK: call float @llvm.atan2.f32
+float test_atan2_float (float p0, float p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float2
+// CHECK: call <2 x float> @llvm.atan2.v2f32
+float2 test_atan2_float2 (float2 p0, float2 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float3
+// CHECK: call <3 x float> @llvm.atan2.v3f32
+float3 test_atan2_float3 (float3 p0, float3 p1) {
+  return atan2(p0, p1);
+}
+
+// CHECK-LABEL: test_atan2_float4
+// CHECK: call <4 x float> @llvm.atan2.v4f32
+float4 test_atan2_float4 (float4 p0, float4 p1) {
+  return atan2(p0, p1);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/cross.hlsl b/clang/test/CodeGenHLSL/builtins/cross.hlsl
index 514e57d36b20..eba710c905bf 100644
--- a/clang/test/CodeGenHLSL/builtins/cross.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/cross.hlsl
@@ -1,37 +1,37 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
-
-// NATIVE_HALF: define [[FNATTRS]] <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half>
-// NATIVE_HALF: ret <3 x half> %hlsl.cross
-// NO_HALF: define [[FNATTRS]] <3 x float> @
-// NO_HALF: call <3 x float> @llvm.[[TARGET]].cross.v3f32(<3 x float>
-// NO_HALF: ret <3 x float> %hlsl.cross
-half3 test_cross_half3(half3 p0, half3 p1)
-{
-    return cross(p0, p1);
-}
-
-// CHECK: define [[FNATTRS]] <3 x float> @
-// CHECK: %hlsl.cross = call <3 x float> @llvm.[[TARGET]].cross.v3f32(
-// CHECK: ret <3 x float> %hlsl.cross
-float3 test_cross_float3(float3 p0, float3 p1)
-{
-    return cross(p0, p1);
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+
+// NATIVE_HALF: define [[FNATTRS]] <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].cross.v3f16(<3 x half>
+// NATIVE_HALF: ret <3 x half> %hlsl.cross
+// NO_HALF: define [[FNATTRS]] <3 x float> @
+// NO_HALF: call <3 x float> @llvm.[[TARGET]].cross.v3f32(<3 x float>
+// NO_HALF: ret <3 x float> %hlsl.cross
+half3 test_cross_half3(half3 p0, half3 p1)
+{
+    return cross(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] <3 x float> @
+// CHECK: %hlsl.cross = call <3 x float> @llvm.[[TARGET]].cross.v3f32(
+// CHECK: ret <3 x float> %hlsl.cross
+float3 test_cross_float3(float3 p0, float3 p1)
+{
+    return cross(p0, p1);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/length.hlsl b/clang/test/CodeGenHLSL/builtins/length.hlsl
index 1c23b0df04df..9b0293c218a5 100644
--- a/clang/test/CodeGenHLSL/builtins/length.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/length.hlsl
@@ -1,73 +1,73 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
-
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: call half @llvm.fabs.f16(half
-// NO_HALF: call float @llvm.fabs.f32(float
-// NATIVE_HALF: ret half
-// NO_HALF: ret float
-half test_length_half(half p0)
-{
-  return length(p0);
-}
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v2f16
-// NO_HALF: %hlsl.length = call float @llvm.dx.length.v2f32(
-// NATIVE_HALF: ret half %hlsl.length
-// NO_HALF: ret float %hlsl.length
-half test_length_half2(half2 p0)
-{
-  return length(p0);
-}
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v3f16
-// NO_HALF: %hlsl.length = call float @llvm.dx.length.v3f32(
-// NATIVE_HALF: ret half %hlsl.length
-// NO_HALF: ret float %hlsl.length
-half test_length_half3(half3 p0)
-{
-  return length(p0);
-}
-// NATIVE_HALF: define noundef half @
-// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v4f16
-// NO_HALF: %hlsl.length = call float @llvm.dx.length.v4f32(
-// NATIVE_HALF: ret half %hlsl.length
-// NO_HALF: ret float %hlsl.length
-half test_length_half4(half4 p0)
-{
-  return length(p0);
-}
-
-// CHECK: define noundef float @
-// CHECK: call float @llvm.fabs.f32(float
-// CHECK: ret float
-float test_length_float(float p0)
-{
-  return length(p0);
-}
-// CHECK: define noundef float @
-// CHECK: %hlsl.length = call float @llvm.dx.length.v2f32(
-// CHECK: ret float %hlsl.length
-float test_length_float2(float2 p0)
-{
-  return length(p0);
-}
-// CHECK: define noundef float @
-// CHECK: %hlsl.length = call float @llvm.dx.length.v3f32(
-// CHECK: ret float %hlsl.length
-float test_length_float3(float3 p0)
-{
-  return length(p0);
-}
-// CHECK: define noundef float @
-// CHECK: %hlsl.length = call float @llvm.dx.length.v4f32(
-// CHECK: ret float %hlsl.length
-float test_length_float4(float4 p0)
-{
-  return length(p0);
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \ 
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF
+
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: call half @llvm.fabs.f16(half
+// NO_HALF: call float @llvm.fabs.f32(float
+// NATIVE_HALF: ret half
+// NO_HALF: ret float
+half test_length_half(half p0)
+{
+  return length(p0);
+}
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v2f16
+// NO_HALF: %hlsl.length = call float @llvm.dx.length.v2f32(
+// NATIVE_HALF: ret half %hlsl.length
+// NO_HALF: ret float %hlsl.length
+half test_length_half2(half2 p0)
+{
+  return length(p0);
+}
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v3f16
+// NO_HALF: %hlsl.length = call float @llvm.dx.length.v3f32(
+// NATIVE_HALF: ret half %hlsl.length
+// NO_HALF: ret float %hlsl.length
+half test_length_half3(half3 p0)
+{
+  return length(p0);
+}
+// NATIVE_HALF: define noundef half @
+// NATIVE_HALF: %hlsl.length = call half @llvm.dx.length.v4f16
+// NO_HALF: %hlsl.length = call float @llvm.dx.length.v4f32(
+// NATIVE_HALF: ret half %hlsl.length
+// NO_HALF: ret float %hlsl.length
+half test_length_half4(half4 p0)
+{
+  return length(p0);
+}
+
+// CHECK: define noundef float @
+// CHECK: call float @llvm.fabs.f32(float
+// CHECK: ret float
+float test_length_float(float p0)
+{
+  return length(p0);
+}
+// CHECK: define noundef float @
+// CHECK: %hlsl.length = call float @llvm.dx.length.v2f32(
+// CHECK: ret float %hlsl.length
+float test_length_float2(float2 p0)
+{
+  return length(p0);
+}
+// CHECK: define noundef float @
+// CHECK: %hlsl.length = call float @llvm.dx.length.v3f32(
+// CHECK: ret float %hlsl.length
+float test_length_float3(float3 p0)
+{
+  return length(p0);
+}
+// CHECK: define noundef float @
+// CHECK: %hlsl.length = call float @llvm.dx.length.v4f32(
+// CHECK: ret float %hlsl.length
+float test_length_float4(float4 p0)
+{
+  return length(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/normalize.hlsl b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
index 83ad607c14a6..d14e7c70ce06 100644
--- a/clang/test/CodeGenHLSL/builtins/normalize.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/normalize.hlsl
@@ -1,85 +1,85 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
-
-// NATIVE_HALF: define [[FNATTRS]] half @
-// NATIVE_HALF: call half @llvm.[[TARGET]].normalize.f16(half
-// NO_HALF: call float @llvm.[[TARGET]].normalize.f32(float
-// NATIVE_HALF: ret half
-// NO_HALF: ret float
-half test_normalize_half(half p0)
-{
-    return normalize(p0);
-}
-// NATIVE_HALF: define [[FNATTRS]] <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].normalize.v2f16(<2 x half>
-// NO_HALF: call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float>
-// NATIVE_HALF: ret <2 x half> %hlsl.normalize
-// NO_HALF: ret <2 x float> %hlsl.normalize
-half2 test_normalize_half2(half2 p0)
-{
-    return normalize(p0);
-}
-// NATIVE_HALF: define [[FNATTRS]] <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].normalize.v3f16(<3 x half>
-// NO_HALF: call <3 x float> @llvm.[[TARGET]].normalize.v3f32(<3 x float>
-// NATIVE_HALF: ret <3 x half> %hlsl.normalize
-// NO_HALF: ret <3 x float> %hlsl.normalize
-half3 test_normalize_half3(half3 p0)
-{
-    return normalize(p0);
-}
-// NATIVE_HALF: define [[FNATTRS]] <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].normalize.v4f16(<4 x half>
-// NO_HALF: call <4 x float> @llvm.[[TARGET]].normalize.v4f32(<4 x float>
-// NATIVE_HALF: ret <4 x half> %hlsl.normalize
-// NO_HALF: ret <4 x float> %hlsl.normalize
-half4 test_normalize_half4(half4 p0)
-{
-    return normalize(p0);
-}
-
-// CHECK: define [[FNATTRS]] float @
-// CHECK: call float @llvm.[[TARGET]].normalize.f32(float
-// CHECK: ret float
-float test_normalize_float(float p0)
-{
-    return normalize(p0);
-}
-// CHECK: define [[FNATTRS]] <2 x float> @
-// CHECK: %hlsl.normalize = call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float>
-
-// CHECK: ret <2 x float> %hlsl.normalize
-float2 test_normalize_float2(float2 p0)
-{
-    return normalize(p0);
-}
-// CHECK: define [[FNATTRS]] <3 x float> @
-// CHECK: %hlsl.normalize = call <3 x float> @llvm.[[TARGET]].normalize.v3f32(
-// CHECK: ret <3 x float> %hlsl.normalize
-float3 test_normalize_float3(float3 p0)
-{
-    return normalize(p0);
-}
-// CHECK: define [[FNATTRS]] <4 x float> @
-// CHECK: %hlsl.normalize = call <4 x float> @llvm.[[TARGET]].normalize.v4f32(
-// CHECK: ret <4 x float> %hlsl.normalize
-float4 test_length_float4(float4 p0)
-{
-    return normalize(p0);
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+
+// NATIVE_HALF: define [[FNATTRS]] half @
+// NATIVE_HALF: call half @llvm.[[TARGET]].normalize.f16(half
+// NO_HALF: call float @llvm.[[TARGET]].normalize.f32(float
+// NATIVE_HALF: ret half
+// NO_HALF: ret float
+half test_normalize_half(half p0)
+{
+    return normalize(p0);
+}
+// NATIVE_HALF: define [[FNATTRS]] <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].normalize.v2f16(<2 x half>
+// NO_HALF: call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float>
+// NATIVE_HALF: ret <2 x half> %hlsl.normalize
+// NO_HALF: ret <2 x float> %hlsl.normalize
+half2 test_normalize_half2(half2 p0)
+{
+    return normalize(p0);
+}
+// NATIVE_HALF: define [[FNATTRS]] <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].normalize.v3f16(<3 x half>
+// NO_HALF: call <3 x float> @llvm.[[TARGET]].normalize.v3f32(<3 x float>
+// NATIVE_HALF: ret <3 x half> %hlsl.normalize
+// NO_HALF: ret <3 x float> %hlsl.normalize
+half3 test_normalize_half3(half3 p0)
+{
+    return normalize(p0);
+}
+// NATIVE_HALF: define [[FNATTRS]] <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].normalize.v4f16(<4 x half>
+// NO_HALF: call <4 x float> @llvm.[[TARGET]].normalize.v4f32(<4 x float>
+// NATIVE_HALF: ret <4 x half> %hlsl.normalize
+// NO_HALF: ret <4 x float> %hlsl.normalize
+half4 test_normalize_half4(half4 p0)
+{
+    return normalize(p0);
+}
+
+// CHECK: define [[FNATTRS]] float @
+// CHECK: call float @llvm.[[TARGET]].normalize.f32(float
+// CHECK: ret float
+float test_normalize_float(float p0)
+{
+    return normalize(p0);
+}
+// CHECK: define [[FNATTRS]] <2 x float> @
+// CHECK: %hlsl.normalize = call <2 x float> @llvm.[[TARGET]].normalize.v2f32(<2 x float>
+
+// CHECK: ret <2 x float> %hlsl.normalize
+float2 test_normalize_float2(float2 p0)
+{
+    return normalize(p0);
+}
+// CHECK: define [[FNATTRS]] <3 x float> @
+// CHECK: %hlsl.normalize = call <3 x float> @llvm.[[TARGET]].normalize.v3f32(
+// CHECK: ret <3 x float> %hlsl.normalize
+float3 test_normalize_float3(float3 p0)
+{
+    return normalize(p0);
+}
+// CHECK: define [[FNATTRS]] <4 x float> @
+// CHECK: %hlsl.normalize = call <4 x float> @llvm.[[TARGET]].normalize.v4f32(
+// CHECK: ret <4 x float> %hlsl.normalize
+float4 test_length_float4(float4 p0)
+{
+    return normalize(p0);
+}
diff --git a/clang/test/CodeGenHLSL/builtins/step.hlsl b/clang/test/CodeGenHLSL/builtins/step.hlsl
index 442f4930ca57..8ef52794a3be 100644
--- a/clang/test/CodeGenHLSL/builtins/step.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/step.hlsl
@@ -1,84 +1,84 @@
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS=noundef -DTARGET=dx
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
-// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
-// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
-// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
-// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
-// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
-// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
-
-// NATIVE_HALF: define [[FNATTRS]] half @
-// NATIVE_HALF: call half @llvm.[[TARGET]].step.f16(half
-// NO_HALF: call float @llvm.[[TARGET]].step.f32(float
-// NATIVE_HALF: ret half
-// NO_HALF: ret float
-half test_step_half(half p0, half p1)
-{
-    return step(p0, p1);
-}
-// NATIVE_HALF: define [[FNATTRS]] <2 x half> @
-// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].step.v2f16(<2 x half>
-// NO_HALF: call <2 x float> @llvm.[[TARGET]].step.v2f32(<2 x float>
-// NATIVE_HALF: ret <2 x half> %hlsl.step
-// NO_HALF: ret <2 x float> %hlsl.step
-half2 test_step_half2(half2 p0, half2 p1)
-{
-    return step(p0, p1);
-}
-// NATIVE_HALF: define [[FNATTRS]] <3 x half> @
-// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].step.v3f16(<3 x half>
-// NO_HALF: call <3 x float> @llvm.[[TARGET]].step.v3f32(<3 x float>
-// NATIVE_HALF: ret <3 x half> %hlsl.step
-// NO_HALF: ret <3 x float> %hlsl.step
-half3 test_step_half3(half3 p0, half3 p1)
-{
-    return step(p0, p1);
-}
-// NATIVE_HALF: define [[FNATTRS]] <4 x half> @
-// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].step.v4f16(<4 x half>
-// NO_HALF: call <4 x float> @llvm.[[TARGET]].step.v4f32(<4 x float>
-// NATIVE_HALF: ret <4 x half> %hlsl.step
-// NO_HALF: ret <4 x float> %hlsl.step
-half4 test_step_half4(half4 p0, half4 p1)
-{
-    return step(p0, p1);
-}
-
-// CHECK: define [[FNATTRS]] float @
-// CHECK: call float @llvm.[[TARGET]].step.f32(float
-// CHECK: ret float
-float test_step_float(float p0, float p1)
-{
-    return step(p0, p1);
-}
-// CHECK: define [[FNATTRS]] <2 x float> @
-// CHECK: %hlsl.step = call <2 x float> @llvm.[[TARGET]].step.v2f32(
-// CHECK: ret <2 x float> %hlsl.step
-float2 test_step_float2(float2 p0, float2 p1)
-{
-    return step(p0, p1);
-}
-// CHECK: define [[FNATTRS]] <3 x float> @
-// CHECK: %hlsl.step = call <3 x float> @llvm.[[TARGET]].step.v3f32(
-// CHECK: ret <3 x float> %hlsl.step
-float3 test_step_float3(float3 p0, float3 p1)
-{
-    return step(p0, p1);
-}
-// CHECK: define [[FNATTRS]] <4 x float> @
-// CHECK: %hlsl.step = call <4 x float> @llvm.[[TARGET]].step.v4f32(
-// CHECK: ret <4 x float> %hlsl.step
-float4 test_step_float4(float4 p0, float4 p1)
-{
-    return step(p0, p1);
-}
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   dxil-pc-shadermodel6.3-library %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS=noundef -DTARGET=dx
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -fnative-half-type \
+// RUN:   -emit-llvm -disable-llvm-passes -o - | FileCheck %s \
+// RUN:   --check-prefixes=CHECK,NATIVE_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+// RUN: %clang_cc1 -finclude-default-header -x hlsl -triple \
+// RUN:   spirv-unknown-vulkan-compute %s -emit-llvm -disable-llvm-passes \
+// RUN:   -o - | FileCheck %s --check-prefixes=CHECK,NO_HALF \
+// RUN:   -DFNATTRS="spir_func noundef" -DTARGET=spv
+
+// NATIVE_HALF: define [[FNATTRS]] half @
+// NATIVE_HALF: call half @llvm.[[TARGET]].step.f16(half
+// NO_HALF: call float @llvm.[[TARGET]].step.f32(float
+// NATIVE_HALF: ret half
+// NO_HALF: ret float
+half test_step_half(half p0, half p1)
+{
+    return step(p0, p1);
+}
+// NATIVE_HALF: define [[FNATTRS]] <2 x half> @
+// NATIVE_HALF: call <2 x half> @llvm.[[TARGET]].step.v2f16(<2 x half>
+// NO_HALF: call <2 x float> @llvm.[[TARGET]].step.v2f32(<2 x float>
+// NATIVE_HALF: ret <2 x half> %hlsl.step
+// NO_HALF: ret <2 x float> %hlsl.step
+half2 test_step_half2(half2 p0, half2 p1)
+{
+    return step(p0, p1);
+}
+// NATIVE_HALF: define [[FNATTRS]] <3 x half> @
+// NATIVE_HALF: call <3 x half> @llvm.[[TARGET]].step.v3f16(<3 x half>
+// NO_HALF: call <3 x float> @llvm.[[TARGET]].step.v3f32(<3 x float>
+// NATIVE_HALF: ret <3 x half> %hlsl.step
+// NO_HALF: ret <3 x float> %hlsl.step
+half3 test_step_half3(half3 p0, half3 p1)
+{
+    return step(p0, p1);
+}
+// NATIVE_HALF: define [[FNATTRS]] <4 x half> @
+// NATIVE_HALF: call <4 x half> @llvm.[[TARGET]].step.v4f16(<4 x half>
+// NO_HALF: call <4 x float> @llvm.[[TARGET]].step.v4f32(<4 x float>
+// NATIVE_HALF: ret <4 x half> %hlsl.step
+// NO_HALF: ret <4 x float> %hlsl.step
+half4 test_step_half4(half4 p0, half4 p1)
+{
+    return step(p0, p1);
+}
+
+// CHECK: define [[FNATTRS]] float @
+// CHECK: call float @llvm.[[TARGET]].step.f32(float
+// CHECK: ret float
+float test_step_float(float p0, float p1)
+{
+    return step(p0, p1);
+}
+// CHECK: define [[FNATTRS]] <2 x float> @
+// CHECK: %hlsl.step = call <2 x float> @llvm.[[TARGET]].step.v2f32(
+// CHECK: ret <2 x float> %hlsl.step
+float2 test_step_float2(float2 p0, float2 p1)
+{
+    return step(p0, p1);
+}
+// CHECK: define [[FNATTRS]] <3 x float> @
+// CHECK: %hlsl.step = call <3 x float> @llvm.[[TARGET]].step.v3f32(
+// CHECK: ret <3 x float> %hlsl.step
+float3 test_step_float3(float3 p0, float3 p1)
+{
+    return step(p0, p1);
+}
+// CHECK: define [[FNATTRS]] <4 x float> @
+// CHECK: %hlsl.step = call <4 x float> @llvm.[[TARGET]].step.v4f32(
+// CHECK: ret <4 x float> %hlsl.step
+float4 test_step_float4(float4 p0, float4 p1)
+{
+    return step(p0, p1);
+}
diff --git a/clang/test/Driver/flang/msvc-link.f90 b/clang/test/Driver/flang/msvc-link.f90
index 463749510eb5..3f7e162a9a61 100644
--- a/clang/test/Driver/flang/msvc-link.f90
+++ b/clang/test/Driver/flang/msvc-link.f90
@@ -1,5 +1,5 @@
-! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s
-!
-! Test that user provided paths come before the Flang runtimes
-! CHECK: "-libpath:test"
-! CHECK: "-libpath:{{.*(\\|/)}}lib"
+! RUN: %clang --driver-mode=flang --target=x86_64-pc-windows-msvc -### %s -Ltest 2>&1 | FileCheck %s
+!
+! Test that user provided paths come before the Flang runtimes
+! CHECK: "-libpath:test"
+! CHECK: "-libpath:{{.*(\\|/)}}lib"
diff --git a/clang/test/FixIt/fixit-newline-style.c b/clang/test/FixIt/fixit-newline-style.c
index 61e4df67e85b..2aac143d4d75 100644
--- a/clang/test/FixIt/fixit-newline-style.c
+++ b/clang/test/FixIt/fixit-newline-style.c
@@ -1,11 +1,11 @@
-// RUN: %clang_cc1 -pedantic -Wunused-label -fno-diagnostics-show-line-numbers -x c %s 2>&1 | FileCheck %s -strict-whitespace
-
-// This file intentionally uses a CRLF newline style
-// CHECK: warning: unused label 'ddd'
-// CHECK-NEXT: {{^  ddd:}}
-// CHECK-NEXT: {{^  \^~~~$}}
-// CHECK-NOT: {{^  ;}}
-void f(void) {
-  ddd:
-  ;
-}
+// RUN: %clang_cc1 -pedantic -Wunused-label -fno-diagnostics-show-line-numbers -x c %s 2>&1 | FileCheck %s -strict-whitespace
+
+// This file intentionally uses a CRLF newline style
+// CHECK: warning: unused label 'ddd'
+// CHECK-NEXT: {{^  ddd:}}
+// CHECK-NEXT: {{^  \^~~~$}}
+// CHECK-NOT: {{^  ;}}
+void f(void) {
+  ddd:
+  ;
+}
diff --git a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c
index d6724444c066..2faeaba32292 100644
--- a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c
+++ b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.c
@@ -1,8 +1,8 @@
-// RUN: %clang_cc1 -E -frewrite-includes %s | %clang_cc1 -
-// expected-no-diagnostics
-// Note: This source file has CRLF line endings.
-// This test validates that -frewrite-includes translates the end of line (EOL)
-// form used in header files to the EOL form used in the the primary source
-// file when the files use different EOL forms.
-#include "rewrite-includes-mixed-eol-crlf.h"
-#include "rewrite-includes-mixed-eol-lf.h"
+// RUN: %clang_cc1 -E -frewrite-includes %s | %clang_cc1 -
+// expected-no-diagnostics
+// Note: This source file has CRLF line endings.
+// This test validates that -frewrite-includes translates the end of line (EOL)
+// form used in header files to the EOL form used in the the primary source
+// file when the files use different EOL forms.
+#include "rewrite-includes-mixed-eol-crlf.h"
+#include "rewrite-includes-mixed-eol-lf.h"
diff --git a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h
index 0439b88b75e2..baedc282296b 100644
--- a/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h
+++ b/clang/test/Frontend/rewrite-includes-mixed-eol-crlf.h
@@ -1,11 +1,11 @@
-// Note: This header file has CRLF line endings.
-// The indentation in some of the conditional inclusion directives below is
-// intentional and is required for this test to function as a regression test
-// for GH59736.
-_Static_assert(__LINE__ == 5, "");
-#if 1
-_Static_assert(__LINE__ == 7, "");
-  #if 1
-  _Static_assert(__LINE__ == 9, "");
-  #endif
-#endif
+// Note: This header file has CRLF line endings.
+// The indentation in some of the conditional inclusion directives below is
+// intentional and is required for this test to function as a regression test
+// for GH59736.
+_Static_assert(__LINE__ == 5, "");
+#if 1
+_Static_assert(__LINE__ == 7, "");
+  #if 1
+  _Static_assert(__LINE__ == 9, "");
+  #endif
+#endif
diff --git a/clang/test/Frontend/system-header-line-directive-ms-lineendings.c b/clang/test/Frontend/system-header-line-directive-ms-lineendings.c
index 92fc07f65e0d..dffdd5cf1959 100644
--- a/clang/test/Frontend/system-header-line-directive-ms-lineendings.c
+++ b/clang/test/Frontend/system-header-line-directive-ms-lineendings.c
@@ -1,21 +1,21 @@
-// RUN: %clang_cc1 %s -E -o - -I %S/Inputs -isystem %S/Inputs/SystemHeaderPrefix | FileCheck %s
-#include <noline.h>
-#include <line-directive-in-system.h>
-
-#include "line-directive.h"
-
-// This tests that the line numbers for the current file are correctly outputted
-// for the include-file-completed test case. This file should be CRLF.
-
-// CHECK: # 1 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
-// CHECK: # 1 "{{.*}}noline.h" 1 3
-// CHECK: foo(void);
-// CHECK: # 3 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
-// CHECK: # 1 "{{.*}}line-directive-in-system.h" 1 3
-//      The "3" below indicates that "foo.h" is considered a system header.
-// CHECK: # 1 "foo.h" 3
-// CHECK: foo(void);
-// CHECK: # 4 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
-// CHECK: # 1 "{{.*}}line-directive.h" 1
-// CHECK: # 10 "foo.h"{{$}}
-// CHECK: # 6 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
+// RUN: %clang_cc1 %s -E -o - -I %S/Inputs -isystem %S/Inputs/SystemHeaderPrefix | FileCheck %s
+#include <noline.h>
+#include <line-directive-in-system.h>
+
+#include "line-directive.h"
+
+// This tests that the line numbers for the current file are correctly outputted
+// for the include-file-completed test case. This file should be CRLF.
+
+// CHECK: # 1 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
+// CHECK: # 1 "{{.*}}noline.h" 1 3
+// CHECK: foo(void);
+// CHECK: # 3 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
+// CHECK: # 1 "{{.*}}line-directive-in-system.h" 1 3
+//      The "3" below indicates that "foo.h" is considered a system header.
+// CHECK: # 1 "foo.h" 3
+// CHECK: foo(void);
+// CHECK: # 4 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
+// CHECK: # 1 "{{.*}}line-directive.h" 1
+// CHECK: # 10 "foo.h"{{$}}
+// CHECK: # 6 "{{.*}}system-header-line-directive-ms-lineendings.c" 2
diff --git a/clang/test/ParserHLSL/bitfields.hlsl b/clang/test/ParserHLSL/bitfields.hlsl
index 307d1143a068..57b6705babdc 100644
--- a/clang/test/ParserHLSL/bitfields.hlsl
+++ b/clang/test/ParserHLSL/bitfields.hlsl
@@ -1,31 +1,31 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -x hlsl -o - %s | FileCheck %s
-
-
-struct MyBitFields {
-  // CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:9:3, col:25> col:16 referenced field1 'unsigned int'
-  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:25> 'int'
-  // CHECK:-value: Int 3
-  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:25> 'int' 3
-  unsigned int field1 : 3; // 3 bits for field1
-
-  // CHECK:FieldDecl 0x{{[0-9a-f]+}} <line:15:3, col:25> col:16 referenced field2 'unsigned int'
-  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:25> 'int'
-  // CHECK:-value: Int 4
-  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:25> 'int' 4
-  unsigned int field2 : 4; // 4 bits for field2
-  
-  // CHECK:FieldDecl 0x{{[0-9a-f]+}} <line:21:3, col:16> col:7 field3 'int'
-  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:16> 'int'
-  // CHECK:-value: Int 5
-  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:16> 'int' 5
-  int field3 : 5;          // 5 bits for field3 (signed)
-};
-
-
-
-[numthreads(1,1,1)]
-void main() {
-  MyBitFields m;
-  m.field1 = 4;
-  m.field2 = m.field1*2;
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -ast-dump -x hlsl -o - %s | FileCheck %s
+
+
+struct MyBitFields {
+  // CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:9:3, col:25> col:16 referenced field1 'unsigned int'
+  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:25> 'int'
+  // CHECK:-value: Int 3
+  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:25> 'int' 3
+  unsigned int field1 : 3; // 3 bits for field1
+
+  // CHECK:FieldDecl 0x{{[0-9a-f]+}} <line:15:3, col:25> col:16 referenced field2 'unsigned int'
+  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:25> 'int'
+  // CHECK:-value: Int 4
+  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:25> 'int' 4
+  unsigned int field2 : 4; // 4 bits for field2
+  
+  // CHECK:FieldDecl 0x{{[0-9a-f]+}} <line:21:3, col:16> col:7 field3 'int'
+  // CHECK:-ConstantExpr 0x{{[0-9a-f]+}} <col:16> 'int'
+  // CHECK:-value: Int 5
+  // CHECK:-IntegerLiteral 0x{{[0-9a-f]+}} <col:16> 'int' 5
+  int field3 : 5;          // 5 bits for field3 (signed)
+};
+
+
+
+[numthreads(1,1,1)]
+void main() {
+  MyBitFields m;
+  m.field1 = 4;
+  m.field2 = m.field1*2;
 }
\ No newline at end of file
diff --git a/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl
index 2eebc920388b..5b228d039345 100644
--- a/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl
+++ b/clang/test/ParserHLSL/hlsl_annotations_on_struct_members.hlsl
@@ -1,21 +1,21 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
-
-// tests that hlsl annotations are properly parsed when applied on field decls,
-// and that the annotation gets properly placed on the AST.
-
-struct Eg9{
-  // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:1, col:8> col:8 implicit struct Eg9
-  // CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:10:3, col:16> col:16 referenced a 'unsigned int'
-  // CHECK: -HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-f]+}} <col:20>
-  unsigned int a : SV_DispatchThreadID;
-};
-Eg9 e9;
-
-
-RWBuffer<int> In : register(u1);
-
-
-[numthreads(1,1,1)]
-void main() {
-  In[0] = e9.a;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// tests that hlsl annotations are properly parsed when applied on field decls,
+// and that the annotation gets properly placed on the AST.
+
+struct Eg9{
+  // CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:1, col:8> col:8 implicit struct Eg9
+  // CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:10:3, col:16> col:16 referenced a 'unsigned int'
+  // CHECK: -HLSLSV_DispatchThreadIDAttr 0x{{[0-9a-f]+}} <col:20>
+  unsigned int a : SV_DispatchThreadID;
+};
+Eg9 e9;
+
+
+RWBuffer<int> In : register(u1);
+
+
+[numthreads(1,1,1)]
+void main() {
+  In[0] = e9.a;
+}
diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl
index 5a72aa242e58..476ec39e14da 100644
--- a/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_contained_type_attr.hlsl
@@ -1,25 +1,25 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -ast-dump -o - %s | FileCheck %s
-
-typedef vector<float, 4> float4;
-
-// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, col:83>
-// CHECK: -HLSLAttributedResourceType 0x{{[0-9a-f]+}} '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]]
-using ResourceIntAliasT = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(int)]];
-ResourceIntAliasT h1;
-
-// CHECK: -VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:82> col:82 h2 '__hlsl_resource_t 
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float4)]]
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float4)]] h2;
-
-// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 6]]:1, line:[[# @LINE + 8]]:1> line:[[# @LINE + 6]]:30 S
-// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} <col:11, col:20> col:20 referenced typename depth 0 index 0 T
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:23, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:30 struct S definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:79> col:79 h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(T)]]
-template <typename T> struct S {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(T)]] h;
-};
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -ast-dump -o - %s | FileCheck %s
+
+typedef vector<float, 4> float4;
+
+// CHECK: -TypeAliasDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, col:83>
+// CHECK: -HLSLAttributedResourceType 0x{{[0-9a-f]+}} '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(int)]]
+using ResourceIntAliasT = __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(int)]];
+ResourceIntAliasT h1;
+
+// CHECK: -VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:82> col:82 h2 '__hlsl_resource_t 
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float4)]]
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float4)]] h2;
+
+// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 6]]:1, line:[[# @LINE + 8]]:1> line:[[# @LINE + 6]]:30 S
+// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} <col:11, col:20> col:20 referenced typename depth 0 index 0 T
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:23, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:30 struct S definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:79> col:79 h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(T)]]
+template <typename T> struct S {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(T)]] h;
+};
diff --git a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl
index b2d492d95945..673ff8693b83 100644
--- a/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_contained_type_attr_error.hlsl
@@ -1,28 +1,28 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -o - %s -verify
-
-typedef vector<float, 4> float4;
-
-// expected-error@+1{{'contained_type' attribute cannot be applied to a declaration}}
-[[hlsl::contained_type(float4)]] __hlsl_resource_t h1;
-
-// expected-error@+1{{'contained_type' attribute takes one argument}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type()]] h3;
-
-// expected-error@+1{{expected a type}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(0)]] h4;
-
-// expected-error@+1{{unknown type name 'a'}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(a)]] h5;
-
-// expected-error@+1{{expected a type}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type("b", c)]] h6;
-
-// expected-warning@+1{{attribute 'contained_type' is already applied}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(float)]] h7;
-
-// expected-warning@+1{{attribute 'contained_type' is already applied with different arguments}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(int)]] h8;
-
-// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
-// expected-error@+1{{attribute 'contained_type' can be used only on HLSL intangible type '__hlsl_resource_t'}}
-float [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] res5;
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -std=hlsl202x -x hlsl -o - %s -verify
+
+typedef vector<float, 4> float4;
+
+// expected-error@+1{{'contained_type' attribute cannot be applied to a declaration}}
+[[hlsl::contained_type(float4)]] __hlsl_resource_t h1;
+
+// expected-error@+1{{'contained_type' attribute takes one argument}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type()]] h3;
+
+// expected-error@+1{{expected a type}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(0)]] h4;
+
+// expected-error@+1{{unknown type name 'a'}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(a)]] h5;
+
+// expected-error@+1{{expected a type}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type("b", c)]] h6;
+
+// expected-warning@+1{{attribute 'contained_type' is already applied}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(float)]] h7;
+
+// expected-warning@+1{{attribute 'contained_type' is already applied with different arguments}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] [[hlsl::contained_type(int)]] h8;
+
+// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+// expected-error@+1{{attribute 'contained_type' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] [[hlsl::contained_type(float)]] res5;
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
index 836d129c8d00..487dc3241303 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr.hlsl
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
-
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:68> col:68 h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
-struct MyBuffer {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h;
-};
-
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:66> col:66 res '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
-__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res;
-
-// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:6 f 'void ()
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]]
-// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
-void f() {
-  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:68> col:68 h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
+struct MyBuffer {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] h;
+};
+
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:66> col:66 res '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
+__hlsl_resource_t [[hlsl::is_rov]] [[hlsl::resource_class(SRV)]] res;
+
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:6 f 'void ()
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 r '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]]
+// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
+void f() {
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] [[hlsl::is_rov]] r;
+}
diff --git a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
index 3b2c12e7a96c..9bb64ea990e2 100644
--- a/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_is_rov_attr_error.hlsl
@@ -1,20 +1,20 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
-
-// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}}
-[[hlsl::is_rov]] __hlsl_resource_t res0;
-
-// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}}
-__hlsl_resource_t [[hlsl::is_rov]] res1;
-
-// expected-error@+1{{'is_rov' attribute takes no arguments}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2;
-  
-// expected-error@+1{{use of undeclared identifier 'gibberish'}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3;
-
-// expected-warning@+1{{attribute 'is_rov' is already applied}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4;
-
-// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
-// expected-error@+1{{attribute 'is_rov' can be used only on HLSL intangible type '__hlsl_resource_t'}}
-float [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] res5;
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
+
+// expected-error@+1{{'is_rov' attribute cannot be applied to a declaration}}
+[[hlsl::is_rov]] __hlsl_resource_t res0;
+
+// expected-error@+1{{HLSL resource needs to have [[hlsl::resource_class()]] attribute}}
+__hlsl_resource_t [[hlsl::is_rov]] res1;
+
+// expected-error@+1{{'is_rov' attribute takes no arguments}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(3)]] res2;
+  
+// expected-error@+1{{use of undeclared identifier 'gibberish'}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov(gibberish)]] res3;
+
+// expected-warning@+1{{attribute 'is_rov' is already applied}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] [[hlsl::is_rov]] res4;
+
+// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+// expected-error@+1{{attribute 'is_rov' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] [[hlsl::is_rov]] res5;
diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl
index 84c924eec24e..e09ed5586c10 100644
--- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr.hlsl
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
-
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:72> col:72 h1 '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-struct MyBuffer {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h1;
-};
-
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:70> col:70 h2 '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-__hlsl_resource_t [[hlsl::raw_buffer]] [[hlsl::resource_class(SRV)]] h2;
-
-// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:6 f 'void ()
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 h3 '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
-void f() {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h3;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:3, col:72> col:72 h1 '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+struct MyBuffer {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h1;
+};
+
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, col:70> col:70 h2 '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+__hlsl_resource_t [[hlsl::raw_buffer]] [[hlsl::resource_class(SRV)]] h2;
+
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 4]]:1, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:6 f 'void ()
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:72> col:72 h3 '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::raw_buffer]]
+void f() {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] h3;
+}
diff --git a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl
index 77530cbf9e4d..a10aca4e96fc 100644
--- a/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_raw_buffer_attr_error.hlsl
@@ -1,17 +1,17 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
-
-// expected-error@+1{{'raw_buffer' attribute cannot be applied to a declaration}}
-[[hlsl::raw_buffer]] __hlsl_resource_t res0;
-
-// expected-error@+1{{'raw_buffer' attribute takes no arguments}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(3)]] res2;
-  
-// expected-error@+1{{use of undeclared identifier 'gibberish'}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]] res3;
-
-// expected-warning@+1{{attribute 'raw_buffer' is already applied}}
-__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] [[hlsl::raw_buffer]] res4;
-
-// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
-// expected-error@+1{{attribute 'raw_buffer' can be used only on HLSL intangible type '__hlsl_resource_t'}}
-float [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] res5;
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
+
+// expected-error@+1{{'raw_buffer' attribute cannot be applied to a declaration}}
+[[hlsl::raw_buffer]] __hlsl_resource_t res0;
+
+// expected-error@+1{{'raw_buffer' attribute takes no arguments}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(3)]] res2;
+  
+// expected-error@+1{{use of undeclared identifier 'gibberish'}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer(gibberish)]] res3;
+
+// expected-warning@+1{{attribute 'raw_buffer' is already applied}}
+__hlsl_resource_t [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] [[hlsl::raw_buffer]] res4;
+
+// expected-error@+2{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+// expected-error@+1{{attribute 'raw_buffer' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] [[hlsl::raw_buffer]] res5;
diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
index fbada8b4b99f..9fee9edddf61 100644
--- a/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_class_attr.hlsl
@@ -1,37 +1,37 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
-
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:3, col:51> col:51 h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-struct MyBuffer {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
-};
-
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 2]]:1, col:49> col:49 res '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
-__hlsl_resource_t [[hlsl::resource_class(SRV)]] res;
-
-// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, line:[[# @LINE + 5]]:1> line:[[# @LINE + 3]]:6 f 'void ()
-// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:55> col:55 r '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]]
-void f() {
-  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r;
-}
-
-// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 6]]:1, line:[[# @LINE + 8]]:1> line:[[# @LINE + 6]]:29 MyBuffer2
-// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} <col:10, col:19> col:19 typename depth 0 index 0 T
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:29 struct MyBuffer2 definition
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:3, col:51> col:51 h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-template<typename T> struct MyBuffer2 {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
-};
-
-// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <line:[[# @LINE - 4]]:1, line:[[# @LINE - 2]]:1> line:[[# @LINE - 4]]:29 struct MyBuffer2 definition implicit_instantiation
-// CHECK: TemplateArgument type 'float'
-// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
-// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE - 7]]:3, col:51> col:51 h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-MyBuffer2<float> myBuffer2;
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} {{.*}} struct MyBuffer definition
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:3, col:51> col:51 h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+struct MyBuffer {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
+};
+
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 2]]:1, col:49> col:49 res '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(SRV)]]
+__hlsl_resource_t [[hlsl::resource_class(SRV)]] res;
+
+// CHECK: FunctionDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:1, line:[[# @LINE + 5]]:1> line:[[# @LINE + 3]]:6 f 'void ()
+// CHECK: VarDecl 0x{{[0-9a-f]+}} <col:3, col:55> col:55 r '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(Sampler)]]
+void f() {
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] r;
+}
+
+// CHECK: ClassTemplateDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 6]]:1, line:[[# @LINE + 8]]:1> line:[[# @LINE + 6]]:29 MyBuffer2
+// CHECK: TemplateTypeParmDecl 0x{{[0-9a-f]+}} <col:10, col:19> col:19 typename depth 0 index 0 T
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, line:[[# @LINE + 6]]:1> line:[[# @LINE + 4]]:29 struct MyBuffer2 definition
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE + 3]]:3, col:51> col:51 h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+template<typename T> struct MyBuffer2 {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] h;
+};
+
+// CHECK: ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <line:[[# @LINE - 4]]:1, line:[[# @LINE - 2]]:1> line:[[# @LINE - 4]]:29 struct MyBuffer2 definition implicit_instantiation
+// CHECK: TemplateArgument type 'float'
+// CHECK: BuiltinType 0x{{[0-9a-f]+}} 'float'
+// CHECK: CXXRecordDecl 0x{{[0-9a-f]+}} <col:22, col:29> col:29 implicit struct MyBuffer2
+// CHECK: FieldDecl 0x{{[0-9a-f]+}} <line:[[# @LINE - 7]]:3, col:51> col:51 h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+MyBuffer2<float> myBuffer2;
diff --git a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
index 63e39daff949..a0a4da1dc2bf 100644
--- a/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_class_attr_error.hlsl
@@ -1,22 +1,22 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
-
-// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}}
-[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0;
-
-// expected-error@+1{{'resource_class' attribute takes one argument}}
-__hlsl_resource_t [[hlsl::resource_class()]] e1;
-
-// expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}}
-__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2;
-
-// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}}
-__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3;
-
-// expected-warning@+1{{attribute 'resource_class' is already applied}}
-__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4;
-
-// expected-error@+1{{'resource_class' attribute takes one argument}}
-__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5;
-
-// expected-error@+1{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
-float [[hlsl::resource_class(UAV)]] e6;
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -o - %s -verify
+
+// expected-error@+1{{'resource_class' attribute cannot be applied to a declaration}}
+[[hlsl::resource_class(UAV)]] __hlsl_resource_t e0;
+
+// expected-error@+1{{'resource_class' attribute takes one argument}}
+__hlsl_resource_t [[hlsl::resource_class()]] e1;
+
+// expected-warning@+1{{ResourceClass attribute argument not supported: gibberish}}
+__hlsl_resource_t [[hlsl::resource_class(gibberish)]] e2;
+
+// expected-warning@+1{{attribute 'resource_class' is already applied with different arguments}}
+__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(UAV)]] e3;
+
+// expected-warning@+1{{attribute 'resource_class' is already applied}}
+__hlsl_resource_t [[hlsl::resource_class(SRV)]] [[hlsl::resource_class(SRV)]] e4;
+
+// expected-error@+1{{'resource_class' attribute takes one argument}}
+__hlsl_resource_t [[hlsl::resource_class(SRV, "aa")]] e5;
+
+// expected-error@+1{{attribute 'resource_class' can be used only on HLSL intangible type '__hlsl_resource_t'}}
+float [[hlsl::resource_class(UAV)]] e6;
diff --git a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
index 38d27bc21e4a..8885e3923735 100644
--- a/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
+++ b/clang/test/ParserHLSL/hlsl_resource_handle_attrs.hlsl
@@ -1,21 +1,21 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s
-
-// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RWBuffer definition implicit_instantiation
-// CHECK: -TemplateArgument type 'float'
-// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
-// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
-RWBuffer<float> Buffer1;
-
-// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RasterizerOrderedBuffer definition implicit_instantiation
-// CHECK: -TemplateArgument type 'vector<float, 4>'
-// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector<float, 4>' 4
-// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
-// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
-// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]
-// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
-// CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector<float, 4>)]]
-// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
-RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4];
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -ast-dump -o - %s | FileCheck %s
+
+// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RWBuffer definition implicit_instantiation
+// CHECK: -TemplateArgument type 'float'
+// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(float)]]
+// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
+RWBuffer<float> Buffer1;
+
+// CHECK: -ClassTemplateSpecializationDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> class RasterizerOrderedBuffer definition implicit_instantiation
+// CHECK: -TemplateArgument type 'vector<float, 4>'
+// CHECK: `-ExtVectorType 0x{{[0-9a-f]+}} 'vector<float, 4>' 4
+// CHECK: `-BuiltinType 0x{{[0-9a-f]+}} 'float'
+// CHECK: -FieldDecl 0x{{[0-9a-f]+}} <<invalid sloc>> <invalid sloc> implicit h '__hlsl_resource_t
+// CHECK-SAME{LITERAL}: [[hlsl::resource_class(UAV)]
+// CHECK-SAME{LITERAL}: [[hlsl::is_rov]]
+// CHECK-SAME{LITERAL}: [[hlsl::contained_type(vector<float, 4>)]]
+// CHECK: -HLSLResourceAttr 0x{{[0-9a-f]+}} <<invalid sloc>> Implicit TypedBuffer
+RasterizerOrderedBuffer<vector<float, 4> > BufferArray3[4];
diff --git a/clang/test/Sema/aarch64-sve-vector-trig-ops.c b/clang/test/Sema/aarch64-sve-vector-trig-ops.c
index 3fe6834be2e0..f853abcd3379 100644
--- a/clang/test/Sema/aarch64-sve-vector-trig-ops.c
+++ b/clang/test/Sema/aarch64-sve-vector-trig-ops.c
@@ -1,65 +1,65 @@
-// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \
-// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
-// REQUIRES: aarch64-registered-target
-
-#include <arm_sve.h>
-
-svfloat32_t test_asin_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_asin(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_acos_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_acos(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_atan_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_atan(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_atan2_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_atan2(v, v);
-  // expected-error@-1 {{1st argument must be a floating point type}}
-}
-
-svfloat32_t test_sin_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_sin(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_cos_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_cos(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_tan_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_tan(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_sinh_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_sinh(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_cosh_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_cosh(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-svfloat32_t test_tanh_vv_i8mf8(svfloat32_t v) {
-
-  return __builtin_elementwise_tanh(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
+// RUN: %clang_cc1 -triple aarch64 -target-feature +sve \
+// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
+// REQUIRES: aarch64-registered-target
+
+#include <arm_sve.h>
+
+svfloat32_t test_asin_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_asin(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_acos_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_acos(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_atan_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_atan(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_atan2_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_atan2(v, v);
+  // expected-error@-1 {{1st argument must be a floating point type}}
+}
+
+svfloat32_t test_sin_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_sin(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_cos_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_cos(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_tan_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_tan(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_sinh_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_sinh(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_cosh_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_cosh(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+svfloat32_t test_tanh_vv_i8mf8(svfloat32_t v) {
+
+  return __builtin_elementwise_tanh(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
diff --git a/clang/test/Sema/riscv-rvv-vector-trig-ops.c b/clang/test/Sema/riscv-rvv-vector-trig-ops.c
index 0aed1b2a0998..006c136f8033 100644
--- a/clang/test/Sema/riscv-rvv-vector-trig-ops.c
+++ b/clang/test/Sema/riscv-rvv-vector-trig-ops.c
@@ -1,67 +1,67 @@
-// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \
-// RUN:   -target-feature +v -target-feature +zfh -target-feature +zvfh \
-// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
-// REQUIRES: riscv-registered-target
-
-#include <riscv_vector.h>
-
-vfloat32mf2_t test_asin_vv_i8mf8(vfloat32mf2_t v) {
-
-    return __builtin_elementwise_asin(v);
-    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-  }
-  
-  vfloat32mf2_t test_acos_vv_i8mf8(vfloat32mf2_t v) {
-  
-    return __builtin_elementwise_acos(v);
-    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-  }
-  
-  vfloat32mf2_t test_atan_vv_i8mf8(vfloat32mf2_t v) {
-  
-    return __builtin_elementwise_atan(v);
-    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-  }
-
-vfloat32mf2_t test_atan2_vv_i8mf8(vfloat32mf2_t v) {
-
-  return __builtin_elementwise_atan2(v, v);
-  // expected-error@-1 {{1st argument must be a floating point type}}
-}
-
-vfloat32mf2_t test_sin_vv_i8mf8(vfloat32mf2_t v) {
-
-  return __builtin_elementwise_sin(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-vfloat32mf2_t test_cos_vv_i8mf8(vfloat32mf2_t v) {
-
-  return __builtin_elementwise_cos(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-vfloat32mf2_t test_tan_vv_i8mf8(vfloat32mf2_t v) {
-
-  return __builtin_elementwise_tan(v);
-  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-}
-
-vfloat32mf2_t test_sinh_vv_i8mf8(vfloat32mf2_t v) {
-
-    return __builtin_elementwise_sinh(v);
-    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-  }
-  
-  vfloat32mf2_t test_cosh_vv_i8mf8(vfloat32mf2_t v) {
-  
-    return __builtin_elementwise_cosh(v);
-    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-  }
-  
-  vfloat32mf2_t test_tanh_vv_i8mf8(vfloat32mf2_t v) {
-  
-    return __builtin_elementwise_tanh(v);
-    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
-  }
-  
+// RUN: %clang_cc1 -triple riscv64 -target-feature +f -target-feature +d \
+// RUN:   -target-feature +v -target-feature +zfh -target-feature +zvfh \
+// RUN:   -disable-O0-optnone -o - -fsyntax-only %s -verify
+// REQUIRES: riscv-registered-target
+
+#include <riscv_vector.h>
+
+vfloat32mf2_t test_asin_vv_i8mf8(vfloat32mf2_t v) {
+
+    return __builtin_elementwise_asin(v);
+    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+  }
+  
+  vfloat32mf2_t test_acos_vv_i8mf8(vfloat32mf2_t v) {
+  
+    return __builtin_elementwise_acos(v);
+    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+  }
+  
+  vfloat32mf2_t test_atan_vv_i8mf8(vfloat32mf2_t v) {
+  
+    return __builtin_elementwise_atan(v);
+    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+  }
+
+vfloat32mf2_t test_atan2_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_atan2(v, v);
+  // expected-error@-1 {{1st argument must be a floating point type}}
+}
+
+vfloat32mf2_t test_sin_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_sin(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+vfloat32mf2_t test_cos_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_cos(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+vfloat32mf2_t test_tan_vv_i8mf8(vfloat32mf2_t v) {
+
+  return __builtin_elementwise_tan(v);
+  // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+}
+
+vfloat32mf2_t test_sinh_vv_i8mf8(vfloat32mf2_t v) {
+
+    return __builtin_elementwise_sinh(v);
+    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+  }
+  
+  vfloat32mf2_t test_cosh_vv_i8mf8(vfloat32mf2_t v) {
+  
+    return __builtin_elementwise_cosh(v);
+    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+  }
+  
+  vfloat32mf2_t test_tanh_vv_i8mf8(vfloat32mf2_t v) {
+  
+    return __builtin_elementwise_tanh(v);
+    // expected-error@-1 {{1st argument must be a vector, integer or floating point type}}
+  }
+  
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl
index 764b9e843f7f..b60fba62bdb0 100644
--- a/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl
+++ b/clang/test/SemaHLSL/Availability/avail-diag-default-compute.hlsl
@@ -1,119 +1,119 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
-// RUN: -fsyntax-only -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.5)))
-float fx(float);  // #fx
-
-__attribute__((availability(shadermodel, introduced = 6.6)))
-half fx(half);  // #fx_half
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
-float fy(float); // #fy
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
-float fz(float); // #fz
-
-float also_alive(float f) {
-  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_alive_fx_call
-  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #also_alive_fy_call
-  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #also_alive_fz_call
-  return 0;
-}
-
-float alive(float f) {
-  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #alive_fx_call
-  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #alive_fy_call
-  // expected-error@#alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #alive_fz_call
-
-  return also_alive(f);
-}
-
-float also_dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-  return 0;
-}
-
-float dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-
-  return also_dead(f);
-}
-
-template<typename T>
-T aliveTemp(T f) {
-  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #aliveTemp_fx_call
-  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #aliveTemp_fy_call
-  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #aliveTemp_fz_call
-  return 0;
-}
-
-template<typename T> T aliveTemp2(T f) {
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
-  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  return fx(f); // #aliveTemp2_fx_call
-}
-
-half test(half x) {
-  return aliveTemp2(x);
-}
-
-float test(float x) {
-  return aliveTemp2(x);
-}
-
-class MyClass
-{
-  float F;
-  float makeF() {
-    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-    float A = fx(F); // #MyClass_makeF_fx_call
-    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float B = fy(F); // #MyClass_makeF_fy_call
-    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
-    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float C = fz(F); // #MyClass_makeF_fz_call
-    return 0;
-  }
-};
-
-[numthreads(4,1,1)]
-float main() {
-  float f = 3;
-  MyClass C = { 1.0f };
-  float a = alive(f);
-  float b = aliveTemp<float>(f); // #aliveTemp_inst
-  float c = C.makeF();
-  float d = test((float)1.0);
-  float e = test((half)1.0);
-  return a * b * c;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
+// RUN: -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+  return 0;
+}
+
+float alive(float f) {
+  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+  // expected-error@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl
index 6bfc8577670c..35b7c384f26c 100644
--- a/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl
+++ b/clang/test/SemaHLSL/Availability/avail-diag-default-lib.hlsl
@@ -1,180 +1,180 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
-// RUN: -fsyntax-only -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.5)))
-float fx(float);  // #fx
-
-__attribute__((availability(shadermodel, introduced = 6.6)))
-half fx(half);  // #fx_half
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
-float fy(float); // #fy
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
-float fz(float); // #fz
-
-float also_alive(float f) {
-  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_alive_fx_call
-  
-  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #also_alive_fy_call
-
-  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #also_alive_fz_call
-
-  return 0;
-}
-
-float alive(float f) {
-  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #alive_fx_call
-
-  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #alive_fy_call
-
-  // expected-error@#alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #alive_fz_call
-
-  return also_alive(f);
-}
-
-float also_dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-  return 0;
-}
-
-float dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-  return also_dead(f);
-}
-
-template<typename T>
-T aliveTemp(T f) {
-  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #aliveTemp_fx_call
-  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #aliveTemp_fy_call
-  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #aliveTemp_fz_call
-  return 0;
-}
-
-template<typename T> T aliveTemp2(T f) {
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
-  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  return fx(f); // #aliveTemp2_fx_call
-}
-
-half test(half x) {
-  return aliveTemp2(x);
-}
-
-float test(float x) {
-  return aliveTemp2(x);
-}
-
-class MyClass
-{
-  float F;
-  float makeF() {
-    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-    float A = fx(F); // #MyClass_makeF_fx_call
-    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float B = fy(F); // #MyClass_makeF_fy_call
-    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
-    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float C = fz(F); // #MyClass_makeF_fz_call
-    return 0;
-  }
-};
-
-// Exported function without body, not used
-export void exportedFunctionUnused(float f);
-
-// Exported function with body, without export, not used
-void exportedFunctionUnused(float f) {
-  // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #exportedFunctionUnused_fx_call
-
-  // API with shader-stage-specific availability in unused exported library function
-  // - no errors expected because the actual shader stage this function
-  // will be used in not known at this time
-  float B = fy(f);
-  float C = fz(f);
-}
-
-// Exported function with body - called from main() which is a compute shader entry point
-export void exportedFunctionUsed(float f) {
-  // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #exportedFunctionUsed_fx_call
-
-  // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #exportedFunctionUsed_fy_call
-
-  // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #exportedFunctionUsed_fz_call
-}
-
-namespace A {
-  namespace B {
-    export {
-      void exportedFunctionInNS(float x) {
-        // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-        // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-        float A = fx(x); // #exportedFunctionInNS_fx_call
-
-        // API with shader-stage-specific availability in exported library function
-        // - no errors expected because the actual shader stage this function
-        // will be used in not known at this time
-        float B = fy(x);
-        float C = fz(x);
-      }
-    }
-  }
-}
-
-// Shader entry point without body
-[shader("compute")]
-[numthreads(4,1,1)]
-float main();
-
-// Shader entry point with body
-[shader("compute")]
-[numthreads(4,1,1)]
-float main() {
-  float f = 3;
-  MyClass C = { 1.0f };
-  float a = alive(f);
-  float b = aliveTemp<float>(f); // #aliveTemp_inst
-  float c = C.makeF();
-  float d = test((float)1.0);
-  float e = test((half)1.0);
-  exportedFunctionUsed(1.0f);
-  return a * b * c;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
+// RUN: -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  
+  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+
+  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+
+  return 0;
+}
+
+float alive(float f) {
+  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+
+  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+
+  // expected-error@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+// Exported function without body, not used
+export void exportedFunctionUnused(float f);
+
+// Exported function with body, without export, not used
+void exportedFunctionUnused(float f) {
+  // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #exportedFunctionUnused_fx_call
+
+  // API with shader-stage-specific availability in unused exported library function
+  // - no errors expected because the actual shader stage this function
+  // will be used in not known at this time
+  float B = fy(f);
+  float C = fz(f);
+}
+
+// Exported function with body - called from main() which is a compute shader entry point
+export void exportedFunctionUsed(float f) {
+  // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #exportedFunctionUsed_fx_call
+
+  // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #exportedFunctionUsed_fy_call
+
+  // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #exportedFunctionUsed_fz_call
+}
+
+namespace A {
+  namespace B {
+    export {
+      void exportedFunctionInNS(float x) {
+        // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+        // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+        float A = fx(x); // #exportedFunctionInNS_fx_call
+
+        // API with shader-stage-specific availability in exported library function
+        // - no errors expected because the actual shader stage this function
+        // will be used in not known at this time
+        float B = fy(x);
+        float C = fz(x);
+      }
+    }
+  }
+}
+
+// Shader entry point without body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main();
+
+// Shader entry point with body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  exportedFunctionUsed(1.0f);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl
index 65836c55821d..406879838393 100644
--- a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl
+++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-compute.hlsl
@@ -1,119 +1,119 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
-// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.5)))
-float fx(float);  // #fx
-
-__attribute__((availability(shadermodel, introduced = 6.6)))
-half fx(half);  // #fx_half
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
-float fy(float); // #fy
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
-float fz(float); // #fz
-
-float also_alive(float f) {
-  // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_alive_fx_call
-  // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #also_alive_fy_call
-  // expected-warning@#also_alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #also_alive_fz_call
-  return 0;
-}
-
-float alive(float f) {
-  // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #alive_fx_call
-  // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #alive_fy_call
-  // expected-warning@#alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #alive_fz_call
-
-  return also_alive(f);
-}
-
-float also_dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-  return 0;
-}
-
-float dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-
-  return also_dead(f);
-}
-
-template<typename T>
-T aliveTemp(T f) {
-  // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #aliveTemp_fx_call
-  // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #aliveTemp_fy_call
-  // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #aliveTemp_fz_call
-  return 0;
-}
-
-template<typename T> T aliveTemp2(T f) {
-  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
-  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
-  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  return fx(f); // #aliveTemp2_fx_call
-}
-
-half test(half x) {
-  return aliveTemp2(x);
-}
-
-float test(float x) {
-  return aliveTemp2(x);
-}
-
-class MyClass
-{
-  float F;
-  float makeF() {
-    // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-    float A = fx(F); // #MyClass_makeF_fx_call
-    // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float B = fy(F); // #MyClass_makeF_fy_call
-    // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}}
-    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float C = fz(F); // #MyClass_makeF_fz_call
-    return 0;
-  }
-};
-
-[numthreads(4,1,1)]
-float main() {
-  float f = 3;
-  MyClass C = { 1.0f };
-  float a = alive(f);
-  float b = aliveTemp<float>(f); // #aliveTemp_inst
-  float c = C.makeF();
-  float d = test((float)1.0);
-  float e = test((half)1.0);
-  return a * b * c;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
+// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+  // expected-warning@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+  return 0;
+}
+
+float alive(float f) {
+  // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+  // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+  // expected-warning@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl
index 4c9783138f67..a23e91a546b1 100644
--- a/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl
+++ b/clang/test/SemaHLSL/Availability/avail-diag-relaxed-lib.hlsl
@@ -1,162 +1,162 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
-// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.5)))
-float fx(float);  // #fx
-
-__attribute__((availability(shadermodel, introduced = 6.6)))
-half fx(half);  // #fx_half
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
-float fy(float); // #fy
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
-float fz(float); // #fz
-
-float also_alive(float f) {
-  // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_alive_fx_call
-  
-  // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #also_alive_fy_call
-
-  // expected-warning@#also_alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #also_alive_fz_call
-
-  return 0;
-}
-
-float alive(float f) {
-  // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #alive_fx_call
-
-  // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #alive_fy_call
-
-  // expected-warning@#alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #alive_fz_call
-
-  return also_alive(f);
-}
-
-float also_dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-  return 0;
-}
-
-float dead(float f) {
-  // unreachable code - no errors expected
-  float A = fx(f);
-  float B = fy(f);
-  float C = fz(f);
-  return also_dead(f);
-}
-
-template<typename T>
-T aliveTemp(T f) {
-  // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #aliveTemp_fx_call
-  // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #aliveTemp_fy_call
-  // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #aliveTemp_fz_call
-  return 0;
-}
-
-template<typename T> T aliveTemp2(T f) {
-  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
-  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
-  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  return fx(f); // #aliveTemp2_fx_call
-}
-
-half test(half x) {
-  return aliveTemp2(x);
-}
-
-float test(float x) {
-  return aliveTemp2(x);
-}
-
-class MyClass
-{
-  float F;
-  float makeF() {
-    // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-    float A = fx(F); // #MyClass_makeF_fx_call
-    // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float B = fy(F); // #MyClass_makeF_fy_call
-    // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}}
-    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float C = fz(F); // #MyClass_makeF_fz_call
-    return 0;
-  }
-};
-
-// Exported function without body, not used
-export void exportedFunctionUnused(float f);
-
-// Exported function with body, without export, not used
-void exportedFunctionUnused(float f) {
-  // expected-warning@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #exportedFunctionUnused_fx_call
-
-  // API with shader-stage-specific availability in unused exported library function
-  // - no errors expected because the actual shader stage this function
-  // will be used in not known at this time
-  float B = fy(f);
-  float C = fz(f);
-}
-
-// Exported function with body - called from main() which is a compute shader entry point
-export void exportedFunctionUsed(float f) {
-  // expected-warning@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #exportedFunctionUsed_fx_call
-
-  // expected-warning@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #exportedFunctionUsed_fy_call
-
-  // expected-warning@#exportedFunctionUsed_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #exportedFunctionUsed_fz_call
-}
-
-// Shader entry point without body
-[shader("compute")]
-[numthreads(4,1,1)]
-float main();
-
-// Shader entry point with body
-[shader("compute")]
-[numthreads(4,1,1)]
-float main() {
-  float f = 3;
-  MyClass C = { 1.0f };
-  float a = alive(f);
-  float b = aliveTemp<float>(f); // #aliveTemp_inst
-  float c = C.makeF();
-  float d = test((float)1.0);
-  float e = test((half)1.0);
-  exportedFunctionUsed(1.0f);
-  return a * b * c;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
+// RUN: -fsyntax-only -Wno-error=hlsl-availability -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-warning@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  
+  // expected-warning@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+
+  // expected-warning@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+
+  return 0;
+}
+
+float alive(float f) {
+  // expected-warning@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+
+  // expected-warning@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+
+  // expected-warning@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return 0;
+}
+
+float dead(float f) {
+  // unreachable code - no errors expected
+  float A = fx(f);
+  float B = fy(f);
+  float C = fz(f);
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-warning@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-warning@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-warning@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-warning@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x);
+}
+
+float test(float x) {
+  return aliveTemp2(x);
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-warning@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-warning@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-warning@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+    return 0;
+  }
+};
+
+// Exported function without body, not used
+export void exportedFunctionUnused(float f);
+
+// Exported function with body, without export, not used
+void exportedFunctionUnused(float f) {
+  // expected-warning@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #exportedFunctionUnused_fx_call
+
+  // API with shader-stage-specific availability in unused exported library function
+  // - no errors expected because the actual shader stage this function
+  // will be used in not known at this time
+  float B = fy(f);
+  float C = fz(f);
+}
+
+// Exported function with body - called from main() which is a compute shader entry point
+export void exportedFunctionUsed(float f) {
+  // expected-warning@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #exportedFunctionUsed_fx_call
+
+  // expected-warning@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #exportedFunctionUsed_fy_call
+
+  // expected-warning@#exportedFunctionUsed_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #exportedFunctionUsed_fz_call
+}
+
+// Shader entry point without body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main();
+
+// Shader entry point with body
+[shader("compute")]
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  exportedFunctionUsed(1.0f);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl
index b67e10c9a901..a8783c10cbab 100644
--- a/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl
+++ b/clang/test/SemaHLSL/Availability/avail-diag-strict-compute.hlsl
@@ -1,129 +1,129 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
-// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.5)))
-float fx(float);  // #fx
-
-__attribute__((availability(shadermodel, introduced = 6.6)))
-half fx(half);  // #fx_half
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
-float fy(float); // #fy
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
-float fz(float); // #fz
-
-float also_alive(float f) {
-  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_alive_fx_call
-  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #also_alive_fy_call
-  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #also_alive_fz_call
-  return 0;
-}
-
-float alive(float f) {
-  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #alive_fx_call
-  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #alive_fy_call
-  // expected-error@#alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #alive_fz_call
-
-  return also_alive(f);
-}
-
-float also_dead(float f) {
-  // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_dead_fx_call
-  // expected-error@#also_dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #also_dead_fy_call
-  // expected-error@#also_dead_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #also_dead_fz_call
-  return 0;
-}
-
-float dead(float f) {
-  // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #dead_fx_call
-  // expected-error@#dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #dead_fy_call
-  // expected-error@#dead_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #dead_fz_call
-
-  return also_dead(f);
-}
-
-template<typename T>
-T aliveTemp(T f) {
-  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp<float>' requested here}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #aliveTemp_fx_call
-  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #aliveTemp_fy_call
-  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #aliveTemp_fz_call
-  return 0;
-}
-
-template<typename T> T aliveTemp2(T f) {
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
-  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  return fx(f); // #aliveTemp2_fx_call
-}
-
-half test(half x) {
-  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<half>' requested here}}
-}
-
-float test(float x) {
-  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<float>' requested here}}
-}
-
-class MyClass
-{
-  float F;
-  float makeF() {
-    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-    float A = fx(F); // #MyClass_makeF_fx_call
-    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float B = fy(F); // #MyClass_makeF_fy_call
-    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
-    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float C = fz(F); // #MyClass_makeF_fz_call
-  }
-};
-
-[numthreads(4,1,1)]
-float main() {
-  float f = 3;
-  MyClass C = { 1.0f };
-  float a = alive(f);
-  float b = aliveTemp<float>(f); // #aliveTemp_inst
-  float c = C.makeF();
-  float d = test((float)1.0);
-  float e = test((half)1.0);
-  return a * b * c;
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute \
+// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+float also_alive(float f) {
+  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+  return 0;
+}
+
+float alive(float f) {
+  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+  // expected-error@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_dead_fx_call
+  // expected-error@#also_dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_dead_fy_call
+  // expected-error@#also_dead_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_dead_fz_call
+  return 0;
+}
+
+float dead(float f) {
+  // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #dead_fx_call
+  // expected-error@#dead_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #dead_fy_call
+  // expected-error@#dead_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #dead_fz_call
+
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp<float>' requested here}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<half>' requested here}}
+}
+
+float test(float x) {
+  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<float>' requested here}}
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+  }
+};
+
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);
+  float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  return a * b * c;
 }
\ No newline at end of file
diff --git a/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl b/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl
index c7be5afbc2d2..0fffbc96dac1 100644
--- a/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl
+++ b/clang/test/SemaHLSL/Availability/avail-diag-strict-lib.hlsl
@@ -1,192 +1,192 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
-// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.5)))
-float fx(float);  // #fx
-
-__attribute__((availability(shadermodel, introduced = 6.6)))
-half fx(half);  // #fx_half
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
-float fy(float); // #fy
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
-float fz(float); // #fz
-
-// FIXME: all diagnostics marked as FUTURE will come alive when HLSL default
-// diagnostic mode is implemented in a future PR which will verify calls in
-// all functions that are reachable from the shader library entry points
-
-float also_alive(float f) {
-  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_alive_fx_call
-  
-  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #also_alive_fy_call
-
-  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #also_alive_fz_call
-
-  return 0;
-}
-
-float alive(float f) {
-  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #alive_fx_call
-
-  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #alive_fy_call
-
-  // expected-error@#alive_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #alive_fz_call
-
-  return also_alive(f);
-}
-
-float also_dead(float f) {
-  // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #also_dead_fx_call
-  
-  // Call to environment-specific function from an unreachable function 
-  // in a shader library - no diagnostic expected.
-  float B = fy(f); // #also_dead_fy_call
-
-  // Call to environment-specific function from an unreachable function 
-  // in a shader library - no diagnostic expected.
-  float C = fz(f); // #also_dead_fz_call
-  return 0;
-}
-
-float dead(float f) {
-  // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #dead_fx_call
-
-  // Call to environment-specific function from an unreachable function 
-  // in a shader library - no diagnostic expected.
-  float B = fy(f); // #dead_fy_call
-
-  // Call to environment-specific function from an unreachable function 
-  // in a shader library - no diagnostic expected.
-  float C = fz(f); // #dead_fz_call
-
-  return also_dead(f);
-}
-
-template<typename T>
-T aliveTemp(T f) {
-  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp<float>' requested here}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #aliveTemp_fx_call
-  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #aliveTemp_fy_call
-  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #aliveTemp_fz_call
-  return 0;
-}
-
-template<typename T> T aliveTemp2(T f) {
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
-  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
-  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  return fx(f); // #aliveTemp2_fx_call
-}
-
-half test(half x) {
-  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<half>' requested here}}
-}
-
-float test(float x) {
-  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<float>' requested here}}
-}
-
-class MyClass
-{
-  float F;
-  float makeF() {
-    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-    float A = fx(F); // #MyClass_makeF_fx_call
-    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float B = fy(F); // #MyClass_makeF_fy_call
-    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
-    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-    float C = fz(F); // #MyClass_makeF_fz_call
-  }
-};
-
-// Exported function without body, not used
-export void exportedFunctionUnused(float f);
-
-// Exported function with body, without export, not used
-void exportedFunctionUnused(float f) {
-  // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #exportedFunctionUnused_fx_call
-
-  // API with shader-stage-specific availability in unused exported library function
-  // - no errors expected because the actual shader stage this function
-  // will be used in not known at this time
-  float B = fy(f);
-  float C = fz(f);
-}
-
-// Exported function with body - called from main() which is a compute shader entry point
-export void exportedFunctionUsed(float f) {
-  // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #exportedFunctionUsed_fx_call
-
-  // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #exportedFunctionUsed_fy_call
-
-  // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float C = fz(f); // #exportedFunctionUsed_fz_call
-}
-
-namespace A {
-  namespace B {
-    export {
-      void exportedFunctionInNS(float x) {
-        // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-        // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-        float A = fx(x); // #exportedFunctionInNS_fx_call
-
-        // API with shader-stage-specific availability in exported library function
-        // - no errors expected because the actual shader stage this function
-        // will be used in not known at this time
-        float B = fy(x);
-        float C = fz(x);
-      }
-    }
-  }
-}
-
-[shader("compute")]
-[numthreads(4,1,1)]
-float main() {
-  float f = 3;
-  MyClass C = { 1.0f };
-  float a = alive(f);float b = aliveTemp<float>(f); // #aliveTemp_inst
-  float c = C.makeF();
-  float d = test((float)1.0);
-  float e = test((half)1.0);
-  exportedFunctionUsed(1.0f);
-  return a * b * c;
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
+// RUN: -fhlsl-strict-availability -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 6.6)))
+half fx(half);  // #fx_half
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = mesh)))
+float fz(float); // #fz
+
+// FIXME: all diagnostics marked as FUTURE will come alive when HLSL default
+// diagnostic mode is implemented in a future PR which will verify calls in
+// all functions that are reachable from the shader library entry points
+
+float also_alive(float f) {
+  // expected-error@#also_alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_alive_fx_call
+  
+  // expected-error@#also_alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #also_alive_fy_call
+
+  // expected-error@#also_alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #also_alive_fz_call
+
+  return 0;
+}
+
+float alive(float f) {
+  // expected-error@#alive_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #alive_fx_call
+
+  // expected-error@#alive_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #alive_fy_call
+
+  // expected-error@#alive_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #alive_fz_call
+
+  return also_alive(f);
+}
+
+float also_dead(float f) {
+  // expected-error@#also_dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #also_dead_fx_call
+  
+  // Call to environment-specific function from an unreachable function 
+  // in a shader library - no diagnostic expected.
+  float B = fy(f); // #also_dead_fy_call
+
+  // Call to environment-specific function from an unreachable function 
+  // in a shader library - no diagnostic expected.
+  float C = fz(f); // #also_dead_fz_call
+  return 0;
+}
+
+float dead(float f) {
+  // expected-error@#dead_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #dead_fx_call
+
+  // Call to environment-specific function from an unreachable function 
+  // in a shader library - no diagnostic expected.
+  float B = fy(f); // #dead_fy_call
+
+  // Call to environment-specific function from an unreachable function 
+  // in a shader library - no diagnostic expected.
+  float C = fz(f); // #dead_fz_call
+
+  return also_dead(f);
+}
+
+template<typename T>
+T aliveTemp(T f) {
+  // expected-error@#aliveTemp_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#aliveTemp_inst {{in instantiation of function template specialization 'aliveTemp<float>' requested here}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #aliveTemp_fx_call
+  // expected-error@#aliveTemp_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #aliveTemp_fy_call
+  // expected-error@#aliveTemp_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #aliveTemp_fz_call
+  return 0;
+}
+
+template<typename T> T aliveTemp2(T f) {
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.6 or newer}}
+  // expected-note@#fx_half {{'fx' has been marked as being introduced in Shader Model 6.6 here, but the deployment target is Shader Model 6.0}}
+  // expected-error@#aliveTemp2_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  return fx(f); // #aliveTemp2_fx_call
+}
+
+half test(half x) {
+  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<half>' requested here}}
+}
+
+float test(float x) {
+  return aliveTemp2(x); // expected-note {{in instantiation of function template specialization 'aliveTemp2<float>' requested here}}
+}
+
+class MyClass
+{
+  float F;
+  float makeF() {
+    // expected-error@#MyClass_makeF_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+    // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+    float A = fx(F); // #MyClass_makeF_fx_call
+    // expected-error@#MyClass_makeF_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+    // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float B = fy(F); // #MyClass_makeF_fy_call
+    // expected-error@#MyClass_makeF_fz_call {{'fz' is unavailable}}
+    // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+    float C = fz(F); // #MyClass_makeF_fz_call
+  }
+};
+
+// Exported function without body, not used
+export void exportedFunctionUnused(float f);
+
+// Exported function with body, without export, not used
+void exportedFunctionUnused(float f) {
+  // expected-error@#exportedFunctionUnused_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #exportedFunctionUnused_fx_call
+
+  // API with shader-stage-specific availability in unused exported library function
+  // - no errors expected because the actual shader stage this function
+  // will be used in not known at this time
+  float B = fy(f);
+  float C = fz(f);
+}
+
+// Exported function with body - called from main() which is a compute shader entry point
+export void exportedFunctionUsed(float f) {
+  // expected-error@#exportedFunctionUsed_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #exportedFunctionUsed_fx_call
+
+  // expected-error@#exportedFunctionUsed_fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #exportedFunctionUsed_fy_call
+
+  // expected-error@#exportedFunctionUsed_fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 6.5 in mesh environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float C = fz(f); // #exportedFunctionUsed_fz_call
+}
+
+namespace A {
+  namespace B {
+    export {
+      void exportedFunctionInNS(float x) {
+        // expected-error@#exportedFunctionInNS_fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+        // expected-note@#fx {{'fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+        float A = fx(x); // #exportedFunctionInNS_fx_call
+
+        // API with shader-stage-specific availability in exported library function
+        // - no errors expected because the actual shader stage this function
+        // will be used in not known at this time
+        float B = fy(x);
+        float C = fz(x);
+      }
+    }
+  }
+}
+
+[shader("compute")]
+[numthreads(4,1,1)]
+float main() {
+  float f = 3;
+  MyClass C = { 1.0f };
+  float a = alive(f);float b = aliveTemp<float>(f); // #aliveTemp_inst
+  float c = C.makeF();
+  float d = test((float)1.0);
+  float e = test((half)1.0);
+  exportedFunctionUsed(1.0f);
+  return a * b * c;
+}
diff --git a/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl
index b56ab8fe4526..bfefc9b116a6 100644
--- a/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl
+++ b/clang/test/SemaHLSL/Availability/avail-lib-multiple-stages.hlsl
@@ -1,57 +1,57 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
-// RUN: -fsyntax-only -verify %s
-
-__attribute__((availability(shadermodel, introduced = 6.5)))
-float fx(float);  // #fx
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
-__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
-float fy(float); // #fy
-
-__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
-float fz(float); // #fz
-
-
-void F(float f) {
-  // Make sure we only get this error once, even though this function is scanned twice - once
-  // in compute shader context and once in pixel shader context.
-  // expected-error@#fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
-  // expected-note@#fx {{fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
-  float A = fx(f); // #fx_call
-  
-  // expected-error@#fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
-  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
-  float B = fy(f); // #fy_call
-
-  // expected-error@#fz_call {{'fz' is unavailable}}
-  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 5.0 in compute environment here, but the deployment target is Shader Model 6.0 pixel environment}}
-  float X = fz(f); // #fz_call
-}
-
-void deadCode(float f) {
-  // no diagnostics expected under default diagnostic mode
-  float A = fx(f);
-  float B = fy(f);
-  float X = fz(f);
-}
-
-// Pixel shader
-[shader("pixel")]
-void mainPixel() {
-  F(1.0);
-}
-
-// First Compute shader
-[shader("compute")]
-[numthreads(4,1,1)]
-void mainCompute1() {
-  F(2.0);
-}
-
-// Second compute shader to make sure we do not get duplicate messages if F is called
-// from multiple entry points.
-[shader("compute")]
-[numthreads(4,1,1)]
-void mainCompute2() {
-  F(3.0);
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-library \
+// RUN: -fsyntax-only -verify %s
+
+__attribute__((availability(shadermodel, introduced = 6.5)))
+float fx(float);  // #fx
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = pixel)))
+__attribute__((availability(shadermodel, introduced = 6.5, environment = compute)))
+float fy(float); // #fy
+
+__attribute__((availability(shadermodel, introduced = 5.0, environment = compute)))
+float fz(float); // #fz
+
+
+void F(float f) {
+  // Make sure we only get this error once, even though this function is scanned twice - once
+  // in compute shader context and once in pixel shader context.
+  // expected-error@#fx_call {{'fx' is only available on Shader Model 6.5 or newer}}
+  // expected-note@#fx {{fx' has been marked as being introduced in Shader Model 6.5 here, but the deployment target is Shader Model 6.0}}
+  float A = fx(f); // #fx_call
+  
+  // expected-error@#fy_call {{'fy' is only available in compute environment on Shader Model 6.5 or newer}}
+  // expected-note@#fy {{'fy' has been marked as being introduced in Shader Model 6.5 in compute environment here, but the deployment target is Shader Model 6.0 compute environment}}
+  float B = fy(f); // #fy_call
+
+  // expected-error@#fz_call {{'fz' is unavailable}}
+  // expected-note@#fz {{'fz' has been marked as being introduced in Shader Model 5.0 in compute environment here, but the deployment target is Shader Model 6.0 pixel environment}}
+  float X = fz(f); // #fz_call
+}
+
+void deadCode(float f) {
+  // no diagnostics expected under default diagnostic mode
+  float A = fx(f);
+  float B = fy(f);
+  float X = fz(f);
+}
+
+// Pixel shader
+[shader("pixel")]
+void mainPixel() {
+  F(1.0);
+}
+
+// First Compute shader
+[shader("compute")]
+[numthreads(4,1,1)]
+void mainCompute1() {
+  F(2.0);
+}
+
+// Second compute shader to make sure we do not get duplicate messages if F is called
+// from multiple entry points.
+[shader("compute")]
+[numthreads(4,1,1)]
+void mainCompute2() {
+  F(3.0);
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
index a472d5519dc5..1ec56542113d 100644
--- a/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/StructuredBuffers.hlsl
@@ -1,19 +1,19 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
-
-typedef vector<float, 3> float3;
-
-StructuredBuffer<float3> Buffer;
-
-// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}}
-// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
-StructuredBuffer BufferErr1;
-
-// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}}
-// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
-StructuredBuffer<> BufferErr2;
-
-[numthreads(1,1,1)]
-void main() {
-  (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer<vector<float, 3>>'}}
-  // expected-note@* {{implicitly declared private here}}
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.0-compute -x hlsl -fsyntax-only -verify %s
+
+typedef vector<float, 3> float3;
+
+StructuredBuffer<float3> Buffer;
+
+// expected-error@+2 {{class template 'StructuredBuffer' requires template arguments}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer BufferErr1;
+
+// expected-error@+2 {{too few template arguments for class template 'StructuredBuffer'}}
+// expected-note@*:* {{template declaration from hidden source: template <class element_type> class StructuredBuffer}}
+StructuredBuffer<> BufferErr2;
+
+[numthreads(1,1,1)]
+void main() {
+  (void)Buffer.h; // expected-error {{'h' is a private member of 'hlsl::StructuredBuffer<vector<float, 3>>'}}
+  // expected-note@* {{implicitly declared private here}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
index 423f5bac9471..354e7abb8a31 100644
--- a/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/cross-errors.hlsl
@@ -1,43 +1,43 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
-
-void test_too_few_arg()
-{
-  return __builtin_hlsl_cross();
-  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
-}
-
-void test_too_many_arg(float3 p0)
-{
-  return __builtin_hlsl_cross(p0, p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
-}
-
-bool builtin_bool_to_float_type_promotion(bool p1)
-{
-  return __builtin_hlsl_cross(p1, p1);
-  // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}}
-}
-
-bool builtin_cross_int_to_float_promotion(int p1)
-{
-  return __builtin_hlsl_cross(p1, p1);
-  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
-}
-
-bool2 builtin_cross_int2_to_float2_promotion(int2 p1)
-{
-  return __builtin_hlsl_cross(p1, p1);
-  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
-}
-
-float2 builtin_cross_float2(float2 p1, float2 p2)
-{
-  return __builtin_hlsl_cross(p1, p2);
-  // expected-error@-1 {{too many elements in vector operand (expected 3 elements, have 2)}}
-}
-
-float3  builtin_cross_float3_int3(float3 p1, int3 p2)
-{
-  return __builtin_hlsl_cross(p1, p2);
-  // expected-error@-1 {{all arguments to '__builtin_hlsl_cross' must have the same type}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify
+
+void test_too_few_arg()
+{
+  return __builtin_hlsl_cross();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+}
+
+void test_too_many_arg(float3 p0)
+{
+  return __builtin_hlsl_cross(p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+bool builtin_bool_to_float_type_promotion(bool p1)
+{
+  return __builtin_hlsl_cross(p1, p1);
+  // expected-error@-1 {{passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+bool builtin_cross_int_to_float_promotion(int p1)
+{
+  return __builtin_hlsl_cross(p1, p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+bool2 builtin_cross_int2_to_float2_promotion(int2 p1)
+{
+  return __builtin_hlsl_cross(p1, p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
+
+float2 builtin_cross_float2(float2 p1, float2 p2)
+{
+  return __builtin_hlsl_cross(p1, p2);
+  // expected-error@-1 {{too many elements in vector operand (expected 3 elements, have 2)}}
+}
+
+float3  builtin_cross_float3_int3(float3 p1, int3 p2)
+{
+  return __builtin_hlsl_cross(p1, p2);
+  // expected-error@-1 {{all arguments to '__builtin_hlsl_cross' must have the same type}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
index bfbd8b28257a..b876a8e84cb3 100644
--- a/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/half-float-only-errors2.hlsl
@@ -1,13 +1,13 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
-
-double test_double_builtin(double p0, double p1) {
-    return TEST_FUNC(p0, p1);
-  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
-}
-
-double2 test_vec_double_builtin(double2 p0, double2 p1) {
-    return TEST_FUNC(p0, p1);
-  // expected-error@-1 {{passing 'double2' (aka 'vector<double, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_atan2
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_fmod
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -emit-llvm-only -disable-llvm-passes -verify -DTEST_FUNC=__builtin_elementwise_pow
+
+double test_double_builtin(double p0, double p1) {
+    return TEST_FUNC(p0, p1);
+  // expected-error@-1 {{passing 'double' to parameter of incompatible type 'float'}}
+}
+
+double2 test_vec_double_builtin(double2 p0, double2 p1) {
+    return TEST_FUNC(p0, p1);
+  // expected-error@-1 {{passing 'double2' (aka 'vector<double, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
index 281faada6f5e..c5e2ac0b502d 100644
--- a/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/length-errors.hlsl
@@ -1,32 +1,32 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected
-
-
-void test_too_few_arg()
-{
-  return __builtin_hlsl_length();
-  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
-}
-
-void test_too_many_arg(float2 p0)
-{
-  return __builtin_hlsl_length(p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
-}
-
-bool builtin_bool_to_float_type_promotion(bool p1)
-{
-  return __builtin_hlsl_length(p1);
-  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
-}
-
-bool builtin_length_int_to_float_promotion(int p1)
-{
-  return __builtin_hlsl_length(p1);
-  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
-}
-
-bool2 builtin_length_int2_to_float2_promotion(int2 p1)
-{
-  return __builtin_hlsl_length(p1);
-  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected
+
+
+void test_too_few_arg()
+{
+  return __builtin_hlsl_length();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+void test_too_many_arg(float2 p0)
+{
+  return __builtin_hlsl_length(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool builtin_bool_to_float_type_promotion(bool p1)
+{
+  return __builtin_hlsl_length(p1);
+  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+bool builtin_length_int_to_float_promotion(int p1)
+{
+  return __builtin_hlsl_length(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+bool2 builtin_length_int2_to_float2_promotion(int2 p1)
+{
+  return __builtin_hlsl_length(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
index fc48c9b2589f..3720dca9b88a 100644
--- a/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/normalize-errors.hlsl
@@ -1,31 +1,31 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected
-
-void test_too_few_arg()
-{
-  return __builtin_hlsl_normalize();
-  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
-}
-
-void test_too_many_arg(float2 p0)
-{
-  return __builtin_hlsl_normalize(p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
-}
-
-bool builtin_bool_to_float_type_promotion(bool p1)
-{
-  return __builtin_hlsl_normalize(p1);
-  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
-}
-
-bool builtin_normalize_int_to_float_promotion(int p1)
-{
-  return __builtin_hlsl_normalize(p1);
-  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
-}
-
-bool2 builtin_normalize_int2_to_float2_promotion(int2 p1)
-{
-  return __builtin_hlsl_normalize(p1);
-  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected
+
+void test_too_few_arg()
+{
+  return __builtin_hlsl_normalize();
+  // expected-error@-1 {{too few arguments to function call, expected 1, have 0}}
+}
+
+void test_too_many_arg(float2 p0)
+{
+  return __builtin_hlsl_normalize(p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 1, have 2}}
+}
+
+bool builtin_bool_to_float_type_promotion(bool p1)
+{
+  return __builtin_hlsl_normalize(p1);
+  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+bool builtin_normalize_int_to_float_promotion(int p1)
+{
+  return __builtin_hlsl_normalize(p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+bool2 builtin_normalize_int2_to_float2_promotion(int2 p1)
+{
+  return __builtin_hlsl_normalize(p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
index 823585201ca6..a76c5ff5dbd2 100644
--- a/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
+++ b/clang/test/SemaHLSL/BuiltIns/step-errors.hlsl
@@ -1,31 +1,31 @@
-// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected
-
-void test_too_few_arg()
-{
-  return __builtin_hlsl_step();
-  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
-}
-
-void test_too_many_arg(float2 p0)
-{
-  return __builtin_hlsl_step(p0, p0, p0);
-  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
-}
-
-bool builtin_bool_to_float_type_promotion(bool p1)
-{
-  return __builtin_hlsl_step(p1, p1);
-  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
-}
-
-bool builtin_step_int_to_float_promotion(int p1)
-{
-  return __builtin_hlsl_step(p1, p1);
-  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
-}
-
-bool2 builtin_step_int2_to_float2_promotion(int2 p1)
-{
-  return __builtin_hlsl_step(p1, p1);
-  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
-}
+// RUN: %clang_cc1 -finclude-default-header -triple dxil-pc-shadermodel6.6-library %s -fnative-half-type -disable-llvm-passes -verify -verify-ignore-unexpected
+
+void test_too_few_arg()
+{
+  return __builtin_hlsl_step();
+  // expected-error@-1 {{too few arguments to function call, expected 2, have 0}}
+}
+
+void test_too_many_arg(float2 p0)
+{
+  return __builtin_hlsl_step(p0, p0, p0);
+  // expected-error@-1 {{too many arguments to function call, expected 2, have 3}}
+}
+
+bool builtin_bool_to_float_type_promotion(bool p1)
+{
+  return __builtin_hlsl_step(p1, p1);
+  // expected-error@-1 {passing 'bool' to parameter of incompatible type 'float'}}
+}
+
+bool builtin_step_int_to_float_promotion(int p1)
+{
+  return __builtin_hlsl_step(p1, p1);
+  // expected-error@-1 {{passing 'int' to parameter of incompatible type 'float'}}
+}
+
+bool2 builtin_step_int2_to_float2_promotion(int2 p1)
+{
+  return __builtin_hlsl_step(p1, p1);
+  // expected-error@-1 {{passing 'int2' (aka 'vector<int, 2>') to parameter of incompatible type '__attribute__((__vector_size__(2 * sizeof(float)))) float' (vector of 2 'float' values)}}
+}
diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
index 8c0f8d6f271d..1223a131af35 100644
--- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
+++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleType.hlsl
@@ -1,81 +1,81 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s
-// expected-no-diagnostics
-
-_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), "");
-// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported
-
-_Static_assert(!__builtin_hlsl_is_intangible(int), "");
-_Static_assert(!__builtin_hlsl_is_intangible(float3), "");
-_Static_assert(!__builtin_hlsl_is_intangible(half[4]), "");
-
-typedef __hlsl_resource_t Res;
-_Static_assert(__builtin_hlsl_is_intangible(const Res), "");
-// no need to check array of Res, arrays of sizeless types are not supported
-
-struct ABuffer {
-    const int i[10];
-    __hlsl_resource_t h;
-};
-_Static_assert(__builtin_hlsl_is_intangible(ABuffer), "");
-_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), "");
-
-struct MyStruct {
-    half2 h2;
-    int3 i3;
-};
-_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), "");
-_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), "");
-
-class MyClass {
-    int3 ivec;
-    float farray[12];
-    MyStruct ms;
-    ABuffer buf;
-};
-_Static_assert(__builtin_hlsl_is_intangible(MyClass), "");
-_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), "");
-
-union U {
-    double d[4];
-    Res buf;
-};
-_Static_assert(__builtin_hlsl_is_intangible(U), "");
-_Static_assert(__builtin_hlsl_is_intangible(U[100]), "");
-
-class MyClass2 {
-    int3 ivec;
-    float farray[12];
-    U u;
-};
-_Static_assert(__builtin_hlsl_is_intangible(MyClass2), "");
-_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), "");
-
-class Simple {
-    int a;
-};
-
-template<typename T> struct TemplatedBuffer {
-    T a;
-    __hlsl_resource_t h;
-};
-_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer<int>), "");
-
-struct MyStruct2 : TemplatedBuffer<float> {
-    float x;
-};
-_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), "");
-
-struct MyStruct3 {
-    const TemplatedBuffer<float> TB[10];
-};
-_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), "");
-
-template<typename T> struct SimpleTemplate {
-    T a;
-};
-_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), "");
-_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate<float>), "");
-
-_Static_assert(__builtin_hlsl_is_intangible(RWBuffer<float>), "");
-_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer<Simple>), "");
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -verify %s
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library -finclude-default-header -fnative-half-type -verify %s
+// expected-no-diagnostics
+
+_Static_assert(__builtin_hlsl_is_intangible(__hlsl_resource_t), "");
+// no need to check array of __hlsl_resource_t, arrays of sizeless types are not supported
+
+_Static_assert(!__builtin_hlsl_is_intangible(int), "");
+_Static_assert(!__builtin_hlsl_is_intangible(float3), "");
+_Static_assert(!__builtin_hlsl_is_intangible(half[4]), "");
+
+typedef __hlsl_resource_t Res;
+_Static_assert(__builtin_hlsl_is_intangible(const Res), "");
+// no need to check array of Res, arrays of sizeless types are not supported
+
+struct ABuffer {
+    const int i[10];
+    __hlsl_resource_t h;
+};
+_Static_assert(__builtin_hlsl_is_intangible(ABuffer), "");
+_Static_assert(__builtin_hlsl_is_intangible(ABuffer[10]), "");
+
+struct MyStruct {
+    half2 h2;
+    int3 i3;
+};
+_Static_assert(!__builtin_hlsl_is_intangible(MyStruct), "");
+_Static_assert(!__builtin_hlsl_is_intangible(MyStruct[10]), "");
+
+class MyClass {
+    int3 ivec;
+    float farray[12];
+    MyStruct ms;
+    ABuffer buf;
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyClass), "");
+_Static_assert(__builtin_hlsl_is_intangible(MyClass[2]), "");
+
+union U {
+    double d[4];
+    Res buf;
+};
+_Static_assert(__builtin_hlsl_is_intangible(U), "");
+_Static_assert(__builtin_hlsl_is_intangible(U[100]), "");
+
+class MyClass2 {
+    int3 ivec;
+    float farray[12];
+    U u;
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyClass2), "");
+_Static_assert(__builtin_hlsl_is_intangible(MyClass2[5]), "");
+
+class Simple {
+    int a;
+};
+
+template<typename T> struct TemplatedBuffer {
+    T a;
+    __hlsl_resource_t h;
+};
+_Static_assert(__builtin_hlsl_is_intangible(TemplatedBuffer<int>), "");
+
+struct MyStruct2 : TemplatedBuffer<float> {
+    float x;
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyStruct2), "");
+
+struct MyStruct3 {
+    const TemplatedBuffer<float> TB[10];
+};
+_Static_assert(__builtin_hlsl_is_intangible(MyStruct3), "");
+
+template<typename T> struct SimpleTemplate {
+    T a;
+};
+_Static_assert(__builtin_hlsl_is_intangible(SimpleTemplate<__hlsl_resource_t>), "");
+_Static_assert(!__builtin_hlsl_is_intangible(SimpleTemplate<float>), "");
+
+_Static_assert(__builtin_hlsl_is_intangible(RWBuffer<float>), "");
+_Static_assert(__builtin_hlsl_is_intangible(StructuredBuffer<Simple>), "");
diff --git a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl
index de9ac90b895f..33614e87640d 100644
--- a/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl
+++ b/clang/test/SemaHLSL/Types/Traits/IsIntangibleTypeErrors.hlsl
@@ -1,12 +1,12 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library  -finclude-default-header -verify %s
-
-struct Undefined; // expected-note {{forward declaration of 'Undefined'}}
-_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}}
-
-void fn(int X) { // expected-note {{declared here}}
-  // expected-error@#vla {{variable length arrays are not supported for the current target}}
-  // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}}
-  // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}}
-  // expected-note@#vla {{function parameter 'X' with unknown value cannot be used in a constant expression}}
-  _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla
-}
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.6-library  -finclude-default-header -verify %s
+
+struct Undefined; // expected-note {{forward declaration of 'Undefined'}}
+_Static_assert(!__builtin_hlsl_is_intangible(Undefined), ""); // expected-error{{incomplete type 'Undefined' used in type trait expression}}
+
+void fn(int X) { // expected-note {{declared here}}
+  // expected-error@#vla {{variable length arrays are not supported for the current target}}
+  // expected-error@#vla {{variable length arrays are not supported in '__builtin_hlsl_is_intangible'}}
+  // expected-warning@#vla {{variable length arrays in C++ are a Clang extension}}
+  // expected-note@#vla {{function parameter 'X' with unknown value cannot be used in a constant expression}}
+  _Static_assert(!__builtin_hlsl_is_intangible(int[X]), ""); // #vla
+}
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl
index 760c057630a7..4e50f70952ad 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_basic.hlsl
@@ -1,42 +1,42 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
-
-// expected-error@+1{{binding type 't' only applies to SRV resources}}
-float f1 : register(t0);
-
-// expected-error@+1 {{binding type 'u' only applies to UAV resources}}
-float f2 : register(u0);
-
-// expected-error@+1{{binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported}}
-float f3 : register(b9);
-
-// expected-error@+1 {{binding type 's' only applies to sampler state}}
-float f4 : register(s0);
-
-// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}}
-float f5 : register(i9);
-
-// expected-error@+1{{binding type 'x' is invalid}}
-float f6 : register(x9);
-
-cbuffer g_cbuffer1 {
-// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}}
-    float f7 : register(c2);
-};
-
-tbuffer g_tbuffer1 {
-// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}}
-    float f8 : register(c2);
-};
-
-cbuffer g_cbuffer2 {
-// expected-error@+1{{binding type 'b' only applies to constant buffer resources}}
-    float f9 : register(b2);
-};
-
-tbuffer g_tbuffer2 {
-// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}}
-    float f10 : register(i2);
-};
-
-// expected-error@+1{{binding type 'c' only applies to numeric variables in the global scope}}
-RWBuffer<float> f11 : register(c3);
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
+
+// expected-error@+1{{binding type 't' only applies to SRV resources}}
+float f1 : register(t0);
+
+// expected-error@+1 {{binding type 'u' only applies to UAV resources}}
+float f2 : register(u0);
+
+// expected-error@+1{{binding type 'b' only applies to constant buffers. The 'bool constant' binding type is no longer supported}}
+float f3 : register(b9);
+
+// expected-error@+1 {{binding type 's' only applies to sampler state}}
+float f4 : register(s0);
+
+// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}}
+float f5 : register(i9);
+
+// expected-error@+1{{binding type 'x' is invalid}}
+float f6 : register(x9);
+
+cbuffer g_cbuffer1 {
+// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}}
+    float f7 : register(c2);
+};
+
+tbuffer g_tbuffer1 {
+// expected-error@+1{{binding type 'c' ignored in buffer declaration. Did you mean 'packoffset'?}}
+    float f8 : register(c2);
+};
+
+cbuffer g_cbuffer2 {
+// expected-error@+1{{binding type 'b' only applies to constant buffer resources}}
+    float f9 : register(b2);
+};
+
+tbuffer g_tbuffer2 {
+// expected-error@+1{{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}}
+    float f10 : register(i2);
+};
+
+// expected-error@+1{{binding type 'c' only applies to numeric variables in the global scope}}
+RWBuffer<float> f11 : register(c3);
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl
index 4c9e9a6b44c9..503c8469666f 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_other.hlsl
@@ -1,9 +1,9 @@
-// RUN: not %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s  | FileCheck %s
-
-// XFAIL: *
-// This expectedly fails because RayQuery is an unsupported type.
-// When it becomes supported, we should expect an error due to 
-// the variable type being classified as "other", and according
-// to the spec, err_hlsl_unsupported_register_type_and_variable_type
-// should be emitted.
-RayQuery<0> r1: register(t0);
+// RUN: not %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s  | FileCheck %s
+
+// XFAIL: *
+// This expectedly fails because RayQuery is an unsupported type.
+// When it becomes supported, we should expect an error due to 
+// the variable type being classified as "other", and according
+// to the spec, err_hlsl_unsupported_register_type_and_variable_type
+// should be emitted.
+RayQuery<0> r1: register(t0);
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl
index 4b6af47c0ab7..ea43e27b5b5a 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_resource.hlsl
@@ -1,49 +1,49 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
-
-// This test validates the diagnostics that are emitted when a variable with a "resource" type
-// is bound to a register using the register annotation
-
-
-template<typename T>
-struct MyTemplatedSRV {
-  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
-};
-
-struct MySRV {
-  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
-};
-
-struct MySampler {
-  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x;
-};
-
-struct MyUAV {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
-};
-
-struct MyCBuffer {
-  __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x;
-};
-
-
-// expected-error@+1  {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}}
-MySRV invalid : register(i2);
-
-// expected-error@+1  {{binding type 't' only applies to SRV resources}}
-MyUAV a : register(t2, space1);
-
-// expected-error@+1  {{binding type 'u' only applies to UAV resources}}
-MySampler b : register(u2, space1);
-
-// expected-error@+1  {{binding type 'b' only applies to constant buffer resources}}
-MyTemplatedSRV<int> c : register(b2);
-
-// expected-error@+1  {{binding type 's' only applies to sampler state}}
-MyUAV d : register(s2, space1);
-
-// empty binding prefix cases:
-// expected-error@+1 {{expected identifier}}
-MyTemplatedSRV<int> e: register();
-
-// expected-error@+1 {{expected identifier}}
-MyTemplatedSRV<int> f: register("");
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
+
+// This test validates the diagnostics that are emitted when a variable with a "resource" type
+// is bound to a register using the register annotation
+
+
+template<typename T>
+struct MyTemplatedSRV {
+  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
+};
+
+struct MySRV {
+  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
+};
+
+struct MySampler {
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x;
+};
+
+struct MyUAV {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
+};
+
+struct MyCBuffer {
+  __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x;
+};
+
+
+// expected-error@+1  {{binding type 'i' ignored. The 'integer constant' binding type is no longer supported}}
+MySRV invalid : register(i2);
+
+// expected-error@+1  {{binding type 't' only applies to SRV resources}}
+MyUAV a : register(t2, space1);
+
+// expected-error@+1  {{binding type 'u' only applies to UAV resources}}
+MySampler b : register(u2, space1);
+
+// expected-error@+1  {{binding type 'b' only applies to constant buffer resources}}
+MyTemplatedSRV<int> c : register(b2);
+
+// expected-error@+1  {{binding type 's' only applies to sampler state}}
+MyUAV d : register(s2, space1);
+
+// empty binding prefix cases:
+// expected-error@+1 {{expected identifier}}
+MyTemplatedSRV<int> e: register();
+
+// expected-error@+1 {{expected identifier}}
+MyTemplatedSRV<int> f: register("");
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl
index e63f264452da..7f248e30c070 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_silence_diags.hlsl
@@ -1,27 +1,27 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify
-
-// expected-no-diagnostics
-float f2 : register(b9);
-
-float f3 : register(i9);
-
-cbuffer g_cbuffer1 {
-    float f4 : register(c2);
-};
-
-
-struct Eg12{
-  RWBuffer<int> a;  
-};
-
-Eg12 e12 : register(c9);
-
-Eg12 bar : register(i1);
-
-struct Eg7 {
-  struct Bar {
-    float f;
-  };
-  Bar b;
-};
-Eg7 e7 : register(t0);
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only -Wno-legacy-constant-register-binding %s -verify
+
+// expected-no-diagnostics
+float f2 : register(b9);
+
+float f3 : register(i9);
+
+cbuffer g_cbuffer1 {
+    float f4 : register(c2);
+};
+
+
+struct Eg12{
+  RWBuffer<int> a;  
+};
+
+Eg12 e12 : register(c9);
+
+Eg12 bar : register(i1);
+
+struct Eg7 {
+  struct Bar {
+    float f;
+  };
+  Bar b;
+};
+Eg7 e7 : register(t0);
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl
index 70e64e6ca752..3001dbb1e3ec 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_space.hlsl
@@ -1,62 +1,62 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
-
-// valid
-cbuffer cbuf {
-    RWBuffer<int> r : register(u0, space0);
-}
-
-cbuffer cbuf2 {
-    struct x {
-        // this test validates that no diagnostic is emitted on the space parameter, because
-        // this register annotation is not in the global scope.
-        // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}}
-        RWBuffer<int> E : register(u2, space3);
-    };
-}
-
-struct MyStruct {
-    RWBuffer<int> E;
-};
-
-cbuffer cbuf3 {
-  // valid
-  MyStruct E : register(u2, space3);
-}
-
-// valid
-MyStruct F : register(u3, space4);
-
-cbuffer cbuf4 {
-  // this test validates that no diagnostic is emitted on the space parameter, because
-  // this register annotation is not in the global scope.
-  // expected-error@+1 {{binding type 'u' only applies to UAV resources}}
-  float a : register(u2, space3); 
-}
-
-// expected-error@+1 {{invalid space specifier 's2' used; expected 'space' followed by an integer, like space1}}
-cbuffer a : register(b0, s2) {
-
-}
-
-// expected-error@+1 {{invalid space specifier 'spaces' used; expected 'space' followed by an integer, like space1}}
-cbuffer b : register(b2, spaces) {
-
-}
-
-// expected-error@+1 {{wrong argument format for hlsl attribute, use space3 instead}}
-cbuffer c : register(b2, space 3) {}
-
-// expected-error@+1 {{register space cannot be specified on global constants}}
-int d : register(c2, space3);
-
-// expected-error@+1 {{register space cannot be specified on global constants}}
-int e : register(c2, space0);
-
-// expected-error@+1 {{register space cannot be specified on global constants}}
-int f : register(c2, space00);
-
-// valid
-RWBuffer<int> g : register(u2, space0);
-
-// valid
-RWBuffer<int> h : register(u2, space0);
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
+
+// valid
+cbuffer cbuf {
+    RWBuffer<int> r : register(u0, space0);
+}
+
+cbuffer cbuf2 {
+    struct x {
+        // this test validates that no diagnostic is emitted on the space parameter, because
+        // this register annotation is not in the global scope.
+        // expected-error@+1 {{'register' attribute only applies to cbuffer/tbuffer and external global variables}}
+        RWBuffer<int> E : register(u2, space3);
+    };
+}
+
+struct MyStruct {
+    RWBuffer<int> E;
+};
+
+cbuffer cbuf3 {
+  // valid
+  MyStruct E : register(u2, space3);
+}
+
+// valid
+MyStruct F : register(u3, space4);
+
+cbuffer cbuf4 {
+  // this test validates that no diagnostic is emitted on the space parameter, because
+  // this register annotation is not in the global scope.
+  // expected-error@+1 {{binding type 'u' only applies to UAV resources}}
+  float a : register(u2, space3); 
+}
+
+// expected-error@+1 {{invalid space specifier 's2' used; expected 'space' followed by an integer, like space1}}
+cbuffer a : register(b0, s2) {
+
+}
+
+// expected-error@+1 {{invalid space specifier 'spaces' used; expected 'space' followed by an integer, like space1}}
+cbuffer b : register(b2, spaces) {
+
+}
+
+// expected-error@+1 {{wrong argument format for hlsl attribute, use space3 instead}}
+cbuffer c : register(b2, space 3) {}
+
+// expected-error@+1 {{register space cannot be specified on global constants}}
+int d : register(c2, space3);
+
+// expected-error@+1 {{register space cannot be specified on global constants}}
+int e : register(c2, space0);
+
+// expected-error@+1 {{register space cannot be specified on global constants}}
+int f : register(c2, space00);
+
+// valid
+RWBuffer<int> g : register(u2, space0);
+
+// valid
+RWBuffer<int> h : register(u2, space0);
diff --git a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
index 40517f393e12..235004102a53 100644
--- a/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
+++ b/clang/test/SemaHLSL/resource_binding_attr_error_udt.hlsl
@@ -1,135 +1,135 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
-
-template<typename T>
-struct MyTemplatedUAV {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
-};
-
-struct MySRV {
-  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
-};
-
-struct MySampler {
-  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x;
-};
-
-struct MyUAV {
-  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
-};
-
-struct MyCBuffer {
-  __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x;
-};
-
-// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0
-struct Eg1 {
-  float f;
-  MySRV SRVBuf;
-  MyUAV UAVBuf;
-  };
-Eg1 e1 : register(t0) : register(u0); 
-
-// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0. 
-// UAVBuf2 gets automatically assigned to u1 even though there is no explicit binding for u1.
-struct Eg2 {
-  float f;
-  MySRV SRVBuf;
-  MyUAV UAVBuf;
-  MyUAV UAVBuf2;
-  };
-Eg2 e2 : register(t0) : register(u0); 
-
-// Valid: Bar, the struct within Eg3, has a valid resource that can be bound to t0. 
-struct Eg3 {
-  struct Bar {
-    MyUAV a;
-  };
-  Bar b;
-};
-Eg3 e3 : register(u0);
-
-// Valid: the first sampler state object within 's' is bound to slot 5
-struct Eg4 {
-  MySampler s[3];
-};
-
-Eg4 e4 : register(s5);
-
-
-struct Eg5 {
-  float f;
-}; 
-// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}}
-Eg5 e5 : register(t0);
-
-struct Eg6 {
-  float f;
-}; 
-// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}}
-Eg6 e6 : register(u0);
-
-struct Eg7 {
-  float f;
-}; 
-// expected-warning@+1{{binding type 'b' only applies to types containing constant buffer resources}}
-Eg7 e7 : register(b0);
-
-struct Eg8 {
-  float f;
-}; 
-// expected-warning@+1{{binding type 's' only applies to types containing sampler state}}
-Eg8 e8 : register(s0);
-
-struct Eg9 {
-  MySRV s;
-}; 
-// expected-warning@+1{{binding type 'c' only applies to types containing numeric types}}
-Eg9 e9 : register(c0);
-
-struct Eg10{
-  // expected-error@+1{{'register' attribute only applies to cbuffer/tbuffer and external global variables}}
-  MyTemplatedUAV<int> a : register(u9);
-};
-Eg10 e10;
-
-
-template<typename R>
-struct Eg11 {
-    R b;
-};
-// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}}
-Eg11<MySRV> e11 : register(u0);
-// invalid because after template expansion, there are no valid resources inside Eg11 to bind as a UAV, only an SRV
-
-
-struct Eg12{
-  MySRV s1;
-  MySRV s2;
-};
-// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}}
-// expected-error@+1{{binding type 'u' cannot be applied more than once}}
-Eg12 e12 : register(u9) : register(u10);
-
-struct Eg13{
-  MySRV s1;
-  MySRV s2;
-};
-// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}}
-// expected-error@+2{{binding type 'u' cannot be applied more than once}}
-// expected-error@+1{{binding type 'u' cannot be applied more than once}}
-Eg13 e13 : register(u9) : register(u10) : register(u11);
-
-// expected-error@+1{{binding type 't' cannot be applied more than once}}
-Eg13 e13_2 : register(t11) : register(t12);
-
-struct Eg14{
- MyTemplatedUAV<int> r1;  
-};
-// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}}
-Eg14 e14 : register(t9);
-
-struct Eg15 {
-  float f[4];
-}; 
-// expected no error
-Eg15 e15 : register(c0);
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -o - -fsyntax-only %s -verify
+
+template<typename T>
+struct MyTemplatedUAV {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
+};
+
+struct MySRV {
+  __hlsl_resource_t [[hlsl::resource_class(SRV)]] x;
+};
+
+struct MySampler {
+  __hlsl_resource_t [[hlsl::resource_class(Sampler)]] x;
+};
+
+struct MyUAV {
+  __hlsl_resource_t [[hlsl::resource_class(UAV)]] x;
+};
+
+struct MyCBuffer {
+  __hlsl_resource_t [[hlsl::resource_class(CBuffer)]] x;
+};
+
+// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0
+struct Eg1 {
+  float f;
+  MySRV SRVBuf;
+  MyUAV UAVBuf;
+  };
+Eg1 e1 : register(t0) : register(u0); 
+
+// Valid: f is skipped, SRVBuf is bound to t0, UAVBuf is bound to u0. 
+// UAVBuf2 gets automatically assigned to u1 even though there is no explicit binding for u1.
+struct Eg2 {
+  float f;
+  MySRV SRVBuf;
+  MyUAV UAVBuf;
+  MyUAV UAVBuf2;
+  };
+Eg2 e2 : register(t0) : register(u0); 
+
+// Valid: Bar, the struct within Eg3, has a valid resource that can be bound to t0. 
+struct Eg3 {
+  struct Bar {
+    MyUAV a;
+  };
+  Bar b;
+};
+Eg3 e3 : register(u0);
+
+// Valid: the first sampler state object within 's' is bound to slot 5
+struct Eg4 {
+  MySampler s[3];
+};
+
+Eg4 e4 : register(s5);
+
+
+struct Eg5 {
+  float f;
+}; 
+// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}}
+Eg5 e5 : register(t0);
+
+struct Eg6 {
+  float f;
+}; 
+// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}}
+Eg6 e6 : register(u0);
+
+struct Eg7 {
+  float f;
+}; 
+// expected-warning@+1{{binding type 'b' only applies to types containing constant buffer resources}}
+Eg7 e7 : register(b0);
+
+struct Eg8 {
+  float f;
+}; 
+// expected-warning@+1{{binding type 's' only applies to types containing sampler state}}
+Eg8 e8 : register(s0);
+
+struct Eg9 {
+  MySRV s;
+}; 
+// expected-warning@+1{{binding type 'c' only applies to types containing numeric types}}
+Eg9 e9 : register(c0);
+
+struct Eg10{
+  // expected-error@+1{{'register' attribute only applies to cbuffer/tbuffer and external global variables}}
+  MyTemplatedUAV<int> a : register(u9);
+};
+Eg10 e10;
+
+
+template<typename R>
+struct Eg11 {
+    R b;
+};
+// expected-warning@+1{{binding type 'u' only applies to types containing UAV resources}}
+Eg11<MySRV> e11 : register(u0);
+// invalid because after template expansion, there are no valid resources inside Eg11 to bind as a UAV, only an SRV
+
+
+struct Eg12{
+  MySRV s1;
+  MySRV s2;
+};
+// expected-warning@+2{{binding type 'u' only applies to types containing UAV resources}}
+// expected-error@+1{{binding type 'u' cannot be applied more than once}}
+Eg12 e12 : register(u9) : register(u10);
+
+struct Eg13{
+  MySRV s1;
+  MySRV s2;
+};
+// expected-warning@+3{{binding type 'u' only applies to types containing UAV resources}}
+// expected-error@+2{{binding type 'u' cannot be applied more than once}}
+// expected-error@+1{{binding type 'u' cannot be applied more than once}}
+Eg13 e13 : register(u9) : register(u10) : register(u11);
+
+// expected-error@+1{{binding type 't' cannot be applied more than once}}
+Eg13 e13_2 : register(t11) : register(t12);
+
+struct Eg14{
+ MyTemplatedUAV<int> r1;  
+};
+// expected-warning@+1{{binding type 't' only applies to types containing SRV resources}}
+Eg14 e14 : register(t9);
+
+struct Eg15 {
+  float f[4];
+}; 
+// expected no error
+Eg15 e15 : register(c0);
diff --git a/clang/tools/scan-build/bin/scan-build.bat b/clang/tools/scan-build/bin/scan-build.bat
index 77be6746318f..f765f205b8ec 100644
--- a/clang/tools/scan-build/bin/scan-build.bat
+++ b/clang/tools/scan-build/bin/scan-build.bat
@@ -1 +1 @@
-perl -S scan-build %*
+perl -S scan-build %*
diff --git a/clang/tools/scan-build/libexec/c++-analyzer.bat b/clang/tools/scan-build/libexec/c++-analyzer.bat
index 69f048a91671..83c7172456a5 100644
--- a/clang/tools/scan-build/libexec/c++-analyzer.bat
+++ b/clang/tools/scan-build/libexec/c++-analyzer.bat
@@ -1 +1 @@
-perl -S c++-analyzer %*
+perl -S c++-analyzer %*
diff --git a/clang/tools/scan-build/libexec/ccc-analyzer.bat b/clang/tools/scan-build/libexec/ccc-analyzer.bat
index 2a85376eb82b..fdd36f3bdd04 100644
--- a/clang/tools/scan-build/libexec/ccc-analyzer.bat
+++ b/clang/tools/scan-build/libexec/ccc-analyzer.bat
@@ -1 +1 @@
-perl -S ccc-analyzer %*
+perl -S ccc-analyzer %*
diff --git a/clang/utils/ClangVisualizers/clang.natvis b/clang/utils/ClangVisualizers/clang.natvis
index a7c70186bc46..611c20dacce1 100644
--- a/clang/utils/ClangVisualizers/clang.natvis
+++ b/clang/utils/ClangVisualizers/clang.natvis
@@ -1,1089 +1,1089 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!--
-Visual Studio Native Debugging Visualizers for LLVM
-
-For Visual Studio 2013 only, put this file into
-"%USERPROFILE%\Documents\Visual Studio 2013\Visualizers" or create a symbolic link so it updates automatically.
-
-For later versions of Visual Studio, no setup is required-->
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-
-  <Type Name="clang::Type">
-    <!-- To visualize clang::Types, we need to look at TypeBits.TC to determine the actual
-         type subclass and manually dispatch accordingly (Visual Studio can't identify the real type
-         because clang::Type has no virtual members hence no RTTI).
-
-         Views:
-           "cmn": Visualization that is common to all clang::Type subclasses
-           "poly": Visualization that is specific to the actual clang::Type subclass. The subtype-specific
-                   <DisplayString> is typically as C++-like as possible (like in dump()) with <Expand>
-                   containing all the gory details.
-           "cpp": Only occasionally used when we need to distinguish between an ordinary view and a C++-like view.
-    -->
-    <DisplayString IncludeView="cmn" Condition="TypeBits.TC==clang::LocInfoType::LocInfo">LocInfoType</DisplayString>
-    <DisplayString IncludeView="cmn">{(clang::Type::TypeClass)TypeBits.TC, en}Type</DisplayString>
-    <!-- Dispatch to visualizers for the actual Type subclass -->
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Builtin" IncludeView="poly">{*(clang::BuiltinType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Pointer" IncludeView="poly">{*(clang::PointerType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Paren" IncludeView="poly">{*(clang::ParenType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::BitInt" IncludeView="poly">{(clang::BitIntType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::LValueReference" IncludeView="poly">{*(clang::LValueReferenceType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::RValueReference" IncludeView="poly">{*(clang::RValueReferenceType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray" IncludeView="poly">{(clang::ConstantArrayType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray" IncludeView="left">{(clang::ConstantArrayType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray" IncludeView="right">{(clang::ConstantArrayType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray" IncludeView="poly">{(clang::VariableArrayType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray" IncludeView="left">{(clang::VariableArrayType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray" IncludeView="right">{(clang::VariableArrayType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray" IncludeView="poly">{(clang::IncompleteArrayType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray" IncludeView="left">{(clang::IncompleteArrayType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray" IncludeView="right">{(clang::IncompleteArrayType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Typedef" IncludeView="poly">{(clang::TypedefType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Typedef" IncludeView="cpp">{(clang::TypedefType *)this,view(cpp)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Attributed" IncludeView="poly">{*(clang::AttributedType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="poly">{(clang::DecayedType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="left">{(clang::DecayedType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="right">{(clang::DecayedType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="poly">{(clang::ElaboratedType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="left">{(clang::ElaboratedType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="right">{(clang::ElaboratedType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="poly">{*(clang::TemplateTypeParmType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="cpp">{*(clang::TemplateTypeParmType *)this,view(cpp)}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm" IncludeView="poly">{*(clang::SubstTemplateTypeParmType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Record" IncludeView="poly">{*(clang::RecordType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Record" IncludeView="cpp">{*(clang::RecordType *)this,view(cpp)}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto" IncludeView="poly">{(clang::FunctionProtoType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto" IncludeView="left">{(clang::FunctionProtoType *)this,view(left)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto" IncludeView="right">{(clang::FunctionProtoType *)this,view(right)na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateSpecialization" IncludeView="poly">{*(clang::TemplateSpecializationType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::DeducedTemplateSpecialization" IncludeView="poly">{*(clang::DeducedTemplateSpecializationType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::DeducedTemplateSpecialization" IncludeView="cpp">{*(clang::DeducedTemplateSpecializationType *)this,view(cpp)}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::InjectedClassName" IncludeView="poly">{*(clang::InjectedClassNameType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::DependentName" IncludeView="poly">{*(clang::DependentNameType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::PackExpansion" IncludeView="poly">{*(clang::PackExpansionType *)this}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::LocInfoType::LocInfo" IncludeView="poly">{(clang::LocInfoType *)this,na}</DisplayString>
-    <DisplayString Condition="TypeBits.TC==clang::LocInfoType::LocInfo" IncludeView="cpp">{(clang::LocInfoType *)this,view(cpp)na}</DisplayString>
-    <DisplayString IncludeView="cpp">{this,view(poly)na}</DisplayString>
-    <DisplayString IncludeView="left">{*this,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="right"></DisplayString>
-    <DisplayString IncludeView="poly">No visualizer yet for {(clang::Type::TypeClass)TypeBits.TC,en}Type</DisplayString> <!-- Not yet implemented Type subclass -->
-    <DisplayString IncludeView="Dependence" Condition="TypeBits.Dependence">Dependence{" ",en}</DisplayString>
-    <DisplayString IncludeView="Dependence"></DisplayString>
-    <DisplayString IncludeView="Cache" Condition="TypeBits.CacheValid &amp;&amp; TypeBits.CachedLocalOrUnnamed">CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en} CachedLocalOrUnnamed</DisplayString>
-    <DisplayString IncludeView="Cache" Condition="TypeBits.CacheValid &amp;&amp; !TypeBits.CachedLocalOrUnnamed">CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en}{" ",sb}</DisplayString>
-    <DisplayString IncludeView="Cache"></DisplayString>
-    <DisplayString IncludeView="FromAST" Condition="TypeBits.FromAST">FromAST</DisplayString>
-    <DisplayString IncludeView="FromAST"></DisplayString>
-    <DisplayString IncludeView="flags" Condition="!TypeBits.Dependence &amp;&amp; !TypeBits.CacheValid &amp;&amp; !TypeBits.FromAST">
-      No TypeBits set beyond TypeClass
-    </DisplayString>
-    <DisplayString IncludeView="flags">{*this, view(Dependence)}{*this, view(Cache)}{*this, view(FromAST)}</DisplayString>
-    <DisplayString>{*this,view(cmn)}  {{{*this,view(poly)}}}</DisplayString>
-    <Expand>
-      <Item Name="TypeClass" IncludeView="cmn">(clang::Type::TypeClass)TypeBits.TC</Item>
-      <Item Name="Flags" IncludeView="cmn">this,view(flags)na</Item>
-      <Item Name="Canonical" IncludeView="cmn">CanonicalType</Item>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Builtin">*(clang::BuiltinType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Pointer">*(clang::PointerType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Paren">*(clang::ParenType*)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::BitInt">*(clang::BitIntType*)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::LValueReference">*(clang::LValueReferenceType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::RValueReference">*(clang::RValueReferenceType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray">(clang::ConstantArrayType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray">(clang::VariableArrayType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray">(clang::IncompleteArrayType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Attributed">*(clang::AttributedType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Decayed">(clang::DecayedType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated">(clang::ElaboratedType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm">(clang::TemplateTypeParmType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm">(clang::SubstTemplateTypeParmType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Record">(clang::RecordType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto">(clang::FunctionProtoType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateSpecialization">(clang::TemplateSpecializationType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::DeducedTemplateSpecialization">(clang::DeducedTemplateSpecializationType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::InjectedClassName">(clang::InjectedClassNameType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::DependentName">(clang::DependentNameType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::PackExpansion">(clang::PackExpansionType *)this</ExpandedItem>
-      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::LocInfoType::LocInfo">(clang::LocInfoType *)this</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::ArrayType">
-    <Expand>
-      <Item Name="ElementType">ElementType</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::ConstantArrayType">
-    <DisplayString IncludeView="left">{ElementType,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="right">[{Size}]</DisplayString>
-    <DisplayString>{ElementType,view(cpp)}[{Size}]</DisplayString>
-    <Expand>
-      <Item Name="Size">Size</Item>
-      <ExpandedItem>(clang::ArrayType *)this</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::IncompleteArrayType">
-    <DisplayString IncludeView="left">{ElementType,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="right">[]</DisplayString>
-    <DisplayString>{ElementType,view(cpp)}[]</DisplayString>
-    <Expand>
-      <ExpandedItem>(clang::ArrayType *)this</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::VariableArrayType">
-    <DisplayString IncludeView="left">{ElementType,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="right">[*]</DisplayString>
-    <DisplayString>{ElementType,view(cpp)}[*]</DisplayString>
-    <Expand>
-      <Item Name="[Size Expression]">(clang::Expr *)SizeExpr</Item>
-      <ExpandedItem>(clang::ArrayType *)this</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::TypedefType">
-    <DisplayString IncludeView="cpp">{Decl,view(name)nd}</DisplayString>
-    <DisplayString>{Decl}</DisplayString>
-    <Expand>
-      <Item Name="Decl">Decl</Item>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::PointerType">
-    <DisplayString>{PointeeType, view(cpp)} *</DisplayString>
-    <Expand>
-      <Item Name="PointeeType">PointeeType</Item>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::ParenType">
-    <DisplayString>{Inner, view(cpp)}</DisplayString>
-    <Expand>
-      <Item Name="Inner">Inner</Item>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::BitIntType">
-    <DisplayString Condition="!IsUnsigned">signed _BitInt({NumBits})</DisplayString>
-    <DisplayString Condition="!IsUnsigned">unsigned _BitInt({NumBits})(</DisplayString>
-    <Expand>
-      <Item Name="NumBits">NumBits</Item>
-      <ExpandedItem>(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <!-- We visualize all inner types for clang reference types. So a rvalue reference to an lvalue reference
-       to an int  would visual as int &amp; &amp;&amp; This is a little different than GetPointeeType(),
-       but more clearly displays the data structure and seems natural -->
-  <Type Name="clang::LValueReferenceType">
-    <DisplayString>{((clang::ReferenceType *)this)-&gt;PointeeType,view(cpp)} &amp;</DisplayString>
-    <Expand>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-      <Item Name="PointeeType">PointeeType</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::RValueReferenceType">
-    <DisplayString>{((clang::ReferenceType *)this)-&gt;PointeeType,view(cpp)} &amp;&amp;</DisplayString>
-    <Expand>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-      <Item Name="PointeeType">PointeeType</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::AttributedType">
-    <DisplayString>{ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind}</DisplayString>
-  </Type>
-
-  <!-- Unfortunately, Visual Studio has trouble seeing the PointerBitMask member PointerIntUnion, so I hardwire it to 2 bits-->
-  <Type Name="clang::DeclContext">
-    <DisplayString>{(clang::Decl::Kind)DeclContextBits.DeclKind,en}Decl</DisplayString>
-    <Expand>
-      <Item Name="DeclKind">(clang::Decl::Kind)DeclContextBits.DeclKind,en</Item>
-      <Synthetic Name="Members">
-        <DisplayString></DisplayString>
-        <Expand>
-          <LinkedListItems>
-            <HeadPointer>FirstDecl</HeadPointer>
-            <NextPointer>(clang::Decl *)(*(intptr_t *)NextInContextAndBits.Value.Data &amp; ~3)</NextPointer>
-            <ValueNode>*this</ValueNode>
-          </LinkedListItems>
-        </Expand>
-      </Synthetic>
-    </Expand>
-  </Type>
-  <Type Name="clang::FieldDecl">
-    <DisplayString>Field {{{*(clang::DeclaratorDecl *)this,view(cpp)nd}}}</DisplayString>
-  </Type>
-  <Type Name="clang::CXXMethodDecl">
-    <DisplayString IncludeView="cpp">{*(clang::FunctionDecl *)this,nd}</DisplayString>
-    <DisplayString>Method {{{*this,view(cpp)}}}</DisplayString>
-  </Type>
-  <Type Name="clang::CXXConstructorDecl">
-    <DisplayString>Constructor {{{Name,view(cpp)}({*(clang::FunctionDecl *)this,view(parm0)nd})}}</DisplayString>
-  </Type>
-  <Type Name="clang::CXXDestructorDecl">
-    <DisplayString>Destructor {{~{Name,view(cpp)}()}}</DisplayString>
-  </Type>
-  <Type Name="clang::TemplateTypeParmDecl">
-    <DisplayString IncludeView="TorC" Condition="Typename">typename</DisplayString>
-    <DisplayString IncludeView="TorC" Condition="!Typename">class</DisplayString>
-    <DisplayString IncludeView="MaybeEllipses" Condition="TypeForDecl == nullptr">(not yet known if parameter pack) </DisplayString>
-    <DisplayString IncludeView="MaybeEllipses" Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)TypeForDecl-&gt;CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType)->CanTTPTInfo.ParameterPack">...</DisplayString>
-    <DisplayString IncludeView="MaybeEllipses" Condition="!((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)TypeForDecl-&gt;CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType)->CanTTPTInfo.ParameterPack"></DisplayString>
-    <DisplayString IncludeView="DefaultArg" Condition="(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data &amp; 3LL) == 0">{(TypeSourceInfo *)(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data&amp;~3LL),view(cpp)}</DisplayString>
-    <DisplayString IncludeView="DefaultArg">{{InheritedInitializer}}</DisplayString>
-    <DisplayString IncludeView="Initializer" Condition="*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data &amp; 3LL">= {this,view(DefaultArg)na}</DisplayString>
-    <DisplayString IncludeView="Initializer"></DisplayString>
-    <DisplayString>{*this,view(TorC)} {*this,view(MaybeEllipses)}{Name,view(cpp)} {this,view(Initializer)na}</DisplayString>
-  </Type>
-  <Type Name="clang::TemplateDecl">
-    <DisplayString IncludeView="cpp">{*TemplatedDecl,view(cpp)}</DisplayString>
-    <DisplayString>template{TemplateParams,na} {*TemplatedDecl};</DisplayString>
-    <Expand>
-      <Item Name="TemplateParams">TemplateParams,na</Item>
-      <Item Name="TemplatedDecl">TemplatedDecl,na</Item>
-    </Expand>
-  </Type>
-  <!-- Unfortunately, visualization of PointerIntPair<PointerUnion> doesn't work due to limitations in natvis, so we will barehad it-->
-  <Type Name="clang::TypedefNameDecl">
-    <DisplayString Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)==0" IncludeView="type">{(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL),view(cpp)na}</DisplayString>
-    <DisplayString Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)!=0" IncludeView="type">{(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL),view(cpp)na}</DisplayString>
-    <DisplayString IncludeView="name">{(TypeDecl *)this,view(cpp)nand}</DisplayString>
-    <DisplayString>typedef {this,view(type)na} {this,view(name)na};</DisplayString>
-    <Expand>
-      <Item Name="IsTransparent" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 1)==0">"Not yet calculated",sb</Item>
-      <Item Name="IsTransparent" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 1)!=0">(bool)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 2)</Item>
-      <Item Name="TypeSourceInfo" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)==0">(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL)</Item>
-      <Item Name="ModedTInfo" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)!=0">(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL)</Item>
-      <ExpandedItem>(TypeDecl *)this,nd</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::TypeAliasDecl">
-    <DisplayString IncludeView="cpp">{(TypedefNameDecl *)this,view(name)nand}</DisplayString>
-    <DisplayString>using {(TypedefNameDecl *)this,view(name)nand} = {(TypedefNameDecl *)this,view(type)nand}</DisplayString>
-  </Type>
-  <Type Name="clang::AssumedTemplateStorage">
-    <DisplayString>{Name}</DisplayString>
-  </Type>
-  <Type Name="clang::UncommonTemplateNameStorage::BitsTag">
-    <DisplayString>Kind={(UncommonTemplateNameStorage::Kind)Kind,en}, Size={Size}</DisplayString>
-    <Expand>
-      <Item Name="Kind">(UncommonTemplateNameStorage::Kind)Kind</Item>
-      <Item Name="Size">Size</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::UncommonTemplateNameStorage">
-    <DisplayString IncludeView="cmn">{Bits},</DisplayString>
-    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::Overloaded">{this,view(cmn)na},{(OverloadedTemplateStorage*)this,na}</DisplayString>
-    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::Assumed">{this,view(cmn)na},{(AssumedTemplateStorage*)this,na}</DisplayString>
-    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParm">{this,view(cmn)na},{(SubstTemplateTemplateParmStorage*)this,na}</DisplayString>
-    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParmPack">{this,view(cmn)na},{(SubstTemplateTemplateParmPackStorage*)this,na}</DisplayString>
-    <DisplayString>{this,view(cmn)na}</DisplayString>
-    <Expand>
-      <Item Name="Bits">Bits</Item>
-      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::Overloaded">(OverloadedTemplateStorage*)this</ExpandedItem>
-      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::Assumed">(AssumedTemplateStorage*)this</ExpandedItem>
-      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParm">(SubstTemplateTemplateParmStorage*)this</ExpandedItem>
-      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParmPack">(SubstTemplateTemplateParmPackStorage*)this</ExpandedItem>
-    </Expand>
-  </Type>
-  <!-- clang::TemplateName::StorageType -->
-  <Type Name="llvm::PointerUnion&lt;clang::TemplateDecl *, clang::UncommonTemplateNameStorage *,
-                          clang::QualifiedTemplateName *, clang::DependentTemplateName *&gt;">
-    <!-- Expand this out by hand to get cpp view -->
-    <DisplayString Condition="(Val.Value &amp;3) == 0" IncludeView="cpp">
-      {(clang::TemplateDecl *)(Val.Value &amp; ~3LL),view(cpp)na}
-    </DisplayString>
-    <DisplayString Condition="(Val.Value &amp;3) == 0">
-      {(clang::TemplateDecl *)(Val.Value &amp; ~3LL),na}
-    </DisplayString>
-    <DisplayString Condition="(Val.Value &amp;3) == 1" IncludeView="cpp">
-      {(clang::UncommonTemplateNameStorage *)(Val.Value &amp; ~3LL),view(cpp)na}
-    </DisplayString>
-    <DisplayString Condition="(Val.Value &amp;3) == 1">
-      {(clang::UncommonTemplateNameStorage *)(Val.Value &amp; ~3LL),na}
-    </DisplayString>
-    <DisplayString Condition="(Val.Value &amp;3) == 2" IncludeView="cpp">
-      {(clang::QualifiedTemplateName *)(Val.Value &amp; ~3LL),view(cpp)na}
-    </DisplayString>
-    <DisplayString Condition="(Val.Value &amp;3) == 2">
-      {(clang::QualifiedTemplateName *)(Val.Value &amp; ~3LL),na}
-    </DisplayString>
-    <DisplayString Condition="(Val.Value &amp;3) == 3" IncludeView="cpp">
-      {(clang::DependentTemplateName *)(Val.Value &amp; ~3LL),view(cpp)na}
-    </DisplayString>
-    <DisplayString Condition="(Val.Value &amp;3) == 3">
-      {(clang::DependentTemplateName *)(Val.Value &amp; ~3LL),na}
-    </DisplayString>
-    <Expand>
-      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 0">"TemplateDecl",s8b</Item>
-      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 0">
-        (clang::TemplateDecl *)(Val.Value &amp; ~3LL)
-      </Item>
-      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 1">"UncommonTemplateNameStorage",s8b</Item>
-      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 1">
-        (clang::UncommonTemplateNameStorage *)(Val.Value &amp; ~3LL)
-      </Item>
-      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 2">"QualifiedTemplateName",s8b</Item>
-      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 2">
-        (clang::QualifiedTemplateName *)(Val.Value &amp; ~3LL)
-      </Item>
-      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 3">"DependentTemplateName",s8b</Item>
-      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 3">
-        (clang::DependentTemplateName *)(Val.Value &amp; ~3LL)
-      </Item>
-      <Item Name="[Val]">Val</Item>
-
-    </Expand>
-  </Type>
-  <Type Name="clang::TemplateName">
-    <DisplayString IncludeView="cpp">{Storage,view(cpp)na}</DisplayString>
-    <DisplayString>{Storage,na}</DisplayString>
-    <Expand>
-      <ExpandedItem>Storage</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::NamedDecl" >
-    <DisplayString IncludeView="cpp">{Name,view(cpp)}</DisplayString>
-    <DisplayString>{Name}</DisplayString>
-  </Type>
-  <Type Name="clang::TagDecl">
-    <DisplayString IncludeView="implicit" Condition="Implicit">implicit{" ",sb}</DisplayString>
-    <DisplayString IncludeView="implicit"></DisplayString>
-    <DisplayString IncludeView="modifiers">{*this,view(implicit)nd}</DisplayString>
-    <DisplayString IncludeView="cpp">{*this,view(modifiers)}{Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Struct">{*this,view(modifiers)nd}struct {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Interface">{*this,view(modifiers)nd}interface {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Union">{*this,view(modifiers)nd}union {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Class">{*this,view(modifiers)nd}class {Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Enum">{*this,view(modifiers)nd}enum {Name,view(cpp)}</DisplayString>
-    <Expand>
-      <ExpandedItem>(clang::DeclContext *)this</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::TagType">
-    <DisplayString IncludeView="cpp">{decl,view(cpp)na}</DisplayString>
-    <DisplayString>{*decl}</DisplayString>
-    <Expand>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-      <Item Name="decl">decl</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::RecordType">
-    <DisplayString IncludeView="cpp">{(clang::TagType *)this,view(cpp)na}</DisplayString>
-    <DisplayString>{(clang::TagType *)this,na}</DisplayString>
-    <Expand>
-      <Item Name="TagType">*(clang::TagType *)this</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::SubstTemplateTypeParmType">
-    <DisplayString>{{{*Replaced,view(cpp)} &lt;= {CanonicalType,view(cpp)}}}</DisplayString>
-    <Expand>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-      <Item Name="Replaced">*Replaced</Item>
-    </Expand>
-  </Type>
-  <!-- We only show the first 5 parameter types in the display string (can't figure out how to loop in DisplayString)
-       but the expansion has all parameters -->
-  <Type Name="clang::FunctionProtoType">
-    <DisplayString IncludeView="left" Condition="FunctionTypeBits.HasTrailingReturn"></DisplayString>
-    <DisplayString IncludeView="left">{ResultType,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="parm0" Condition="FunctionTypeBits.NumParams==0"></DisplayString>
-    <DisplayString IncludeView="parm0">{*(clang::QualType *)(this+1),view(cpp)}{*this,view(parm1)}</DisplayString>
-    <DisplayString IncludeView="parm1" Condition="FunctionTypeBits.NumParams==1"></DisplayString>
-    <DisplayString IncludeView="parm1">, {*((clang::QualType *)(this+1)+1),view(cpp)}{*this,view(parm2)}</DisplayString>
-    <DisplayString IncludeView="parm2" Condition="FunctionTypeBits.NumParams==2"></DisplayString>
-    <DisplayString IncludeView="parm2">, {*((clang::QualType *)(this+1)+2),view(cpp)}{*this,view(parm3)}</DisplayString>
-    <DisplayString IncludeView="parm3" Condition="FunctionTypeBits.NumParams==3"></DisplayString>
-    <DisplayString IncludeView="parm3">, {*((clang::QualType *)(this+1)+3),view(cpp)}{*this,view(parm4)}</DisplayString>
-    <DisplayString IncludeView="parm4" Condition="FunctionTypeBits.NumParams==4"></DisplayString>
-    <DisplayString IncludeView="parm4">, {*((clang::QualType *)(this+1)+4),view(cpp)}{*this,view(parm5)}</DisplayString>
-    <DisplayString IncludeView="parm5" Condition="FunctionTypeBits.NumParams==5"></DisplayString>
-    <DisplayString IncludeView="parm5">, /* expand for more params */</DisplayString>
-    <DisplayString IncludeView="right" Condition="FunctionTypeBits.HasTrailingReturn">({*this,view(parm0)}) -&gt; {ResultType,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="right">({*this,view(parm0)})</DisplayString>
-    <DisplayString>{this,view(left)na}{this,view(right)na}</DisplayString>
-    <Expand>
-      <Item Name="ResultType">ResultType</Item>
-      <Synthetic Name="Parameter Types">
-        <DisplayString>{*this,view(parm0)}</DisplayString>
-        <Expand>
-          <ArrayItems>
-            <Size>FunctionTypeBits.NumParams</Size>
-            <ValuePointer>(clang::QualType *)(this+1)</ValuePointer>
-          </ArrayItems>
-        </Expand>
-      </Synthetic>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-
-  <Type Name="clang::AdjustedType">
-    <DisplayString>{OriginalTy} adjusted to {AdjustedTy}</DisplayString>
-    <Expand>
-      <Item Name="OriginalTy">OriginalTy</Item>
-      <Item Name="AdjustedTy">AdjustedTy</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::DecayedType">
-    <DisplayString IncludeView="left">{OriginalTy,view(left)}</DisplayString>
-    <DisplayString IncludeView="right">{OriginalTy,view(right)}</DisplayString>
-    <DisplayString>{OriginalTy}</DisplayString>
-    <Expand>
-      <ExpandedItem>(clang::AdjustedType *)this</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::ElaboratedType">
-    <DisplayString IncludeView="left">{NamedType,view(left)}</DisplayString>
-    <DisplayString IncludeView="right">{NamedType,view(right)}</DisplayString>
-    <DisplayString>{NamedType}</DisplayString>
-    <Expand>
-      <Item Name="[Keyword]">(clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword</Item>
-      <Item Name="[Nested Name Specifier]">NNS</Item>
-      <Item Name="[Underlying Type]">NamedType,view(cmn)</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::TemplateTypeParmType">
-    <DisplayString IncludeView="cpp" Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">{TTPDecl->Name,view(cpp)}</DisplayString>
-    <DisplayString Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">Non-canonical: {*TTPDecl}</DisplayString>
-    <DisplayString>Canonical: {CanTTPTInfo}</DisplayString>
-    <Expand>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::InjectedClassNameType">
-    <DisplayString>{Decl,view(cpp)}</DisplayString>
-    <Expand>
-      <Item Name="Decl">Decl</Item>
-      <Item Name="InjectedType">InjectedType</Item>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::DependentNameType">
-    <DisplayString>{NNS}{Name,view(cpp)na}</DisplayString>
-    <Expand>
-      <Item Name="NNS">NNS</Item>
-      <Item Name="Name">Name</Item>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::NestedNameSpecifier">
-    <DisplayString Condition="!Specifier"></DisplayString>
-    <DisplayString Condition="((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3) == 0">{(IdentifierInfo*)Specifier,view(cpp)na}::</DisplayString>
-    <DisplayString Condition="((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3) == 1">{(NamedDecl*)Specifier,view(cpp)na}::</DisplayString>
-    <DisplayString Condition="((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3) == 2">{(Type*)Specifier,view(cpp)na}::</DisplayString>
-    <Expand>
-      <Item Name="Kind">(NestedNameSpecifier::StoredSpecifierKind)((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3)</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::PackExpansionType">
-    <DisplayString>{Pattern}</DisplayString>
-    <Expand>
-      <Item Name="Pattern">Pattern</Item>
-      <Item Name="NumExpansions">NumExpansions</Item>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::QualType">
-    <DisplayString IncludeView="poly">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(poly)}{*this,view(fastQuals)}</DisplayString>
-    <DisplayString IncludeView="cpp">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(cpp)}{*this,view(fastQuals)}</DisplayString>
-    <DisplayString IncludeView="left">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(left)}{*this,view(fastQuals)}</DisplayString>
-    <DisplayString IncludeView="right">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(right)}{*this,view(fastQuals)}</DisplayString>
-    <!-- For the Fast Qualifiers, it is simpler (and probably more efficient) just to list all 8 cases than create
-          views for each qualifier. TODO: Non-fast qualifiers -->
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==0"></DisplayString>
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==1">{" ",sb}const</DisplayString>
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==2">{" ",sb}restrict</DisplayString>
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==3">{" ",sb}const restrict</DisplayString>
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==4">{" ",sb}volatile</DisplayString>
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==5">{" ",sb}const volatile</DisplayString>
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==6">{" ",sb}volatile restrict</DisplayString>
-    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==7">{" ",sb}const volatile restrict</DisplayString>
-    <DisplayString IncludeView="fastQuals">Cannot visualize non-fast qualifiers</DisplayString>
-    <DisplayString Condition="(*(uintptr_t *)Value.Value.Data) == 0">Null</DisplayString>
-    <DisplayString>{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,na}{*this,view(fastQuals)}</DisplayString>
-    <Expand>
-      <Item Name="Fast Quals">*this,view(fastQuals)</Item>
-      <ExpandedItem>((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType</ExpandedItem>
-    </Expand>
-
-  </Type>
-  <Type Name="clang::LocInfoType">
-    <DisplayString IncludeView="cpp">{DeclInfo,view(cpp)na}</DisplayString>
-    <DisplayString>{DeclInfo,na}</DisplayString>
-    <Expand>
-      <Item Name="DeclInfo">DeclInfo</Item>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::TypeSourceInfo">
-    <DisplayString IncludeView="cpp">{Ty,view(cpp)}</DisplayString>
-    <DisplayString>{Ty}</DisplayString>
-    <Expand>
-      <ExpandedItem>Ty</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::TypeLoc">
-    <DisplayString>{(QualType *)&amp;Ty,na}</DisplayString>
-    <Expand>
-      <Item Name="Ty">(QualType *)&amp;Ty</Item>
-      <Item Name="Data">Data</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::TypeLocBuilder">
-    <DisplayString Optional="true" Condition="LastTy.Value.Value==0">Not building anything</DisplayString>
-    <DisplayString Optional="true">Building a {LastTy}</DisplayString>
-  </Type>
-  <Type Name="clang::TemplateArgumentLoc">
-    <DisplayString IncludeView="cpp">{Argument,view(cpp)}</DisplayString>
-    <DisplayString>{Argument}</DisplayString>
-  </Type>
-  <Type Name="clang::TemplateArgument">
-    <DisplayString IncludeView="cpp" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">{*(clang::QualType *)&amp;TypeOrValue.V,view(cpp)}</DisplayString>
-    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">{(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} template argument: {*(clang::QualType *)&amp;TypeOrValue.V}</DisplayString>
-    <DisplayString IncludeView="arg0" Condition="Args.NumArgs==0"></DisplayString>
-    <DisplayString IncludeView="arg0">{Args.Args[0]}{*this,view(arg1)}</DisplayString>
-    <DisplayString IncludeView="arg1" Condition="Args.NumArgs==1"></DisplayString>
-    <DisplayString IncludeView="arg1">, {Args.Args[1]}{*this,view(arg2)}</DisplayString>
-    <DisplayString IncludeView="arg2" Condition="Args.NumArgs==2"></DisplayString>
-    <DisplayString IncludeView="arg2">, {Args.Args[2]}, ...</DisplayString>
-    <DisplayString IncludeView="arg0cpp" Condition="Args.NumArgs==0"></DisplayString>
-    <DisplayString IncludeView="arg0cpp">{Args.Args[0],view(cpp)}{*this,view(arg1cpp)}</DisplayString>
-    <DisplayString IncludeView="arg1cpp" Condition="Args.NumArgs==1"></DisplayString>
-    <DisplayString IncludeView="arg1cpp">, {Args.Args[1],view(cpp)}{*this,view(arg2cpp)}</DisplayString>
-    <DisplayString IncludeView="arg2cpp" Condition="Args.NumArgs==2"></DisplayString>
-    <DisplayString IncludeView="arg2cpp">, {Args.Args[2],view(cpp)}, ...</DisplayString>
-    <DisplayString IncludeView="cpp" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">{*this,view(arg0cpp)}</DisplayString>
-    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">{*this,view(arg0)}</DisplayString>
-    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Expression">{(clang::Expr *)TypeOrValue.V,view(cpp)na}</DisplayString>
-    <DisplayString>{(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en}</DisplayString>
-    <Expand>
-      <Item Name="QualType" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">*(clang::QualType *)&amp;TypeOrValue.V</Item>
-      <Item Name="Expression" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Expression">(clang::Expr *)TypeOrValue.V</Item>
-      <ArrayItems Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">
-        <Size>Args.NumArgs</Size>
-        <ValuePointer>Args.Args</ValuePointer>
-      </ArrayItems>
-      <!-- TODO: Other kinds-->
-    </Expand>
-  </Type>
-  <Type Name="clang::TemplateArgumentListInfo">
-    <DisplayString IncludeView ="elt0" Condition="Arguments.Size == 0"></DisplayString>
-    <DisplayString IncludeView ="elt0">{((TemplateArgumentLoc*)Arguments.BeginX)[0],view(cpp)}{*this,view(elt1)}</DisplayString>
-    <DisplayString IncludeView ="elt1" Condition="Arguments.Size == 1"></DisplayString>
-    <DisplayString IncludeView ="elt1">, {((TemplateArgumentLoc*)Arguments.BeginX)[1],view(cpp)}{*this,view(elt2)}</DisplayString>
-    <DisplayString IncludeView ="elt2" Condition="Arguments.Size == 2"></DisplayString>
-    <DisplayString IncludeView ="elt2">, {((TemplateArgumentLoc*)Arguments.BeginX)[2],view(cpp)}{*this,view(elt3)}</DisplayString>
-    <DisplayString IncludeView ="elt3" Condition="Arguments.Size == 3"></DisplayString>
-    <DisplayString IncludeView ="elt3">, {((TemplateArgumentLoc*)Arguments.BeginX)[3],view(cpp)}{*this,view(elt4)}</DisplayString>
-    <DisplayString IncludeView ="elt4" Condition="Arguments.Size == 4"></DisplayString>
-    <DisplayString IncludeView ="elt4">, ...</DisplayString>
-    <DisplayString Condition="Arguments.Size == 0">empty</DisplayString>
-    <DisplayString Condition="Arguments.Size != 0">&lt;{*this,view(elt0)}&gt;</DisplayString>
-    <DisplayString>Uninitialized</DisplayString>
-  </Type>
-  <Type Name="clang::TemplateArgumentList">
-    <DisplayString IncludeView="arg0" Condition="NumArguments==0"></DisplayString>
-    <DisplayString IncludeView="arg0">{Arguments[0],view(cpp)}{*this,view(arg1)}</DisplayString>
-    <DisplayString IncludeView="arg1" Condition="NumArguments==1"></DisplayString>
-    <DisplayString IncludeView="arg1">, {Arguments[1],view(cpp)}{*this,view(arg2)}</DisplayString>
-    <DisplayString IncludeView="arg2" Condition="NumArguments==2"></DisplayString>
-    <DisplayString IncludeView="arg2">, {Arguments[1],view(cpp)}, ...</DisplayString>
-    <DisplayString>&lt;{*this,view(arg0)}&gt;</DisplayString>
-    <Expand>
-      <Item Name="NumArguments">NumArguments</Item>
-      <ArrayItems>
-        <Size>NumArguments</Size>
-        <ValuePointer>Arguments</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-  <Type Name="llvm::ArrayRef&lt;clang::TemplateArgument&gt;">
-    <DisplayString IncludeView="arg0" Condition="Length==0"></DisplayString>
-    <DisplayString IncludeView="arg0">{Data[0],view(cpp)}{*this,view(arg1)}</DisplayString>
-    <DisplayString IncludeView="arg1" Condition="Length==1"></DisplayString>
-    <DisplayString IncludeView="arg1">, {Data[1],view(cpp)}{*this,view(arg2)}</DisplayString>
-    <DisplayString IncludeView="arg2" Condition="Length==2"></DisplayString>
-    <DisplayString IncludeView="arg2">, {Data[2],view(cpp)}, ...</DisplayString>
-    <DisplayString>&lt;{*this,view(arg0)}&gt;</DisplayString>
-    <Expand>
-      <Item Name="Length">Length</Item>
-      <Synthetic Name="Data">
-        <Expand>
-          <ArrayItems>
-            <Size>Length</Size>
-            <ValuePointer>Data</ValuePointer>
-          </ArrayItems>
-        </Expand>
-      </Synthetic>
-    </Expand>
-  </Type>
-  <Type Name="clang::MultiLevelTemplateArgumentList">
-    <DisplayString IncludeView="level0" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==0"></DisplayString>
-    <DisplayString IncludeView="level0">{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[0],view(cpp)}{*this,view(level1)}</DisplayString>
-    <DisplayString IncludeView="level1" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==1"></DisplayString>
-    <DisplayString IncludeView="level1">::{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[1],view(cpp)}{*this,view(level2)}</DisplayString>
-    <DisplayString IncludeView="level2" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==2"></DisplayString>
-    <DisplayString IncludeView="level2">::{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[2],view(cpp)}, ...</DisplayString>
-    <DisplayString>{*this,view(level0)}</DisplayString>
-    <Expand>
-      <Item Name="TemplateList">TemplateArgumentLists</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::ParsedTemplateArgument">
-    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Type" IncludeView="cpp">{(clang::QualType *)Arg,view(cpp)na}</DisplayString>
-    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Type">Type template argument: {*(clang::QualType *)Arg}</DisplayString>
-    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::NonType">Non-type template argument: {*(clang::Expr *)Arg}</DisplayString>
-    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Template">Template template argument: {*(clang::TemplateName *)Arg</DisplayString>
-    <Expand>
-      <Item Name="Kind">Kind,en</Item>
-      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::Type">(clang::QualType *)Arg</Item>
-      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::NonType">(clang::Expr *)Arg</Item>
-      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::Template">(clang::TemplateName *)Arg</Item>
-    </Expand>
-  </Type>
-  <!-- Builtin types that have C++ keywords are manually displayed as that keyword. Otherwise, just use the enum name -->
-  <Type Name="clang::BuiltinType">
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Void">void</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Bool">bool</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char_U">char</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UChar">unsigned char</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::WChar_U">wchar_t</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char16">char16_t</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char32">char32_t</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UShort">unsigned short</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UInt">unsigned int</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::ULong">unsigned long</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::ULongLong">unsigned long long</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UInt128">__uint128_t</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char_S">char</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::SChar">signed char</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::WChar_S">wchar_t</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Short">short</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Int">int</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Long">long</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::LongLong">long long</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Int128">__int128_t</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Half">__fp16</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Float">float</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Double">double</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::LongDouble">long double</DisplayString>
-    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::NullPtr">nullptr_t</DisplayString>
-    <DisplayString>{(clang::BuiltinType::Kind)BuiltinTypeBits.Kind, en}</DisplayString>
-    <Expand>
-      <Item Name="Kind">(clang::BuiltinType::Kind)BuiltinTypeBits.Kind</Item>
-    </Expand>
-  </Type>
-
-  <Type Name="clang::TemplateSpecializationType">
-    <DisplayString IncludeView="arg0" Condition="TemplateSpecializationTypeBits.NumArgs==0"></DisplayString>
-    <DisplayString IncludeView="arg0">{((clang::TemplateArgument *)(this+1))[0],view(cpp)}{*this,view(arg1)}</DisplayString>
-    <DisplayString IncludeView="arg1" Condition="TemplateSpecializationTypeBits.NumArgs==1"></DisplayString>
-    <DisplayString IncludeView="arg1">, {((clang::TemplateArgument *)(this+1))[1],view(cpp)}{*this,view(arg2)}</DisplayString>
-    <DisplayString IncludeView="arg2" Condition="TemplateSpecializationTypeBits.NumArgs==2"></DisplayString>
-    <DisplayString IncludeView="arg2">, {((clang::TemplateArgument *)(this+1))[2],view(cpp)}{*this,view(arg3)}</DisplayString>
-    <DisplayString Condition="(Template.Storage.Val.Value &amp; 3) == 0">
-      {*((clang::TemplateDecl *)(Template.Storage.Val.Value))->TemplatedDecl,view(cpp)}&lt;{*this,view(arg0)}&gt;
-    </DisplayString>
-    <DisplayString>Can't visualize this TemplateSpecializationType</DisplayString>
-    <Expand>
-      <Item Name="Template">Template.Storage</Item>
-      <ArrayItems>
-        <Size>TemplateSpecializationTypeBits.NumArgs</Size>
-        <ValuePointer>(clang::TemplateArgument *)(this+1)</ValuePointer>
-      </ArrayItems>
-      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeducedType">
-    <Expand>
-      <Item Name="isDeduced">(CanonicalType.Value.Value != this) || TypeBits.Dependent</Item>
-      <ExpandedItem>*(clang::Type *)this,view(cmn)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeducedTemplateSpecializationType">
-    <DisplayString Condition="(CanonicalType.Value.Value != this) || TypeBits.Dependent">{CanonicalType,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="cpp">{Template,view(cpp)}</DisplayString>
-    <DisplayString>{Template}</DisplayString>
-    <Expand>
-      <Item Name="Template">Template</Item>
-      <Item Name="Deduced As" Condition="(CanonicalType.Value.Value != this) || TypeBits.Dependent">CanonicalType,view(cpp)</Item>
-      <ExpandedItem>(clang::DeducedType *)this</ExpandedItem>
-      <Item Name="Template">Template</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::ClassTemplateSpecializationDecl">
-    <DisplayString>{*(CXXRecordDecl *)this,nd}{*TemplateArgs}</DisplayString>
-    <Expand>
-      <ExpandedItem>(CXXRecordDecl *)this,nd</ExpandedItem>
-      <Item Name="TemplateArgs">TemplateArgs</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::IdentifierInfo">
-    <DisplayString Condition="Entry != 0">{((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,sb}</DisplayString>
-    <Expand>
-      <Item Condition="Entry != 0" Name="[Identifier]">((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,s</Item>
-      <Item Name="Token Kind">(clang::tok::TokenKind)TokenID</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeclarationName">
-    <DisplayString Condition="Ptr == 0" IncludeView="cpp"></DisplayString>
-    <DisplayString Condition="Ptr == 0">Empty</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredIdentifier" IncludeView="cpp">{*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredIdentifier">{{Identifier ({*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector">{{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector">{{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXConstructorName" IncludeView="cpp">{(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),view(cpp)na}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXConstructorName">C++ Constructor {{{(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),view(cpp)na}}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXDestructorName">C++ Destructor {{*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask)}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXConversionFunctionName">C++ Conversion function {{*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask)}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXOperatorName">C++ Operator {{*(clang::detail::CXXOperatorIdName *)(Ptr &amp; ~PtrMask)}}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra"
-                   IncludeView="cpp">{*(clang::detail::DeclarationNameExtra *)(Ptr &amp; ~PtrMask),view(cpp)}</DisplayString>
-    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra">{{Extra ({*(clang::detail::DeclarationNameExtra *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
-    <Expand>
-      <Item Name="Kind">StoredNameKind(Ptr &amp; PtrMask),en</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredIdentifier" Name="[Identifier]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask),na</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector" Name="[ObjC Zero Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask),na</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector" Name="[ObjC One Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask),na</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXConstructorName" Name="[C++ Constructor]">*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXDestructorName" Name="[C++ Destructor]">*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXConversionFunctionName" Name="[C++ Conversion function]">*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXOperatorName" Name="[C++ Operator]">*(clang::detail::CXXOperatorIdName *)(Ptr &amp; ~PtrMask),na</Item>
-      <Item Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra" Name="[Extra]">(clang::detail::DeclarationNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::detail::DeclarationNameExtra">
-    <DisplayString Condition="ExtraKindOrNumArgs == CXXDeductionGuideName" IncludeView="cpp">
-      {(CXXDeductionGuideNameExtra *)this,view(cpp)nand}
-    </DisplayString>
-    <DisplayString Condition="ExtraKindOrNumArgs == CXXDeductionGuideName">
-      {(CXXDeductionGuideNameExtra *)this,nand}
-    </DisplayString>
-    <DisplayString Condition="ExtraKindOrNumArgs == CXXLiteralOperatorName">C++ Literal operator</DisplayString>
-    <DisplayString Condition="ExtraKindOrNumArgs == CXXUsingDirective">C++ Using directive</DisplayString>
-    <DisplayString Condition="ExtraKindOrNumArgs == ObjCMultiArgSelector">Objective-C MultiArg selector</DisplayString>
-    <DisplayString>{(clang::detail::DeclarationNameExtra::ExtraKind)ExtraKindOrNumArgs,en}{"  ",sb}{*this,view(cpp)}</DisplayString>
-    <Expand>
-      <ExpandedItem Condition="ExtraKindOrNumArgs == CXXDeductionGuideName">(CXXDeductionGuideNameExtra *)this</ExpandedItem>
-      <Item Name="ExtraKindOrNumArgs" Condition="ExtraKindOrNumArgs != CXXDeductionGuideName">ExtraKindOrNumArgs</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::detail::CXXDeductionGuideNameExtra">
-    <DisplayString IncludeView="cpp">{Template->TemplatedDecl,view(cpp)}</DisplayString>
-    <DisplayString>C++ Deduction guide for {Template->TemplatedDecl,view(cpp)na}</DisplayString>
-  </Type>
-  <Type Name="clang::detail::CXXSpecialNameExtra">
-    <DisplayString IncludeView="cpp">{Type,view(cpp)}</DisplayString>
-    <DisplayString>{Type}</DisplayString>
-  </Type>
-  <Type Name="clang::DeclarationNameInfo">
-    <DisplayString>{Name}</DisplayString>
-  </Type>
-  <Type Name="clang::TemplateIdAnnotation">
-    <DisplayString IncludeView="arg0" Condition="NumArgs==0"></DisplayString>
-    <DisplayString IncludeView="arg0">{(ParsedTemplateArgument *)(this+1),view(cpp)na}{this,view(arg1)na}</DisplayString>
-    <DisplayString IncludeView="arg1" Condition="NumArgs==1"></DisplayString>
-    <DisplayString IncludeView="arg1">, {((ParsedTemplateArgument *)(this+1))+1,view(cpp)na}{this,view(arg2)na}</DisplayString>
-    <DisplayString IncludeView="arg2" Condition="NumArgs==2"></DisplayString>
-    <DisplayString IncludeView="arg1">, ...</DisplayString>
-    <DisplayString>{Name,na}&lt;{this,view(arg0)na}&gt;</DisplayString>
-    <Expand>
-      <Item Name="Name">Name</Item>
-      <Synthetic Name="Arguments">
-        <DisplayString>{this,view(arg0)na}</DisplayString>
-        <Expand>
-          <ArrayItems>
-            <Size>NumArgs</Size>
-            <ValuePointer>(ParsedTemplateArgument *)(this+1)</ValuePointer>
-          </ArrayItems>
-        </Expand>
-      </Synthetic>
-      <Item Name="Operator">Operator</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::Token">
-    <DisplayString Condition="Kind == clang::tok::annot_template_id">{{annot_template_id ({(clang::TemplateIdAnnotation *)(PtrData),na})}}</DisplayString>
-    <DisplayString Condition="Kind == clang::tok::identifier">{{Identifier ({(clang::IdentifierInfo *)(PtrData),na})}}</DisplayString>
-    <DisplayString>{(clang::tok::TokenKind)Kind,en}</DisplayString>
-  </Type>
-  <Type Name="clang::Lexer">
-    <DisplayString>{BufferPtr,nasb}</DisplayString>
-  </Type>
-  <Type Name="clang::Preprocessor::IncludeStackInfo">
-    <DisplayString Condition="TheLexer._Mypair._Myval2 != 0">{TheLexer._Mypair._Myval2,na}</DisplayString>
-    <DisplayString Condition="TheTokenLexer._Mypair._Myval2 != 0">Expanding Macro: {TheTokenLexer._Mypair._Myval2,na}</DisplayString>
-    <DisplayString></DisplayString>
-  </Type>
-  <Type Name="clang::Preprocessor">
-    <DisplayString IncludeView="cached" Condition="CachedLexPos &lt; CachedTokens.Size">
-      [{(Token *)(CachedTokens.BeginX) + CachedLexPos,na}] {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na}
-    </DisplayString>
-    <DisplayString IncludeView="cached"> {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na}</DisplayString>
-    <DisplayString Condition="CurLexer._Mypair._Myval2 != 0">{CurLexer._Mypair._Myval2,na}</DisplayString>
-    <DisplayString Condition="CurTokenLexer._Mypair._Myval2 != 0">Expanding Macro: {CurTokenLexer._Mypair._Myval2,na}</DisplayString>
-    <!-- Can't use CurLexerCallback because natvis sees the type rather than the variable -->
-    <DisplayString Condition="IncludeMacroStack._Mypair._Myval2._Mylast - IncludeMacroStack._Mypair._Myval2._Myfirst">
-      {this,view(cached)}
-    </DisplayString>
-    <DisplayString>CLK_LexAfterModuleImport</DisplayString>
-  </Type>
-  <Type Name="clang::Parser">
-    <DisplayString>[{Tok}] {PP,na}</DisplayString>
-  </Type>
-  <Type Name="clang::LambdaIntroducer::LambdaCapture">
-    <DisplayString Condition="Kind == LCK_This">this</DisplayString>
-    <DisplayString Condition="Kind == LCK_StarThis">*this</DisplayString>
-    <DisplayString Condition="Kind == LCK_ByCopy">{Id}</DisplayString>
-    <DisplayString Condition="Kind == LCK_ByRef">&amp;{Id}</DisplayString>
-    <DisplayString>No visualizer for {Kind}</DisplayString>
-  </Type>
-  <Type Name="clang::LambdaIntroducer">
-    <DisplayString IncludeView="default" Condition="Default==LCD_None"></DisplayString>
-    <DisplayString IncludeView="default" Condition="Default==LCD_ByCopy">=,</DisplayString>
-    <DisplayString IncludeView="default" Condition="Default==LCD_ByRef">&amp;,</DisplayString>
-    <DisplayString IncludeView="capture0" Condition="Captures.Size==0"></DisplayString>
-    <DisplayString IncludeView="capture0">{(LambdaCapture *)(Captures.BeginX),na}{this,view(capture1)na}</DisplayString>
-    <DisplayString IncludeView="capture1" Condition="Captures.Size==1"></DisplayString>
-    <DisplayString IncludeView="capture1">,{(LambdaCapture *)(Captures.BeginX)+1,na}{this,view(capture2)na}</DisplayString>
-    <DisplayString IncludeView="capture2" Condition="Captures.Size==2"></DisplayString>
-    <DisplayString IncludeView="capture2">,{(LambdaCapture *)(Captures.BeginX)+2,na}{this,view(capture3)na}</DisplayString>
-    <DisplayString IncludeView="capture3" Condition="Captures.Size==3"></DisplayString>
-    <DisplayString IncludeView="capture3">,...</DisplayString>
-    <DisplayString>[{this,view(default)na}{this,view(capture0)na}]</DisplayString>
-  </Type>
-  <Type Name="clang::DeclSpec">
-    <DisplayString IncludeView="extra" Condition="TypeSpecType == TST_typename || TypeSpecType == TST_typeofType || TypeSpecType == TST_underlying_type || TypeSpecType == TST_atomic">
-      , [{TypeRep}]
-    </DisplayString>
-    <DisplayString IncludeView="extra" Condition="TypeSpecType == TST_typeofExpr || TypeSpecType == TST_decltype">
-      , [{ExprRep}]
-    </DisplayString>
-    <DisplayString IncludeView="extra" Condition="TypeSpecType == TST_enum || TypeSpecType == TST_struct || TypeSpecType == TST_interface || TypeSpecType == TST_union || TypeSpecType == TST_class">
-      , [{DeclRep}]
-    </DisplayString>
-    <DisplayString IncludeView="extra"></DisplayString>
-    <DisplayString>[{(clang::DeclSpec::SCS)StorageClassSpec,en}], [{(clang::TypeSpecifierType)TypeSpecType,en}]{this,view(extra)na}</DisplayString>
-    <Expand>
-      <Item Name="StorageClassSpec">(clang::DeclSpec::SCS)StorageClassSpec</Item>
-      <Item Name="TypeSpecType">(clang::TypeSpecifierType)TypeSpecType</Item>
-      <Item Name="TypeRep" Condition="TypeSpecType == TST_typename || TypeSpecType == TST_typeofType || TypeSpecType == TST_underlying_type || TypeSpecType == TST_atomic">
-        TypeRep
-      </Item>
-      <Item Name="ExprRep" Condition="TypeSpecType == TST_typeofExpr || TypeSpecType == TST_decltype">
-        ExprRep
-      </Item>
-      <Item Name="DeclRep" Condition="TypeSpecType == TST_enum || TypeSpecType == TST_struct || TypeSpecType == TST_interface || TypeSpecType == TST_union || TypeSpecType == TST_class">
-        DeclRep
-      </Item>
-
-    </Expand>
-  </Type>
-  <Type Name="clang::PragmaHandler">
-    <DisplayString>{Name,s}</DisplayString>
-  </Type>
-  <Type Name="clang::FileEntry">
-    <DisplayString>{RealPathName,s}</DisplayString>
-  </Type>
-  <Type Name="clang::DirectoryEntry">
-    <DisplayString>{Name,s}</DisplayString>
-  </Type>
-  <Type Name="clang::VarDecl::VarDeclBitfields">
-    <Expand>
-      <Item Name="StorageClass">(clang::StorageClass)SClass</Item>
-      <Item Name="ThreadStorageClass">(clang::ThreadStorageClassSpecifier)TSCSpec</Item>
-      <Item Name="InitStyle">(clang::VarDecl::InitializationStyle)InitStyle</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeclaratorDecl">
-    <DisplayString>{DeclType,view(left)} {Name,view(cpp)}{DeclType,view(right)}</DisplayString>
-    <Expand>
-      <Item Name="Name">Name</Item>
-      <Item Name="DeclType">DeclType</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::VarDecl">
-    <DisplayString>{(DeclaratorDecl*)this,nand}</DisplayString>
-    <Expand>
-      <ExpandedItem>(DeclaratorDecl*)this,nd</ExpandedItem>
-      <Item Name="Init">Init</Item>
-      <Item Name="VarDeclBits">VarDeclBits</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::ParmVarDecl">
-    <DisplayString>{*(VarDecl*)this,nd}</DisplayString>
-    <Expand>
-      <Item Name="ParmVarDeclBits">ParmVarDeclBits</Item>
-      <ExpandedItem>*(VarDecl*)this,nd</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::ExplicitSpecifier">
-    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data&gt;&gt;1)&amp;3) == ExplicitSpecKind::ResolvedTrue" IncludeView="cpp">{"explicit ",sb}</DisplayString>
-    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data&gt;&gt;1)&amp;3) == ExplicitSpecKind::ResolvedFalse" IncludeView="cpp"></DisplayString>
-    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data&gt;&gt;1)&amp;3) == ExplicitSpecKind::Unresolved" IncludeView="cpp">explicit({ExplicitSpec,view(ptr)na})</DisplayString>
-    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data)&amp;~7) == 0">{ExplicitSpec,view(int)en}</DisplayString>
-    <DisplayString>{ExplicitSpec,view(int)en} : {ExplicitSpec,view(ptr)na}</DisplayString>
-  </Type>
-  <Type Name="clang::CXXDeductionGuideDecl">
-    <DisplayString>{ExplicitSpec,view(cpp)}{Name,view(cpp)nd}({(FunctionDecl*)this,view(parm0)nand}) -&gt; {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)->ResultType,view(cpp)}</DisplayString>
-    <Expand>
-      <Item Name="ExplicitSpec">ExplicitSpec</Item>
-      <Item Name="IsCopyDeductionCandidate">(bool)FunctionDeclBits.IsCopyDeductionCandidate</Item>
-      <ExpandedItem>(FunctionDecl*)this,nd</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::FunctionDecl">
-    <DisplayString IncludeView="retType">{((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)->ResultType,view(cpp)}</DisplayString>
-    <DisplayString IncludeView="parm0" Condition="0 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
-    <DisplayString IncludeView="parm0">{ParamInfo[0],na}{*this,view(parm1)nd}</DisplayString>
-    <DisplayString IncludeView="parm1" Condition="1 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
-    <DisplayString IncludeView="parm1">, {ParamInfo[1],na}{*this,view(parm2)nd}</DisplayString>
-    <DisplayString IncludeView="parm2" Condition="2 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
-    <DisplayString IncludeView="parm2">, {ParamInfo[2],na}{*this,view(parm3)nd}</DisplayString>
-    <DisplayString IncludeView="parm3" Condition="3 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
-    <DisplayString IncludeView="parm3">, {ParamInfo[3],na}{*this,view(parm4)nd}</DisplayString>
-    <DisplayString IncludeView="parm4" Condition="4 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
-    <DisplayString IncludeView="parm4">, {ParamInfo[4],na}{*this,view(parm5)nd}</DisplayString>
-    <DisplayString IncludeView="parm5" Condition="5 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
-    <DisplayString IncludeView="parm5">, /* expand for more params */</DisplayString>
-    <DisplayString Condition="((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.HasTrailingReturn">
-      auto {Name,view(cpp)nd}({*this,view(parm0)nd}) -&gt; {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)->ResultType,view(cpp)}
-    </DisplayString>
-    <DisplayString>{this,view(retType)nand} {Name,view(cpp)nd}({*this,view(parm0)nd})</DisplayString>
-    <Expand>
-      <ExpandedItem>(clang::DeclaratorDecl *)this,nd</ExpandedItem>
-      <Item Name="ReturnType">((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)->ResultType</Item>
-      <Synthetic Name="Parameter Types">
-        <DisplayString>{*this,view(parm0)nd}</DisplayString>
-        <Expand>
-          <ArrayItems>
-            <Size>((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams</Size>
-            <ValuePointer>ParamInfo</ValuePointer>
-          </ArrayItems>
-        </Expand>
-      </Synthetic>
-      <Item Name="TemplateOrSpecialization">TemplateOrSpecialization</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::OpaquePtr&lt;*&gt;">
-    <DisplayString>{*($T1*)&amp;Ptr}</DisplayString>
-    <Expand>
-      <ExpandedItem>($T1*)&amp;Ptr</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::UnionOpaquePtr&lt;*&gt;">
-    <DisplayString>{($T1 *)Ptr}</DisplayString>
-    <Expand>
-      <ExpandedItem>($T1 *)Ptr</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::TemplateParameterList">
-    <DisplayString IncludeView="parm0" Condition="NumParams==0"></DisplayString>
-    <DisplayString IncludeView="parm0">{*((NamedDecl **)(this+1))[0],view(cpp)}{*this,view(parm1)}</DisplayString>
-    <DisplayString IncludeView="parm1" Condition="NumParams==1"></DisplayString>
-    <DisplayString IncludeView="parm1">, {*((NamedDecl **)(this+1))[1],view(cpp)}{*this,view(parm2)}</DisplayString>
-    <DisplayString IncludeView="parm2" Condition="NumParams==2"></DisplayString>
-    <DisplayString IncludeView="parm2">, {*((NamedDecl **)(this+1))[2],view(cpp)}{*this,view(parm3)}</DisplayString>
-    <DisplayString IncludeView="parm3" Condition="NumParams==3"></DisplayString>
-    <DisplayString IncludeView="parm3">, {*((NamedDecl **)(this+1))[3],view(cpp)}{*this,view(parm4)}</DisplayString>
-    <DisplayString IncludeView="parm4" Condition="NumParams==4"></DisplayString>
-    <DisplayString IncludeView="parm4">, {*((NamedDecl **)(this+1))[4],view(cpp)}{*this,view(parm5)}</DisplayString>
-    <DisplayString IncludeView="parm5" Condition="NumParams==5"></DisplayString>
-    <DisplayString IncludeView="parm5">, /* Expand for more params */</DisplayString>
-    <DisplayString>&lt;{*this,view(parm0)}&gt;</DisplayString>
-    <Expand>
-      <ArrayItems>
-        <Size>NumParams</Size>
-      <ValuePointer>(NamedDecl **)(this+1)</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-  <Type Name="clang::Stmt">
-    <DisplayString>{(clang::Stmt::StmtClass)StmtBits.sClass,en}</DisplayString>
-    <Expand>
-      <Item Name="Class">(clang::Stmt::StmtClass)StmtBits.sClass,en</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::Expr">
-    <DisplayString Condition="StmtBits.sClass==clang::Stmt::StmtClass::StringLiteralClass" IncludeView="poly">{*(clang::StringLiteral *)this}</DisplayString>
-    <DisplayString>Expression of class {(clang::Stmt::StmtClass)StmtBits.sClass,en} and type {TR,view(cpp)}</DisplayString>
-  </Type>
-  <Type Name="clang::StringLiteral">
-    <Expand>
-      <Item Name="Length">*(unsigned *)(((clang::StringLiteral *)this)+1)</Item>
-      <Item Name="Data" Condition="StringLiteralBits.NumConcatenated==1">(const char *)(((clang::StringLiteral *)this)+1)+4+4,[*(unsigned *)(((clang::StringLiteral *)this)+1)]s8</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeclAccessPair">
-    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_public">public</DisplayString>
-    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_protected">protected</DisplayString>
-    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_private">private</DisplayString>
-    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_none"></DisplayString>
-    <DisplayString IncludeView="decl">{*(clang::NamedDecl *)(Ptr&amp;~Mask)}</DisplayString>
-    <DisplayString>{*this,view(access)} {*this,view(decl)}</DisplayString>
-    <Expand>
-      <Item Name="access">(clang::AccessSpecifier)(Ptr&amp;Mask),en</Item>
-      <Item Name="decl">*(clang::NamedDecl *)(Ptr&amp;~Mask)</Item>
-    </Expand>
-  </Type>
-  <Type Name="clang::UnqualifiedId">
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_Identifier">[IK_Identifier] {*Identifier}</DisplayString>
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_OperatorFunctionId">[IK_OperatorFunctionId] {OperatorFunctionId}</DisplayString>
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_ConversionFunctionId">[IK_ConversionFunctionId] {ConversionFunctionId}</DisplayString>
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_ConstructorName">[IK_ConstructorName] {ConstructorName}</DisplayString>
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_DestructorName">[IK_DestructorName] {DestructorName}</DisplayString>
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_DeductionGuideName">[IK_DeductionGuideName] {TemplateName}</DisplayString>
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_TemplateId">[IK_TemplateId] {TemplateId}</DisplayString>
-    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_ConstructorTemplateId">[IK_ConstructorTemplateId] {TemplateId}</DisplayString>
-    <DisplayString>Kind</DisplayString>
-    <Expand>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_Identifier">Identifier</ExpandedItem>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_OperatorFunctionId">OperatorFunctionId</ExpandedItem>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_ConversionFunctionId">ConversionFunctionId</ExpandedItem>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_ConstructorName">ConstructorName</ExpandedItem>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_DestructorName">DestructorName</ExpandedItem>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_DeductionGuideName">TemplateName</ExpandedItem>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_TemplateId">TemplateId</ExpandedItem>
-      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_ConstructorTemplateId">TemplateId</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeclGroup">
-    <DisplayString>NumDecls={NumDecls}</DisplayString>
-    <Expand>
-      <ArrayItems>
-        <Size>NumDecls</Size>
-        <ValuePointer>(Decl **)(this+1)</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-  <Type Name="clang::DeclGroupRef">
-    <DisplayString Condition="(Kind)((uintptr_t)D&amp;1)==SingleDeclKind">{*D}</DisplayString>
-    <DisplayString>{*(DeclGroup *)((uintptr_t)D&amp;~1)}</DisplayString>
-    <Expand>
-      <ExpandedItem Condition="(Kind)((uintptr_t)D&amp;1)==SingleDeclKind">D</ExpandedItem>
-      <ExpandedItem Condition="(Kind)((uintptr_t)D&amp;1)==DeclGroupKind">(DeclGroup *)((uintptr_t)D&amp;~1)</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::Declarator">
-    <DisplayString>{DS} {Name}</DisplayString>
-  </Type>
-  <Type Name="clang::UnresolvedSet&lt;*&gt;">
-    <DisplayString>{Decls}</DisplayString>
-    <Expand>
-      <ExpandedItem>Decls</ExpandedItem>
-    </Expand>
-  </Type>
-  <Type Name="clang::LookupResult">
-    <DisplayString Condition="ResultKind == clang::LookupResult::Ambiguous">{Ambiguity,en}: {Decls}</DisplayString>
-    <DisplayString>{ResultKind,en}: {Decls}</DisplayString>
-  </Type>
-  <Type Name="clang::ActionResult&lt;*, 0&gt;">
-    <DisplayString Condition="Invalid">Invalid</DisplayString>
-    <DisplayString Condition="!*(void **)&amp;Val">Unset</DisplayString>
-    <DisplayString>{Val}</DisplayString>
-  </Type>
-  <Type Name="clang::ActionResult&lt;*, 1&gt;">
-    <DisplayString Condition="Value&amp;1">Invalid</DisplayString>
-    <DisplayString Condition="Value==0">Unset</DisplayString>
-    <DisplayString>{($T1)(Value&amp;~1)}</DisplayString>
-    <Expand>
-      <Item Name="Invalid">(bool)(Value&amp;1)</Item>
-      <Item Name="Val">($T1)(Value&amp;~1)</Item>
-    </Expand>
-  </Type>
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Visual Studio Native Debugging Visualizers for LLVM
+
+For Visual Studio 2013 only, put this file into
+"%USERPROFILE%\Documents\Visual Studio 2013\Visualizers" or create a symbolic link so it updates automatically.
+
+For later versions of Visual Studio, no setup is required-->
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+
+  <Type Name="clang::Type">
+    <!-- To visualize clang::Types, we need to look at TypeBits.TC to determine the actual
+         type subclass and manually dispatch accordingly (Visual Studio can't identify the real type
+         because clang::Type has no virtual members hence no RTTI).
+
+         Views:
+           "cmn": Visualization that is common to all clang::Type subclasses
+           "poly": Visualization that is specific to the actual clang::Type subclass. The subtype-specific
+                   <DisplayString> is typically as C++-like as possible (like in dump()) with <Expand>
+                   containing all the gory details.
+           "cpp": Only occasionally used when we need to distinguish between an ordinary view and a C++-like view.
+    -->
+    <DisplayString IncludeView="cmn" Condition="TypeBits.TC==clang::LocInfoType::LocInfo">LocInfoType</DisplayString>
+    <DisplayString IncludeView="cmn">{(clang::Type::TypeClass)TypeBits.TC, en}Type</DisplayString>
+    <!-- Dispatch to visualizers for the actual Type subclass -->
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Builtin" IncludeView="poly">{*(clang::BuiltinType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Pointer" IncludeView="poly">{*(clang::PointerType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Paren" IncludeView="poly">{*(clang::ParenType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::BitInt" IncludeView="poly">{(clang::BitIntType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::LValueReference" IncludeView="poly">{*(clang::LValueReferenceType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::RValueReference" IncludeView="poly">{*(clang::RValueReferenceType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray" IncludeView="poly">{(clang::ConstantArrayType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray" IncludeView="left">{(clang::ConstantArrayType *)this,view(left)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray" IncludeView="right">{(clang::ConstantArrayType *)this,view(right)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray" IncludeView="poly">{(clang::VariableArrayType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray" IncludeView="left">{(clang::VariableArrayType *)this,view(left)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray" IncludeView="right">{(clang::VariableArrayType *)this,view(right)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray" IncludeView="poly">{(clang::IncompleteArrayType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray" IncludeView="left">{(clang::IncompleteArrayType *)this,view(left)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray" IncludeView="right">{(clang::IncompleteArrayType *)this,view(right)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Typedef" IncludeView="poly">{(clang::TypedefType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Typedef" IncludeView="cpp">{(clang::TypedefType *)this,view(cpp)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Attributed" IncludeView="poly">{*(clang::AttributedType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="poly">{(clang::DecayedType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="left">{(clang::DecayedType *)this,view(left)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Decayed" IncludeView="right">{(clang::DecayedType *)this,view(right)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="poly">{(clang::ElaboratedType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="left">{(clang::ElaboratedType *)this,view(left)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated" IncludeView="right">{(clang::ElaboratedType *)this,view(right)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="poly">{*(clang::TemplateTypeParmType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm" IncludeView="cpp">{*(clang::TemplateTypeParmType *)this,view(cpp)}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm" IncludeView="poly">{*(clang::SubstTemplateTypeParmType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Record" IncludeView="poly">{*(clang::RecordType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::Record" IncludeView="cpp">{*(clang::RecordType *)this,view(cpp)}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto" IncludeView="poly">{(clang::FunctionProtoType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto" IncludeView="left">{(clang::FunctionProtoType *)this,view(left)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto" IncludeView="right">{(clang::FunctionProtoType *)this,view(right)na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::TemplateSpecialization" IncludeView="poly">{*(clang::TemplateSpecializationType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::DeducedTemplateSpecialization" IncludeView="poly">{*(clang::DeducedTemplateSpecializationType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::DeducedTemplateSpecialization" IncludeView="cpp">{*(clang::DeducedTemplateSpecializationType *)this,view(cpp)}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::InjectedClassName" IncludeView="poly">{*(clang::InjectedClassNameType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::DependentName" IncludeView="poly">{*(clang::DependentNameType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::Type::TypeClass::PackExpansion" IncludeView="poly">{*(clang::PackExpansionType *)this}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::LocInfoType::LocInfo" IncludeView="poly">{(clang::LocInfoType *)this,na}</DisplayString>
+    <DisplayString Condition="TypeBits.TC==clang::LocInfoType::LocInfo" IncludeView="cpp">{(clang::LocInfoType *)this,view(cpp)na}</DisplayString>
+    <DisplayString IncludeView="cpp">{this,view(poly)na}</DisplayString>
+    <DisplayString IncludeView="left">{*this,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="right"></DisplayString>
+    <DisplayString IncludeView="poly">No visualizer yet for {(clang::Type::TypeClass)TypeBits.TC,en}Type</DisplayString> <!-- Not yet implemented Type subclass -->
+    <DisplayString IncludeView="Dependence" Condition="TypeBits.Dependence">Dependence{" ",en}</DisplayString>
+    <DisplayString IncludeView="Dependence"></DisplayString>
+    <DisplayString IncludeView="Cache" Condition="TypeBits.CacheValid &amp;&amp; TypeBits.CachedLocalOrUnnamed">CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en} CachedLocalOrUnnamed</DisplayString>
+    <DisplayString IncludeView="Cache" Condition="TypeBits.CacheValid &amp;&amp; !TypeBits.CachedLocalOrUnnamed">CachedLinkage: {(clang::Linkage)TypeBits.CachedLinkage,en}{" ",sb}</DisplayString>
+    <DisplayString IncludeView="Cache"></DisplayString>
+    <DisplayString IncludeView="FromAST" Condition="TypeBits.FromAST">FromAST</DisplayString>
+    <DisplayString IncludeView="FromAST"></DisplayString>
+    <DisplayString IncludeView="flags" Condition="!TypeBits.Dependence &amp;&amp; !TypeBits.CacheValid &amp;&amp; !TypeBits.FromAST">
+      No TypeBits set beyond TypeClass
+    </DisplayString>
+    <DisplayString IncludeView="flags">{*this, view(Dependence)}{*this, view(Cache)}{*this, view(FromAST)}</DisplayString>
+    <DisplayString>{*this,view(cmn)}  {{{*this,view(poly)}}}</DisplayString>
+    <Expand>
+      <Item Name="TypeClass" IncludeView="cmn">(clang::Type::TypeClass)TypeBits.TC</Item>
+      <Item Name="Flags" IncludeView="cmn">this,view(flags)na</Item>
+      <Item Name="Canonical" IncludeView="cmn">CanonicalType</Item>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Builtin">*(clang::BuiltinType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Pointer">*(clang::PointerType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Paren">*(clang::ParenType*)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::BitInt">*(clang::BitIntType*)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::LValueReference">*(clang::LValueReferenceType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::RValueReference">*(clang::RValueReferenceType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::ConstantArray">(clang::ConstantArrayType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::VariableArray">(clang::VariableArrayType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::IncompleteArray">(clang::IncompleteArrayType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Attributed">*(clang::AttributedType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Decayed">(clang::DecayedType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Elaborated">(clang::ElaboratedType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateTypeParm">(clang::TemplateTypeParmType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::SubstTemplateTypeParm">(clang::SubstTemplateTypeParmType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::Record">(clang::RecordType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::FunctionProto">(clang::FunctionProtoType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::TemplateSpecialization">(clang::TemplateSpecializationType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::DeducedTemplateSpecialization">(clang::DeducedTemplateSpecializationType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::InjectedClassName">(clang::InjectedClassNameType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::DependentName">(clang::DependentNameType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::Type::TypeClass::PackExpansion">(clang::PackExpansionType *)this</ExpandedItem>
+      <ExpandedItem ExcludeView="cmn" Condition="TypeBits.TC==clang::LocInfoType::LocInfo">(clang::LocInfoType *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::ArrayType">
+    <Expand>
+      <Item Name="ElementType">ElementType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::ConstantArrayType">
+    <DisplayString IncludeView="left">{ElementType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="right">[{Size}]</DisplayString>
+    <DisplayString>{ElementType,view(cpp)}[{Size}]</DisplayString>
+    <Expand>
+      <Item Name="Size">Size</Item>
+      <ExpandedItem>(clang::ArrayType *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::IncompleteArrayType">
+    <DisplayString IncludeView="left">{ElementType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="right">[]</DisplayString>
+    <DisplayString>{ElementType,view(cpp)}[]</DisplayString>
+    <Expand>
+      <ExpandedItem>(clang::ArrayType *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::VariableArrayType">
+    <DisplayString IncludeView="left">{ElementType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="right">[*]</DisplayString>
+    <DisplayString>{ElementType,view(cpp)}[*]</DisplayString>
+    <Expand>
+      <Item Name="[Size Expression]">(clang::Expr *)SizeExpr</Item>
+      <ExpandedItem>(clang::ArrayType *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TypedefType">
+    <DisplayString IncludeView="cpp">{Decl,view(name)nd}</DisplayString>
+    <DisplayString>{Decl}</DisplayString>
+    <Expand>
+      <Item Name="Decl">Decl</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::PointerType">
+    <DisplayString>{PointeeType, view(cpp)} *</DisplayString>
+    <Expand>
+      <Item Name="PointeeType">PointeeType</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::ParenType">
+    <DisplayString>{Inner, view(cpp)}</DisplayString>
+    <Expand>
+      <Item Name="Inner">Inner</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::BitIntType">
+    <DisplayString Condition="!IsUnsigned">signed _BitInt({NumBits})</DisplayString>
+    <DisplayString Condition="!IsUnsigned">unsigned _BitInt({NumBits})(</DisplayString>
+    <Expand>
+      <Item Name="NumBits">NumBits</Item>
+      <ExpandedItem>(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <!-- We visualize all inner types for clang reference types. So a rvalue reference to an lvalue reference
+       to an int  would visual as int &amp; &amp;&amp; This is a little different than GetPointeeType(),
+       but more clearly displays the data structure and seems natural -->
+  <Type Name="clang::LValueReferenceType">
+    <DisplayString>{((clang::ReferenceType *)this)-&gt;PointeeType,view(cpp)} &amp;</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="PointeeType">PointeeType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::RValueReferenceType">
+    <DisplayString>{((clang::ReferenceType *)this)-&gt;PointeeType,view(cpp)} &amp;&amp;</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="PointeeType">PointeeType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::AttributedType">
+    <DisplayString>{ModifiedType} Attribute={(clang::AttributedType::Kind)AttributedTypeBits.AttrKind}</DisplayString>
+  </Type>
+
+  <!-- Unfortunately, Visual Studio has trouble seeing the PointerBitMask member PointerIntUnion, so I hardwire it to 2 bits-->
+  <Type Name="clang::DeclContext">
+    <DisplayString>{(clang::Decl::Kind)DeclContextBits.DeclKind,en}Decl</DisplayString>
+    <Expand>
+      <Item Name="DeclKind">(clang::Decl::Kind)DeclContextBits.DeclKind,en</Item>
+      <Synthetic Name="Members">
+        <DisplayString></DisplayString>
+        <Expand>
+          <LinkedListItems>
+            <HeadPointer>FirstDecl</HeadPointer>
+            <NextPointer>(clang::Decl *)(*(intptr_t *)NextInContextAndBits.Value.Data &amp; ~3)</NextPointer>
+            <ValueNode>*this</ValueNode>
+          </LinkedListItems>
+        </Expand>
+      </Synthetic>
+    </Expand>
+  </Type>
+  <Type Name="clang::FieldDecl">
+    <DisplayString>Field {{{*(clang::DeclaratorDecl *)this,view(cpp)nd}}}</DisplayString>
+  </Type>
+  <Type Name="clang::CXXMethodDecl">
+    <DisplayString IncludeView="cpp">{*(clang::FunctionDecl *)this,nd}</DisplayString>
+    <DisplayString>Method {{{*this,view(cpp)}}}</DisplayString>
+  </Type>
+  <Type Name="clang::CXXConstructorDecl">
+    <DisplayString>Constructor {{{Name,view(cpp)}({*(clang::FunctionDecl *)this,view(parm0)nd})}}</DisplayString>
+  </Type>
+  <Type Name="clang::CXXDestructorDecl">
+    <DisplayString>Destructor {{~{Name,view(cpp)}()}}</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateTypeParmDecl">
+    <DisplayString IncludeView="TorC" Condition="Typename">typename</DisplayString>
+    <DisplayString IncludeView="TorC" Condition="!Typename">class</DisplayString>
+    <DisplayString IncludeView="MaybeEllipses" Condition="TypeForDecl == nullptr">(not yet known if parameter pack) </DisplayString>
+    <DisplayString IncludeView="MaybeEllipses" Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)TypeForDecl-&gt;CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType)->CanTTPTInfo.ParameterPack">...</DisplayString>
+    <DisplayString IncludeView="MaybeEllipses" Condition="!((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)TypeForDecl-&gt;CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType)->CanTTPTInfo.ParameterPack"></DisplayString>
+    <DisplayString IncludeView="DefaultArg" Condition="(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data &amp; 3LL) == 0">{(TypeSourceInfo *)(*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data&amp;~3LL),view(cpp)}</DisplayString>
+    <DisplayString IncludeView="DefaultArg">{{InheritedInitializer}}</DisplayString>
+    <DisplayString IncludeView="Initializer" Condition="*(uintptr_t *)DefaultArgument.ValueOrInherited.Val.Value.Data &amp; 3LL">= {this,view(DefaultArg)na}</DisplayString>
+    <DisplayString IncludeView="Initializer"></DisplayString>
+    <DisplayString>{*this,view(TorC)} {*this,view(MaybeEllipses)}{Name,view(cpp)} {this,view(Initializer)na}</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateDecl">
+    <DisplayString IncludeView="cpp">{*TemplatedDecl,view(cpp)}</DisplayString>
+    <DisplayString>template{TemplateParams,na} {*TemplatedDecl};</DisplayString>
+    <Expand>
+      <Item Name="TemplateParams">TemplateParams,na</Item>
+      <Item Name="TemplatedDecl">TemplatedDecl,na</Item>
+    </Expand>
+  </Type>
+  <!-- Unfortunately, visualization of PointerIntPair<PointerUnion> doesn't work due to limitations in natvis, so we will barehad it-->
+  <Type Name="clang::TypedefNameDecl">
+    <DisplayString Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)==0" IncludeView="type">{(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL),view(cpp)na}</DisplayString>
+    <DisplayString Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)!=0" IncludeView="type">{(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL),view(cpp)na}</DisplayString>
+    <DisplayString IncludeView="name">{(TypeDecl *)this,view(cpp)nand}</DisplayString>
+    <DisplayString>typedef {this,view(type)na} {this,view(name)na};</DisplayString>
+    <Expand>
+      <Item Name="IsTransparent" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 1)==0">"Not yet calculated",sb</Item>
+      <Item Name="IsTransparent" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 1)!=0">(bool)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 2)</Item>
+      <Item Name="TypeSourceInfo" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)==0">(clang::TypeSourceInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL)</Item>
+      <Item Name="ModedTInfo" Condition="(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; 4)!=0">(clang::TypedefNameDecl::ModedTInfo *)(*(uintptr_t *)MaybeModedTInfo.Value.Data &amp; ~7LL)</Item>
+      <ExpandedItem>(TypeDecl *)this,nd</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TypeAliasDecl">
+    <DisplayString IncludeView="cpp">{(TypedefNameDecl *)this,view(name)nand}</DisplayString>
+    <DisplayString>using {(TypedefNameDecl *)this,view(name)nand} = {(TypedefNameDecl *)this,view(type)nand}</DisplayString>
+  </Type>
+  <Type Name="clang::AssumedTemplateStorage">
+    <DisplayString>{Name}</DisplayString>
+  </Type>
+  <Type Name="clang::UncommonTemplateNameStorage::BitsTag">
+    <DisplayString>Kind={(UncommonTemplateNameStorage::Kind)Kind,en}, Size={Size}</DisplayString>
+    <Expand>
+      <Item Name="Kind">(UncommonTemplateNameStorage::Kind)Kind</Item>
+      <Item Name="Size">Size</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::UncommonTemplateNameStorage">
+    <DisplayString IncludeView="cmn">{Bits},</DisplayString>
+    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::Overloaded">{this,view(cmn)na},{(OverloadedTemplateStorage*)this,na}</DisplayString>
+    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::Assumed">{this,view(cmn)na},{(AssumedTemplateStorage*)this,na}</DisplayString>
+    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParm">{this,view(cmn)na},{(SubstTemplateTemplateParmStorage*)this,na}</DisplayString>
+    <DisplayString Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParmPack">{this,view(cmn)na},{(SubstTemplateTemplateParmPackStorage*)this,na}</DisplayString>
+    <DisplayString>{this,view(cmn)na}</DisplayString>
+    <Expand>
+      <Item Name="Bits">Bits</Item>
+      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::Overloaded">(OverloadedTemplateStorage*)this</ExpandedItem>
+      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::Assumed">(AssumedTemplateStorage*)this</ExpandedItem>
+      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParm">(SubstTemplateTemplateParmStorage*)this</ExpandedItem>
+      <ExpandedItem Condition="Bits.Kind==UncommonTemplateNameStorage::SubstTemplateTemplateParmPack">(SubstTemplateTemplateParmPackStorage*)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <!-- clang::TemplateName::StorageType -->
+  <Type Name="llvm::PointerUnion&lt;clang::TemplateDecl *, clang::UncommonTemplateNameStorage *,
+                          clang::QualifiedTemplateName *, clang::DependentTemplateName *&gt;">
+    <!-- Expand this out by hand to get cpp view -->
+    <DisplayString Condition="(Val.Value &amp;3) == 0" IncludeView="cpp">
+      {(clang::TemplateDecl *)(Val.Value &amp; ~3LL),view(cpp)na}
+    </DisplayString>
+    <DisplayString Condition="(Val.Value &amp;3) == 0">
+      {(clang::TemplateDecl *)(Val.Value &amp; ~3LL),na}
+    </DisplayString>
+    <DisplayString Condition="(Val.Value &amp;3) == 1" IncludeView="cpp">
+      {(clang::UncommonTemplateNameStorage *)(Val.Value &amp; ~3LL),view(cpp)na}
+    </DisplayString>
+    <DisplayString Condition="(Val.Value &amp;3) == 1">
+      {(clang::UncommonTemplateNameStorage *)(Val.Value &amp; ~3LL),na}
+    </DisplayString>
+    <DisplayString Condition="(Val.Value &amp;3) == 2" IncludeView="cpp">
+      {(clang::QualifiedTemplateName *)(Val.Value &amp; ~3LL),view(cpp)na}
+    </DisplayString>
+    <DisplayString Condition="(Val.Value &amp;3) == 2">
+      {(clang::QualifiedTemplateName *)(Val.Value &amp; ~3LL),na}
+    </DisplayString>
+    <DisplayString Condition="(Val.Value &amp;3) == 3" IncludeView="cpp">
+      {(clang::DependentTemplateName *)(Val.Value &amp; ~3LL),view(cpp)na}
+    </DisplayString>
+    <DisplayString Condition="(Val.Value &amp;3) == 3">
+      {(clang::DependentTemplateName *)(Val.Value &amp; ~3LL),na}
+    </DisplayString>
+    <Expand>
+      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 0">"TemplateDecl",s8b</Item>
+      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 0">
+        (clang::TemplateDecl *)(Val.Value &amp; ~3LL)
+      </Item>
+      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 1">"UncommonTemplateNameStorage",s8b</Item>
+      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 1">
+        (clang::UncommonTemplateNameStorage *)(Val.Value &amp; ~3LL)
+      </Item>
+      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 2">"QualifiedTemplateName",s8b</Item>
+      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 2">
+        (clang::QualifiedTemplateName *)(Val.Value &amp; ~3LL)
+      </Item>
+      <Item Name="[Holds]" Condition="(Val.Value &amp;3) == 3">"DependentTemplateName",s8b</Item>
+      <Item Name="[Ptr]" Optional="true"  Condition="(Val.Value &amp;3) == 3">
+        (clang::DependentTemplateName *)(Val.Value &amp; ~3LL)
+      </Item>
+      <Item Name="[Val]">Val</Item>
+
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateName">
+    <DisplayString IncludeView="cpp">{Storage,view(cpp)na}</DisplayString>
+    <DisplayString>{Storage,na}</DisplayString>
+    <Expand>
+      <ExpandedItem>Storage</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::NamedDecl" >
+    <DisplayString IncludeView="cpp">{Name,view(cpp)}</DisplayString>
+    <DisplayString>{Name}</DisplayString>
+  </Type>
+  <Type Name="clang::TagDecl">
+    <DisplayString IncludeView="implicit" Condition="Implicit">implicit{" ",sb}</DisplayString>
+    <DisplayString IncludeView="implicit"></DisplayString>
+    <DisplayString IncludeView="modifiers">{*this,view(implicit)nd}</DisplayString>
+    <DisplayString IncludeView="cpp">{*this,view(modifiers)}{Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Struct">{*this,view(modifiers)nd}struct {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Interface">{*this,view(modifiers)nd}interface {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Union">{*this,view(modifiers)nd}union {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Class">{*this,view(modifiers)nd}class {Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="TagDeclBits.TagDeclKind==clang::TagTypeKind::Enum">{*this,view(modifiers)nd}enum {Name,view(cpp)}</DisplayString>
+    <Expand>
+      <ExpandedItem>(clang::DeclContext *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TagType">
+    <DisplayString IncludeView="cpp">{decl,view(cpp)na}</DisplayString>
+    <DisplayString>{*decl}</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="decl">decl</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::RecordType">
+    <DisplayString IncludeView="cpp">{(clang::TagType *)this,view(cpp)na}</DisplayString>
+    <DisplayString>{(clang::TagType *)this,na}</DisplayString>
+    <Expand>
+      <Item Name="TagType">*(clang::TagType *)this</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::SubstTemplateTypeParmType">
+    <DisplayString>{{{*Replaced,view(cpp)} &lt;= {CanonicalType,view(cpp)}}}</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+      <Item Name="Replaced">*Replaced</Item>
+    </Expand>
+  </Type>
+  <!-- We only show the first 5 parameter types in the display string (can't figure out how to loop in DisplayString)
+       but the expansion has all parameters -->
+  <Type Name="clang::FunctionProtoType">
+    <DisplayString IncludeView="left" Condition="FunctionTypeBits.HasTrailingReturn"></DisplayString>
+    <DisplayString IncludeView="left">{ResultType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="parm0" Condition="FunctionTypeBits.NumParams==0"></DisplayString>
+    <DisplayString IncludeView="parm0">{*(clang::QualType *)(this+1),view(cpp)}{*this,view(parm1)}</DisplayString>
+    <DisplayString IncludeView="parm1" Condition="FunctionTypeBits.NumParams==1"></DisplayString>
+    <DisplayString IncludeView="parm1">, {*((clang::QualType *)(this+1)+1),view(cpp)}{*this,view(parm2)}</DisplayString>
+    <DisplayString IncludeView="parm2" Condition="FunctionTypeBits.NumParams==2"></DisplayString>
+    <DisplayString IncludeView="parm2">, {*((clang::QualType *)(this+1)+2),view(cpp)}{*this,view(parm3)}</DisplayString>
+    <DisplayString IncludeView="parm3" Condition="FunctionTypeBits.NumParams==3"></DisplayString>
+    <DisplayString IncludeView="parm3">, {*((clang::QualType *)(this+1)+3),view(cpp)}{*this,view(parm4)}</DisplayString>
+    <DisplayString IncludeView="parm4" Condition="FunctionTypeBits.NumParams==4"></DisplayString>
+    <DisplayString IncludeView="parm4">, {*((clang::QualType *)(this+1)+4),view(cpp)}{*this,view(parm5)}</DisplayString>
+    <DisplayString IncludeView="parm5" Condition="FunctionTypeBits.NumParams==5"></DisplayString>
+    <DisplayString IncludeView="parm5">, /* expand for more params */</DisplayString>
+    <DisplayString IncludeView="right" Condition="FunctionTypeBits.HasTrailingReturn">({*this,view(parm0)}) -&gt; {ResultType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="right">({*this,view(parm0)})</DisplayString>
+    <DisplayString>{this,view(left)na}{this,view(right)na}</DisplayString>
+    <Expand>
+      <Item Name="ResultType">ResultType</Item>
+      <Synthetic Name="Parameter Types">
+        <DisplayString>{*this,view(parm0)}</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>FunctionTypeBits.NumParams</Size>
+            <ValuePointer>(clang::QualType *)(this+1)</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+
+  <Type Name="clang::AdjustedType">
+    <DisplayString>{OriginalTy} adjusted to {AdjustedTy}</DisplayString>
+    <Expand>
+      <Item Name="OriginalTy">OriginalTy</Item>
+      <Item Name="AdjustedTy">AdjustedTy</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::DecayedType">
+    <DisplayString IncludeView="left">{OriginalTy,view(left)}</DisplayString>
+    <DisplayString IncludeView="right">{OriginalTy,view(right)}</DisplayString>
+    <DisplayString>{OriginalTy}</DisplayString>
+    <Expand>
+      <ExpandedItem>(clang::AdjustedType *)this</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::ElaboratedType">
+    <DisplayString IncludeView="left">{NamedType,view(left)}</DisplayString>
+    <DisplayString IncludeView="right">{NamedType,view(right)}</DisplayString>
+    <DisplayString>{NamedType}</DisplayString>
+    <Expand>
+      <Item Name="[Keyword]">(clang::ElaboratedTypeKeyword)TypeWithKeywordBits.Keyword</Item>
+      <Item Name="[Nested Name Specifier]">NNS</Item>
+      <Item Name="[Underlying Type]">NamedType,view(cmn)</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateTypeParmType">
+    <DisplayString IncludeView="cpp" Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">{TTPDecl->Name,view(cpp)}</DisplayString>
+    <DisplayString Condition="((clang::TemplateTypeParmType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)CanonicalType.Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType) != this">Non-canonical: {*TTPDecl}</DisplayString>
+    <DisplayString>Canonical: {CanTTPTInfo}</DisplayString>
+    <Expand>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::InjectedClassNameType">
+    <DisplayString>{Decl,view(cpp)}</DisplayString>
+    <Expand>
+      <Item Name="Decl">Decl</Item>
+      <Item Name="InjectedType">InjectedType</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::DependentNameType">
+    <DisplayString>{NNS}{Name,view(cpp)na}</DisplayString>
+    <Expand>
+      <Item Name="NNS">NNS</Item>
+      <Item Name="Name">Name</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::NestedNameSpecifier">
+    <DisplayString Condition="!Specifier"></DisplayString>
+    <DisplayString Condition="((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3) == 0">{(IdentifierInfo*)Specifier,view(cpp)na}::</DisplayString>
+    <DisplayString Condition="((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3) == 1">{(NamedDecl*)Specifier,view(cpp)na}::</DisplayString>
+    <DisplayString Condition="((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3) == 2">{(Type*)Specifier,view(cpp)na}::</DisplayString>
+    <Expand>
+      <Item Name="Kind">(NestedNameSpecifier::StoredSpecifierKind)((*(uintptr_t *)Prefix.Value.Data&gt;&gt;1)&amp;3)</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::PackExpansionType">
+    <DisplayString>{Pattern}</DisplayString>
+    <Expand>
+      <Item Name="Pattern">Pattern</Item>
+      <Item Name="NumExpansions">NumExpansions</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::QualType">
+    <DisplayString IncludeView="poly">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(poly)}{*this,view(fastQuals)}</DisplayString>
+    <DisplayString IncludeView="cpp">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(cpp)}{*this,view(fastQuals)}</DisplayString>
+    <DisplayString IncludeView="left">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(left)}{*this,view(fastQuals)}</DisplayString>
+    <DisplayString IncludeView="right">{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,view(right)}{*this,view(fastQuals)}</DisplayString>
+    <!-- For the Fast Qualifiers, it is simpler (and probably more efficient) just to list all 8 cases than create
+          views for each qualifier. TODO: Non-fast qualifiers -->
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==0"></DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==1">{" ",sb}const</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==2">{" ",sb}restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==3">{" ",sb}const restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==4">{" ",sb}volatile</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==5">{" ",sb}const volatile</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==6">{" ",sb}volatile restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals" Condition="(((*(uintptr_t *)Value.Value.Data) &gt;&gt; 1) &amp; 7)==7">{" ",sb}const volatile restrict</DisplayString>
+    <DisplayString IncludeView="fastQuals">Cannot visualize non-fast qualifiers</DisplayString>
+    <DisplayString Condition="(*(uintptr_t *)Value.Value.Data) == 0">Null</DisplayString>
+    <DisplayString>{((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType,na}{*this,view(fastQuals)}</DisplayString>
+    <Expand>
+      <Item Name="Fast Quals">*this,view(fastQuals)</Item>
+      <ExpandedItem>((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)Value.Value.Data) &amp; ~(uintptr_t)((1U &lt;&lt; clang::TypeAlignmentInBits) - 1U)))-&gt;BaseType</ExpandedItem>
+    </Expand>
+
+  </Type>
+  <Type Name="clang::LocInfoType">
+    <DisplayString IncludeView="cpp">{DeclInfo,view(cpp)na}</DisplayString>
+    <DisplayString>{DeclInfo,na}</DisplayString>
+    <Expand>
+      <Item Name="DeclInfo">DeclInfo</Item>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TypeSourceInfo">
+    <DisplayString IncludeView="cpp">{Ty,view(cpp)}</DisplayString>
+    <DisplayString>{Ty}</DisplayString>
+    <Expand>
+      <ExpandedItem>Ty</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TypeLoc">
+    <DisplayString>{(QualType *)&amp;Ty,na}</DisplayString>
+    <Expand>
+      <Item Name="Ty">(QualType *)&amp;Ty</Item>
+      <Item Name="Data">Data</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::TypeLocBuilder">
+    <DisplayString Optional="true" Condition="LastTy.Value.Value==0">Not building anything</DisplayString>
+    <DisplayString Optional="true">Building a {LastTy}</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateArgumentLoc">
+    <DisplayString IncludeView="cpp">{Argument,view(cpp)}</DisplayString>
+    <DisplayString>{Argument}</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateArgument">
+    <DisplayString IncludeView="cpp" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">{*(clang::QualType *)&amp;TypeOrValue.V,view(cpp)}</DisplayString>
+    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">{(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en} template argument: {*(clang::QualType *)&amp;TypeOrValue.V}</DisplayString>
+    <DisplayString IncludeView="arg0" Condition="Args.NumArgs==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{Args.Args[0]}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="Args.NumArgs==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {Args.Args[1]}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="Args.NumArgs==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {Args.Args[2]}, ...</DisplayString>
+    <DisplayString IncludeView="arg0cpp" Condition="Args.NumArgs==0"></DisplayString>
+    <DisplayString IncludeView="arg0cpp">{Args.Args[0],view(cpp)}{*this,view(arg1cpp)}</DisplayString>
+    <DisplayString IncludeView="arg1cpp" Condition="Args.NumArgs==1"></DisplayString>
+    <DisplayString IncludeView="arg1cpp">, {Args.Args[1],view(cpp)}{*this,view(arg2cpp)}</DisplayString>
+    <DisplayString IncludeView="arg2cpp" Condition="Args.NumArgs==2"></DisplayString>
+    <DisplayString IncludeView="arg2cpp">, {Args.Args[2],view(cpp)}, ...</DisplayString>
+    <DisplayString IncludeView="cpp" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">{*this,view(arg0cpp)}</DisplayString>
+    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">{*this,view(arg0)}</DisplayString>
+    <DisplayString Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Expression">{(clang::Expr *)TypeOrValue.V,view(cpp)na}</DisplayString>
+    <DisplayString>{(clang::TemplateArgument::ArgKind)TypeOrValue.Kind,en}</DisplayString>
+    <Expand>
+      <Item Name="QualType" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Type">*(clang::QualType *)&amp;TypeOrValue.V</Item>
+      <Item Name="Expression" Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Expression">(clang::Expr *)TypeOrValue.V</Item>
+      <ArrayItems Condition="Integer.Kind == clang::TemplateArgument::ArgKind::Pack">
+        <Size>Args.NumArgs</Size>
+        <ValuePointer>Args.Args</ValuePointer>
+      </ArrayItems>
+      <!-- TODO: Other kinds-->
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateArgumentListInfo">
+    <DisplayString IncludeView ="elt0" Condition="Arguments.Size == 0"></DisplayString>
+    <DisplayString IncludeView ="elt0">{((TemplateArgumentLoc*)Arguments.BeginX)[0],view(cpp)}{*this,view(elt1)}</DisplayString>
+    <DisplayString IncludeView ="elt1" Condition="Arguments.Size == 1"></DisplayString>
+    <DisplayString IncludeView ="elt1">, {((TemplateArgumentLoc*)Arguments.BeginX)[1],view(cpp)}{*this,view(elt2)}</DisplayString>
+    <DisplayString IncludeView ="elt2" Condition="Arguments.Size == 2"></DisplayString>
+    <DisplayString IncludeView ="elt2">, {((TemplateArgumentLoc*)Arguments.BeginX)[2],view(cpp)}{*this,view(elt3)}</DisplayString>
+    <DisplayString IncludeView ="elt3" Condition="Arguments.Size == 3"></DisplayString>
+    <DisplayString IncludeView ="elt3">, {((TemplateArgumentLoc*)Arguments.BeginX)[3],view(cpp)}{*this,view(elt4)}</DisplayString>
+    <DisplayString IncludeView ="elt4" Condition="Arguments.Size == 4"></DisplayString>
+    <DisplayString IncludeView ="elt4">, ...</DisplayString>
+    <DisplayString Condition="Arguments.Size == 0">empty</DisplayString>
+    <DisplayString Condition="Arguments.Size != 0">&lt;{*this,view(elt0)}&gt;</DisplayString>
+    <DisplayString>Uninitialized</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateArgumentList">
+    <DisplayString IncludeView="arg0" Condition="NumArguments==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{Arguments[0],view(cpp)}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="NumArguments==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {Arguments[1],view(cpp)}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="NumArguments==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {Arguments[1],view(cpp)}, ...</DisplayString>
+    <DisplayString>&lt;{*this,view(arg0)}&gt;</DisplayString>
+    <Expand>
+      <Item Name="NumArguments">NumArguments</Item>
+      <ArrayItems>
+        <Size>NumArguments</Size>
+        <ValuePointer>Arguments</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="llvm::ArrayRef&lt;clang::TemplateArgument&gt;">
+    <DisplayString IncludeView="arg0" Condition="Length==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{Data[0],view(cpp)}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="Length==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {Data[1],view(cpp)}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="Length==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {Data[2],view(cpp)}, ...</DisplayString>
+    <DisplayString>&lt;{*this,view(arg0)}&gt;</DisplayString>
+    <Expand>
+      <Item Name="Length">Length</Item>
+      <Synthetic Name="Data">
+        <Expand>
+          <ArrayItems>
+            <Size>Length</Size>
+            <ValuePointer>Data</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+    </Expand>
+  </Type>
+  <Type Name="clang::MultiLevelTemplateArgumentList">
+    <DisplayString IncludeView="level0" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==0"></DisplayString>
+    <DisplayString IncludeView="level0">{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[0],view(cpp)}{*this,view(level1)}</DisplayString>
+    <DisplayString IncludeView="level1" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==1"></DisplayString>
+    <DisplayString IncludeView="level1">::{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[1],view(cpp)}{*this,view(level2)}</DisplayString>
+    <DisplayString IncludeView="level2" Condition="(llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.EndX - (llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX==2"></DisplayString>
+    <DisplayString IncludeView="level2">::{((llvm::ArrayRef&lt;clang::TemplateArgument&gt; *)TemplateArgumentLists.BeginX)[2],view(cpp)}, ...</DisplayString>
+    <DisplayString>{*this,view(level0)}</DisplayString>
+    <Expand>
+      <Item Name="TemplateList">TemplateArgumentLists</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::ParsedTemplateArgument">
+    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Type" IncludeView="cpp">{(clang::QualType *)Arg,view(cpp)na}</DisplayString>
+    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Type">Type template argument: {*(clang::QualType *)Arg}</DisplayString>
+    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::NonType">Non-type template argument: {*(clang::Expr *)Arg}</DisplayString>
+    <DisplayString Condition="Kind==clang::ParsedTemplateArgument::Template">Template template argument: {*(clang::TemplateName *)Arg</DisplayString>
+    <Expand>
+      <Item Name="Kind">Kind,en</Item>
+      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::Type">(clang::QualType *)Arg</Item>
+      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::NonType">(clang::Expr *)Arg</Item>
+      <Item Name="Arg" Condition="Kind==clang::ParsedTemplateArgument::Template">(clang::TemplateName *)Arg</Item>
+    </Expand>
+  </Type>
+  <!-- Builtin types that have C++ keywords are manually displayed as that keyword. Otherwise, just use the enum name -->
+  <Type Name="clang::BuiltinType">
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Void">void</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Bool">bool</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char_U">char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UChar">unsigned char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::WChar_U">wchar_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char16">char16_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char32">char32_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UShort">unsigned short</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UInt">unsigned int</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::ULong">unsigned long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::ULongLong">unsigned long long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::UInt128">__uint128_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Char_S">char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::SChar">signed char</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::WChar_S">wchar_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Short">short</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Int">int</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Long">long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::LongLong">long long</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Int128">__int128_t</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Half">__fp16</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Float">float</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::Double">double</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::LongDouble">long double</DisplayString>
+    <DisplayString Condition="BuiltinTypeBits.Kind==clang::BuiltinType::NullPtr">nullptr_t</DisplayString>
+    <DisplayString>{(clang::BuiltinType::Kind)BuiltinTypeBits.Kind, en}</DisplayString>
+    <Expand>
+      <Item Name="Kind">(clang::BuiltinType::Kind)BuiltinTypeBits.Kind</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="clang::TemplateSpecializationType">
+    <DisplayString IncludeView="arg0" Condition="TemplateSpecializationTypeBits.NumArgs==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{((clang::TemplateArgument *)(this+1))[0],view(cpp)}{*this,view(arg1)}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="TemplateSpecializationTypeBits.NumArgs==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {((clang::TemplateArgument *)(this+1))[1],view(cpp)}{*this,view(arg2)}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="TemplateSpecializationTypeBits.NumArgs==2"></DisplayString>
+    <DisplayString IncludeView="arg2">, {((clang::TemplateArgument *)(this+1))[2],view(cpp)}{*this,view(arg3)}</DisplayString>
+    <DisplayString Condition="(Template.Storage.Val.Value &amp; 3) == 0">
+      {*((clang::TemplateDecl *)(Template.Storage.Val.Value))->TemplatedDecl,view(cpp)}&lt;{*this,view(arg0)}&gt;
+    </DisplayString>
+    <DisplayString>Can't visualize this TemplateSpecializationType</DisplayString>
+    <Expand>
+      <Item Name="Template">Template.Storage</Item>
+      <ArrayItems>
+        <Size>TemplateSpecializationTypeBits.NumArgs</Size>
+        <ValuePointer>(clang::TemplateArgument *)(this+1)</ValuePointer>
+      </ArrayItems>
+      <ExpandedItem>*(clang::Type *)this, view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeducedType">
+    <Expand>
+      <Item Name="isDeduced">(CanonicalType.Value.Value != this) || TypeBits.Dependent</Item>
+      <ExpandedItem>*(clang::Type *)this,view(cmn)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeducedTemplateSpecializationType">
+    <DisplayString Condition="(CanonicalType.Value.Value != this) || TypeBits.Dependent">{CanonicalType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="cpp">{Template,view(cpp)}</DisplayString>
+    <DisplayString>{Template}</DisplayString>
+    <Expand>
+      <Item Name="Template">Template</Item>
+      <Item Name="Deduced As" Condition="(CanonicalType.Value.Value != this) || TypeBits.Dependent">CanonicalType,view(cpp)</Item>
+      <ExpandedItem>(clang::DeducedType *)this</ExpandedItem>
+      <Item Name="Template">Template</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::ClassTemplateSpecializationDecl">
+    <DisplayString>{*(CXXRecordDecl *)this,nd}{*TemplateArgs}</DisplayString>
+    <Expand>
+      <ExpandedItem>(CXXRecordDecl *)this,nd</ExpandedItem>
+      <Item Name="TemplateArgs">TemplateArgs</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::IdentifierInfo">
+    <DisplayString Condition="Entry != 0">{((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,sb}</DisplayString>
+    <Expand>
+      <Item Condition="Entry != 0" Name="[Identifier]">((llvm::StringMapEntry&lt;clang::IdentifierInfo *&gt;*)Entry)+1,s</Item>
+      <Item Name="Token Kind">(clang::tok::TokenKind)TokenID</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclarationName">
+    <DisplayString Condition="Ptr == 0" IncludeView="cpp"></DisplayString>
+    <DisplayString Condition="Ptr == 0">Empty</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredIdentifier" IncludeView="cpp">{*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredIdentifier">{{Identifier ({*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector">{{ObjC Zero Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector">{{ObjC One Arg Selector (*{(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXConstructorName" IncludeView="cpp">{(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),view(cpp)na}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXConstructorName">C++ Constructor {{{(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),view(cpp)na}}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXDestructorName">C++ Destructor {{*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask)}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXConversionFunctionName">C++ Conversion function {{*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask)}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredCXXOperatorName">C++ Operator {{*(clang::detail::CXXOperatorIdName *)(Ptr &amp; ~PtrMask)}}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra"
+                   IncludeView="cpp">{*(clang::detail::DeclarationNameExtra *)(Ptr &amp; ~PtrMask),view(cpp)}</DisplayString>
+    <DisplayString Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra">{{Extra ({*(clang::detail::DeclarationNameExtra *)(Ptr &amp; ~PtrMask)})}}</DisplayString>
+    <Expand>
+      <Item Name="Kind">StoredNameKind(Ptr &amp; PtrMask),en</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredIdentifier" Name="[Identifier]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask),na</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCZeroArgSelector" Name="[ObjC Zero Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask),na</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredObjCOneArgSelector" Name="[ObjC One Arg Selector]">*(clang::IdentifierInfo *)(Ptr &amp; ~PtrMask),na</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXConstructorName" Name="[C++ Constructor]">*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXDestructorName" Name="[C++ Destructor]">*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXConversionFunctionName" Name="[C++ Conversion function]">*(clang::detail::CXXSpecialNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredCXXOperatorName" Name="[C++ Operator]">*(clang::detail::CXXOperatorIdName *)(Ptr &amp; ~PtrMask),na</Item>
+      <Item Condition="(Ptr &amp; PtrMask) == StoredDeclarationNameExtra" Name="[Extra]">(clang::detail::DeclarationNameExtra *)(Ptr &amp; ~PtrMask),na</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::detail::DeclarationNameExtra">
+    <DisplayString Condition="ExtraKindOrNumArgs == CXXDeductionGuideName" IncludeView="cpp">
+      {(CXXDeductionGuideNameExtra *)this,view(cpp)nand}
+    </DisplayString>
+    <DisplayString Condition="ExtraKindOrNumArgs == CXXDeductionGuideName">
+      {(CXXDeductionGuideNameExtra *)this,nand}
+    </DisplayString>
+    <DisplayString Condition="ExtraKindOrNumArgs == CXXLiteralOperatorName">C++ Literal operator</DisplayString>
+    <DisplayString Condition="ExtraKindOrNumArgs == CXXUsingDirective">C++ Using directive</DisplayString>
+    <DisplayString Condition="ExtraKindOrNumArgs == ObjCMultiArgSelector">Objective-C MultiArg selector</DisplayString>
+    <DisplayString>{(clang::detail::DeclarationNameExtra::ExtraKind)ExtraKindOrNumArgs,en}{"  ",sb}{*this,view(cpp)}</DisplayString>
+    <Expand>
+      <ExpandedItem Condition="ExtraKindOrNumArgs == CXXDeductionGuideName">(CXXDeductionGuideNameExtra *)this</ExpandedItem>
+      <Item Name="ExtraKindOrNumArgs" Condition="ExtraKindOrNumArgs != CXXDeductionGuideName">ExtraKindOrNumArgs</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::detail::CXXDeductionGuideNameExtra">
+    <DisplayString IncludeView="cpp">{Template->TemplatedDecl,view(cpp)}</DisplayString>
+    <DisplayString>C++ Deduction guide for {Template->TemplatedDecl,view(cpp)na}</DisplayString>
+  </Type>
+  <Type Name="clang::detail::CXXSpecialNameExtra">
+    <DisplayString IncludeView="cpp">{Type,view(cpp)}</DisplayString>
+    <DisplayString>{Type}</DisplayString>
+  </Type>
+  <Type Name="clang::DeclarationNameInfo">
+    <DisplayString>{Name}</DisplayString>
+  </Type>
+  <Type Name="clang::TemplateIdAnnotation">
+    <DisplayString IncludeView="arg0" Condition="NumArgs==0"></DisplayString>
+    <DisplayString IncludeView="arg0">{(ParsedTemplateArgument *)(this+1),view(cpp)na}{this,view(arg1)na}</DisplayString>
+    <DisplayString IncludeView="arg1" Condition="NumArgs==1"></DisplayString>
+    <DisplayString IncludeView="arg1">, {((ParsedTemplateArgument *)(this+1))+1,view(cpp)na}{this,view(arg2)na}</DisplayString>
+    <DisplayString IncludeView="arg2" Condition="NumArgs==2"></DisplayString>
+    <DisplayString IncludeView="arg1">, ...</DisplayString>
+    <DisplayString>{Name,na}&lt;{this,view(arg0)na}&gt;</DisplayString>
+    <Expand>
+      <Item Name="Name">Name</Item>
+      <Synthetic Name="Arguments">
+        <DisplayString>{this,view(arg0)na}</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>NumArgs</Size>
+            <ValuePointer>(ParsedTemplateArgument *)(this+1)</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+      <Item Name="Operator">Operator</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::Token">
+    <DisplayString Condition="Kind == clang::tok::annot_template_id">{{annot_template_id ({(clang::TemplateIdAnnotation *)(PtrData),na})}}</DisplayString>
+    <DisplayString Condition="Kind == clang::tok::identifier">{{Identifier ({(clang::IdentifierInfo *)(PtrData),na})}}</DisplayString>
+    <DisplayString>{(clang::tok::TokenKind)Kind,en}</DisplayString>
+  </Type>
+  <Type Name="clang::Lexer">
+    <DisplayString>{BufferPtr,nasb}</DisplayString>
+  </Type>
+  <Type Name="clang::Preprocessor::IncludeStackInfo">
+    <DisplayString Condition="TheLexer._Mypair._Myval2 != 0">{TheLexer._Mypair._Myval2,na}</DisplayString>
+    <DisplayString Condition="TheTokenLexer._Mypair._Myval2 != 0">Expanding Macro: {TheTokenLexer._Mypair._Myval2,na}</DisplayString>
+    <DisplayString></DisplayString>
+  </Type>
+  <Type Name="clang::Preprocessor">
+    <DisplayString IncludeView="cached" Condition="CachedLexPos &lt; CachedTokens.Size">
+      [{(Token *)(CachedTokens.BeginX) + CachedLexPos,na}] {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na}
+    </DisplayString>
+    <DisplayString IncludeView="cached"> {IncludeMacroStack._Mypair._Myval2._Mylast - 1,na}</DisplayString>
+    <DisplayString Condition="CurLexer._Mypair._Myval2 != 0">{CurLexer._Mypair._Myval2,na}</DisplayString>
+    <DisplayString Condition="CurTokenLexer._Mypair._Myval2 != 0">Expanding Macro: {CurTokenLexer._Mypair._Myval2,na}</DisplayString>
+    <!-- Can't use CurLexerCallback because natvis sees the type rather than the variable -->
+    <DisplayString Condition="IncludeMacroStack._Mypair._Myval2._Mylast - IncludeMacroStack._Mypair._Myval2._Myfirst">
+      {this,view(cached)}
+    </DisplayString>
+    <DisplayString>CLK_LexAfterModuleImport</DisplayString>
+  </Type>
+  <Type Name="clang::Parser">
+    <DisplayString>[{Tok}] {PP,na}</DisplayString>
+  </Type>
+  <Type Name="clang::LambdaIntroducer::LambdaCapture">
+    <DisplayString Condition="Kind == LCK_This">this</DisplayString>
+    <DisplayString Condition="Kind == LCK_StarThis">*this</DisplayString>
+    <DisplayString Condition="Kind == LCK_ByCopy">{Id}</DisplayString>
+    <DisplayString Condition="Kind == LCK_ByRef">&amp;{Id}</DisplayString>
+    <DisplayString>No visualizer for {Kind}</DisplayString>
+  </Type>
+  <Type Name="clang::LambdaIntroducer">
+    <DisplayString IncludeView="default" Condition="Default==LCD_None"></DisplayString>
+    <DisplayString IncludeView="default" Condition="Default==LCD_ByCopy">=,</DisplayString>
+    <DisplayString IncludeView="default" Condition="Default==LCD_ByRef">&amp;,</DisplayString>
+    <DisplayString IncludeView="capture0" Condition="Captures.Size==0"></DisplayString>
+    <DisplayString IncludeView="capture0">{(LambdaCapture *)(Captures.BeginX),na}{this,view(capture1)na}</DisplayString>
+    <DisplayString IncludeView="capture1" Condition="Captures.Size==1"></DisplayString>
+    <DisplayString IncludeView="capture1">,{(LambdaCapture *)(Captures.BeginX)+1,na}{this,view(capture2)na}</DisplayString>
+    <DisplayString IncludeView="capture2" Condition="Captures.Size==2"></DisplayString>
+    <DisplayString IncludeView="capture2">,{(LambdaCapture *)(Captures.BeginX)+2,na}{this,view(capture3)na}</DisplayString>
+    <DisplayString IncludeView="capture3" Condition="Captures.Size==3"></DisplayString>
+    <DisplayString IncludeView="capture3">,...</DisplayString>
+    <DisplayString>[{this,view(default)na}{this,view(capture0)na}]</DisplayString>
+  </Type>
+  <Type Name="clang::DeclSpec">
+    <DisplayString IncludeView="extra" Condition="TypeSpecType == TST_typename || TypeSpecType == TST_typeofType || TypeSpecType == TST_underlying_type || TypeSpecType == TST_atomic">
+      , [{TypeRep}]
+    </DisplayString>
+    <DisplayString IncludeView="extra" Condition="TypeSpecType == TST_typeofExpr || TypeSpecType == TST_decltype">
+      , [{ExprRep}]
+    </DisplayString>
+    <DisplayString IncludeView="extra" Condition="TypeSpecType == TST_enum || TypeSpecType == TST_struct || TypeSpecType == TST_interface || TypeSpecType == TST_union || TypeSpecType == TST_class">
+      , [{DeclRep}]
+    </DisplayString>
+    <DisplayString IncludeView="extra"></DisplayString>
+    <DisplayString>[{(clang::DeclSpec::SCS)StorageClassSpec,en}], [{(clang::TypeSpecifierType)TypeSpecType,en}]{this,view(extra)na}</DisplayString>
+    <Expand>
+      <Item Name="StorageClassSpec">(clang::DeclSpec::SCS)StorageClassSpec</Item>
+      <Item Name="TypeSpecType">(clang::TypeSpecifierType)TypeSpecType</Item>
+      <Item Name="TypeRep" Condition="TypeSpecType == TST_typename || TypeSpecType == TST_typeofType || TypeSpecType == TST_underlying_type || TypeSpecType == TST_atomic">
+        TypeRep
+      </Item>
+      <Item Name="ExprRep" Condition="TypeSpecType == TST_typeofExpr || TypeSpecType == TST_decltype">
+        ExprRep
+      </Item>
+      <Item Name="DeclRep" Condition="TypeSpecType == TST_enum || TypeSpecType == TST_struct || TypeSpecType == TST_interface || TypeSpecType == TST_union || TypeSpecType == TST_class">
+        DeclRep
+      </Item>
+
+    </Expand>
+  </Type>
+  <Type Name="clang::PragmaHandler">
+    <DisplayString>{Name,s}</DisplayString>
+  </Type>
+  <Type Name="clang::FileEntry">
+    <DisplayString>{RealPathName,s}</DisplayString>
+  </Type>
+  <Type Name="clang::DirectoryEntry">
+    <DisplayString>{Name,s}</DisplayString>
+  </Type>
+  <Type Name="clang::VarDecl::VarDeclBitfields">
+    <Expand>
+      <Item Name="StorageClass">(clang::StorageClass)SClass</Item>
+      <Item Name="ThreadStorageClass">(clang::ThreadStorageClassSpecifier)TSCSpec</Item>
+      <Item Name="InitStyle">(clang::VarDecl::InitializationStyle)InitStyle</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclaratorDecl">
+    <DisplayString>{DeclType,view(left)} {Name,view(cpp)}{DeclType,view(right)}</DisplayString>
+    <Expand>
+      <Item Name="Name">Name</Item>
+      <Item Name="DeclType">DeclType</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::VarDecl">
+    <DisplayString>{(DeclaratorDecl*)this,nand}</DisplayString>
+    <Expand>
+      <ExpandedItem>(DeclaratorDecl*)this,nd</ExpandedItem>
+      <Item Name="Init">Init</Item>
+      <Item Name="VarDeclBits">VarDeclBits</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::ParmVarDecl">
+    <DisplayString>{*(VarDecl*)this,nd}</DisplayString>
+    <Expand>
+      <Item Name="ParmVarDeclBits">ParmVarDeclBits</Item>
+      <ExpandedItem>*(VarDecl*)this,nd</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::ExplicitSpecifier">
+    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data&gt;&gt;1)&amp;3) == ExplicitSpecKind::ResolvedTrue" IncludeView="cpp">{"explicit ",sb}</DisplayString>
+    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data&gt;&gt;1)&amp;3) == ExplicitSpecKind::ResolvedFalse" IncludeView="cpp"></DisplayString>
+    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data&gt;&gt;1)&amp;3) == ExplicitSpecKind::Unresolved" IncludeView="cpp">explicit({ExplicitSpec,view(ptr)na})</DisplayString>
+    <DisplayString Condition="((*(uintptr_t *)ExplicitSpec.Value.Data)&amp;~7) == 0">{ExplicitSpec,view(int)en}</DisplayString>
+    <DisplayString>{ExplicitSpec,view(int)en} : {ExplicitSpec,view(ptr)na}</DisplayString>
+  </Type>
+  <Type Name="clang::CXXDeductionGuideDecl">
+    <DisplayString>{ExplicitSpec,view(cpp)}{Name,view(cpp)nd}({(FunctionDecl*)this,view(parm0)nand}) -&gt; {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)(((uintptr_t)DeclType.Value.Value) &amp; ~15))-&gt;BaseType)->ResultType,view(cpp)}</DisplayString>
+    <Expand>
+      <Item Name="ExplicitSpec">ExplicitSpec</Item>
+      <Item Name="IsCopyDeductionCandidate">(bool)FunctionDeclBits.IsCopyDeductionCandidate</Item>
+      <ExpandedItem>(FunctionDecl*)this,nd</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::FunctionDecl">
+    <DisplayString IncludeView="retType">{((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)->ResultType,view(cpp)}</DisplayString>
+    <DisplayString IncludeView="parm0" Condition="0 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
+    <DisplayString IncludeView="parm0">{ParamInfo[0],na}{*this,view(parm1)nd}</DisplayString>
+    <DisplayString IncludeView="parm1" Condition="1 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
+    <DisplayString IncludeView="parm1">, {ParamInfo[1],na}{*this,view(parm2)nd}</DisplayString>
+    <DisplayString IncludeView="parm2" Condition="2 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
+    <DisplayString IncludeView="parm2">, {ParamInfo[2],na}{*this,view(parm3)nd}</DisplayString>
+    <DisplayString IncludeView="parm3" Condition="3 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
+    <DisplayString IncludeView="parm3">, {ParamInfo[3],na}{*this,view(parm4)nd}</DisplayString>
+    <DisplayString IncludeView="parm4" Condition="4 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
+    <DisplayString IncludeView="parm4">, {ParamInfo[4],na}{*this,view(parm5)nd}</DisplayString>
+    <DisplayString IncludeView="parm5" Condition="5 == ((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams"></DisplayString>
+    <DisplayString IncludeView="parm5">, /* expand for more params */</DisplayString>
+    <DisplayString Condition="((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.HasTrailingReturn">
+      auto {Name,view(cpp)nd}({*this,view(parm0)nd}) -&gt; {((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)->ResultType,view(cpp)}
+    </DisplayString>
+    <DisplayString>{this,view(retType)nand} {Name,view(cpp)nd}({*this,view(parm0)nd})</DisplayString>
+    <Expand>
+      <ExpandedItem>(clang::DeclaratorDecl *)this,nd</ExpandedItem>
+      <Item Name="ReturnType">((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)->ResultType</Item>
+      <Synthetic Name="Parameter Types">
+        <DisplayString>{*this,view(parm0)nd}</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>((clang::FunctionProtoType *)((clang::ExtQualsTypeCommonBase *)((*(uintptr_t *)DeclType.Value.Value.Data) &amp; ~15))-&gt;BaseType)-&gt;FunctionTypeBits.NumParams</Size>
+            <ValuePointer>ParamInfo</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+      <Item Name="TemplateOrSpecialization">TemplateOrSpecialization</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::OpaquePtr&lt;*&gt;">
+    <DisplayString>{*($T1*)&amp;Ptr}</DisplayString>
+    <Expand>
+      <ExpandedItem>($T1*)&amp;Ptr</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::UnionOpaquePtr&lt;*&gt;">
+    <DisplayString>{($T1 *)Ptr}</DisplayString>
+    <Expand>
+      <ExpandedItem>($T1 *)Ptr</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::TemplateParameterList">
+    <DisplayString IncludeView="parm0" Condition="NumParams==0"></DisplayString>
+    <DisplayString IncludeView="parm0">{*((NamedDecl **)(this+1))[0],view(cpp)}{*this,view(parm1)}</DisplayString>
+    <DisplayString IncludeView="parm1" Condition="NumParams==1"></DisplayString>
+    <DisplayString IncludeView="parm1">, {*((NamedDecl **)(this+1))[1],view(cpp)}{*this,view(parm2)}</DisplayString>
+    <DisplayString IncludeView="parm2" Condition="NumParams==2"></DisplayString>
+    <DisplayString IncludeView="parm2">, {*((NamedDecl **)(this+1))[2],view(cpp)}{*this,view(parm3)}</DisplayString>
+    <DisplayString IncludeView="parm3" Condition="NumParams==3"></DisplayString>
+    <DisplayString IncludeView="parm3">, {*((NamedDecl **)(this+1))[3],view(cpp)}{*this,view(parm4)}</DisplayString>
+    <DisplayString IncludeView="parm4" Condition="NumParams==4"></DisplayString>
+    <DisplayString IncludeView="parm4">, {*((NamedDecl **)(this+1))[4],view(cpp)}{*this,view(parm5)}</DisplayString>
+    <DisplayString IncludeView="parm5" Condition="NumParams==5"></DisplayString>
+    <DisplayString IncludeView="parm5">, /* Expand for more params */</DisplayString>
+    <DisplayString>&lt;{*this,view(parm0)}&gt;</DisplayString>
+    <Expand>
+      <ArrayItems>
+        <Size>NumParams</Size>
+      <ValuePointer>(NamedDecl **)(this+1)</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="clang::Stmt">
+    <DisplayString>{(clang::Stmt::StmtClass)StmtBits.sClass,en}</DisplayString>
+    <Expand>
+      <Item Name="Class">(clang::Stmt::StmtClass)StmtBits.sClass,en</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::Expr">
+    <DisplayString Condition="StmtBits.sClass==clang::Stmt::StmtClass::StringLiteralClass" IncludeView="poly">{*(clang::StringLiteral *)this}</DisplayString>
+    <DisplayString>Expression of class {(clang::Stmt::StmtClass)StmtBits.sClass,en} and type {TR,view(cpp)}</DisplayString>
+  </Type>
+  <Type Name="clang::StringLiteral">
+    <Expand>
+      <Item Name="Length">*(unsigned *)(((clang::StringLiteral *)this)+1)</Item>
+      <Item Name="Data" Condition="StringLiteralBits.NumConcatenated==1">(const char *)(((clang::StringLiteral *)this)+1)+4+4,[*(unsigned *)(((clang::StringLiteral *)this)+1)]s8</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclAccessPair">
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_public">public</DisplayString>
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_protected">protected</DisplayString>
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_private">private</DisplayString>
+    <DisplayString IncludeView="access" Condition="(Ptr&amp;Mask) == clang::AS_none"></DisplayString>
+    <DisplayString IncludeView="decl">{*(clang::NamedDecl *)(Ptr&amp;~Mask)}</DisplayString>
+    <DisplayString>{*this,view(access)} {*this,view(decl)}</DisplayString>
+    <Expand>
+      <Item Name="access">(clang::AccessSpecifier)(Ptr&amp;Mask),en</Item>
+      <Item Name="decl">*(clang::NamedDecl *)(Ptr&amp;~Mask)</Item>
+    </Expand>
+  </Type>
+  <Type Name="clang::UnqualifiedId">
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_Identifier">[IK_Identifier] {*Identifier}</DisplayString>
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_OperatorFunctionId">[IK_OperatorFunctionId] {OperatorFunctionId}</DisplayString>
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_ConversionFunctionId">[IK_ConversionFunctionId] {ConversionFunctionId}</DisplayString>
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_ConstructorName">[IK_ConstructorName] {ConstructorName}</DisplayString>
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_DestructorName">[IK_DestructorName] {DestructorName}</DisplayString>
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_DeductionGuideName">[IK_DeductionGuideName] {TemplateName}</DisplayString>
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_TemplateId">[IK_TemplateId] {TemplateId}</DisplayString>
+    <DisplayString Condition="Kind==UnqualifiedIdKind::IK_ConstructorTemplateId">[IK_ConstructorTemplateId] {TemplateId}</DisplayString>
+    <DisplayString>Kind</DisplayString>
+    <Expand>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_Identifier">Identifier</ExpandedItem>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_OperatorFunctionId">OperatorFunctionId</ExpandedItem>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_ConversionFunctionId">ConversionFunctionId</ExpandedItem>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_ConstructorName">ConstructorName</ExpandedItem>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_DestructorName">DestructorName</ExpandedItem>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_DeductionGuideName">TemplateName</ExpandedItem>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_TemplateId">TemplateId</ExpandedItem>
+      <ExpandedItem Condition="Kind==UnqualifiedIdKind::IK_ConstructorTemplateId">TemplateId</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclGroup">
+    <DisplayString>NumDecls={NumDecls}</DisplayString>
+    <Expand>
+      <ArrayItems>
+        <Size>NumDecls</Size>
+        <ValuePointer>(Decl **)(this+1)</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="clang::DeclGroupRef">
+    <DisplayString Condition="(Kind)((uintptr_t)D&amp;1)==SingleDeclKind">{*D}</DisplayString>
+    <DisplayString>{*(DeclGroup *)((uintptr_t)D&amp;~1)}</DisplayString>
+    <Expand>
+      <ExpandedItem Condition="(Kind)((uintptr_t)D&amp;1)==SingleDeclKind">D</ExpandedItem>
+      <ExpandedItem Condition="(Kind)((uintptr_t)D&amp;1)==DeclGroupKind">(DeclGroup *)((uintptr_t)D&amp;~1)</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::Declarator">
+    <DisplayString>{DS} {Name}</DisplayString>
+  </Type>
+  <Type Name="clang::UnresolvedSet&lt;*&gt;">
+    <DisplayString>{Decls}</DisplayString>
+    <Expand>
+      <ExpandedItem>Decls</ExpandedItem>
+    </Expand>
+  </Type>
+  <Type Name="clang::LookupResult">
+    <DisplayString Condition="ResultKind == clang::LookupResult::Ambiguous">{Ambiguity,en}: {Decls}</DisplayString>
+    <DisplayString>{ResultKind,en}: {Decls}</DisplayString>
+  </Type>
+  <Type Name="clang::ActionResult&lt;*, 0&gt;">
+    <DisplayString Condition="Invalid">Invalid</DisplayString>
+    <DisplayString Condition="!*(void **)&amp;Val">Unset</DisplayString>
+    <DisplayString>{Val}</DisplayString>
+  </Type>
+  <Type Name="clang::ActionResult&lt;*, 1&gt;">
+    <DisplayString Condition="Value&amp;1">Invalid</DisplayString>
+    <DisplayString Condition="Value==0">Unset</DisplayString>
+    <DisplayString>{($T1)(Value&amp;~1)}</DisplayString>
+    <Expand>
+      <Item Name="Invalid">(bool)(Value&amp;1)</Item>
+      <Item Name="Val">($T1)(Value&amp;~1)</Item>
+    </Expand>
+  </Type>
+</AutoVisualizer>
diff --git a/flang/test/Driver/msvc-dependent-lib-flags.f90 b/flang/test/Driver/msvc-dependent-lib-flags.f90
index 765917f07d8e..1b7ecb604ad6 100644
--- a/flang/test/Driver/msvc-dependent-lib-flags.f90
+++ b/flang/test/Driver/msvc-dependent-lib-flags.f90
@@ -1,36 +1,36 @@
-! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC
-! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG
-! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL
-! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG
-
-! MSVC: -fc1
-! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib
-! MSVC-SAME: -D_MT
-! MSVC-SAME: --dependent-lib=libcmt
-! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib
-! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib
-
-! MSVC-DEBUG: -fc1
-! MSVC-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib
-! MSVC-DEBUG-SAME: -D_MT
-! MSVC-DEBUG-SAME: -D_DEBUG
-! MSVC-DEBUG-SAME: --dependent-lib=libcmtd
-! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib
-! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib
-
-! MSVC-DLL: -fc1
-! MSVC-DLL-SAME: --dependent-lib=clang_rt.builtins.lib
-! MSVC-DLL-SAME: -D_MT
-! MSVC-DLL-SAME: -D_DLL
-! MSVC-DLL-SAME: --dependent-lib=msvcrt
-! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib
-! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib
-
-! MSVC-DLL-DEBUG: -fc1
-! MSVC-DLL-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib
-! MSVC-DLL-DEBUG-SAME: -D_MT
-! MSVC-DLL-DEBUG-SAME: -D_DEBUG
-! MSVC-DLL-DEBUG-SAME: -D_DLL
-! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd
-! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib
-! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=static_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DEBUG
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL
+! RUN: %flang -### --target=aarch64-windows-msvc -resource-dir=%S/Inputs/resource_dir -fms-runtime-lib=dll_dbg %S/Inputs/hello.f90 -v 2>&1 | FileCheck %s --check-prefixes=MSVC-DLL-DEBUG
+
+! MSVC: -fc1
+! MSVC-SAME: --dependent-lib=clang_rt.builtins.lib
+! MSVC-SAME: -D_MT
+! MSVC-SAME: --dependent-lib=libcmt
+! MSVC-SAME: --dependent-lib=FortranRuntime.static.lib
+! MSVC-SAME: --dependent-lib=FortranDecimal.static.lib
+
+! MSVC-DEBUG: -fc1
+! MSVC-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib
+! MSVC-DEBUG-SAME: -D_MT
+! MSVC-DEBUG-SAME: -D_DEBUG
+! MSVC-DEBUG-SAME: --dependent-lib=libcmtd
+! MSVC-DEBUG-SAME: --dependent-lib=FortranRuntime.static_dbg.lib
+! MSVC-DEBUG-SAME: --dependent-lib=FortranDecimal.static_dbg.lib
+
+! MSVC-DLL: -fc1
+! MSVC-DLL-SAME: --dependent-lib=clang_rt.builtins.lib
+! MSVC-DLL-SAME: -D_MT
+! MSVC-DLL-SAME: -D_DLL
+! MSVC-DLL-SAME: --dependent-lib=msvcrt
+! MSVC-DLL-SAME: --dependent-lib=FortranRuntime.dynamic.lib
+! MSVC-DLL-SAME: --dependent-lib=FortranDecimal.dynamic.lib
+
+! MSVC-DLL-DEBUG: -fc1
+! MSVC-DLL-DEBUG-SAME: --dependent-lib=clang_rt.builtins.lib
+! MSVC-DLL-DEBUG-SAME: -D_MT
+! MSVC-DLL-DEBUG-SAME: -D_DEBUG
+! MSVC-DLL-DEBUG-SAME: -D_DLL
+! MSVC-DLL-DEBUG-SAME: --dependent-lib=msvcrtd
+! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranRuntime.dynamic_dbg.lib
+! MSVC-DLL-DEBUG-SAME: --dependent-lib=FortranDecimal.dynamic_dbg.lib
diff --git a/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile b/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile
index a1f689e07c77..d420a34c03e7 100644
--- a/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile
+++ b/lldb/test/API/commands/expression/ir-interpreter-phi-nodes/Makefile
@@ -1,4 +1,4 @@
-
-CXX_SOURCES := main.cpp
-
-include Makefile.rules
+
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms b/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms
index cab06c1c9d50..e817a491af57 100644
--- a/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms
+++ b/lldb/test/API/functionalities/postmortem/minidump/fizzbuzz.syms
@@ -1,2 +1,2 @@
-MODULE windows x86 0F45B7919A9646F9BF8F2D6076EA421A11 fizzbuzz.pdb
-PUBLIC 1000 0 main
+MODULE windows x86 0F45B7919A9646F9BF8F2D6076EA421A11 fizzbuzz.pdb
+PUBLIC 1000 0 main
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
index e3b48697fd78..745f6cc9d65a 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/Makefile
@@ -1,23 +1,23 @@
-CXX_SOURCES := main.cpp
-LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
-
-a.out: lib_b lib_a lib_c lib_d
-
-include Makefile.rules
-
-lib_a: lib_b
-	"$(MAKE)" -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
-		LD_EXTRAS="-L. -l_b"
-
-lib_b:
-	"$(MAKE)" -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
-
-lib_c:
-	"$(MAKE)" -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
-
-lib_d:
-	"$(MAKE)" -f $(MAKEFILE_RULES) \
-		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
+CXX_SOURCES := main.cpp
+LD_EXTRAS := -L. -l_d -l_c -l_a -l_b
+
+a.out: lib_b lib_a lib_c lib_d
+
+include Makefile.rules
+
+lib_a: lib_b
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=a.cpp DYLIB_NAME=_a \
+		LD_EXTRAS="-L. -l_b"
+
+lib_b:
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=b.cpp DYLIB_NAME=_b
+
+lib_c:
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=c.cpp DYLIB_NAME=_c
+
+lib_d:
+	"$(MAKE)" -f $(MAKEFILE_RULES) \
+		DYLIB_ONLY=YES DYLIB_CXX_SOURCES=d.cpp DYLIB_NAME=_d
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp
index 778b46ed5cef..66633b70ee1e 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/a.cpp
@@ -1,3 +1,3 @@
-extern "C" int b_function();
-
-extern "C" int a_function() { return b_function(); }
+extern "C" int b_function();
+
+extern "C" int a_function() { return b_function(); }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp
index 4f1a4032ee0e..8b16fbdb5728 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/b.cpp
@@ -1 +1 @@
-extern "C" int b_function() { return 500; }
+extern "C" int b_function() { return 500; }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp
index 8abd1b155a75..120c88f2bb60 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/c.cpp
@@ -1 +1 @@
-extern "C" int c_function() { return 600; }
+extern "C" int c_function() { return 600; }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp
index 58888a29ba32..d37ad2621ae4 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/d.cpp
@@ -1 +1 @@
-extern "C" int d_function() { return 700; }
+extern "C" int d_function() { return 700; }
diff --git a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp
index 77b38c5ccdc6..bd2c79cdab9d 100644
--- a/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp
+++ b/lldb/test/API/functionalities/target-new-solib-notifications/main.cpp
@@ -1,16 +1,16 @@
-#include <stdio.h>
-
-extern "C" int a_function();
-extern "C" int c_function();
-extern "C" int b_function();
-extern "C" int d_function();
-
-int main() {
-  a_function();
-  b_function();
-  c_function();
-  d_function();
-
-  puts("running"); // breakpoint here
-  return 0;
-}
+#include <stdio.h>
+
+extern "C" int a_function();
+extern "C" int c_function();
+extern "C" int b_function();
+extern "C" int d_function();
+
+int main() {
+  a_function();
+  b_function();
+  c_function();
+  d_function();
+
+  puts("running"); // breakpoint here
+  return 0;
+}
diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile b/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile
index 15a931850e17..10495940055b 100644
--- a/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile
+++ b/lldb/test/API/functionalities/unwind/zeroth_frame/Makefile
@@ -1,3 +1,3 @@
-C_SOURCES := main.c
-
-include Makefile.rules
+C_SOURCES := main.c
+
+include Makefile.rules
diff --git a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py
index d660844405e1..70f72c72c834 100644
--- a/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py
+++ b/lldb/test/API/functionalities/unwind/zeroth_frame/TestZerothFrame.py
@@ -1,88 +1,88 @@
-"""
-Test that line information is recalculated properly for a frame when it moves
-from the middle of the backtrace to a zero index.
-
-This is a regression test for a StackFrame bug, where whether frame is zero or
-not depends on an internal field. When LLDB was updating its frame list value
-of the field wasn't copied into existing StackFrame instances, so those
-StackFrame instances, would use an incorrect line entry evaluation logic in
-situations if it was in the middle of the stack frame list (not zeroth), and
-then moved to the top position. The difference in logic is that for zeroth
-frames line entry is returned for program counter, while for other frame
-(except for those that "behave like zeroth") it is for the instruction
-preceding PC, as PC points to the next instruction after function call. When
-the bug is present, when execution stops at the second breakpoint
-SBFrame.GetLineEntry() returns line entry for the previous line, rather than
-the one with a breakpoint. Note that this is specific to
-SBFrame.GetLineEntry(), SBFrame.GetPCAddress().GetLineEntry() would return
-correct entry.
-
-This bug doesn't reproduce through an LLDB interpretator, however it happens
-when using API directly, for example in LLDB-MI.
-"""
-
-import lldb
-from lldbsuite.test.decorators import *
-from lldbsuite.test.lldbtest import *
-from lldbsuite.test import lldbutil
-
-
-class ZerothFrame(TestBase):
-    def test(self):
-        """
-        Test that line information is recalculated properly for a frame when it moves
-        from the middle of the backtrace to a zero index.
-        """
-        self.build()
-        self.setTearDownCleanup()
-
-        exe = self.getBuildArtifact("a.out")
-        target = self.dbg.CreateTarget(exe)
-        self.assertTrue(target, VALID_TARGET)
-
-        main_dot_c = lldb.SBFileSpec("main.c")
-        bp1 = target.BreakpointCreateBySourceRegex(
-            "// Set breakpoint 1 here", main_dot_c
-        )
-        bp2 = target.BreakpointCreateBySourceRegex(
-            "// Set breakpoint 2 here", main_dot_c
-        )
-
-        process = target.LaunchSimple(None, None, self.get_process_working_directory())
-        self.assertTrue(process, VALID_PROCESS)
-
-        thread = self.thread()
-
-        if self.TraceOn():
-            print("Backtrace at the first breakpoint:")
-            for f in thread.frames:
-                print(f)
-
-        # Check that we have stopped at correct breakpoint.
-        self.assertEqual(
-            thread.frame[0].GetLineEntry().GetLine(),
-            bp1.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(),
-            "LLDB reported incorrect line number.",
-        )
-
-        # Important to use SBProcess::Continue() instead of
-        # self.runCmd('continue'), because the problem doesn't reproduce with
-        # 'continue' command.
-        process.Continue()
-
-        if self.TraceOn():
-            print("Backtrace at the second breakpoint:")
-            for f in thread.frames:
-                print(f)
-        # Check that we have stopped at the breakpoint
-        self.assertEqual(
-            thread.frame[0].GetLineEntry().GetLine(),
-            bp2.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(),
-            "LLDB reported incorrect line number.",
-        )
-        # Double-check with GetPCAddress()
-        self.assertEqual(
-            thread.frame[0].GetLineEntry().GetLine(),
-            thread.frame[0].GetPCAddress().GetLineEntry().GetLine(),
-            "LLDB reported incorrect line number.",
-        )
+"""
+Test that line information is recalculated properly for a frame when it moves
+from the middle of the backtrace to a zero index.
+
+This is a regression test for a StackFrame bug, where whether frame is zero or
+not depends on an internal field. When LLDB was updating its frame list value
+of the field wasn't copied into existing StackFrame instances, so those
+StackFrame instances, would use an incorrect line entry evaluation logic in
+situations if it was in the middle of the stack frame list (not zeroth), and
+then moved to the top position. The difference in logic is that for zeroth
+frames line entry is returned for program counter, while for other frame
+(except for those that "behave like zeroth") it is for the instruction
+preceding PC, as PC points to the next instruction after function call. When
+the bug is present, when execution stops at the second breakpoint
+SBFrame.GetLineEntry() returns line entry for the previous line, rather than
+the one with a breakpoint. Note that this is specific to
+SBFrame.GetLineEntry(), SBFrame.GetPCAddress().GetLineEntry() would return
+correct entry.
+
+This bug doesn't reproduce through an LLDB interpretator, however it happens
+when using API directly, for example in LLDB-MI.
+"""
+
+import lldb
+from lldbsuite.test.decorators import *
+from lldbsuite.test.lldbtest import *
+from lldbsuite.test import lldbutil
+
+
+class ZerothFrame(TestBase):
+    def test(self):
+        """
+        Test that line information is recalculated properly for a frame when it moves
+        from the middle of the backtrace to a zero index.
+        """
+        self.build()
+        self.setTearDownCleanup()
+
+        exe = self.getBuildArtifact("a.out")
+        target = self.dbg.CreateTarget(exe)
+        self.assertTrue(target, VALID_TARGET)
+
+        main_dot_c = lldb.SBFileSpec("main.c")
+        bp1 = target.BreakpointCreateBySourceRegex(
+            "// Set breakpoint 1 here", main_dot_c
+        )
+        bp2 = target.BreakpointCreateBySourceRegex(
+            "// Set breakpoint 2 here", main_dot_c
+        )
+
+        process = target.LaunchSimple(None, None, self.get_process_working_directory())
+        self.assertTrue(process, VALID_PROCESS)
+
+        thread = self.thread()
+
+        if self.TraceOn():
+            print("Backtrace at the first breakpoint:")
+            for f in thread.frames:
+                print(f)
+
+        # Check that we have stopped at correct breakpoint.
+        self.assertEqual(
+            thread.frame[0].GetLineEntry().GetLine(),
+            bp1.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(),
+            "LLDB reported incorrect line number.",
+        )
+
+        # Important to use SBProcess::Continue() instead of
+        # self.runCmd('continue'), because the problem doesn't reproduce with
+        # 'continue' command.
+        process.Continue()
+
+        if self.TraceOn():
+            print("Backtrace at the second breakpoint:")
+            for f in thread.frames:
+                print(f)
+        # Check that we have stopped at the breakpoint
+        self.assertEqual(
+            thread.frame[0].GetLineEntry().GetLine(),
+            bp2.GetLocationAtIndex(0).GetAddress().GetLineEntry().GetLine(),
+            "LLDB reported incorrect line number.",
+        )
+        # Double-check with GetPCAddress()
+        self.assertEqual(
+            thread.frame[0].GetLineEntry().GetLine(),
+            thread.frame[0].GetPCAddress().GetLineEntry().GetLine(),
+            "LLDB reported incorrect line number.",
+        )
diff --git a/lldb/test/API/python_api/debugger/Makefile b/lldb/test/API/python_api/debugger/Makefile
index bfad5f33e867..99998b20bcb0 100644
--- a/lldb/test/API/python_api/debugger/Makefile
+++ b/lldb/test/API/python_api/debugger/Makefile
@@ -1,3 +1,3 @@
-CXX_SOURCES := main.cpp
-
-include Makefile.rules
+CXX_SOURCES := main.cpp
+
+include Makefile.rules
diff --git a/lldb/test/Shell/BuildScript/modes.test b/lldb/test/Shell/BuildScript/modes.test
index 02311f712d77..1ce50104855f 100644
--- a/lldb/test/Shell/BuildScript/modes.test
+++ b/lldb/test/Shell/BuildScript/modes.test
@@ -1,35 +1,35 @@
-RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \
-RUN:    | FileCheck --check-prefix=COMPILE %s
-
-RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \
-RUN:    | FileCheck --check-prefix=COMPILE-MULTI %s
-
-RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foo.exe foobar.obj \
-RUN:    | FileCheck --check-prefix=LINK %s
-
-RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foobar.exe foo.obj bar.obj \
-RUN:    | FileCheck --check-prefix=LINK-MULTI %s
-
-RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foobar.c \
-RUN:    | FileCheck --check-prefix=BOTH %s
-
-RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foo.c bar.c \
-RUN:    | FileCheck --check-prefix=BOTH-MULTI %s
-
-
-COMPILE: compiling foobar.c -> foo.out
-
-COMPILE-MULTI: compiling foo.c -> foo.o{{(bj)?}}
-COMPILE-MULTI: compiling bar.c -> bar.o{{(bj)?}}
-
-
-LINK: linking foobar.obj -> foo.exe
-
-LINK-MULTI: linking foo.obj+bar.obj -> foobar.exe
-
-BOTH: compiling foobar.c -> [[OBJFOO:foobar.exe-foobar.o(bj)?]]
-BOTH: linking [[OBJFOO]] -> foobar.exe
-
-BOTH-MULTI: compiling foo.c -> [[OBJFOO:foobar.exe-foo.o(bj)?]]
-BOTH-MULTI: compiling bar.c -> [[OBJBAR:foobar.exe-bar.o(bj)?]]
-BOTH-MULTI: linking [[OBJFOO]]+[[OBJBAR]] -> foobar.exe
+RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \
+RUN:    | FileCheck --check-prefix=COMPILE %s
+
+RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \
+RUN:    | FileCheck --check-prefix=COMPILE-MULTI %s
+
+RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foo.exe foobar.obj \
+RUN:    | FileCheck --check-prefix=LINK %s
+
+RUN: %build -n --verbose --arch=32 --mode=link --compiler=any -o %t/foobar.exe foo.obj bar.obj \
+RUN:    | FileCheck --check-prefix=LINK-MULTI %s
+
+RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foobar.c \
+RUN:    | FileCheck --check-prefix=BOTH %s
+
+RUN: %build -n --verbose --arch=32 --mode=compile-and-link --compiler=any -o %t/foobar.exe foo.c bar.c \
+RUN:    | FileCheck --check-prefix=BOTH-MULTI %s
+
+
+COMPILE: compiling foobar.c -> foo.out
+
+COMPILE-MULTI: compiling foo.c -> foo.o{{(bj)?}}
+COMPILE-MULTI: compiling bar.c -> bar.o{{(bj)?}}
+
+
+LINK: linking foobar.obj -> foo.exe
+
+LINK-MULTI: linking foo.obj+bar.obj -> foobar.exe
+
+BOTH: compiling foobar.c -> [[OBJFOO:foobar.exe-foobar.o(bj)?]]
+BOTH: linking [[OBJFOO]] -> foobar.exe
+
+BOTH-MULTI: compiling foo.c -> [[OBJFOO:foobar.exe-foo.o(bj)?]]
+BOTH-MULTI: compiling bar.c -> [[OBJBAR:foobar.exe-bar.o(bj)?]]
+BOTH-MULTI: linking [[OBJFOO]]+[[OBJBAR]] -> foobar.exe
diff --git a/lldb/test/Shell/BuildScript/script-args.test b/lldb/test/Shell/BuildScript/script-args.test
index 13e8a5160942..647a48e4442b 100644
--- a/lldb/test/Shell/BuildScript/script-args.test
+++ b/lldb/test/Shell/BuildScript/script-args.test
@@ -1,32 +1,32 @@
-RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \
-RUN:    | FileCheck %s
-RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \
-RUN:    | FileCheck --check-prefix=MULTI-INPUT %s
-
-
-CHECK:      Script Arguments:
-CHECK-NEXT:   Arch: 32
-CHECK:        Compiler: any
-CHECK:        Outdir: {{.*}}script-args.test.tmp
-CHECK:        Output: {{.*}}script-args.test.tmp{{.}}foo.out
-CHECK:        Nodefaultlib: False
-CHECK:        Opt: none
-CHECK:        Mode: compile
-CHECK:        Clean: True
-CHECK:        Verbose: True
-CHECK:        Dryrun: True
-CHECK:        Inputs: foobar.c
-
-MULTI-INPUT:      Script Arguments:
-MULTI-INPUT-NEXT:   Arch: 32
-MULTI-INPUT-NEXT:   Compiler: any
-MULTI-INPUT-NEXT:   Outdir: {{.*}}script-args.test.tmp
-MULTI-INPUT-NEXT:   Output: 
-MULTI-INPUT-NEXT:   Nodefaultlib: False
-MULTI-INPUT-NEXT:   Opt: none
-MULTI-INPUT-NEXT:   Mode: compile
-MULTI-INPUT-NEXT:   Clean: True
-MULTI-INPUT-NEXT:   Verbose: True
-MULTI-INPUT-NEXT:   Dryrun: True
-MULTI-INPUT-NEXT:   Inputs: foo.c
-MULTI-INPUT-NEXT:           bar.c
+RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any -o %t/foo.out foobar.c \
+RUN:    | FileCheck %s
+RUN: %build -n --verbose --arch=32 --mode=compile --compiler=any --outdir %t foo.c bar.c \
+RUN:    | FileCheck --check-prefix=MULTI-INPUT %s
+
+
+CHECK:      Script Arguments:
+CHECK-NEXT:   Arch: 32
+CHECK:        Compiler: any
+CHECK:        Outdir: {{.*}}script-args.test.tmp
+CHECK:        Output: {{.*}}script-args.test.tmp{{.}}foo.out
+CHECK:        Nodefaultlib: False
+CHECK:        Opt: none
+CHECK:        Mode: compile
+CHECK:        Clean: True
+CHECK:        Verbose: True
+CHECK:        Dryrun: True
+CHECK:        Inputs: foobar.c
+
+MULTI-INPUT:      Script Arguments:
+MULTI-INPUT-NEXT:   Arch: 32
+MULTI-INPUT-NEXT:   Compiler: any
+MULTI-INPUT-NEXT:   Outdir: {{.*}}script-args.test.tmp
+MULTI-INPUT-NEXT:   Output: 
+MULTI-INPUT-NEXT:   Nodefaultlib: False
+MULTI-INPUT-NEXT:   Opt: none
+MULTI-INPUT-NEXT:   Mode: compile
+MULTI-INPUT-NEXT:   Clean: True
+MULTI-INPUT-NEXT:   Verbose: True
+MULTI-INPUT-NEXT:   Dryrun: True
+MULTI-INPUT-NEXT:   Inputs: foo.c
+MULTI-INPUT-NEXT:           bar.c
diff --git a/lldb/test/Shell/BuildScript/toolchain-clang-cl.test b/lldb/test/Shell/BuildScript/toolchain-clang-cl.test
index 8c9ea9fddb8a..4f64859a02b6 100644
--- a/lldb/test/Shell/BuildScript/toolchain-clang-cl.test
+++ b/lldb/test/Shell/BuildScript/toolchain-clang-cl.test
@@ -1,49 +1,49 @@
-REQUIRES: lld, system-windows
-
-RUN: %build -n --verbose --arch=32 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \
-RUN:    | FileCheck --check-prefix=CHECK-32 %s
-
-RUN: %build -n --verbose --arch=64 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \
-RUN:    | FileCheck --check-prefix=CHECK-64 %s
-
-CHECK-32: Script Arguments:
-CHECK-32:   Arch: 32
-CHECK-32:   Compiler: clang-cl
-CHECK-32:   Outdir: {{.*}}
-CHECK-32:   Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe
-CHECK-32:   Nodefaultlib: False
-CHECK-32:   Opt: none
-CHECK-32:   Mode: compile
-CHECK-32:   Clean: True
-CHECK-32:   Verbose: True
-CHECK-32:   Dryrun: True
-CHECK-32:   Inputs: foobar.c
-CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk
-CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj
-CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb
-CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe
-CHECK-32: compiling foobar.c -> foo.exe-foobar.obj
-CHECK-32: {{.*}}clang-cl{{(\.EXE)?}} -m32
-CHECK-32: linking foo.exe-foobar.obj -> foo.exe
-CHECK-32: {{.*}}lld-link{{(\.EXE)?}}
-
-CHECK-64: Script Arguments:
-CHECK-64:   Arch: 64
-CHECK-64:   Compiler: clang-cl
-CHECK-64:   Outdir: {{.*}}
-CHECK-64:   Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe
-CHECK-64:   Nodefaultlib: False
-CHECK-64:   Opt: none
-CHECK-64:   Mode: compile
-CHECK-64:   Clean: True
-CHECK-64:   Verbose: True
-CHECK-64:   Dryrun: True
-CHECK-64:   Inputs: foobar.c
-CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk
-CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj
-CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb
-CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe
-CHECK-64: compiling foobar.c -> foo.exe-foobar.obj
-CHECK-64: {{.*}}clang-cl{{(\.EXE)?}} -m64
-CHECK-64: linking foo.exe-foobar.obj -> foo.exe
-CHECK-64: {{.*}}lld-link{{(\.EXE)?}}
+REQUIRES: lld, system-windows
+
+RUN: %build -n --verbose --arch=32 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \
+RUN:    | FileCheck --check-prefix=CHECK-32 %s
+
+RUN: %build -n --verbose --arch=64 --compiler=clang-cl --mode=compile-and-link -o %t/foo.exe foobar.c \
+RUN:    | FileCheck --check-prefix=CHECK-64 %s
+
+CHECK-32: Script Arguments:
+CHECK-32:   Arch: 32
+CHECK-32:   Compiler: clang-cl
+CHECK-32:   Outdir: {{.*}}
+CHECK-32:   Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe
+CHECK-32:   Nodefaultlib: False
+CHECK-32:   Opt: none
+CHECK-32:   Mode: compile
+CHECK-32:   Clean: True
+CHECK-32:   Verbose: True
+CHECK-32:   Dryrun: True
+CHECK-32:   Inputs: foobar.c
+CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk
+CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj
+CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb
+CHECK-32: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe
+CHECK-32: compiling foobar.c -> foo.exe-foobar.obj
+CHECK-32: {{.*}}clang-cl{{(\.EXE)?}} -m32
+CHECK-32: linking foo.exe-foobar.obj -> foo.exe
+CHECK-32: {{.*}}lld-link{{(\.EXE)?}}
+
+CHECK-64: Script Arguments:
+CHECK-64:   Arch: 64
+CHECK-64:   Compiler: clang-cl
+CHECK-64:   Outdir: {{.*}}
+CHECK-64:   Output: {{.*}}toolchain-clang-cl.test.tmp\foo.exe
+CHECK-64:   Nodefaultlib: False
+CHECK-64:   Opt: none
+CHECK-64:   Mode: compile
+CHECK-64:   Clean: True
+CHECK-64:   Verbose: True
+CHECK-64:   Dryrun: True
+CHECK-64:   Inputs: foobar.c
+CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foobar.ilk
+CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe-foobar.obj
+CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.pdb
+CHECK-64: Cleaning {{.*}}toolchain-clang-cl.test.tmp{{.}}foo.exe
+CHECK-64: compiling foobar.c -> foo.exe-foobar.obj
+CHECK-64: {{.*}}clang-cl{{(\.EXE)?}} -m64
+CHECK-64: linking foo.exe-foobar.obj -> foo.exe
+CHECK-64: {{.*}}lld-link{{(\.EXE)?}}
diff --git a/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp b/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp
index 6bf78b5dc43b..d5b96472eb11 100644
--- a/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp
+++ b/lldb/test/Shell/Minidump/Windows/Sigsegv/Inputs/sigsegv.cpp
@@ -1,40 +1,40 @@
-
-// nodefaultlib build: cl -Zi sigsegv.cpp /link /nodefaultlib
-
-#ifdef USE_CRT
-#include <stdio.h>
-#else
-int main();
-extern "C"
-{
-    int _fltused;
-    void mainCRTStartup() { main(); }
-    void printf(const char*, ...) {}
-}
-#endif
-
-void crash(bool crash_self)
-{
-    printf("Before...\n");
-    if(crash_self)
-    {
-        printf("Crashing in 3, 2, 1 ...\n");
-        *(volatile int*)nullptr = 0;
-    }
-    printf("After...\n");
-}
-
-int foo(int x, float y, const char* msg)
-{
-    bool flag = x > y;
-    if(flag)
-        printf("x = %d, y = %f, msg = %s\n", x, y, msg);
-    crash(flag);
-    return x << 1;
-}
-
-int main()
-{
-    foo(10, 3.14, "testing");
-}
-
+
+// nodefaultlib build: cl -Zi sigsegv.cpp /link /nodefaultlib
+
+#ifdef USE_CRT
+#include <stdio.h>
+#else
+int main();
+extern "C"
+{
+    int _fltused;
+    void mainCRTStartup() { main(); }
+    void printf(const char*, ...) {}
+}
+#endif
+
+void crash(bool crash_self)
+{
+    printf("Before...\n");
+    if(crash_self)
+    {
+        printf("Crashing in 3, 2, 1 ...\n");
+        *(volatile int*)nullptr = 0;
+    }
+    printf("After...\n");
+}
+
+int foo(int x, float y, const char* msg)
+{
+    bool flag = x > y;
+    if(flag)
+        printf("x = %d, y = %f, msg = %s\n", x, y, msg);
+    crash(flag);
+    return x << 1;
+}
+
+int main()
+{
+    foo(10, 3.14, "testing");
+}
+
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s
index aac8f4c16980..a9d248758bfc 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s
+++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites.s
@@ -1,622 +1,622 @@
-# Compiled from the following files, but replaced the call to abort with nop.
-# clang-cl -fuse-ld=lld-link /Z7 /O1 /Faa.asm /winsysroot~/win_toolchain a.cpp
-# a.cpp:
-# #include "a.h"
-# int main(int argc, char** argv) {
-#   volatile int main_local = Namespace1::foo(2);
-#   return 0;
-# }
-# a.h:
-# #include <stdlib.h>
-# #include "b.h"
-# namespace Namespace1 {
-# inline int foo(int x) {
-#   volatile int foo_local = x + 1;
-#   ++foo_local;
-#   if (!foo_local)
-#     abort();
-#   return Class1::bar(foo_local);
-# }
-# } // namespace Namespace1
-# b.h:
-# #include "c.h"
-# class Class1 {
-# public:
-#   inline static int bar(int x) {
-#     volatile int bar_local = x + 1;
-#     ++bar_local;
-#     return Namespace2::Class2::func(bar_local);
-#   }
-# };
-# c.h:
-# namespace Namespace2 {
-# class Class2 {
-# public:
-#   inline static int func(int x) {
-#     volatile int func_local = x + 1;
-#     func_local += x;
-#     return func_local;
-#   }
-# };
-# } // namespace Namespace2
-
-	.text
-	.def	@feat.00;
-	.scl	3;
-	.type	0;
-	.endef
-	.globl	@feat.00
-.set @feat.00, 0
-	.intel_syntax noprefix
-	.file	"a.cpp"
-	.def	main;
-	.scl	2;
-	.type	32;
-	.endef
-	.section	.text,"xr",one_only,main
-	.globl	main                            # -- Begin function main
-main:                                   # @main
-.Lfunc_begin0:
-	.cv_func_id 0
-	.cv_file	1 "/tmp/a.cpp" "4FFB96E5DF1A95CE7DB9732CFFE001D7" 1
-	.cv_loc	0 1 2 0                         # a.cpp:2:0
-.seh_proc main
-# %bb.0:
-	#DEBUG_VALUE: main:argv <- $rdx
-	#DEBUG_VALUE: main:argc <- $ecx
-	#DEBUG_VALUE: foo:x <- 2
-	sub	rsp, 56
-	.seh_stackalloc 56
-	.seh_endprologue
-.Ltmp0:
-	.cv_file	2 "/tmp/./a.h" "BBFED90EF093E9C1D032CC9B05B5D167" 1
-	.cv_inline_site_id 1 within 0 inlined_at 1 3 0
-	.cv_loc	1 2 5 0                         # ./a.h:5:0
-	mov	dword ptr [rsp + 44], 3
-	.cv_loc	1 2 6 0                         # ./a.h:6:0
-	inc	dword ptr [rsp + 44]
-	.cv_loc	1 2 7 0                         # ./a.h:7:0
-	mov	eax, dword ptr [rsp + 44]
-	test	eax, eax
-	je	.LBB0_2
-.Ltmp1:
-# %bb.1:
-	#DEBUG_VALUE: main:argv <- $rdx
-	#DEBUG_VALUE: main:argc <- $ecx
-	#DEBUG_VALUE: foo:x <- 2
-	.cv_loc	1 2 9 0                         # ./a.h:9:0
-	mov	eax, dword ptr [rsp + 44]
-.Ltmp2:
-	#DEBUG_VALUE: bar:x <- $eax
-	.cv_file	3 "/tmp/./b.h" "A26CC743A260115F33AF91AB11F95877" 1
-	.cv_inline_site_id 2 within 1 inlined_at 2 9 0
-	.cv_loc	2 3 5 0                         # ./b.h:5:0
-	inc	eax
-.Ltmp3:
-	mov	dword ptr [rsp + 52], eax
-	.cv_loc	2 3 6 0                         # ./b.h:6:0
-	inc	dword ptr [rsp + 52]
-	.cv_loc	2 3 7 0                         # ./b.h:7:0
-	mov	eax, dword ptr [rsp + 52]
-.Ltmp4:
-	#DEBUG_VALUE: func:x <- $eax
-	.cv_file	4 "/tmp/./c.h" "8AF4613F78624BBE96D1C408ABA39B2D" 1
-	.cv_inline_site_id 3 within 2 inlined_at 3 7 0
-	.cv_loc	3 4 5 0                         # ./c.h:5:0
-	lea	ecx, [rax + 1]
-.Ltmp5:
-	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx
-	mov	dword ptr [rsp + 48], ecx
-	.cv_loc	3 4 6 0                         # ./c.h:6:0
-	add	dword ptr [rsp + 48], eax
-	.cv_loc	3 4 7 0                         # ./c.h:7:0
-	mov	eax, dword ptr [rsp + 48]
-.Ltmp6:
-	.cv_loc	0 1 3 0                         # a.cpp:3:0
-	mov	dword ptr [rsp + 48], eax
-	.cv_loc	0 1 4 0                         # a.cpp:4:0
-	xor	eax, eax
-	# Use fake debug info to tests inline info.
-	.cv_loc	1 2 20 0
-	add	rsp, 56
-	ret
-.Ltmp7:
-.LBB0_2:
-	#DEBUG_VALUE: main:argv <- $rdx
-	#DEBUG_VALUE: main:argc <- $ecx
-	#DEBUG_VALUE: foo:x <- 2
-	.cv_loc	1 2 8 0                         # ./a.h:8:0
-	nop
-.Ltmp8:
-	int3
-.Ltmp9:
-	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx
-	#DEBUG_VALUE: main:argv <- [DW_OP_LLVM_entry_value 1] $rdx
-.Lfunc_end0:
-	.seh_endproc
-                                        # -- End function
-	.section	.drectve,"yn"
-	.ascii	" /DEFAULTLIB:libcmt.lib"
-	.ascii	" /DEFAULTLIB:oldnames.lib"
-	.section	.debug$S,"dr"
-	.p2align	2
-	.long	4                               # Debug section magic
-	.long	241
-	.long	.Ltmp11-.Ltmp10                 # Subsection size
-.Ltmp10:
-	.short	.Ltmp13-.Ltmp12                 # Record length
-.Ltmp12:
-	.short	4353                            # Record kind: S_OBJNAME
-	.long	0                               # Signature
-	.asciz	"/tmp/a-2b2ba0.obj"             # Object name
-	.p2align	2
-.Ltmp13:
-	.short	.Ltmp15-.Ltmp14                 # Record length
-.Ltmp14:
-	.short	4412                            # Record kind: S_COMPILE3
-	.long	1                               # Flags and language
-	.short	208                             # CPUType
-	.short	15                              # Frontend version
-	.short	0
-	.short	0
-	.short	0
-	.short	15000                           # Backend version
-	.short	0
-	.short	0
-	.short	0
-	.asciz	"clang version 15.0.0"          # Null-terminated compiler version string
-	.p2align	2
-.Ltmp15:
-.Ltmp11:
-	.p2align	2
-	.long	246                             # Inlinee lines subsection
-	.long	.Ltmp17-.Ltmp16                 # Subsection size
-.Ltmp16:
-	.long	0                               # Inlinee lines signature
-
-                                        # Inlined function foo starts at ./a.h:4
-	.long	4099                            # Type index of inlined function
-	.cv_filechecksumoffset	2               # Offset into filechecksum table
-	.long	4                               # Starting line number
-
-                                        # Inlined function bar starts at ./b.h:4
-	.long	4106                            # Type index of inlined function
-	.cv_filechecksumoffset	3               # Offset into filechecksum table
-	.long	4                               # Starting line number
-
-                                        # Inlined function func starts at ./c.h:4
-	.long	4113                            # Type index of inlined function
-	.cv_filechecksumoffset	4               # Offset into filechecksum table
-	.long	4                               # Starting line number
-.Ltmp17:
-	.p2align	2
-	.section	.debug$S,"dr",associative,main
-	.p2align	2
-	.long	4                               # Debug section magic
-	.long	241                             # Symbol subsection for main
-	.long	.Ltmp19-.Ltmp18                 # Subsection size
-.Ltmp18:
-	.short	.Ltmp21-.Ltmp20                 # Record length
-.Ltmp20:
-	.short	4423                            # Record kind: S_GPROC32_ID
-	.long	0                               # PtrParent
-	.long	0                               # PtrEnd
-	.long	0                               # PtrNext
-	.long	.Lfunc_end0-main                # Code size
-	.long	0                               # Offset after prologue
-	.long	0                               # Offset before epilogue
-	.long	4117                            # Function type index
-	.secrel32	main                    # Function section relative address
-	.secidx	main                            # Function section index
-	.byte	0                               # Flags
-	.asciz	"main"                          # Function name
-	.p2align	2
-.Ltmp21:
-	.short	.Ltmp23-.Ltmp22                 # Record length
-.Ltmp22:
-	.short	4114                            # Record kind: S_FRAMEPROC
-	.long	56                              # FrameSize
-	.long	0                               # Padding
-	.long	0                               # Offset of padding
-	.long	0                               # Bytes of callee saved registers
-	.long	0                               # Exception handler offset
-	.short	0                               # Exception handler section
-	.long	81920                           # Flags (defines frame register)
-	.p2align	2
-.Ltmp23:
-	.short	.Ltmp25-.Ltmp24                 # Record length
-.Ltmp24:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	116                             # TypeIndex
-	.short	1                               # Flags
-	.asciz	"argc"
-	.p2align	2
-.Ltmp25:
-	.cv_def_range	 .Lfunc_begin0 .Ltmp5 .Ltmp7 .Ltmp8, reg, 18
-	.short	.Ltmp27-.Ltmp26                 # Record length
-.Ltmp26:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	4114                            # TypeIndex
-	.short	1                               # Flags
-	.asciz	"argv"
-	.p2align	2
-.Ltmp27:
-	.cv_def_range	 .Lfunc_begin0 .Ltmp8, reg, 331
-	.short	.Ltmp29-.Ltmp28                 # Record length
-.Ltmp28:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	4118                            # TypeIndex
-	.short	0                               # Flags
-	.asciz	"main_local"
-	.p2align	2
-.Ltmp29:
-	.cv_def_range	 .Ltmp0 .Ltmp9, frame_ptr_rel, 48
-	.short	.Ltmp31-.Ltmp30                 # Record length
-.Ltmp30:
-	.short	4429                            # Record kind: S_INLINESITE
-	.long	0                               # PtrParent
-	.long	0                               # PtrEnd
-	.long	4099                            # Inlinee type index
-	.cv_inline_linetable	1 2 4 .Lfunc_begin0 .Lfunc_end0
-	.p2align	2
-.Ltmp31:
-	.short	.Ltmp33-.Ltmp32                 # Record length
-.Ltmp32:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	116                             # TypeIndex
-	.short	257                             # Flags
-	.asciz	"x"
-	.p2align	2
-.Ltmp33:
-	.short	.Ltmp35-.Ltmp34                 # Record length
-.Ltmp34:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	4118                            # TypeIndex
-	.short	0                               # Flags
-	.asciz	"foo_local"
-	.p2align	2
-.Ltmp35:
-	.cv_def_range	 .Ltmp0 .Ltmp6 .Ltmp7 .Ltmp9, frame_ptr_rel, 44
-	.short	.Ltmp37-.Ltmp36                 # Record length
-.Ltmp36:
-	.short	4429                            # Record kind: S_INLINESITE
-	.long	0                               # PtrParent
-	.long	0                               # PtrEnd
-	.long	4106                            # Inlinee type index
-	.cv_inline_linetable	2 3 4 .Lfunc_begin0 .Lfunc_end0
-	.p2align	2
-.Ltmp37:
-	.short	.Ltmp39-.Ltmp38                 # Record length
-.Ltmp38:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	116                             # TypeIndex
-	.short	1                               # Flags
-	.asciz	"x"
-	.p2align	2
-.Ltmp39:
-	.cv_def_range	 .Ltmp2 .Ltmp3, reg, 17
-	.short	.Ltmp41-.Ltmp40                 # Record length
-.Ltmp40:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	4118                            # TypeIndex
-	.short	0                               # Flags
-	.asciz	"bar_local"
-	.p2align	2
-.Ltmp41:
-	.cv_def_range	 .Ltmp2 .Ltmp6, frame_ptr_rel, 52
-	.short	.Ltmp43-.Ltmp42                 # Record length
-.Ltmp42:
-	.short	4429                            # Record kind: S_INLINESITE
-	.long	0                               # PtrParent
-	.long	0                               # PtrEnd
-	.long	4113                            # Inlinee type index
-	.cv_inline_linetable	3 4 4 .Lfunc_begin0 .Lfunc_end0
-	.p2align	2
-.Ltmp43:
-	.short	.Ltmp45-.Ltmp44                 # Record length
-.Ltmp44:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	116                             # TypeIndex
-	.short	1                               # Flags
-	.asciz	"x"
-	.p2align	2
-.Ltmp45:
-	.cv_def_range	 .Ltmp4 .Ltmp6, reg, 17
-	.short	.Ltmp47-.Ltmp46                 # Record length
-.Ltmp46:
-	.short	4414                            # Record kind: S_LOCAL
-	.long	4118                            # TypeIndex
-	.short	0                               # Flags
-	.asciz	"func_local"
-	.p2align	2
-.Ltmp47:
-	.cv_def_range	 .Ltmp4 .Ltmp6, frame_ptr_rel, 48
-	.short	2                               # Record length
-	.short	4430                            # Record kind: S_INLINESITE_END
-	.short	2                               # Record length
-	.short	4430                            # Record kind: S_INLINESITE_END
-	.short	2                               # Record length
-	.short	4430                            # Record kind: S_INLINESITE_END
-	.short	2                               # Record length
-	.short	4431                            # Record kind: S_PROC_ID_END
-.Ltmp19:
-	.p2align	2
-	.cv_linetable	0, main, .Lfunc_end0
-	.section	.debug$S,"dr"
-	.long	241
-	.long	.Ltmp49-.Ltmp48                 # Subsection size
-.Ltmp48:
-	.short	.Ltmp51-.Ltmp50                 # Record length
-.Ltmp50:
-	.short	4360                            # Record kind: S_UDT
-	.long	4103                            # Type
-	.asciz	"Class1"
-	.p2align	2
-.Ltmp51:
-	.short	.Ltmp53-.Ltmp52                 # Record length
-.Ltmp52:
-	.short	4360                            # Record kind: S_UDT
-	.long	4110                            # Type
-	.asciz	"Namespace2::Class2"
-	.p2align	2
-.Ltmp53:
-.Ltmp49:
-	.p2align	2
-	.cv_filechecksums                       # File index to string table offset subsection
-	.cv_stringtable                         # String table
-	.long	241
-	.long	.Ltmp55-.Ltmp54                 # Subsection size
-.Ltmp54:
-	.short	.Ltmp57-.Ltmp56                 # Record length
-.Ltmp56:
-	.short	4428                            # Record kind: S_BUILDINFO
-	.long	4124                            # LF_BUILDINFO index
-	.p2align	2
-.Ltmp57:
-.Ltmp55:
-	.p2align	2
-	.section	.debug$T,"dr"
-	.p2align	2
-	.long	4                               # Debug section magic
-	# StringId (0x1000)
-	.short	0x12                            # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.asciz	"Namespace1"                    # StringData
-	.byte	241
-	# ArgList (0x1001)
-	.short	0xa                             # Record length
-	.short	0x1201                          # Record kind: LF_ARGLIST
-	.long	0x1                             # NumArgs
-	.long	0x74                            # Argument: int
-	# Procedure (0x1002)
-	.short	0xe                             # Record length
-	.short	0x1008                          # Record kind: LF_PROCEDURE
-	.long	0x74                            # ReturnType: int
-	.byte	0x0                             # CallingConvention: NearC
-	.byte	0x0                             # FunctionOptions
-	.short	0x1                             # NumParameters
-	.long	0x1001                          # ArgListType: (int)
-	# FuncId (0x1003)
-	.short	0xe                             # Record length
-	.short	0x1601                          # Record kind: LF_FUNC_ID
-	.long	0x1000                          # ParentScope: Namespace1
-	.long	0x1002                          # FunctionType: int (int)
-	.asciz	"foo"                           # Name
-	# Class (0x1004)
-	.short	0x2a                            # Record length
-	.short	0x1504                          # Record kind: LF_CLASS
-	.short	0x0                             # MemberCount
-	.short	0x280                           # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) )
-	.long	0x0                             # FieldList
-	.long	0x0                             # DerivedFrom
-	.long	0x0                             # VShape
-	.short	0x0                             # SizeOf
-	.asciz	"Class1"                        # Name
-	.asciz	".?AVClass1@@"                  # LinkageName
-	.byte	242
-	.byte	241
-	# MemberFunction (0x1005)
-	.short	0x1a                            # Record length
-	.short	0x1009                          # Record kind: LF_MFUNCTION
-	.long	0x74                            # ReturnType: int
-	.long	0x1004                          # ClassType: Class1
-	.long	0x0                             # ThisType
-	.byte	0x0                             # CallingConvention: NearC
-	.byte	0x0                             # FunctionOptions
-	.short	0x1                             # NumParameters
-	.long	0x1001                          # ArgListType: (int)
-	.long	0x0                             # ThisAdjustment
-	# FieldList (0x1006)
-	.short	0xe                             # Record length
-	.short	0x1203                          # Record kind: LF_FIELDLIST
-	.short	0x1511                          # Member kind: OneMethod ( LF_ONEMETHOD )
-	.short	0xb                             # Attrs: Public, Static
-	.long	0x1005                          # Type: int Class1::(int)
-	.asciz	"bar"                           # Name
-	# Class (0x1007)
-	.short	0x2a                            # Record length
-	.short	0x1504                          # Record kind: LF_CLASS
-	.short	0x1                             # MemberCount
-	.short	0x200                           # Properties ( HasUniqueName (0x200) )
-	.long	0x1006                          # FieldList: <field list>
-	.long	0x0                             # DerivedFrom
-	.long	0x0                             # VShape
-	.short	0x1                             # SizeOf
-	.asciz	"Class1"                        # Name
-	.asciz	".?AVClass1@@"                  # LinkageName
-	.byte	242
-	.byte	241
-	# StringId (0x1008)
-	.short	0x12                            # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.asciz	"/tmp/./b.h"                    # StringData
-	.byte	241
-	# UdtSourceLine (0x1009)
-	.short	0xe                             # Record length
-	.short	0x1606                          # Record kind: LF_UDT_SRC_LINE
-	.long	0x1007                          # UDT: Class1
-	.long	0x1008                          # SourceFile: /tmp/./b.h
-	.long	0x2                             # LineNumber
-	# MemberFuncId (0x100A)
-	.short	0xe                             # Record length
-	.short	0x1602                          # Record kind: LF_MFUNC_ID
-	.long	0x1004                          # ClassType: Class1
-	.long	0x1005                          # FunctionType: int Class1::(int)
-	.asciz	"bar"                           # Name
-	# Class (0x100B)
-	.short	0x42                            # Record length
-	.short	0x1504                          # Record kind: LF_CLASS
-	.short	0x0                             # MemberCount
-	.short	0x280                           # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) )
-	.long	0x0                             # FieldList
-	.long	0x0                             # DerivedFrom
-	.long	0x0                             # VShape
-	.short	0x0                             # SizeOf
-	.asciz	"Namespace2::Class2"            # Name
-	.asciz	".?AVClass2@Namespace2@@"       # LinkageName
-	.byte	243
-	.byte	242
-	.byte	241
-	# MemberFunction (0x100C)
-	.short	0x1a                            # Record length
-	.short	0x1009                          # Record kind: LF_MFUNCTION
-	.long	0x74                            # ReturnType: int
-	.long	0x100b                          # ClassType: Namespace2::Class2
-	.long	0x0                             # ThisType
-	.byte	0x0                             # CallingConvention: NearC
-	.byte	0x0                             # FunctionOptions
-	.short	0x1                             # NumParameters
-	.long	0x1001                          # ArgListType: (int)
-	.long	0x0                             # ThisAdjustment
-	# FieldList (0x100D)
-	.short	0x12                            # Record length
-	.short	0x1203                          # Record kind: LF_FIELDLIST
-	.short	0x1511                          # Member kind: OneMethod ( LF_ONEMETHOD )
-	.short	0xb                             # Attrs: Public, Static
-	.long	0x100c                          # Type: int Namespace2::Class2::(int)
-	.asciz	"func"                          # Name
-	.byte	243
-	.byte	242
-	.byte	241
-	# Class (0x100E)
-	.short	0x42                            # Record length
-	.short	0x1504                          # Record kind: LF_CLASS
-	.short	0x1                             # MemberCount
-	.short	0x200                           # Properties ( HasUniqueName (0x200) )
-	.long	0x100d                          # FieldList: <field list>
-	.long	0x0                             # DerivedFrom
-	.long	0x0                             # VShape
-	.short	0x1                             # SizeOf
-	.asciz	"Namespace2::Class2"            # Name
-	.asciz	".?AVClass2@Namespace2@@"       # LinkageName
-	.byte	243
-	.byte	242
-	.byte	241
-	# StringId (0x100F)
-	.short	0x12                            # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.asciz	"/tmp/./c.h"                    # StringData
-	.byte	241
-	# UdtSourceLine (0x1010)
-	.short	0xe                             # Record length
-	.short	0x1606                          # Record kind: LF_UDT_SRC_LINE
-	.long	0x100e                          # UDT: Namespace2::Class2
-	.long	0x100f                          # SourceFile: /tmp/./c.h
-	.long	0x2                             # LineNumber
-	# MemberFuncId (0x1011)
-	.short	0x12                            # Record length
-	.short	0x1602                          # Record kind: LF_MFUNC_ID
-	.long	0x100b                          # ClassType: Namespace2::Class2
-	.long	0x100c                          # FunctionType: int Namespace2::Class2::(int)
-	.asciz	"func"                          # Name
-	.byte	243
-	.byte	242
-	.byte	241
-	# Pointer (0x1012)
-	.short	0xa                             # Record length
-	.short	0x1002                          # Record kind: LF_POINTER
-	.long	0x670                           # PointeeType: char*
-	.long	0x1000c                         # Attrs: [ Type: Near64, Mode: Pointer, SizeOf: 8 ]
-	# ArgList (0x1013)
-	.short	0xe                             # Record length
-	.short	0x1201                          # Record kind: LF_ARGLIST
-	.long	0x2                             # NumArgs
-	.long	0x74                            # Argument: int
-	.long	0x1012                          # Argument: char**
-	# Procedure (0x1014)
-	.short	0xe                             # Record length
-	.short	0x1008                          # Record kind: LF_PROCEDURE
-	.long	0x74                            # ReturnType: int
-	.byte	0x0                             # CallingConvention: NearC
-	.byte	0x0                             # FunctionOptions
-	.short	0x2                             # NumParameters
-	.long	0x1013                          # ArgListType: (int, char**)
-	# FuncId (0x1015)
-	.short	0x12                            # Record length
-	.short	0x1601                          # Record kind: LF_FUNC_ID
-	.long	0x0                             # ParentScope
-	.long	0x1014                          # FunctionType: int (int, char**)
-	.asciz	"main"                          # Name
-	.byte	243
-	.byte	242
-	.byte	241
-	# Modifier (0x1016)
-	.short	0xa                             # Record length
-	.short	0x1001                          # Record kind: LF_MODIFIER
-	.long	0x74                            # ModifiedType: int
-	.short	0x2                             # Modifiers ( Volatile (0x2) )
-	.byte	242
-	.byte	241
-	# StringId (0x1017)
-	.short	0xe                             # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.asciz	"/tmp"                          # StringData
-	.byte	243
-	.byte	242
-	.byte	241
-	# StringId (0x1018)
-	.short	0xe                             # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.asciz	"a.cpp"                         # StringData
-	.byte	242
-	.byte	241
-	# StringId (0x1019)
-	.short	0xa                             # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.byte	0                               # StringData
-	.byte	243
-	.byte	242
-	.byte	241
-	# StringId (0x101A)
-	.short	0x4e                            # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.asciz	"/usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang" # StringData
-	.byte	243
-	.byte	242
-	.byte	241
-	# StringId (0x101B)
-	.short	0x9f6                           # Record length
-	.short	0x1605                          # Record kind: LF_STRING_ID
-	.long	0x0                             # Id
-	.asciz	"\"-cc1\" \"-triple\" \"x86_64-pc-windows-msvc19.20.0\" \"-S\" \"-disable-free\" \"-clear-ast-before-backend\" \"-disable-llvm-verifier\" \"-discard-value-names\" \"-mrelocation-model\" \"pic\" \"-pic-level\" \"2\" \"-mframe-pointer=none\" \"-relaxed-aliasing\" \"-fmath-errno\" \"-ffp-contract=on\" \"-fno-rounding-math\" \"-mconstructor-aliases\" \"-funwind-tables=2\" \"-target-cpu\" \"x86-64\" \"-mllvm\" \"-x86-asm-syntax=intel\" \"-tune-cpu\" \"generic\" \"-mllvm\" \"-treat-scalable-fixed-error-as-warning\" \"-D_MT\" \"-flto-visibility-public-std\" \"--dependent-lib=libcmt\" \"--dependent-lib=oldnames\" \"-stack-protector\" \"2\" \"-fms-volatile\" \"-fdiagnostics-format\" \"msvc\" \"-gno-column-info\" \"-gcodeview\" \"-debug-info-kind=constructor\" \"-ffunction-sections\" \"-fcoverage-compilation-dir=/tmp\" \"-resource-dir\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt\" \"-Os\" \"-fdeprecated-macro\" \"-fdebug-compilation-dir=/tmp\" \"-ferror-limit\" \"19\" \"-fno-use-cxa-atexit\" \"-fms-extensions\" \"-fms-compatibility\" \"-fms-compatibility-version=19.20\" \"-std=c++14\" \"-fdelayed-template-parsing\" \"-fcolor-diagnostics\" \"-vectorize-loops\" \"-vectorize-slp\" \"-faddrsig\" \"-x\" \"c++\"" # StringData
-	.byte	242
-	.byte	241
-	# BuildInfo (0x101C)
-	.short	0x1a                            # Record length
-	.short	0x1603                          # Record kind: LF_BUILDINFO
-	.short	0x5                             # NumArgs
-	.long	0x1017                          # Argument: /tmp
-	.long	0x101a                          # Argument: /usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang
-	.long	0x1018                          # Argument: a.cpp
-	.long	0x1019                          # Argument
-	.long	0x101b                          # Argument: "-cc1" "-triple" "x86_64-pc-windows-msvc19.20.0" "-S" "-disable-free" "-clear-ast-before-backend" "-disable-llvm-verifier" "-discard-value-names" "-mrelocation-model" "pic" "-pic-level" "2" "-mframe-pointer=none" "-relaxed-aliasing" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-mllvm" "-x86-asm-syntax=intel" "-tune-cpu" "generic" "-mllvm" "-treat-scalable-fixed-error-as-warning" "-D_MT" "-flto-visibility-public-std" "--dependent-lib=libcmt" "--dependent-lib=oldnames" "-stack-protector" "2" "-fms-volatile" "-fdiagnostics-format" "msvc" "-gno-column-info" "-gcodeview" "-debug-info-kind=constructor" "-ffunction-sections" "-fcoverage-compilation-dir=/tmp" "-resource-dir" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0" "-internal-isystem" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt" "-Os" "-fdeprecated-macro" "-fdebug-compilation-dir=/tmp" "-ferror-limit" "19" "-fno-use-cxa-atexit" "-fms-extensions" "-fms-compatibility" "-fms-compatibility-version=19.20" "-std=c++14" "-fdelayed-template-parsing" "-fcolor-diagnostics" "-vectorize-loops" "-vectorize-slp" "-faddrsig" "-x" "c++"
-	.byte	242
-	.byte	241
-	.addrsig
+# Compiled from the following files, but replaced the call to abort with nop.
+# clang-cl -fuse-ld=lld-link /Z7 /O1 /Faa.asm /winsysroot~/win_toolchain a.cpp
+# a.cpp:
+# #include "a.h"
+# int main(int argc, char** argv) {
+#   volatile int main_local = Namespace1::foo(2);
+#   return 0;
+# }
+# a.h:
+# #include <stdlib.h>
+# #include "b.h"
+# namespace Namespace1 {
+# inline int foo(int x) {
+#   volatile int foo_local = x + 1;
+#   ++foo_local;
+#   if (!foo_local)
+#     abort();
+#   return Class1::bar(foo_local);
+# }
+# } // namespace Namespace1
+# b.h:
+# #include "c.h"
+# class Class1 {
+# public:
+#   inline static int bar(int x) {
+#     volatile int bar_local = x + 1;
+#     ++bar_local;
+#     return Namespace2::Class2::func(bar_local);
+#   }
+# };
+# c.h:
+# namespace Namespace2 {
+# class Class2 {
+# public:
+#   inline static int func(int x) {
+#     volatile int func_local = x + 1;
+#     func_local += x;
+#     return func_local;
+#   }
+# };
+# } // namespace Namespace2
+
+	.text
+	.def	@feat.00;
+	.scl	3;
+	.type	0;
+	.endef
+	.globl	@feat.00
+.set @feat.00, 0
+	.intel_syntax noprefix
+	.file	"a.cpp"
+	.def	main;
+	.scl	2;
+	.type	32;
+	.endef
+	.section	.text,"xr",one_only,main
+	.globl	main                            # -- Begin function main
+main:                                   # @main
+.Lfunc_begin0:
+	.cv_func_id 0
+	.cv_file	1 "/tmp/a.cpp" "4FFB96E5DF1A95CE7DB9732CFFE001D7" 1
+	.cv_loc	0 1 2 0                         # a.cpp:2:0
+.seh_proc main
+# %bb.0:
+	#DEBUG_VALUE: main:argv <- $rdx
+	#DEBUG_VALUE: main:argc <- $ecx
+	#DEBUG_VALUE: foo:x <- 2
+	sub	rsp, 56
+	.seh_stackalloc 56
+	.seh_endprologue
+.Ltmp0:
+	.cv_file	2 "/tmp/./a.h" "BBFED90EF093E9C1D032CC9B05B5D167" 1
+	.cv_inline_site_id 1 within 0 inlined_at 1 3 0
+	.cv_loc	1 2 5 0                         # ./a.h:5:0
+	mov	dword ptr [rsp + 44], 3
+	.cv_loc	1 2 6 0                         # ./a.h:6:0
+	inc	dword ptr [rsp + 44]
+	.cv_loc	1 2 7 0                         # ./a.h:7:0
+	mov	eax, dword ptr [rsp + 44]
+	test	eax, eax
+	je	.LBB0_2
+.Ltmp1:
+# %bb.1:
+	#DEBUG_VALUE: main:argv <- $rdx
+	#DEBUG_VALUE: main:argc <- $ecx
+	#DEBUG_VALUE: foo:x <- 2
+	.cv_loc	1 2 9 0                         # ./a.h:9:0
+	mov	eax, dword ptr [rsp + 44]
+.Ltmp2:
+	#DEBUG_VALUE: bar:x <- $eax
+	.cv_file	3 "/tmp/./b.h" "A26CC743A260115F33AF91AB11F95877" 1
+	.cv_inline_site_id 2 within 1 inlined_at 2 9 0
+	.cv_loc	2 3 5 0                         # ./b.h:5:0
+	inc	eax
+.Ltmp3:
+	mov	dword ptr [rsp + 52], eax
+	.cv_loc	2 3 6 0                         # ./b.h:6:0
+	inc	dword ptr [rsp + 52]
+	.cv_loc	2 3 7 0                         # ./b.h:7:0
+	mov	eax, dword ptr [rsp + 52]
+.Ltmp4:
+	#DEBUG_VALUE: func:x <- $eax
+	.cv_file	4 "/tmp/./c.h" "8AF4613F78624BBE96D1C408ABA39B2D" 1
+	.cv_inline_site_id 3 within 2 inlined_at 3 7 0
+	.cv_loc	3 4 5 0                         # ./c.h:5:0
+	lea	ecx, [rax + 1]
+.Ltmp5:
+	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx
+	mov	dword ptr [rsp + 48], ecx
+	.cv_loc	3 4 6 0                         # ./c.h:6:0
+	add	dword ptr [rsp + 48], eax
+	.cv_loc	3 4 7 0                         # ./c.h:7:0
+	mov	eax, dword ptr [rsp + 48]
+.Ltmp6:
+	.cv_loc	0 1 3 0                         # a.cpp:3:0
+	mov	dword ptr [rsp + 48], eax
+	.cv_loc	0 1 4 0                         # a.cpp:4:0
+	xor	eax, eax
+	# Use fake debug info to tests inline info.
+	.cv_loc	1 2 20 0
+	add	rsp, 56
+	ret
+.Ltmp7:
+.LBB0_2:
+	#DEBUG_VALUE: main:argv <- $rdx
+	#DEBUG_VALUE: main:argc <- $ecx
+	#DEBUG_VALUE: foo:x <- 2
+	.cv_loc	1 2 8 0                         # ./a.h:8:0
+	nop
+.Ltmp8:
+	int3
+.Ltmp9:
+	#DEBUG_VALUE: main:argc <- [DW_OP_LLVM_entry_value 1] $ecx
+	#DEBUG_VALUE: main:argv <- [DW_OP_LLVM_entry_value 1] $rdx
+.Lfunc_end0:
+	.seh_endproc
+                                        # -- End function
+	.section	.drectve,"yn"
+	.ascii	" /DEFAULTLIB:libcmt.lib"
+	.ascii	" /DEFAULTLIB:oldnames.lib"
+	.section	.debug$S,"dr"
+	.p2align	2
+	.long	4                               # Debug section magic
+	.long	241
+	.long	.Ltmp11-.Ltmp10                 # Subsection size
+.Ltmp10:
+	.short	.Ltmp13-.Ltmp12                 # Record length
+.Ltmp12:
+	.short	4353                            # Record kind: S_OBJNAME
+	.long	0                               # Signature
+	.asciz	"/tmp/a-2b2ba0.obj"             # Object name
+	.p2align	2
+.Ltmp13:
+	.short	.Ltmp15-.Ltmp14                 # Record length
+.Ltmp14:
+	.short	4412                            # Record kind: S_COMPILE3
+	.long	1                               # Flags and language
+	.short	208                             # CPUType
+	.short	15                              # Frontend version
+	.short	0
+	.short	0
+	.short	0
+	.short	15000                           # Backend version
+	.short	0
+	.short	0
+	.short	0
+	.asciz	"clang version 15.0.0"          # Null-terminated compiler version string
+	.p2align	2
+.Ltmp15:
+.Ltmp11:
+	.p2align	2
+	.long	246                             # Inlinee lines subsection
+	.long	.Ltmp17-.Ltmp16                 # Subsection size
+.Ltmp16:
+	.long	0                               # Inlinee lines signature
+
+                                        # Inlined function foo starts at ./a.h:4
+	.long	4099                            # Type index of inlined function
+	.cv_filechecksumoffset	2               # Offset into filechecksum table
+	.long	4                               # Starting line number
+
+                                        # Inlined function bar starts at ./b.h:4
+	.long	4106                            # Type index of inlined function
+	.cv_filechecksumoffset	3               # Offset into filechecksum table
+	.long	4                               # Starting line number
+
+                                        # Inlined function func starts at ./c.h:4
+	.long	4113                            # Type index of inlined function
+	.cv_filechecksumoffset	4               # Offset into filechecksum table
+	.long	4                               # Starting line number
+.Ltmp17:
+	.p2align	2
+	.section	.debug$S,"dr",associative,main
+	.p2align	2
+	.long	4                               # Debug section magic
+	.long	241                             # Symbol subsection for main
+	.long	.Ltmp19-.Ltmp18                 # Subsection size
+.Ltmp18:
+	.short	.Ltmp21-.Ltmp20                 # Record length
+.Ltmp20:
+	.short	4423                            # Record kind: S_GPROC32_ID
+	.long	0                               # PtrParent
+	.long	0                               # PtrEnd
+	.long	0                               # PtrNext
+	.long	.Lfunc_end0-main                # Code size
+	.long	0                               # Offset after prologue
+	.long	0                               # Offset before epilogue
+	.long	4117                            # Function type index
+	.secrel32	main                    # Function section relative address
+	.secidx	main                            # Function section index
+	.byte	0                               # Flags
+	.asciz	"main"                          # Function name
+	.p2align	2
+.Ltmp21:
+	.short	.Ltmp23-.Ltmp22                 # Record length
+.Ltmp22:
+	.short	4114                            # Record kind: S_FRAMEPROC
+	.long	56                              # FrameSize
+	.long	0                               # Padding
+	.long	0                               # Offset of padding
+	.long	0                               # Bytes of callee saved registers
+	.long	0                               # Exception handler offset
+	.short	0                               # Exception handler section
+	.long	81920                           # Flags (defines frame register)
+	.p2align	2
+.Ltmp23:
+	.short	.Ltmp25-.Ltmp24                 # Record length
+.Ltmp24:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	116                             # TypeIndex
+	.short	1                               # Flags
+	.asciz	"argc"
+	.p2align	2
+.Ltmp25:
+	.cv_def_range	 .Lfunc_begin0 .Ltmp5 .Ltmp7 .Ltmp8, reg, 18
+	.short	.Ltmp27-.Ltmp26                 # Record length
+.Ltmp26:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	4114                            # TypeIndex
+	.short	1                               # Flags
+	.asciz	"argv"
+	.p2align	2
+.Ltmp27:
+	.cv_def_range	 .Lfunc_begin0 .Ltmp8, reg, 331
+	.short	.Ltmp29-.Ltmp28                 # Record length
+.Ltmp28:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	4118                            # TypeIndex
+	.short	0                               # Flags
+	.asciz	"main_local"
+	.p2align	2
+.Ltmp29:
+	.cv_def_range	 .Ltmp0 .Ltmp9, frame_ptr_rel, 48
+	.short	.Ltmp31-.Ltmp30                 # Record length
+.Ltmp30:
+	.short	4429                            # Record kind: S_INLINESITE
+	.long	0                               # PtrParent
+	.long	0                               # PtrEnd
+	.long	4099                            # Inlinee type index
+	.cv_inline_linetable	1 2 4 .Lfunc_begin0 .Lfunc_end0
+	.p2align	2
+.Ltmp31:
+	.short	.Ltmp33-.Ltmp32                 # Record length
+.Ltmp32:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	116                             # TypeIndex
+	.short	257                             # Flags
+	.asciz	"x"
+	.p2align	2
+.Ltmp33:
+	.short	.Ltmp35-.Ltmp34                 # Record length
+.Ltmp34:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	4118                            # TypeIndex
+	.short	0                               # Flags
+	.asciz	"foo_local"
+	.p2align	2
+.Ltmp35:
+	.cv_def_range	 .Ltmp0 .Ltmp6 .Ltmp7 .Ltmp9, frame_ptr_rel, 44
+	.short	.Ltmp37-.Ltmp36                 # Record length
+.Ltmp36:
+	.short	4429                            # Record kind: S_INLINESITE
+	.long	0                               # PtrParent
+	.long	0                               # PtrEnd
+	.long	4106                            # Inlinee type index
+	.cv_inline_linetable	2 3 4 .Lfunc_begin0 .Lfunc_end0
+	.p2align	2
+.Ltmp37:
+	.short	.Ltmp39-.Ltmp38                 # Record length
+.Ltmp38:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	116                             # TypeIndex
+	.short	1                               # Flags
+	.asciz	"x"
+	.p2align	2
+.Ltmp39:
+	.cv_def_range	 .Ltmp2 .Ltmp3, reg, 17
+	.short	.Ltmp41-.Ltmp40                 # Record length
+.Ltmp40:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	4118                            # TypeIndex
+	.short	0                               # Flags
+	.asciz	"bar_local"
+	.p2align	2
+.Ltmp41:
+	.cv_def_range	 .Ltmp2 .Ltmp6, frame_ptr_rel, 52
+	.short	.Ltmp43-.Ltmp42                 # Record length
+.Ltmp42:
+	.short	4429                            # Record kind: S_INLINESITE
+	.long	0                               # PtrParent
+	.long	0                               # PtrEnd
+	.long	4113                            # Inlinee type index
+	.cv_inline_linetable	3 4 4 .Lfunc_begin0 .Lfunc_end0
+	.p2align	2
+.Ltmp43:
+	.short	.Ltmp45-.Ltmp44                 # Record length
+.Ltmp44:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	116                             # TypeIndex
+	.short	1                               # Flags
+	.asciz	"x"
+	.p2align	2
+.Ltmp45:
+	.cv_def_range	 .Ltmp4 .Ltmp6, reg, 17
+	.short	.Ltmp47-.Ltmp46                 # Record length
+.Ltmp46:
+	.short	4414                            # Record kind: S_LOCAL
+	.long	4118                            # TypeIndex
+	.short	0                               # Flags
+	.asciz	"func_local"
+	.p2align	2
+.Ltmp47:
+	.cv_def_range	 .Ltmp4 .Ltmp6, frame_ptr_rel, 48
+	.short	2                               # Record length
+	.short	4430                            # Record kind: S_INLINESITE_END
+	.short	2                               # Record length
+	.short	4430                            # Record kind: S_INLINESITE_END
+	.short	2                               # Record length
+	.short	4430                            # Record kind: S_INLINESITE_END
+	.short	2                               # Record length
+	.short	4431                            # Record kind: S_PROC_ID_END
+.Ltmp19:
+	.p2align	2
+	.cv_linetable	0, main, .Lfunc_end0
+	.section	.debug$S,"dr"
+	.long	241
+	.long	.Ltmp49-.Ltmp48                 # Subsection size
+.Ltmp48:
+	.short	.Ltmp51-.Ltmp50                 # Record length
+.Ltmp50:
+	.short	4360                            # Record kind: S_UDT
+	.long	4103                            # Type
+	.asciz	"Class1"
+	.p2align	2
+.Ltmp51:
+	.short	.Ltmp53-.Ltmp52                 # Record length
+.Ltmp52:
+	.short	4360                            # Record kind: S_UDT
+	.long	4110                            # Type
+	.asciz	"Namespace2::Class2"
+	.p2align	2
+.Ltmp53:
+.Ltmp49:
+	.p2align	2
+	.cv_filechecksums                       # File index to string table offset subsection
+	.cv_stringtable                         # String table
+	.long	241
+	.long	.Ltmp55-.Ltmp54                 # Subsection size
+.Ltmp54:
+	.short	.Ltmp57-.Ltmp56                 # Record length
+.Ltmp56:
+	.short	4428                            # Record kind: S_BUILDINFO
+	.long	4124                            # LF_BUILDINFO index
+	.p2align	2
+.Ltmp57:
+.Ltmp55:
+	.p2align	2
+	.section	.debug$T,"dr"
+	.p2align	2
+	.long	4                               # Debug section magic
+	# StringId (0x1000)
+	.short	0x12                            # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.asciz	"Namespace1"                    # StringData
+	.byte	241
+	# ArgList (0x1001)
+	.short	0xa                             # Record length
+	.short	0x1201                          # Record kind: LF_ARGLIST
+	.long	0x1                             # NumArgs
+	.long	0x74                            # Argument: int
+	# Procedure (0x1002)
+	.short	0xe                             # Record length
+	.short	0x1008                          # Record kind: LF_PROCEDURE
+	.long	0x74                            # ReturnType: int
+	.byte	0x0                             # CallingConvention: NearC
+	.byte	0x0                             # FunctionOptions
+	.short	0x1                             # NumParameters
+	.long	0x1001                          # ArgListType: (int)
+	# FuncId (0x1003)
+	.short	0xe                             # Record length
+	.short	0x1601                          # Record kind: LF_FUNC_ID
+	.long	0x1000                          # ParentScope: Namespace1
+	.long	0x1002                          # FunctionType: int (int)
+	.asciz	"foo"                           # Name
+	# Class (0x1004)
+	.short	0x2a                            # Record length
+	.short	0x1504                          # Record kind: LF_CLASS
+	.short	0x0                             # MemberCount
+	.short	0x280                           # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) )
+	.long	0x0                             # FieldList
+	.long	0x0                             # DerivedFrom
+	.long	0x0                             # VShape
+	.short	0x0                             # SizeOf
+	.asciz	"Class1"                        # Name
+	.asciz	".?AVClass1@@"                  # LinkageName
+	.byte	242
+	.byte	241
+	# MemberFunction (0x1005)
+	.short	0x1a                            # Record length
+	.short	0x1009                          # Record kind: LF_MFUNCTION
+	.long	0x74                            # ReturnType: int
+	.long	0x1004                          # ClassType: Class1
+	.long	0x0                             # ThisType
+	.byte	0x0                             # CallingConvention: NearC
+	.byte	0x0                             # FunctionOptions
+	.short	0x1                             # NumParameters
+	.long	0x1001                          # ArgListType: (int)
+	.long	0x0                             # ThisAdjustment
+	# FieldList (0x1006)
+	.short	0xe                             # Record length
+	.short	0x1203                          # Record kind: LF_FIELDLIST
+	.short	0x1511                          # Member kind: OneMethod ( LF_ONEMETHOD )
+	.short	0xb                             # Attrs: Public, Static
+	.long	0x1005                          # Type: int Class1::(int)
+	.asciz	"bar"                           # Name
+	# Class (0x1007)
+	.short	0x2a                            # Record length
+	.short	0x1504                          # Record kind: LF_CLASS
+	.short	0x1                             # MemberCount
+	.short	0x200                           # Properties ( HasUniqueName (0x200) )
+	.long	0x1006                          # FieldList: <field list>
+	.long	0x0                             # DerivedFrom
+	.long	0x0                             # VShape
+	.short	0x1                             # SizeOf
+	.asciz	"Class1"                        # Name
+	.asciz	".?AVClass1@@"                  # LinkageName
+	.byte	242
+	.byte	241
+	# StringId (0x1008)
+	.short	0x12                            # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.asciz	"/tmp/./b.h"                    # StringData
+	.byte	241
+	# UdtSourceLine (0x1009)
+	.short	0xe                             # Record length
+	.short	0x1606                          # Record kind: LF_UDT_SRC_LINE
+	.long	0x1007                          # UDT: Class1
+	.long	0x1008                          # SourceFile: /tmp/./b.h
+	.long	0x2                             # LineNumber
+	# MemberFuncId (0x100A)
+	.short	0xe                             # Record length
+	.short	0x1602                          # Record kind: LF_MFUNC_ID
+	.long	0x1004                          # ClassType: Class1
+	.long	0x1005                          # FunctionType: int Class1::(int)
+	.asciz	"bar"                           # Name
+	# Class (0x100B)
+	.short	0x42                            # Record length
+	.short	0x1504                          # Record kind: LF_CLASS
+	.short	0x0                             # MemberCount
+	.short	0x280                           # Properties ( ForwardReference (0x80) | HasUniqueName (0x200) )
+	.long	0x0                             # FieldList
+	.long	0x0                             # DerivedFrom
+	.long	0x0                             # VShape
+	.short	0x0                             # SizeOf
+	.asciz	"Namespace2::Class2"            # Name
+	.asciz	".?AVClass2@Namespace2@@"       # LinkageName
+	.byte	243
+	.byte	242
+	.byte	241
+	# MemberFunction (0x100C)
+	.short	0x1a                            # Record length
+	.short	0x1009                          # Record kind: LF_MFUNCTION
+	.long	0x74                            # ReturnType: int
+	.long	0x100b                          # ClassType: Namespace2::Class2
+	.long	0x0                             # ThisType
+	.byte	0x0                             # CallingConvention: NearC
+	.byte	0x0                             # FunctionOptions
+	.short	0x1                             # NumParameters
+	.long	0x1001                          # ArgListType: (int)
+	.long	0x0                             # ThisAdjustment
+	# FieldList (0x100D)
+	.short	0x12                            # Record length
+	.short	0x1203                          # Record kind: LF_FIELDLIST
+	.short	0x1511                          # Member kind: OneMethod ( LF_ONEMETHOD )
+	.short	0xb                             # Attrs: Public, Static
+	.long	0x100c                          # Type: int Namespace2::Class2::(int)
+	.asciz	"func"                          # Name
+	.byte	243
+	.byte	242
+	.byte	241
+	# Class (0x100E)
+	.short	0x42                            # Record length
+	.short	0x1504                          # Record kind: LF_CLASS
+	.short	0x1                             # MemberCount
+	.short	0x200                           # Properties ( HasUniqueName (0x200) )
+	.long	0x100d                          # FieldList: <field list>
+	.long	0x0                             # DerivedFrom
+	.long	0x0                             # VShape
+	.short	0x1                             # SizeOf
+	.asciz	"Namespace2::Class2"            # Name
+	.asciz	".?AVClass2@Namespace2@@"       # LinkageName
+	.byte	243
+	.byte	242
+	.byte	241
+	# StringId (0x100F)
+	.short	0x12                            # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.asciz	"/tmp/./c.h"                    # StringData
+	.byte	241
+	# UdtSourceLine (0x1010)
+	.short	0xe                             # Record length
+	.short	0x1606                          # Record kind: LF_UDT_SRC_LINE
+	.long	0x100e                          # UDT: Namespace2::Class2
+	.long	0x100f                          # SourceFile: /tmp/./c.h
+	.long	0x2                             # LineNumber
+	# MemberFuncId (0x1011)
+	.short	0x12                            # Record length
+	.short	0x1602                          # Record kind: LF_MFUNC_ID
+	.long	0x100b                          # ClassType: Namespace2::Class2
+	.long	0x100c                          # FunctionType: int Namespace2::Class2::(int)
+	.asciz	"func"                          # Name
+	.byte	243
+	.byte	242
+	.byte	241
+	# Pointer (0x1012)
+	.short	0xa                             # Record length
+	.short	0x1002                          # Record kind: LF_POINTER
+	.long	0x670                           # PointeeType: char*
+	.long	0x1000c                         # Attrs: [ Type: Near64, Mode: Pointer, SizeOf: 8 ]
+	# ArgList (0x1013)
+	.short	0xe                             # Record length
+	.short	0x1201                          # Record kind: LF_ARGLIST
+	.long	0x2                             # NumArgs
+	.long	0x74                            # Argument: int
+	.long	0x1012                          # Argument: char**
+	# Procedure (0x1014)
+	.short	0xe                             # Record length
+	.short	0x1008                          # Record kind: LF_PROCEDURE
+	.long	0x74                            # ReturnType: int
+	.byte	0x0                             # CallingConvention: NearC
+	.byte	0x0                             # FunctionOptions
+	.short	0x2                             # NumParameters
+	.long	0x1013                          # ArgListType: (int, char**)
+	# FuncId (0x1015)
+	.short	0x12                            # Record length
+	.short	0x1601                          # Record kind: LF_FUNC_ID
+	.long	0x0                             # ParentScope
+	.long	0x1014                          # FunctionType: int (int, char**)
+	.asciz	"main"                          # Name
+	.byte	243
+	.byte	242
+	.byte	241
+	# Modifier (0x1016)
+	.short	0xa                             # Record length
+	.short	0x1001                          # Record kind: LF_MODIFIER
+	.long	0x74                            # ModifiedType: int
+	.short	0x2                             # Modifiers ( Volatile (0x2) )
+	.byte	242
+	.byte	241
+	# StringId (0x1017)
+	.short	0xe                             # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.asciz	"/tmp"                          # StringData
+	.byte	243
+	.byte	242
+	.byte	241
+	# StringId (0x1018)
+	.short	0xe                             # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.asciz	"a.cpp"                         # StringData
+	.byte	242
+	.byte	241
+	# StringId (0x1019)
+	.short	0xa                             # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.byte	0                               # StringData
+	.byte	243
+	.byte	242
+	.byte	241
+	# StringId (0x101A)
+	.short	0x4e                            # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.asciz	"/usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang" # StringData
+	.byte	243
+	.byte	242
+	.byte	241
+	# StringId (0x101B)
+	.short	0x9f6                           # Record length
+	.short	0x1605                          # Record kind: LF_STRING_ID
+	.long	0x0                             # Id
+	.asciz	"\"-cc1\" \"-triple\" \"x86_64-pc-windows-msvc19.20.0\" \"-S\" \"-disable-free\" \"-clear-ast-before-backend\" \"-disable-llvm-verifier\" \"-discard-value-names\" \"-mrelocation-model\" \"pic\" \"-pic-level\" \"2\" \"-mframe-pointer=none\" \"-relaxed-aliasing\" \"-fmath-errno\" \"-ffp-contract=on\" \"-fno-rounding-math\" \"-mconstructor-aliases\" \"-funwind-tables=2\" \"-target-cpu\" \"x86-64\" \"-mllvm\" \"-x86-asm-syntax=intel\" \"-tune-cpu\" \"generic\" \"-mllvm\" \"-treat-scalable-fixed-error-as-warning\" \"-D_MT\" \"-flto-visibility-public-std\" \"--dependent-lib=libcmt\" \"--dependent-lib=oldnames\" \"-stack-protector\" \"2\" \"-fms-volatile\" \"-fdiagnostics-format\" \"msvc\" \"-gno-column-info\" \"-gcodeview\" \"-debug-info-kind=constructor\" \"-ffunction-sections\" \"-fcoverage-compilation-dir=/tmp\" \"-resource-dir\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt\" \"-internal-isystem\" \"/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt\" \"-Os\" \"-fdeprecated-macro\" \"-fdebug-compilation-dir=/tmp\" \"-ferror-limit\" \"19\" \"-fno-use-cxa-atexit\" \"-fms-extensions\" \"-fms-compatibility\" \"-fms-compatibility-version=19.20\" \"-std=c++14\" \"-fdelayed-template-parsing\" \"-fcolor-diagnostics\" \"-vectorize-loops\" \"-vectorize-slp\" \"-faddrsig\" \"-x\" \"c++\"" # StringData
+	.byte	242
+	.byte	241
+	# BuildInfo (0x101C)
+	.short	0x1a                            # Record length
+	.short	0x1603                          # Record kind: LF_BUILDINFO
+	.short	0x5                             # NumArgs
+	.long	0x1017                          # Argument: /tmp
+	.long	0x101a                          # Argument: /usr/local/google/home/zequanwu/llvm-project/build/release/bin/clang
+	.long	0x1018                          # Argument: a.cpp
+	.long	0x1019                          # Argument
+	.long	0x101b                          # Argument: "-cc1" "-triple" "x86_64-pc-windows-msvc19.20.0" "-S" "-disable-free" "-clear-ast-before-backend" "-disable-llvm-verifier" "-discard-value-names" "-mrelocation-model" "pic" "-pic-level" "2" "-mframe-pointer=none" "-relaxed-aliasing" "-fmath-errno" "-ffp-contract=on" "-fno-rounding-math" "-mconstructor-aliases" "-funwind-tables=2" "-target-cpu" "x86-64" "-mllvm" "-x86-asm-syntax=intel" "-tune-cpu" "generic" "-mllvm" "-treat-scalable-fixed-error-as-warning" "-D_MT" "-flto-visibility-public-std" "--dependent-lib=libcmt" "--dependent-lib=oldnames" "-stack-protector" "2" "-fms-volatile" "-fdiagnostics-format" "msvc" "-gno-column-info" "-gcodeview" "-debug-info-kind=constructor" "-ffunction-sections" "-fcoverage-compilation-dir=/tmp" "-resource-dir" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0" "-internal-isystem" "/usr/local/google/home/zequanwu/llvm-project/build/release/lib/clang/15.0.0/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/DIA SDK/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/VC/Tools/MSVC/14.26.28801/atlmfc/include" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/ucrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/shared" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/um" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/winrt" "-internal-isystem" "/usr/local/google/home/zequanwu/chromium/src/third_party/depot_tools/win_toolchain/vs_files/3bda71a11e/Windows Kits/10/Include/10.0.19041.0/cppwinrt" "-Os" "-fdeprecated-macro" "-fdebug-compilation-dir=/tmp" "-ferror-limit" "19" "-fno-use-cxa-atexit" "-fms-extensions" "-fms-compatibility" "-fms-compatibility-version=19.20" "-std=c++14" "-fdelayed-template-parsing" "-fcolor-diagnostics" "-vectorize-loops" "-vectorize-slp" "-faddrsig" "-x" "c++"
+	.byte	242
+	.byte	241
+	.addrsig
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit
index 2291c7c45271..eab5061dafbd 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit
+++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/inline_sites_live.lldbinit
@@ -1,7 +1,7 @@
-br set -p BP_bar -f inline_sites_live.cpp
-br set -p BP_foo -f inline_sites_live.cpp
-run
-expression param
-continue
-expression param
-expression local
+br set -p BP_bar -f inline_sites_live.cpp
+br set -p BP_foo -f inline_sites_live.cpp
+run
+expression param
+continue
+expression param
+expression local
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit
index ad080da24dab..feda74856757 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit
+++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/local-variables-registers.lldbinit
@@ -1,35 +1,35 @@
-image lookup -a 0x140001000 -v
-image lookup -a 0x140001003 -v
-image lookup -a 0x140001006 -v
-
-image lookup -a 0x140001011 -v
-image lookup -a 0x140001017 -v
-image lookup -a 0x140001019 -v
-image lookup -a 0x14000101e -v
-image lookup -a 0x14000102c -v
-
-image lookup -a 0x140001031 -v
-image lookup -a 0x140001032 -v
-image lookup -a 0x140001033 -v
-image lookup -a 0x140001034 -v
-image lookup -a 0x140001035 -v
-image lookup -a 0x140001036 -v
-image lookup -a 0x140001037 -v
-image lookup -a 0x14000103b -v
-image lookup -a 0x14000103d -v
-image lookup -a 0x14000103f -v
-image lookup -a 0x140001041 -v
-image lookup -a 0x140001043 -v
-image lookup -a 0x140001045 -v
-image lookup -a 0x140001046 -v
-image lookup -a 0x140001047 -v
-image lookup -a 0x140001048 -v
-image lookup -a 0x140001049 -v
-image lookup -a 0x14000104a -v
-image lookup -a 0x14000104b -v
-image lookup -a 0x14000104c -v
-image lookup -a 0x14000104e -v
-image lookup -a 0x14000104f -v
-image lookup -a 0x140001050 -v
-image lookup -a 0x140001051 -v
-exit
+image lookup -a 0x140001000 -v
+image lookup -a 0x140001003 -v
+image lookup -a 0x140001006 -v
+
+image lookup -a 0x140001011 -v
+image lookup -a 0x140001017 -v
+image lookup -a 0x140001019 -v
+image lookup -a 0x14000101e -v
+image lookup -a 0x14000102c -v
+
+image lookup -a 0x140001031 -v
+image lookup -a 0x140001032 -v
+image lookup -a 0x140001033 -v
+image lookup -a 0x140001034 -v
+image lookup -a 0x140001035 -v
+image lookup -a 0x140001036 -v
+image lookup -a 0x140001037 -v
+image lookup -a 0x14000103b -v
+image lookup -a 0x14000103d -v
+image lookup -a 0x14000103f -v
+image lookup -a 0x140001041 -v
+image lookup -a 0x140001043 -v
+image lookup -a 0x140001045 -v
+image lookup -a 0x140001046 -v
+image lookup -a 0x140001047 -v
+image lookup -a 0x140001048 -v
+image lookup -a 0x140001049 -v
+image lookup -a 0x14000104a -v
+image lookup -a 0x14000104b -v
+image lookup -a 0x14000104c -v
+image lookup -a 0x14000104e -v
+image lookup -a 0x14000104f -v
+image lookup -a 0x140001050 -v
+image lookup -a 0x140001051 -v
+exit
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit
index afe3f2c8b943..3f639eb2e539 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit
+++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/lookup-by-types.lldbinit
@@ -1,4 +1,4 @@
-image lookup -type A
-image lookup -type B
-
+image lookup -type A
+image lookup -type B
+
 quit
\ No newline at end of file
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit
index 3dc33fd789da..32758f1fbc51 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit
+++ b/lldb/test/Shell/SymbolFile/NativePDB/Inputs/subfield_register_simple_type.lldbinit
@@ -1,2 +1,2 @@
-image lookup -a 0x40102f -v
-quit
+image lookup -a 0x40102f -v
+quit
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp
index ca2a84de7698..f0fac90e5065 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/function-types-classes.cpp
@@ -113,9 +113,9 @@ auto incomplete = &three<Incomplete*, Incomplete**, const Incomplete*>;
 // CHECK: |-CXXRecordDecl {{.*}} union U
 // CHECK: |-EnumDecl {{.*}} E
 // CHECK: |-CXXRecordDecl {{.*}} struct S
-// CHECK: |-VarDecl {{.*}} a 'S (*)(C *, U &, E &&)'
-// CHECK: |-VarDecl {{.*}} b 'E (*)(const S *, const C &, const U &&)'
-// CHECK: |-VarDecl {{.*}} c 'U (*)(volatile E *, volatile S &, volatile C &&)'
+// CHECK: |-VarDecl {{.*}} a 'S (*)(C *, U &, E &&)'
+// CHECK: |-VarDecl {{.*}} b 'E (*)(const S *, const C &, const U &&)'
+// CHECK: |-VarDecl {{.*}} c 'U (*)(volatile E *, volatile S &, volatile C &&)'
 // CHECK: |-VarDecl {{.*}} d 'C (*)(const volatile U *, const volatile E &, const volatile S &&)'
 // CHECK: |-CXXRecordDecl {{.*}} struct B
 // CHECK: | `-CXXRecordDecl {{.*}} struct A
@@ -125,14 +125,14 @@ auto incomplete = &three<Incomplete*, Incomplete**, const Incomplete*>;
 // CHECK: | | `-CXXRecordDecl {{.*}} struct S
 // CHECK: | `-NamespaceDecl {{.*}} B
 // CHECK: |   `-CXXRecordDecl {{.*}} struct S
-// CHECK: |-VarDecl {{.*}} e 'A::B::S *(*)(B::A::S *, A::C::S &)'
-// CHECK: |-VarDecl {{.*}} f 'A::C::S &(*)(A::B::S *, B::A::S *)'
+// CHECK: |-VarDecl {{.*}} e 'A::B::S *(*)(B::A::S *, A::C::S &)'
+// CHECK: |-VarDecl {{.*}} f 'A::C::S &(*)(A::B::S *, B::A::S *)'
 // CHECK: |-VarDecl {{.*}} g 'B::A::S *(*)(A::C::S &, A::B::S *)'
 // CHECK: |-CXXRecordDecl {{.*}} struct TC<int>
 // CHECK: |-CXXRecordDecl {{.*}} struct TC<TC<int>>
 // CHECK: |-CXXRecordDecl {{.*}} struct TC<A::B::S>
 // CHECK: |-CXXRecordDecl {{.*}} struct TC<void>
-// CHECK: |-VarDecl {{.*}} h 'TC<void> (*)(TC<int>, TC<TC<int>>, TC<A::B::S>)'
+// CHECK: |-VarDecl {{.*}} h 'TC<void> (*)(TC<int>, TC<TC<int>>, TC<A::B::S>)'
 // CHECK: |-VarDecl {{.*}} i 'A::B::S (*)()'
 // CHECK: |-CXXRecordDecl {{.*}} struct Incomplete
 // CHECK: `-VarDecl {{.*}} incomplete 'Incomplete *(*)(Incomplete **, const Incomplete *)'
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp
index 767149ea18c4..402982726965 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/inline_sites_live.cpp
@@ -1,34 +1,34 @@
-// clang-format off
-// REQUIRES: system-windows
-
-// RUN: %build -o %t.exe -- %s
-// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \
-// RUN:     %p/Inputs/inline_sites_live.lldbinit 2>&1 | FileCheck %s
-
-void use(int) {}
-
-void __attribute__((always_inline)) bar(int param) {
-  use(param); // BP_bar
-}
-
-void __attribute__((always_inline)) foo(int param) {
-  int local = param+1;
-  bar(local);
-  use(param);
-  use(local); // BP_foo
-}
-
-int main(int argc, char** argv) {
-  foo(argc);
-}
-
-// CHECK:      * thread #1, stop reason = breakpoint 1
-// CHECK-NEXT:    frame #0: {{.*}}`main [inlined] bar(param=2)
-// CHECK:      (lldb) expression param
-// CHECK-NEXT: (int) $0 = 2
-// CHECK:      * thread #1, stop reason = breakpoint 2
-// CHECK-NEXT:    frame #0: {{.*}}`main [inlined] foo(param=1)
-// CHECK:      (lldb) expression param
-// CHECK-NEXT: (int) $1 = 1
-// CHECK-NEXT: (lldb) expression local
-// CHECK-NEXT: (int) $2 = 2
+// clang-format off
+// REQUIRES: system-windows
+
+// RUN: %build -o %t.exe -- %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \
+// RUN:     %p/Inputs/inline_sites_live.lldbinit 2>&1 | FileCheck %s
+
+void use(int) {}
+
+void __attribute__((always_inline)) bar(int param) {
+  use(param); // BP_bar
+}
+
+void __attribute__((always_inline)) foo(int param) {
+  int local = param+1;
+  bar(local);
+  use(param);
+  use(local); // BP_foo
+}
+
+int main(int argc, char** argv) {
+  foo(argc);
+}
+
+// CHECK:      * thread #1, stop reason = breakpoint 1
+// CHECK-NEXT:    frame #0: {{.*}}`main [inlined] bar(param=2)
+// CHECK:      (lldb) expression param
+// CHECK-NEXT: (int) $0 = 2
+// CHECK:      * thread #1, stop reason = breakpoint 2
+// CHECK-NEXT:    frame #0: {{.*}}`main [inlined] foo(param=1)
+// CHECK:      (lldb) expression param
+// CHECK-NEXT: (int) $1 = 1
+// CHECK-NEXT: (lldb) expression local
+// CHECK-NEXT: (int) $2 = 2
diff --git a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp
index f3aea8115f38..cd5bbfc30fa0 100644
--- a/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp
+++ b/lldb/test/Shell/SymbolFile/NativePDB/lookup-by-types.cpp
@@ -1,46 +1,46 @@
-// clang-format off
-
-// RUN: %build -o %t.exe -- %s
-// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \
-// RUN:     %p/Inputs/lookup-by-types.lldbinit 2>&1 | FileCheck %s
-
-class B;
-class A {
-public:
-    static const A constA;
-    static A a;
-    static B b;
-    int val = 1;
-};
-class B {
-public:
-    static A a;
-    int val = 2;
-};
-A varA;
-B varB;
-const A A::constA = varA;
-A A::a = varA;
-B A::b = varB;
-A B::a = varA;
-
-int main(int argc, char **argv) {
-  return varA.val + varB.val;
-}
-
-// CHECK:      image lookup -type A
-// CHECK-NEXT: 1 match found in {{.*}}.exe
-// CHECK-NEXT: compiler_type = "class A {
-// CHECK-NEXT:     static const A constA;
-// CHECK-NEXT:     static A a;
-// CHECK-NEXT:     static B b;
-// CHECK-NEXT: public:
-// CHECK-NEXT:     int val;
-// CHECK-NEXT: }"
-// CHECK:      image lookup -type B
-// CHECK-NEXT: 1 match found in {{.*}}.exe
-// CHECK-NEXT:  compiler_type = "class B {
-// CHECK-NEXT:     static A a;
-// CHECK-NEXT: public:
-// CHECK-NEXT:     int val;
-// CHECK-NEXT: }"
+// clang-format off
+
+// RUN: %build -o %t.exe -- %s
+// RUN: env LLDB_USE_NATIVE_PDB_READER=1 %lldb -f %t.exe -s \
+// RUN:     %p/Inputs/lookup-by-types.lldbinit 2>&1 | FileCheck %s
+
+class B;
+class A {
+public:
+    static const A constA;
+    static A a;
+    static B b;
+    int val = 1;
+};
+class B {
+public:
+    static A a;
+    int val = 2;
+};
+A varA;
+B varB;
+const A A::constA = varA;
+A A::a = varA;
+B A::b = varB;
+A B::a = varA;
+
+int main(int argc, char **argv) {
+  return varA.val + varB.val;
+}
+
+// CHECK:      image lookup -type A
+// CHECK-NEXT: 1 match found in {{.*}}.exe
+// CHECK-NEXT: compiler_type = "class A {
+// CHECK-NEXT:     static const A constA;
+// CHECK-NEXT:     static A a;
+// CHECK-NEXT:     static B b;
+// CHECK-NEXT: public:
+// CHECK-NEXT:     int val;
+// CHECK-NEXT: }"
+// CHECK:      image lookup -type B
+// CHECK-NEXT: 1 match found in {{.*}}.exe
+// CHECK-NEXT:  compiler_type = "class B {
+// CHECK-NEXT:     static A a;
+// CHECK-NEXT: public:
+// CHECK-NEXT:     int val;
+// CHECK-NEXT: }"
diff --git a/lldb/unittests/Breakpoint/CMakeLists.txt b/lldb/unittests/Breakpoint/CMakeLists.txt
index 757c2da1a4d9..db985bc82dc5 100644
--- a/lldb/unittests/Breakpoint/CMakeLists.txt
+++ b/lldb/unittests/Breakpoint/CMakeLists.txt
@@ -1,10 +1,10 @@
-add_lldb_unittest(LLDBBreakpointTests
-  BreakpointIDTest.cpp
-  WatchpointAlgorithmsTests.cpp
-
-  LINK_LIBS
-    lldbBreakpoint
-    lldbCore
-  LINK_COMPONENTS
-    Support
-  )
+add_lldb_unittest(LLDBBreakpointTests
+  BreakpointIDTest.cpp
+  WatchpointAlgorithmsTests.cpp
+
+  LINK_LIBS
+    lldbBreakpoint
+    lldbCore
+  LINK_COMPONENTS
+    Support
+  )
diff --git a/llvm/benchmarks/FormatVariadicBM.cpp b/llvm/benchmarks/FormatVariadicBM.cpp
index c03ead400d0d..e351db338730 100644
--- a/llvm/benchmarks/FormatVariadicBM.cpp
+++ b/llvm/benchmarks/FormatVariadicBM.cpp
@@ -1,63 +1,63 @@
-//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "benchmark/benchmark.h"
-#include "llvm/Support/FormatVariadic.h"
-#include <algorithm>
-#include <string>
-#include <vector>
-
-using namespace llvm;
-using namespace std;
-
-// Generate a list of format strings that have `NumReplacements` replacements
-// by permuting the replacements and some literal text.
-static vector<string> getFormatStrings(int NumReplacements) {
-  vector<string> Components;
-  for (int I = 0; I < NumReplacements; I++)
-    Components.push_back("{" + to_string(I) + "}");
-  // Intersperse these with some other literal text (_).
-  const string_view Literal = "____";
-  for (char C : Literal)
-    Components.push_back(string(1, C));
-
-  vector<string> Formats;
-  do {
-    string Concat;
-    for (const string &C : Components)
-      Concat += C;
-    Formats.emplace_back(Concat);
-  } while (next_permutation(Components.begin(), Components.end()));
-  return Formats;
-}
-
-// Generate the set of formats to exercise outside the benchmark code.
-static const vector<vector<string>> Formats = {
-    getFormatStrings(1), getFormatStrings(2), getFormatStrings(3),
-    getFormatStrings(4), getFormatStrings(5),
-};
-
-// Benchmark formatv() for a variety of format strings and 1-5 replacements.
-static void BM_FormatVariadic(benchmark::State &state) {
-  for (auto _ : state) {
-    for (const string &Fmt : Formats[0])
-      formatv(Fmt.c_str(), 1).str();
-    for (const string &Fmt : Formats[1])
-      formatv(Fmt.c_str(), 1, 2).str();
-    for (const string &Fmt : Formats[2])
-      formatv(Fmt.c_str(), 1, 2, 3).str();
-    for (const string &Fmt : Formats[3])
-      formatv(Fmt.c_str(), 1, 2, 3, 4).str();
-    for (const string &Fmt : Formats[4])
-      formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str();
-  }
-}
-
-BENCHMARK(BM_FormatVariadic);
-
-BENCHMARK_MAIN();
+//===- FormatVariadicBM.cpp - formatv() benchmark ---------- --------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/Support/FormatVariadic.h"
+#include <algorithm>
+#include <string>
+#include <vector>
+
+using namespace llvm;
+using namespace std;
+
+// Generate a list of format strings that have `NumReplacements` replacements
+// by permuting the replacements and some literal text.
+static vector<string> getFormatStrings(int NumReplacements) {
+  vector<string> Components;
+  for (int I = 0; I < NumReplacements; I++)
+    Components.push_back("{" + to_string(I) + "}");
+  // Intersperse these with some other literal text (_).
+  const string_view Literal = "____";
+  for (char C : Literal)
+    Components.push_back(string(1, C));
+
+  vector<string> Formats;
+  do {
+    string Concat;
+    for (const string &C : Components)
+      Concat += C;
+    Formats.emplace_back(Concat);
+  } while (next_permutation(Components.begin(), Components.end()));
+  return Formats;
+}
+
+// Generate the set of formats to exercise outside the benchmark code.
+static const vector<vector<string>> Formats = {
+    getFormatStrings(1), getFormatStrings(2), getFormatStrings(3),
+    getFormatStrings(4), getFormatStrings(5),
+};
+
+// Benchmark formatv() for a variety of format strings and 1-5 replacements.
+static void BM_FormatVariadic(benchmark::State &state) {
+  for (auto _ : state) {
+    for (const string &Fmt : Formats[0])
+      formatv(Fmt.c_str(), 1).str();
+    for (const string &Fmt : Formats[1])
+      formatv(Fmt.c_str(), 1, 2).str();
+    for (const string &Fmt : Formats[2])
+      formatv(Fmt.c_str(), 1, 2, 3).str();
+    for (const string &Fmt : Formats[3])
+      formatv(Fmt.c_str(), 1, 2, 3, 4).str();
+    for (const string &Fmt : Formats[4])
+      formatv(Fmt.c_str(), 1, 2, 3, 4, 5).str();
+  }
+}
+
+BENCHMARK(BM_FormatVariadic);
+
+BENCHMARK_MAIN();
diff --git a/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp b/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp
index fa9c528424c9..953d9125e11e 100644
--- a/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp
+++ b/llvm/benchmarks/GetIntrinsicForClangBuiltin.cpp
@@ -1,50 +1,50 @@
-#include "benchmark/benchmark.h"
-#include "llvm/IR/Intrinsics.h"
-
-using namespace llvm;
-using namespace Intrinsic;
-
-// Benchmark intrinsic lookup from a variety of targets.
-static void BM_GetIntrinsicForClangBuiltin(benchmark::State &state) {
-  static const char *Builtins[] = {
-      "__builtin_adjust_trampoline",
-      "__builtin_trap",
-      "__builtin_arm_ttest",
-      "__builtin_amdgcn_cubetc",
-      "__builtin_amdgcn_udot2",
-      "__builtin_arm_stc",
-      "__builtin_bpf_compare",
-      "__builtin_HEXAGON_A2_max",
-      "__builtin_lasx_xvabsd_b",
-      "__builtin_mips_dlsa",
-      "__nvvm_floor_f",
-      "__builtin_altivec_vslb",
-      "__builtin_r600_read_tgid_x",
-      "__builtin_riscv_aes64im",
-      "__builtin_s390_vcksm",
-      "__builtin_ve_vl_pvfmksge_Mvl",
-      "__builtin_ia32_axor64",
-      "__builtin_bitrev",
-  };
-  static const char *Targets[] = {"",     "aarch64", "amdgcn", "mips",
-                                  "nvvm", "r600",    "riscv"};
-
-  for (auto _ : state) {
-    for (auto Builtin : Builtins)
-      for (auto Target : Targets)
-        getIntrinsicForClangBuiltin(Target, Builtin);
-  }
-}
-
-static void
-BM_GetIntrinsicForClangBuiltinHexagonFirst(benchmark::State &state) {
-  // Exercise the worst case by looking for the first builtin for a target
-  // that has a lot of builtins.
-  for (auto _ : state)
-    getIntrinsicForClangBuiltin("hexagon", "__builtin_HEXAGON_A2_abs");
-}
-
-BENCHMARK(BM_GetIntrinsicForClangBuiltin);
-BENCHMARK(BM_GetIntrinsicForClangBuiltinHexagonFirst);
-
-BENCHMARK_MAIN();
+#include "benchmark/benchmark.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace Intrinsic;
+
+// Benchmark intrinsic lookup from a variety of targets.
+static void BM_GetIntrinsicForClangBuiltin(benchmark::State &state) {
+  static const char *Builtins[] = {
+      "__builtin_adjust_trampoline",
+      "__builtin_trap",
+      "__builtin_arm_ttest",
+      "__builtin_amdgcn_cubetc",
+      "__builtin_amdgcn_udot2",
+      "__builtin_arm_stc",
+      "__builtin_bpf_compare",
+      "__builtin_HEXAGON_A2_max",
+      "__builtin_lasx_xvabsd_b",
+      "__builtin_mips_dlsa",
+      "__nvvm_floor_f",
+      "__builtin_altivec_vslb",
+      "__builtin_r600_read_tgid_x",
+      "__builtin_riscv_aes64im",
+      "__builtin_s390_vcksm",
+      "__builtin_ve_vl_pvfmksge_Mvl",
+      "__builtin_ia32_axor64",
+      "__builtin_bitrev",
+  };
+  static const char *Targets[] = {"",     "aarch64", "amdgcn", "mips",
+                                  "nvvm", "r600",    "riscv"};
+
+  for (auto _ : state) {
+    for (auto Builtin : Builtins)
+      for (auto Target : Targets)
+        getIntrinsicForClangBuiltin(Target, Builtin);
+  }
+}
+
+static void
+BM_GetIntrinsicForClangBuiltinHexagonFirst(benchmark::State &state) {
+  // Exercise the worst case by looking for the first builtin for a target
+  // that has a lot of builtins.
+  for (auto _ : state)
+    getIntrinsicForClangBuiltin("hexagon", "__builtin_HEXAGON_A2_abs");
+}
+
+BENCHMARK(BM_GetIntrinsicForClangBuiltin);
+BENCHMARK(BM_GetIntrinsicForClangBuiltinHexagonFirst);
+
+BENCHMARK_MAIN();
diff --git a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp
index 7f3bd3bc9eb6..758291274675 100644
--- a/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp
+++ b/llvm/benchmarks/GetIntrinsicInfoTableEntriesBM.cpp
@@ -1,30 +1,30 @@
-//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "benchmark/benchmark.h"
-#include "llvm/ADT/SmallVector.h"
-#include "llvm/IR/Intrinsics.h"
-
-using namespace llvm;
-using namespace Intrinsic;
-
-static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) {
-  SmallVector<IITDescriptor> Table;
-  for (auto _ : state) {
-    for (ID ID = 1; ID < num_intrinsics; ++ID) {
-      // This makes sure the vector does not keep growing, as well as after the
-      // first iteration does not result in additional allocations.
-      Table.clear();
-      getIntrinsicInfoTableEntries(ID, Table);
-    }
-  }
-}
-
-BENCHMARK(BM_GetIntrinsicInfoTableEntries);
-
-BENCHMARK_MAIN();
+//===- GetIntrinsicInfoTableEntries.cpp - IIT signature benchmark ---------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "benchmark/benchmark.h"
+#include "llvm/ADT/SmallVector.h"
+#include "llvm/IR/Intrinsics.h"
+
+using namespace llvm;
+using namespace Intrinsic;
+
+static void BM_GetIntrinsicInfoTableEntries(benchmark::State &state) {
+  SmallVector<IITDescriptor> Table;
+  for (auto _ : state) {
+    for (ID ID = 1; ID < num_intrinsics; ++ID) {
+      // This makes sure the vector does not keep growing, as well as after the
+      // first iteration does not result in additional allocations.
+      Table.clear();
+      getIntrinsicInfoTableEntries(ID, Table);
+    }
+  }
+}
+
+BENCHMARK(BM_GetIntrinsicInfoTableEntries);
+
+BENCHMARK_MAIN();
diff --git a/llvm/docs/_static/LoopOptWG_invite.ics b/llvm/docs/_static/LoopOptWG_invite.ics
index 65597d90a9c8..7c92e4048cc3 100644
--- a/llvm/docs/_static/LoopOptWG_invite.ics
+++ b/llvm/docs/_static/LoopOptWG_invite.ics
@@ -1,80 +1,80 @@
-BEGIN:VCALENDAR
-PRODID:-//Google Inc//Google Calendar 70.9054//EN
-VERSION:2.0
-CALSCALE:GREGORIAN
-METHOD:PUBLISH
-X-WR-CALNAME:LLVM Loop Optimization Discussion
-X-WR-TIMEZONE:Europe/Berlin
-BEGIN:VTIMEZONE
-TZID:America/New_York
-X-LIC-LOCATION:America/New_York
-BEGIN:DAYLIGHT
-TZOFFSETFROM:-0500
-TZOFFSETTO:-0400
-TZNAME:EDT
-DTSTART:19700308T020000
-RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU
-END:DAYLIGHT
-BEGIN:STANDARD
-TZOFFSETFROM:-0400
-TZOFFSETTO:-0500
-TZNAME:EST
-DTSTART:19701101T020000
-RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU
-END:STANDARD
-END:VTIMEZONE
-BEGIN:VEVENT
-DTSTART;TZID=America/New_York:20240904T110000
-DTEND;TZID=America/New_York:20240904T120000
-RRULE:FREQ=MONTHLY;BYDAY=1WE
-DTSTAMP:20240821T160951Z
-UID:58h3f0kd3aooohmeii0johh23c@google.com
-X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg
-CREATED:20240821T151507Z
-DESCRIPTION:LLVM Loop Optimization Discussion<br>Video call link: <a href="
- https://meet.google.com/fmz-gspu-odg" target="_blank">https://meet.google.c
- om/fmz-gspu-odg</a><br>Agenda/Minutes/Discussion: <a href="https://docs.goo
- gle.com/document/d/1sdzoyB11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sh
- aring" class="pastedDriveLink-0">https://docs.google.com/document/d/1sdzoyB
- 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing</a>\n\n-::~:~::~:~:~
- :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\
- nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE)
-  +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm
- z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp
- ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n
- -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~
- :~:~::~:~::-
-LAST-MODIFIED:20240821T160941Z
-SEQUENCE:0
-STATUS:CONFIRMED
-SUMMARY:LLVM Loop Optimization Discussion
-TRANSP:OPAQUE
-END:VEVENT
-BEGIN:VEVENT
-DTSTART;TZID=America/New_York:20240904T110000
-DTEND;TZID=America/New_York:20240904T120000
-DTSTAMP:20240821T160951Z
-UID:58h3f0kd3aooohmeii0johh23c@google.com
-X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg
-RECURRENCE-ID;TZID=America/New_York:20240904T110000
-CREATED:20240821T151507Z
-DESCRIPTION:LLVM Loop Optimization Discussion<br>Video call link: <a href="
- https://meet.google.com/fmz-gspu-odg" target="_blank">https://meet.google.c
- om/fmz-gspu-odg</a><br>Agenda/Minutes/Discussion: <a href="https://docs.goo
- gle.com/document/d/1sdzoyB11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sh
- aring" class="pastedDriveLink-0">https://docs.google.com/document/d/1sdzoyB
- 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing</a>\n\n-::~:~::~:~:~
- :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\
- nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE)
-  +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm
- z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp
- ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n
- -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~
- :~:~::~:~::-
-LAST-MODIFIED:20240821T160941Z
-SEQUENCE:0
-STATUS:CONFIRMED
-SUMMARY:LLVM Loop Optimization Discussion
-TRANSP:OPAQUE
-END:VEVENT
-END:VCALENDAR
+BEGIN:VCALENDAR
+PRODID:-//Google Inc//Google Calendar 70.9054//EN
+VERSION:2.0
+CALSCALE:GREGORIAN
+METHOD:PUBLISH
+X-WR-CALNAME:LLVM Loop Optimization Discussion
+X-WR-TIMEZONE:Europe/Berlin
+BEGIN:VTIMEZONE
+TZID:America/New_York
+X-LIC-LOCATION:America/New_York
+BEGIN:DAYLIGHT
+TZOFFSETFROM:-0500
+TZOFFSETTO:-0400
+TZNAME:EDT
+DTSTART:19700308T020000
+RRULE:FREQ=YEARLY;BYMONTH=3;BYDAY=2SU
+END:DAYLIGHT
+BEGIN:STANDARD
+TZOFFSETFROM:-0400
+TZOFFSETTO:-0500
+TZNAME:EST
+DTSTART:19701101T020000
+RRULE:FREQ=YEARLY;BYMONTH=11;BYDAY=1SU
+END:STANDARD
+END:VTIMEZONE
+BEGIN:VEVENT
+DTSTART;TZID=America/New_York:20240904T110000
+DTEND;TZID=America/New_York:20240904T120000
+RRULE:FREQ=MONTHLY;BYDAY=1WE
+DTSTAMP:20240821T160951Z
+UID:58h3f0kd3aooohmeii0johh23c@google.com
+X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg
+CREATED:20240821T151507Z
+DESCRIPTION:LLVM Loop Optimization Discussion<br>Video call link: <a href="
+ https://meet.google.com/fmz-gspu-odg" target="_blank">https://meet.google.c
+ om/fmz-gspu-odg</a><br>Agenda/Minutes/Discussion: <a href="https://docs.goo
+ gle.com/document/d/1sdzoyB11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sh
+ aring" class="pastedDriveLink-0">https://docs.google.com/document/d/1sdzoyB
+ 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing</a>\n\n-::~:~::~:~:~
+ :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\
+ nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE)
+  +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm
+ z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp
+ ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n
+ -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~
+ :~:~::~:~::-
+LAST-MODIFIED:20240821T160941Z
+SEQUENCE:0
+STATUS:CONFIRMED
+SUMMARY:LLVM Loop Optimization Discussion
+TRANSP:OPAQUE
+END:VEVENT
+BEGIN:VEVENT
+DTSTART;TZID=America/New_York:20240904T110000
+DTEND;TZID=America/New_York:20240904T120000
+DTSTAMP:20240821T160951Z
+UID:58h3f0kd3aooohmeii0johh23c@google.com
+X-GOOGLE-CONFERENCE:https://meet.google.com/fmz-gspu-odg
+RECURRENCE-ID;TZID=America/New_York:20240904T110000
+CREATED:20240821T151507Z
+DESCRIPTION:LLVM Loop Optimization Discussion<br>Video call link: <a href="
+ https://meet.google.com/fmz-gspu-odg" target="_blank">https://meet.google.c
+ om/fmz-gspu-odg</a><br>Agenda/Minutes/Discussion: <a href="https://docs.goo
+ gle.com/document/d/1sdzoyB11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sh
+ aring" class="pastedDriveLink-0">https://docs.google.com/document/d/1sdzoyB
+ 11s0ccTZ3fobqctDpgJmRoFcz0sviKxqczs4g/edit?usp=sharing</a>\n\n-::~:~::~:~:~
+ :~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~::~:~::-\
+ nJoin with Google Meet: https://meet.google.com/fmz-gspu-odg\nOr dial: (DE)
+  +49 40 8081617343 PIN: 948106286#\nMore phone numbers: https://tel.meet/fm
+ z-gspu-odg?pin=6273693382184&hs=7\n\nLearn more about Meet at: https://supp
+ ort.google.com/a/users/answer/9282720\n\nPlease do not edit this section.\n
+ -::~:~::~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~:~
+ :~:~::~:~::-
+LAST-MODIFIED:20240821T160941Z
+SEQUENCE:0
+STATUS:CONFIRMED
+SUMMARY:LLVM Loop Optimization Discussion
+TRANSP:OPAQUE
+END:VEVENT
+END:VCALENDAR
diff --git a/llvm/lib/Support/rpmalloc/CACHE.md b/llvm/lib/Support/rpmalloc/CACHE.md
index 052320baf532..645093026deb 100644
--- a/llvm/lib/Support/rpmalloc/CACHE.md
+++ b/llvm/lib/Support/rpmalloc/CACHE.md
@@ -1,19 +1,19 @@
-# Thread caches
-rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let spans of memory pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc.
-
-The rpmalloc thread cache is configured to be unlimited, performance oriented as meaning default values, size oriented where both thread cache and global cache is reduced significantly, or disabled where both thread and global caches are disabled and completely free pages are directly unmapped.
-
-The benchmark is configured to run threads allocating 150000 blocks distributed in the `[16, 16000]` bytes range with a linear falloff probability. It runs 1000 loops, and every iteration 75000 blocks (50%) are freed and allocated in a scattered pattern. There are no cross thread allocations/deallocations. Parameters: `benchmark n 0 0 0 1000 150000 75000 16 16000`. The benchmarks are run on an Ubuntu 16.10 machine with 8 cores (4 physical, HT) and 12GiB RAM.
-
-The benchmark also includes results for the standard library malloc implementation as a reference for comparison with the nocache setting.
-
-![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=387883204&format=image)
-![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=1644710241&format=image)
-
-For single threaded case the unlimited cache and performance oriented cache settings have identical performance and memory overhead, indicating that the memory pages fit in the combined thread and global cache. As number of threads increase to 2-4 threads, the performance settings have slightly higher performance which can seem odd at first, but can be explained by low contention on the global cache where some memory pages can flow between threads without stalling, reducing the overall number of calls to map new memory pages (also indicated by the slightly lower memory overhead). 
-
-As threads increase even more to 5-10 threads, the increased contention and eventual limit of global cache cause the unlimited setting to gain a slight advantage in performance. As expected the memory overhead remains constant for unlimited caches, while going down for performance setting when number of threads increases.
-
-The size oriented setting maintain good performance compared to the standard library while reducing the memory overhead compared to the performance setting with a decent amount.
-
-The nocache setting still outperforms the reference standard library allocator for workloads up to 6 threads while maintaining a near zero memory overhead, which is even slightly lower than the standard library. For use case scenarios where number of allocation of each size class is lower the overhead in rpmalloc from the 64KiB span size will of course increase.
+# Thread caches
+rpmalloc has a thread cache of free memory blocks which can be used in allocations without interfering with other threads or going to system to map more memory, as well as a global cache shared by all threads to let spans of memory pages flow between threads. Configuring the size of these caches can be crucial to obtaining good performance while minimizing memory overhead blowup. Below is a simple case study using the benchmark tool to compare different thread cache configurations for rpmalloc.
+
+The rpmalloc thread cache is configured to be unlimited, performance oriented as meaning default values, size oriented where both thread cache and global cache is reduced significantly, or disabled where both thread and global caches are disabled and completely free pages are directly unmapped.
+
+The benchmark is configured to run threads allocating 150000 blocks distributed in the `[16, 16000]` bytes range with a linear falloff probability. It runs 1000 loops, and every iteration 75000 blocks (50%) are freed and allocated in a scattered pattern. There are no cross thread allocations/deallocations. Parameters: `benchmark n 0 0 0 1000 150000 75000 16 16000`. The benchmarks are run on an Ubuntu 16.10 machine with 8 cores (4 physical, HT) and 12GiB RAM.
+
+The benchmark also includes results for the standard library malloc implementation as a reference for comparison with the nocache setting.
+
+![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=387883204&format=image)
+![Ubuntu 16.10 random [16, 16000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=1644710241&format=image)
+
+For single threaded case the unlimited cache and performance oriented cache settings have identical performance and memory overhead, indicating that the memory pages fit in the combined thread and global cache. As number of threads increase to 2-4 threads, the performance settings have slightly higher performance which can seem odd at first, but can be explained by low contention on the global cache where some memory pages can flow between threads without stalling, reducing the overall number of calls to map new memory pages (also indicated by the slightly lower memory overhead). 
+
+As threads increase even more to 5-10 threads, the increased contention and eventual limit of global cache cause the unlimited setting to gain a slight advantage in performance. As expected the memory overhead remains constant for unlimited caches, while going down for performance setting when number of threads increases.
+
+The size oriented setting maintain good performance compared to the standard library while reducing the memory overhead compared to the performance setting with a decent amount.
+
+The nocache setting still outperforms the reference standard library allocator for workloads up to 6 threads while maintaining a near zero memory overhead, which is even slightly lower than the standard library. For use case scenarios where number of allocation of each size class is lower the overhead in rpmalloc from the 64KiB span size will of course increase.
diff --git a/llvm/lib/Support/rpmalloc/README.md b/llvm/lib/Support/rpmalloc/README.md
index 916bca0118d8..2233df9da42d 100644
--- a/llvm/lib/Support/rpmalloc/README.md
+++ b/llvm/lib/Support/rpmalloc/README.md
@@ -1,220 +1,220 @@
-# rpmalloc - General Purpose Memory Allocator
-This library provides a cross platform lock free thread caching 16-byte aligned memory allocator implemented in C.
-This is a fork of rpmalloc 1.4.5.
-
-Platforms currently supported:
-
-- Windows
-- MacOS
-- iOS
-- Linux
-- Android
-- Haiku
-
-The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation and mapping granularity/size.
-
-This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
-
-# Performance
-We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment.
-
-Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
-
-https://github.com/mjansson/rpmalloc-benchmark
-
-Below is an example performance comparison chart of rpmalloc and other popular allocator implementations, with default configurations used.
-
-![Ubuntu 16.10, random [16, 8000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=301017877&format=image)
-
-The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8 logical cores (4 physical, HT). The actual numbers are not to be interpreted as absolute performance figures, but rather as relative comparisons between the different allocators. For additional benchmark results, see the [BENCHMARKS](BENCHMARKS.md) file.
-
-Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines.
-
-# Required functions
-
-Before calling any other function in the API, you __MUST__ call the initialization function, either __rpmalloc_initialize__ or __rpmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point.
-
-Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case.
-
-# Using
-The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads:
-
-__rpmalloc_initialize__ : Call at process start to initialize the allocator
-
-__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend, memory page size and mapping granularity.
-
-__rpmalloc_finalize__: Call at process exit to finalize the allocator
-
-__rpmalloc_thread_initialize__: Call at each thread start to initialize the thread local data for the allocator
-
-__rpmalloc_thread_finalize__: Call at each thread exit to finalize and release thread cache back to global cache
-
-__rpmalloc_config__: Get the current runtime configuration of the allocator
-
-Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replacement functions. Remember all allocations are 16-byte aligned, so no need to call the explicit rpmemalign/rpaligned_alloc/rpposix_memalign functions unless you need greater alignment, they are simply wrappers to make it easier to replace in existing code.
-
-If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__, and then rebuild the library or your project where you added the rpmalloc source. If you compile rpmalloc as a separate library you must make the linker use the override symbols from the library by referencing at least one symbol. The easiest way is to simply include `rpmalloc.h` in at least one source file and call `rpmalloc_linker_reference` somewhere - it's a dummy empty function. On Windows platforms and C++ overrides you have to `#include <rpnew.h>` in at least one source file and also manually handle the initialize/finalize of the process and all threads. The list of libc entry points replaced may not be complete, use libc/stdc++ replacement only as a convenience for testing the library on an existing code base, not a final solution.
-
-For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1.
-
-# Building
-To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides.
-
-The configure + ninja build also produces two shared object/dynamic libraries. The `rpmallocwrap` shared library can be used with LD_PRELOAD/DYLD_INSERT_LIBRARIES to inject in a preexisting binary, replacing any malloc/free family of function calls. This is only implemented for Linux and macOS targets. The list of libc entry points replaced may not be complete, use preloading as a convenience for testing the library on an existing binary, not a final solution. The dynamic library also provides automatic init/fini of process and threads for all platforms.
-
-The latest stable release is available in the master branch. For latest development code, use the develop branch.
-
-# Cache configuration options
-Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by editing the individual defines in the `rpmalloc.c` source file for fine tuned control.
-
-__ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead.
-
-__ENABLE_UNLIMITED_GLOBAL_CACHE__: By default defined to 0, set to 1 to make global caches infinite, i.e never unmap memory pages back to the OS.
-
-__ENABLE_UNLIMITED_THREAD_CACHE__: By default defined to 0, set to 1 to make thread caches infinite, i.e never release spans to global cache unless thread finishes.
-
-__ENABLE_GLOBAL_CACHE__: By default defined to 1, enables the global cache shared between all threads. Set to 0 to disable the global cache and directly unmap pages evicted from the thread cache.
-
-__ENABLE_THREAD_CACHE__: By default defined to 1, enables the per-thread cache. Set to 0 to disable the thread cache and directly unmap pages no longer in use (also disables the global cache).
-
-__ENABLE_ADAPTIVE_THREAD_CACHE__: Introduces a simple heuristics in the thread cache size, keeping 25% of the high water mark for each span count class.
-
-# Other configuration options
-Detailed statistics are available if __ENABLE_STATISTICS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. This will cause a slight overhead in runtime to collect statistics for each memory operation, and will also add 4 bytes overhead per allocation to track sizes.
-
-Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. If enabled, size arguments to the global entry points are verified not to cause integer overflows in calculations.
-
-Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`.
-
-To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1.
-
-To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB.
-
-To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled.
-
-# Huge pages
-The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done.
-
-# Quick overview
-The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages.
-
-# Implementation details
-The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits).
-
-Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 1024] bytes, medium blocks (1024, 32256] bytes, and large blocks (32256, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (1024, span size] bytes.
-
-Small blocks have a size class granularity of 16 bytes each in 64 buckets. Medium blocks have a granularity of 512 bytes, 61 buckets (default). Large blocks have the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 36 bytes will allocate a block of 48 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested.
-
-Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans. The fourth level is a global list of free spans.
-
-Each span for a small and medium size class keeps track of how many blocks are allocated/free, as well as a list of which blocks that are free for allocation. To avoid locks, each span is completely owned by the allocating thread, and all cross-thread deallocations will be deferred to the owner thread through a separate free list per span.
-
-Large blocks, or super spans, are cached in two levels. The first level is a per thread list of free super spans. The second level is a global list of free super spans.
-
-# Memory mapping
-By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes.
-
-The returned memory address from the memory map function MUST be aligned to the memory page size and the memory span size (which ever is larger), both of which is configurable. Either provide the page and span sizes during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the maximum of page and span size. The span size MUST be a power of two in [4096, 262144] range, and be a multiple or divisor of the memory page size.
-
-Memory mapping requests are always done in multiples of the memory page size. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two.
-
-To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 64). If the memory page size is larger than the span size, the number of spans to map in a single call will be adjusted to guarantee a multiple of the page size, and the spans will be kept mapped until the entire span range can be unmapped in one call (to avoid trying to unmap partial pages).
-
-On macOS and iOS mmap requests are tagged with tag 240 for easy identification with the vmmap tool.
-
-# Span breaking
-Super spans (spans a multiple > 1 of the span size) can be subdivided into smaller spans to fulfill a need to map a new span of memory. By default the allocator will greedily grab and break any larger span from the available caches before mapping new virtual memory. However, spans can currently not be glued together to form larger super spans again. Subspans can traverse the cache and be used by different threads individually.
-
-A span that is a subspan of a larger super span can be individually decommitted to reduce physical memory pressure when the span is evicted from caches and scheduled to be unmapped. The entire original super span will keep track of the subspans it is broken up into, and when the entire range is decommitted the super span will be unmapped. This allows platforms like Windows that require the entire virtual memory range that was mapped in a call to VirtualAlloc to be unmapped in one call to VirtualFree, while still decommitting individual pages in subspans (if the page size is smaller than the span size).
-
-If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting individual pages and the total super span byte size for finally releasing the entire super span memory range.
-
-# Memory fragmentation
-There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class.
-
-However, there is memory fragmentation in the meaning that a request for x bytes followed by a request of y bytes where x and y are at least one size class different in size will return blocks that are at least one memory page apart in virtual address space. Only blocks of the same size will potentially be within the same memory page span.
-
-rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span.
-
-# First class heaps
-rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time.
-
-# Producer-consumer scenario
-Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads.
-
-# Best case scenarios
-Threads that keep ownership of allocated memory blocks within the thread and free the blocks from the same thread will have optimal performance.
-
-Threads that have allocation patterns where the difference in memory usage high and low water marks fit within the thread cache thresholds in the allocator will never touch the global cache except during thread init/fini and have optimal performance. Tweaking the cache limits can be done on a per-size-class basis.
-
-# Worst case scenarios
-Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (16, 32, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 4KiB (or the configured memory page size) per size class as each span is initialized one memory page at a time. The cache for free spans will be reused by all size classes.
-
-Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead).
-
-# Caveats
-VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space.
-
-All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__.
-
-To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks.
-
-# Other languages
-
-[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs)
-
-[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp)
-
-# License
-
-This is free and unencumbered software released into the public domain.
-
-Anyone is free to copy, modify, publish, use, compile, sell, or
-distribute this software, either in source code form or as a compiled
-binary, for any purpose, commercial or non-commercial, and by any
-means.
-
-In jurisdictions that recognize copyright laws, the author or authors
-of this software dedicate any and all copyright interest in the
-software to the public domain. We make this dedication for the benefit
-of the public at large and to the detriment of our heirs and
-successors. We intend this dedication to be an overt act of
-relinquishment in perpetuity of all present and future rights to this
-software under copyright law.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
-EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
-MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
-IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
-OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
-ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
-OTHER DEALINGS IN THE SOFTWARE.
-
-For more information, please refer to <http://unlicense.org>
-
-
-You can also use this software under the MIT license if public domain is
-not recognized in your country
-
-
-The MIT License (MIT)
-
-Copyright (c) 2017 Mattias Jansson
-
-Permission is hereby granted, free of charge, to any person obtaining a copy
-of this software and associated documentation files (the "Software"), to deal
-in the Software without restriction, including without limitation the rights
-to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
-copies of the Software, and to permit persons to whom the Software is
-furnished to do so, subject to the following conditions:
-
-The above copyright notice and this permission notice shall be included in
-all copies or substantial portions of the Software.
-
-THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
-IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
-FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
-AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
-LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
-OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
-THE SOFTWARE.
+# rpmalloc - General Purpose Memory Allocator
+This library provides a cross platform lock free thread caching 16-byte aligned memory allocator implemented in C.
+This is a fork of rpmalloc 1.4.5.
+
+Platforms currently supported:
+
+- Windows
+- MacOS
+- iOS
+- Linux
+- Android
+- Haiku
+
+The code should be easily portable to any platform with atomic operations and an mmap-style virtual memory management API. The API used to map/unmap memory pages can be configured in runtime to a custom implementation and mapping granularity/size.
+
+This library is put in the public domain; you can redistribute it and/or modify it without any restrictions. Or, if you choose, you can use it under the MIT license.
+
+# Performance
+We believe rpmalloc is faster than most popular memory allocators like tcmalloc, hoard, ptmalloc3 and others without causing extra allocated memory overhead in the thread caches compared to these allocators. We also believe the implementation to be easier to read and modify compared to these allocators, as it is a single source file of ~3000 lines of C code. All allocations have a natural 16-byte alignment.
+
+Contained in a parallel repository is a benchmark utility that performs interleaved unaligned allocations and deallocations (both in-thread and cross-thread) in multiple threads. It measures number of memory operations performed per CPU second, as well as memory overhead by comparing the virtual memory mapped with the number of bytes requested in allocation calls. The setup of number of thread, cross-thread deallocation rate and allocation size limits is configured by command line arguments.
+
+https://github.com/mjansson/rpmalloc-benchmark
+
+Below is an example performance comparison chart of rpmalloc and other popular allocator implementations, with default configurations used.
+
+![Ubuntu 16.10, random [16, 8000] bytes, 8 cores](https://docs.google.com/spreadsheets/d/1NWNuar1z0uPCB5iVS_Cs6hSo2xPkTmZf0KsgWS_Fb_4/pubchart?oid=301017877&format=image)
+
+The benchmark producing these numbers were run on an Ubuntu 16.10 machine with 8 logical cores (4 physical, HT). The actual numbers are not to be interpreted as absolute performance figures, but rather as relative comparisons between the different allocators. For additional benchmark results, see the [BENCHMARKS](BENCHMARKS.md) file.
+
+Configuration of the thread and global caches can be important depending on your use pattern. See [CACHE](CACHE.md) for a case study and some comments/guidelines.
+
+# Required functions
+
+Before calling any other function in the API, you __MUST__ call the initialization function, either __rpmalloc_initialize__ or __rpmalloc_initialize_config__, or you will get undefined behaviour when calling other rpmalloc entry point.
+
+Before terminating your use of the allocator, you __SHOULD__ call __rpmalloc_finalize__ in order to release caches and unmap virtual memory, as well as prepare the allocator for global scope cleanup at process exit or dynamic library unload depending on your use case.
+
+# Using
+The easiest way to use the library is simply adding __rpmalloc.[h|c]__ to your project and compile them along with your sources. This contains only the rpmalloc specific entry points and does not provide internal hooks to process and/or thread creation at the moment. You are required to call these functions from your own code in order to initialize and finalize the allocator in your process and threads:
+
+__rpmalloc_initialize__ : Call at process start to initialize the allocator
+
+__rpmalloc_initialize_config__ : Optional entry point to call at process start to initialize the allocator with a custom memory mapping backend, memory page size and mapping granularity.
+
+__rpmalloc_finalize__: Call at process exit to finalize the allocator
+
+__rpmalloc_thread_initialize__: Call at each thread start to initialize the thread local data for the allocator
+
+__rpmalloc_thread_finalize__: Call at each thread exit to finalize and release thread cache back to global cache
+
+__rpmalloc_config__: Get the current runtime configuration of the allocator
+
+Then simply use the __rpmalloc__/__rpfree__ and the other malloc style replacement functions. Remember all allocations are 16-byte aligned, so no need to call the explicit rpmemalign/rpaligned_alloc/rpposix_memalign functions unless you need greater alignment, they are simply wrappers to make it easier to replace in existing code.
+
+If you wish to override the standard library malloc family of functions and have automatic initialization/finalization of process and threads, define __ENABLE_OVERRIDE__ to non-zero which will include the `malloc.c` file in compilation of __rpmalloc.c__, and then rebuild the library or your project where you added the rpmalloc source. If you compile rpmalloc as a separate library you must make the linker use the override symbols from the library by referencing at least one symbol. The easiest way is to simply include `rpmalloc.h` in at least one source file and call `rpmalloc_linker_reference` somewhere - it's a dummy empty function. On Windows platforms and C++ overrides you have to `#include <rpnew.h>` in at least one source file and also manually handle the initialize/finalize of the process and all threads. The list of libc entry points replaced may not be complete, use libc/stdc++ replacement only as a convenience for testing the library on an existing code base, not a final solution.
+
+For explicit first class heaps, see the __rpmalloc_heap_*__ API under [first class heaps](#first-class-heaps) section, requiring __RPMALLOC_FIRST_CLASS_HEAPS__ tp be defined to 1.
+
+# Building
+To compile as a static library run the configure python script which generates a Ninja build script, then build using ninja. The ninja build produces two static libraries, one named `rpmalloc` and one named `rpmallocwrap`, where the latter includes the libc entry point overrides.
+
+The configure + ninja build also produces two shared object/dynamic libraries. The `rpmallocwrap` shared library can be used with LD_PRELOAD/DYLD_INSERT_LIBRARIES to inject in a preexisting binary, replacing any malloc/free family of function calls. This is only implemented for Linux and macOS targets. The list of libc entry points replaced may not be complete, use preloading as a convenience for testing the library on an existing binary, not a final solution. The dynamic library also provides automatic init/fini of process and threads for all platforms.
+
+The latest stable release is available in the master branch. For latest development code, use the develop branch.
+
+# Cache configuration options
+Free memory pages are cached both per thread and in a global cache for all threads. The size of the thread caches is determined by an adaptive scheme where each cache is limited by a percentage of the maximum allocation count of the corresponding size class. The size of the global caches is determined by a multiple of the maximum of all thread caches. The factors controlling the cache sizes can be set by editing the individual defines in the `rpmalloc.c` source file for fine tuned control.
+
+__ENABLE_UNLIMITED_CACHE__: By default defined to 0, set to 1 to make all caches infinite, i.e never release spans to global cache unless thread finishes and never unmap memory pages back to the OS. Highest performance but largest memory overhead.
+
+__ENABLE_UNLIMITED_GLOBAL_CACHE__: By default defined to 0, set to 1 to make global caches infinite, i.e never unmap memory pages back to the OS.
+
+__ENABLE_UNLIMITED_THREAD_CACHE__: By default defined to 0, set to 1 to make thread caches infinite, i.e never release spans to global cache unless thread finishes.
+
+__ENABLE_GLOBAL_CACHE__: By default defined to 1, enables the global cache shared between all threads. Set to 0 to disable the global cache and directly unmap pages evicted from the thread cache.
+
+__ENABLE_THREAD_CACHE__: By default defined to 1, enables the per-thread cache. Set to 0 to disable the thread cache and directly unmap pages no longer in use (also disables the global cache).
+
+__ENABLE_ADAPTIVE_THREAD_CACHE__: Introduces a simple heuristics in the thread cache size, keeping 25% of the high water mark for each span count class.
+
+# Other configuration options
+Detailed statistics are available if __ENABLE_STATISTICS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. This will cause a slight overhead in runtime to collect statistics for each memory operation, and will also add 4 bytes overhead per allocation to track sizes.
+
+Integer safety checks on all calls are enabled if __ENABLE_VALIDATE_ARGS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`. If enabled, size arguments to the global entry points are verified not to cause integer overflows in calculations.
+
+Asserts are enabled if __ENABLE_ASSERTS__ is defined to 1 (default is 0, or disabled), either on compile command line or by setting the value in `rpmalloc.c`.
+
+To include __malloc.c__ in compilation and provide overrides of standard library malloc entry points define __ENABLE_OVERRIDE__ to 1. To enable automatic initialization of finalization of process and threads in order to preload the library into executables using standard library malloc, define __ENABLE_PRELOAD__ to 1.
+
+To enable the runtime configurable memory page and span sizes, define __RPMALLOC_CONFIGURABLE__ to 1. By default, memory page size is determined by system APIs and memory span size is set to 64KiB.
+
+To enable support for first class heaps, define __RPMALLOC_FIRST_CLASS_HEAPS__ to 1. By default, the first class heap API is disabled.
+
+# Huge pages
+The allocator has support for huge/large pages on Windows, Linux and MacOS. To enable it, pass a non-zero value in the config value `enable_huge_pages` when initializing the allocator with `rpmalloc_initialize_config`. If the system does not support huge pages it will be automatically disabled. You can query the status by looking at `enable_huge_pages` in the config returned from a call to `rpmalloc_config` after initialization is done.
+
+# Quick overview
+The allocator is similar in spirit to tcmalloc from the [Google Performance Toolkit](https://github.com/gperftools/gperftools). It uses separate heaps for each thread and partitions memory blocks according to a preconfigured set of size classes, up to 2MiB. Larger blocks are mapped and unmapped directly. Allocations for different size classes will be served from different set of memory pages, each "span" of pages is dedicated to one size class. Spans of pages can flow between threads when the thread cache overflows and are released to a global cache, or when the thread ends. Unlike tcmalloc, single blocks do not flow between threads, only entire spans of pages.
+
+# Implementation details
+The allocator is based on a fixed but configurable page alignment (defaults to 64KiB) and 16 byte block alignment, where all runs of memory pages (spans) are mapped to this alignment boundary. On Windows this is automatically guaranteed up to 64KiB by the VirtualAlloc granularity, and on mmap systems it is achieved by oversizing the mapping and aligning the returned virtual memory address to the required boundaries. By aligning to a fixed size the free operation can locate the header of the memory span without having to do a table lookup (as tcmalloc does) by simply masking out the low bits of the address (for 64KiB this would be the low 16 bits).
+
+Memory blocks are divided into three categories. For 64KiB span size/alignment the small blocks are [16, 1024] bytes, medium blocks (1024, 32256] bytes, and large blocks (32256, 2097120] bytes. The three categories are further divided in size classes. If the span size is changed, the small block classes remain but medium blocks go from (1024, span size] bytes.
+
+Small blocks have a size class granularity of 16 bytes each in 64 buckets. Medium blocks have a granularity of 512 bytes, 61 buckets (default). Large blocks have the same granularity as the configured span size (default 64KiB). All allocations are fitted to these size class boundaries (an allocation of 36 bytes will allocate a block of 48 bytes). Each small and medium size class has an associated span (meaning a contiguous set of memory pages) configuration describing how many pages the size class will allocate each time the cache is empty and a new allocation is requested.
+
+Spans for small and medium blocks are cached in four levels to avoid calls to map/unmap memory pages. The first level is a per thread single active span for each size class. The second level is a per thread list of partially free spans for each size class. The third level is a per thread list of free spans. The fourth level is a global list of free spans.
+
+Each span for a small and medium size class keeps track of how many blocks are allocated/free, as well as a list of which blocks that are free for allocation. To avoid locks, each span is completely owned by the allocating thread, and all cross-thread deallocations will be deferred to the owner thread through a separate free list per span.
+
+Large blocks, or super spans, are cached in two levels. The first level is a per thread list of free super spans. The second level is a global list of free super spans.
+
+# Memory mapping
+By default the allocator uses OS APIs to map virtual memory pages as needed, either `VirtualAlloc` on Windows or `mmap` on POSIX systems. If you want to use your own custom memory mapping provider you can use __rpmalloc_initialize_config__ and pass function pointers to map and unmap virtual memory. These function should reserve and free the requested number of bytes.
+
+The returned memory address from the memory map function MUST be aligned to the memory page size and the memory span size (which ever is larger), both of which is configurable. Either provide the page and span sizes during initialization using __rpmalloc_initialize_config__, or use __rpmalloc_config__ to find the required alignment which is equal to the maximum of page and span size. The span size MUST be a power of two in [4096, 262144] range, and be a multiple or divisor of the memory page size.
+
+Memory mapping requests are always done in multiples of the memory page size. You can specify a custom page size when initializing rpmalloc with __rpmalloc_initialize_config__, or pass 0 to let rpmalloc determine the system memory page size using OS APIs. The page size MUST be a power of two.
+
+To reduce system call overhead, memory spans are mapped in batches controlled by the `span_map_count` configuration variable (which defaults to the `DEFAULT_SPAN_MAP_COUNT` value if 0, which in turn is sized according to the cache configuration define, defaulting to 64). If the memory page size is larger than the span size, the number of spans to map in a single call will be adjusted to guarantee a multiple of the page size, and the spans will be kept mapped until the entire span range can be unmapped in one call (to avoid trying to unmap partial pages).
+
+On macOS and iOS mmap requests are tagged with tag 240 for easy identification with the vmmap tool.
+
+# Span breaking
+Super spans (spans a multiple > 1 of the span size) can be subdivided into smaller spans to fulfill a need to map a new span of memory. By default the allocator will greedily grab and break any larger span from the available caches before mapping new virtual memory. However, spans can currently not be glued together to form larger super spans again. Subspans can traverse the cache and be used by different threads individually.
+
+A span that is a subspan of a larger super span can be individually decommitted to reduce physical memory pressure when the span is evicted from caches and scheduled to be unmapped. The entire original super span will keep track of the subspans it is broken up into, and when the entire range is decommitted the super span will be unmapped. This allows platforms like Windows that require the entire virtual memory range that was mapped in a call to VirtualAlloc to be unmapped in one call to VirtualFree, while still decommitting individual pages in subspans (if the page size is smaller than the span size).
+
+If you use a custom memory map/unmap function you need to take this into account by looking at the `release` parameter given to the `memory_unmap` function. It is set to 0 for decommitting individual pages and the total super span byte size for finally releasing the entire super span memory range.
+
+# Memory fragmentation
+There is no memory fragmentation by the allocator in the sense that it will not leave unallocated and unusable "holes" in the memory pages by calls to allocate and free blocks of different sizes. This is due to the fact that the memory pages allocated for each size class is split up in perfectly aligned blocks which are not reused for a request of a different size. The block freed by a call to `rpfree` will always be immediately available for an allocation request within the same size class.
+
+However, there is memory fragmentation in the meaning that a request for x bytes followed by a request of y bytes where x and y are at least one size class different in size will return blocks that are at least one memory page apart in virtual address space. Only blocks of the same size will potentially be within the same memory page span.
+
+rpmalloc keeps an "active span" and free list for each size class. This leads to back-to-back allocations will most likely be served from within the same span of memory pages (unless the span runs out of free blocks). The rpmalloc implementation will also use any "holes" in memory pages in semi-filled spans before using a completely free span.
+
+# First class heaps
+rpmalloc provides a first class heap type with explicit heap control API. Heaps are maintained with calls to __rpmalloc_heap_acquire__ and __rpmalloc_heap_release__ and allocations/frees are done with __rpmalloc_heap_alloc__ and __rpmalloc_heap_free__. See the `rpmalloc.h` documentation for the full list of functions in the heap API. The main use case of explicit heap control is to scope allocations in a heap and release everything with a single call to __rpmalloc_heap_free_all__ without having to maintain ownership of memory blocks. Note that the heap API is not thread-safe, the caller must make sure that each heap is only used in a single thread at any given time.
+
+# Producer-consumer scenario
+Compared to the some other allocators, rpmalloc does not suffer as much from a producer-consumer thread scenario where one thread allocates memory blocks and another thread frees the blocks. In some allocators the free blocks need to traverse both the thread cache of the thread doing the free operations as well as the global cache before being reused in the allocating thread. In rpmalloc the freed blocks will be reused as soon as the allocating thread needs to get new spans from the thread cache. This enables faster release of completely freed memory pages as blocks in a memory page will not be aliased between different owning threads.
+
+# Best case scenarios
+Threads that keep ownership of allocated memory blocks within the thread and free the blocks from the same thread will have optimal performance.
+
+Threads that have allocation patterns where the difference in memory usage high and low water marks fit within the thread cache thresholds in the allocator will never touch the global cache except during thread init/fini and have optimal performance. Tweaking the cache limits can be done on a per-size-class basis.
+
+# Worst case scenarios
+Since each thread cache maps spans of memory pages per size class, a thread that allocates just a few blocks of each size class (16, 32, ...) for many size classes will never fill each bucket, and thus map a lot of memory pages while only using a small fraction of the mapped memory. However, the wasted memory will always be less than 4KiB (or the configured memory page size) per size class as each span is initialized one memory page at a time. The cache for free spans will be reused by all size classes.
+
+Threads that perform a lot of allocations and deallocations in a pattern that have a large difference in high and low water marks, and that difference is larger than the thread cache size, will put a lot of contention on the global cache. What will happen is the thread cache will overflow on each low water mark causing pages to be released to the global cache, then underflow on high water mark causing pages to be re-acquired from the global cache. This can be mitigated by changing the __MAX_SPAN_CACHE_DIVISOR__ define in the source code (at the cost of higher average memory overhead).
+
+# Caveats
+VirtualAlloc has an internal granularity of 64KiB. However, mmap lacks this granularity control, and the implementation instead oversizes the memory mapping with configured span size to be able to always return a memory area with the required alignment. Since the extra memory pages are never touched this will not result in extra committed physical memory pages, but rather only increase virtual memory address space.
+
+All entry points assume the passed values are valid, for example passing an invalid pointer to free would most likely result in a segmentation fault. __The library does not try to guard against errors!__.
+
+To support global scope data doing dynamic allocation/deallocation such as C++ objects with custom constructors and destructors, the call to __rpmalloc_finalize__ will not completely terminate the allocator but rather empty all caches and put the allocator in finalization mode. Once this call has been made, the allocator is no longer thread safe and expects all remaining calls to originate from global data destruction on main thread. Any spans or heaps becoming free during this phase will be immediately unmapped to allow correct teardown of the process or dynamic library without any leaks.
+
+# Other languages
+
+[Johan Andersson](https://github.com/repi) at Embark has created a Rust wrapper available at [rpmalloc-rs](https://github.com/EmbarkStudios/rpmalloc-rs)
+
+[Stas Denisov](https://github.com/nxrighthere) has created a C# wrapper available at [Rpmalloc-CSharp](https://github.com/nxrighthere/Rpmalloc-CSharp)
+
+# License
+
+This is free and unencumbered software released into the public domain.
+
+Anyone is free to copy, modify, publish, use, compile, sell, or
+distribute this software, either in source code form or as a compiled
+binary, for any purpose, commercial or non-commercial, and by any
+means.
+
+In jurisdictions that recognize copyright laws, the author or authors
+of this software dedicate any and all copyright interest in the
+software to the public domain. We make this dedication for the benefit
+of the public at large and to the detriment of our heirs and
+successors. We intend this dedication to be an overt act of
+relinquishment in perpetuity of all present and future rights to this
+software under copyright law.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND,
+EXPRESS OR IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF
+MERCHANTABILITY, FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.
+IN NO EVENT SHALL THE AUTHORS BE LIABLE FOR ANY CLAIM, DAMAGES OR
+OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
+ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
+OTHER DEALINGS IN THE SOFTWARE.
+
+For more information, please refer to <http://unlicense.org>
+
+
+You can also use this software under the MIT license if public domain is
+not recognized in your country
+
+
+The MIT License (MIT)
+
+Copyright (c) 2017 Mattias Jansson
+
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+
+The above copyright notice and this permission notice shall be included in
+all copies or substantial portions of the Software.
+
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+THE SOFTWARE.
diff --git a/llvm/lib/Support/rpmalloc/malloc.c b/llvm/lib/Support/rpmalloc/malloc.c
index 3fcfe848250c..59e13aab3ef7 100644
--- a/llvm/lib/Support/rpmalloc/malloc.c
+++ b/llvm/lib/Support/rpmalloc/malloc.c
@@ -1,724 +1,724 @@
-//===------------------------ malloc.c ------------------*- C -*-=============//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This library provides a cross-platform lock free thread caching malloc
-// implementation in C11.
-//
-//
-// This file provides overrides for the standard library malloc entry points for
-// C and new/delete operators for C++ It also provides automatic
-// initialization/finalization of process and threads
-//
-//===----------------------------------------------------------------------===//
-
-#if defined(__TINYC__)
-#include <sys/types.h>
-#endif
-
-#ifndef ARCH_64BIT
-#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
-#define ARCH_64BIT 1
-_Static_assert(sizeof(size_t) == 8, "Data type size mismatch");
-_Static_assert(sizeof(void *) == 8, "Data type size mismatch");
-#else
-#define ARCH_64BIT 0
-_Static_assert(sizeof(size_t) == 4, "Data type size mismatch");
-_Static_assert(sizeof(void *) == 4, "Data type size mismatch");
-#endif
-#endif
-
-#if (defined(__GNUC__) || defined(__clang__))
-#pragma GCC visibility push(default)
-#endif
-
-#define USE_IMPLEMENT 1
-#define USE_INTERPOSE 0
-#define USE_ALIAS 0
-
-#if defined(__APPLE__)
-#undef USE_INTERPOSE
-#define USE_INTERPOSE 1
-
-typedef struct interpose_t {
-  void *new_func;
-  void *orig_func;
-} interpose_t;
-
-#define MAC_INTERPOSE_PAIR(newf, oldf) {(void *)newf, (void *)oldf}
-#define MAC_INTERPOSE_SINGLE(newf, oldf)                                       \
-  __attribute__((used)) static const interpose_t macinterpose##newf##oldf      \
-      __attribute__((section("__DATA, __interpose"))) =                        \
-          MAC_INTERPOSE_PAIR(newf, oldf)
-
-#endif
-
-#if !defined(_WIN32) && !defined(__APPLE__)
-#undef USE_IMPLEMENT
-#undef USE_ALIAS
-#define USE_IMPLEMENT 0
-#define USE_ALIAS 1
-#endif
-
-#ifdef _MSC_VER
-#pragma warning(disable : 4100)
-#undef malloc
-#undef free
-#undef calloc
-#define RPMALLOC_RESTRICT __declspec(restrict)
-#else
-#define RPMALLOC_RESTRICT
-#endif
-
-#if ENABLE_OVERRIDE
-
-typedef struct rp_nothrow_t {
-  int __dummy;
-} rp_nothrow_t;
-
-#if USE_IMPLEMENT
-
-extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL malloc(size_t size) {
-  return rpmalloc(size);
-}
-extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL calloc(size_t count,
-                                                            size_t size) {
-  return rpcalloc(count, size);
-}
-extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL realloc(void *ptr,
-                                                             size_t size) {
-  return rprealloc(ptr, size);
-}
-extern inline void *RPMALLOC_CDECL reallocf(void *ptr, size_t size) {
-  return rprealloc(ptr, size);
-}
-extern inline void *RPMALLOC_CDECL aligned_alloc(size_t alignment,
-                                                 size_t size) {
-  return rpaligned_alloc(alignment, size);
-}
-extern inline void *RPMALLOC_CDECL memalign(size_t alignment, size_t size) {
-  return rpmemalign(alignment, size);
-}
-extern inline int RPMALLOC_CDECL posix_memalign(void **memptr, size_t alignment,
-                                                size_t size) {
-  return rpposix_memalign(memptr, alignment, size);
-}
-extern inline void RPMALLOC_CDECL free(void *ptr) { rpfree(ptr); }
-extern inline void RPMALLOC_CDECL cfree(void *ptr) { rpfree(ptr); }
-extern inline size_t RPMALLOC_CDECL malloc_usable_size(void *ptr) {
-  return rpmalloc_usable_size(ptr);
-}
-extern inline size_t RPMALLOC_CDECL malloc_size(void *ptr) {
-  return rpmalloc_usable_size(ptr);
-}
-
-#ifdef _WIN32
-extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _malloc_base(size_t size) {
-  return rpmalloc(size);
-}
-extern inline void RPMALLOC_CDECL _free_base(void *ptr) { rpfree(ptr); }
-extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _calloc_base(size_t count,
-                                                                  size_t size) {
-  return rpcalloc(count, size);
-}
-extern inline size_t RPMALLOC_CDECL _msize(void *ptr) {
-  return rpmalloc_usable_size(ptr);
-}
-extern inline size_t RPMALLOC_CDECL _msize_base(void *ptr) {
-  return rpmalloc_usable_size(ptr);
-}
-extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL
-_realloc_base(void *ptr, size_t size) {
-  return rprealloc(ptr, size);
-}
-#endif
-
-#ifdef _WIN32
-// For Windows, #include <rpnew.h> in one source file to get the C++ operator
-// overrides implemented in your module
-#else
-// Overload the C++ operators using the mangled names
-// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) operators
-// delete and delete[]
-#define RPDEFVIS __attribute__((visibility("default")))
-extern void _ZdlPv(void *p);
-void RPDEFVIS _ZdlPv(void *p) { rpfree(p); }
-extern void _ZdaPv(void *p);
-void RPDEFVIS _ZdaPv(void *p) { rpfree(p); }
-#if ARCH_64BIT
-// 64-bit operators new and new[], normal and aligned
-extern void *_Znwm(uint64_t size);
-void *RPDEFVIS _Znwm(uint64_t size) { return rpmalloc(size); }
-extern void *_Znam(uint64_t size);
-void *RPDEFVIS _Znam(uint64_t size) { return rpmalloc(size); }
-extern void *_Znwmm(uint64_t size, uint64_t align);
-void *RPDEFVIS _Znwmm(uint64_t size, uint64_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_Znamm(uint64_t size, uint64_t align);
-void *RPDEFVIS _Znamm(uint64_t size, uint64_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnwmSt11align_val_t(uint64_t size, uint64_t align);
-void *RPDEFVIS _ZnwmSt11align_val_t(uint64_t size, uint64_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnamSt11align_val_t(uint64_t size, uint64_t align);
-void *RPDEFVIS _ZnamSt11align_val_t(uint64_t size, uint64_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t);
-void *RPDEFVIS _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpmalloc(size);
-}
-extern void *_ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t);
-void *RPDEFVIS _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpmalloc(size);
-}
-extern void *_ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
-                                                rp_nothrow_t t);
-void *RPDEFVIS _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
-                                                  rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
-                                                rp_nothrow_t t);
-void *RPDEFVIS _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
-                                                  rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpaligned_alloc(align, size);
-}
-// 64-bit operators sized delete and delete[], normal and aligned
-extern void _ZdlPvm(void *p, uint64_t size);
-void RPDEFVIS _ZdlPvm(void *p, uint64_t size) {
-  rpfree(p);
-  (void)sizeof(size);
-}
-extern void _ZdaPvm(void *p, uint64_t size);
-void RPDEFVIS _ZdaPvm(void *p, uint64_t size) {
-  rpfree(p);
-  (void)sizeof(size);
-}
-extern void _ZdlPvSt11align_val_t(void *p, uint64_t align);
-void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t align) {
-  rpfree(p);
-  (void)sizeof(align);
-}
-extern void _ZdaPvSt11align_val_t(void *p, uint64_t align);
-void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t align) {
-  rpfree(p);
-  (void)sizeof(align);
-}
-extern void _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align);
-void RPDEFVIS _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) {
-  rpfree(p);
-  (void)sizeof(size);
-  (void)sizeof(align);
-}
-extern void _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align);
-void RPDEFVIS _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) {
-  rpfree(p);
-  (void)sizeof(size);
-  (void)sizeof(align);
-}
-#else
-// 32-bit operators new and new[], normal and aligned
-extern void *_Znwj(uint32_t size);
-void *RPDEFVIS _Znwj(uint32_t size) { return rpmalloc(size); }
-extern void *_Znaj(uint32_t size);
-void *RPDEFVIS _Znaj(uint32_t size) { return rpmalloc(size); }
-extern void *_Znwjj(uint32_t size, uint32_t align);
-void *RPDEFVIS _Znwjj(uint32_t size, uint32_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_Znajj(uint32_t size, uint32_t align);
-void *RPDEFVIS _Znajj(uint32_t size, uint32_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnwjSt11align_val_t(size_t size, size_t align);
-void *RPDEFVIS _ZnwjSt11align_val_t(size_t size, size_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnajSt11align_val_t(size_t size, size_t align);
-void *RPDEFVIS _ZnajSt11align_val_t(size_t size, size_t align) {
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t);
-void *RPDEFVIS _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpmalloc(size);
-}
-extern void *_ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t);
-void *RPDEFVIS _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpmalloc(size);
-}
-extern void *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
-                                                rp_nothrow_t t);
-void *RPDEFVIS _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
-                                                  rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpaligned_alloc(align, size);
-}
-extern void *_ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
-                                                rp_nothrow_t t);
-void *RPDEFVIS _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
-                                                  rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpaligned_alloc(align, size);
-}
-// 32-bit operators sized delete and delete[], normal and aligned
-extern void _ZdlPvj(void *p, uint64_t size);
-void RPDEFVIS _ZdlPvj(void *p, uint64_t size) {
-  rpfree(p);
-  (void)sizeof(size);
-}
-extern void _ZdaPvj(void *p, uint64_t size);
-void RPDEFVIS _ZdaPvj(void *p, uint64_t size) {
-  rpfree(p);
-  (void)sizeof(size);
-}
-extern void _ZdlPvSt11align_val_t(void *p, uint32_t align);
-void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t a) {
-  rpfree(p);
-  (void)sizeof(align);
-}
-extern void _ZdaPvSt11align_val_t(void *p, uint32_t align);
-void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t a) {
-  rpfree(p);
-  (void)sizeof(align);
-}
-extern void _ZdlPvjSt11align_val_t(void *p, uint32_t size, uint32_t align);
-void RPDEFVIS _ZdlPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) {
-  rpfree(p);
-  (void)sizeof(size);
-  (void)sizeof(a);
-}
-extern void _ZdaPvjSt11align_val_t(void *p, uint32_t size, uint32_t align);
-void RPDEFVIS _ZdaPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) {
-  rpfree(p);
-  (void)sizeof(size);
-  (void)sizeof(a);
-}
-#endif
-#endif
-#endif
-
-#if USE_INTERPOSE || USE_ALIAS
-
-static void *rpmalloc_nothrow(size_t size, rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpmalloc(size);
-}
-static void *rpaligned_alloc_reverse(size_t size, size_t align) {
-  return rpaligned_alloc(align, size);
-}
-static void *rpaligned_alloc_reverse_nothrow(size_t size, size_t align,
-                                             rp_nothrow_t t) {
-  (void)sizeof(t);
-  return rpaligned_alloc(align, size);
-}
-static void rpfree_size(void *p, size_t size) {
-  (void)sizeof(size);
-  rpfree(p);
-}
-static void rpfree_aligned(void *p, size_t align) {
-  (void)sizeof(align);
-  rpfree(p);
-}
-static void rpfree_size_aligned(void *p, size_t size, size_t align) {
-  (void)sizeof(size);
-  (void)sizeof(align);
-  rpfree(p);
-}
-
-#endif
-
-#if USE_INTERPOSE
-
-__attribute__((used)) static const interpose_t macinterpose_malloc[]
-    __attribute__((section("__DATA, __interpose"))) = {
-        // new and new[]
-        MAC_INTERPOSE_PAIR(rpmalloc, _Znwm),
-        MAC_INTERPOSE_PAIR(rpmalloc, _Znam),
-        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znwmm),
-        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znamm),
-        MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnwmRKSt9nothrow_t),
-        MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnamRKSt9nothrow_t),
-        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnwmSt11align_val_t),
-        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnamSt11align_val_t),
-        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow,
-                           _ZnwmSt11align_val_tRKSt9nothrow_t),
-        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow,
-                           _ZnamSt11align_val_tRKSt9nothrow_t),
-        // delete and delete[]
-        MAC_INTERPOSE_PAIR(rpfree, _ZdlPv), MAC_INTERPOSE_PAIR(rpfree, _ZdaPv),
-        MAC_INTERPOSE_PAIR(rpfree_size, _ZdlPvm),
-        MAC_INTERPOSE_PAIR(rpfree_size, _ZdaPvm),
-        MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdlPvSt11align_val_t),
-        MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdaPvSt11align_val_t),
-        MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdlPvmSt11align_val_t),
-        MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdaPvmSt11align_val_t),
-        // libc entry points
-        MAC_INTERPOSE_PAIR(rpmalloc, malloc),
-        MAC_INTERPOSE_PAIR(rpmalloc, calloc),
-        MAC_INTERPOSE_PAIR(rprealloc, realloc),
-        MAC_INTERPOSE_PAIR(rprealloc, reallocf),
-#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15
-        MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc),
-#endif
-        MAC_INTERPOSE_PAIR(rpmemalign, memalign),
-        MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign),
-        MAC_INTERPOSE_PAIR(rpfree, free), MAC_INTERPOSE_PAIR(rpfree, cfree),
-        MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size),
-        MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size)};
-
-#endif
-
-#if USE_ALIAS
-
-#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default")));
-
-// Alias the C++ operators using the mangled names
-// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling)
-
-// operators delete and delete[]
-void _ZdlPv(void *p) RPALIAS(rpfree) void _ZdaPv(void *p) RPALIAS(rpfree)
-
-#if ARCH_64BIT
-    // 64-bit operators new and new[], normal and aligned
-    void *_Znwm(uint64_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
-        RPALIAS(rpmalloc) void *_Znam(uint64_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwmm(uint64_t size,
-                                                                 uint64_t align)
-        RPALIAS(rpaligned_alloc_reverse) void *_Znamm(uint64_t size,
-                                                      uint64_t align)
-            RPALIAS(rpaligned_alloc_reverse) void *_ZnwmSt11align_val_t(
-                size_t size, size_t align)
-                RPALIAS(rpaligned_alloc_reverse) void *_ZnamSt11align_val_t(
-                    size_t size, size_t align)
-                    RPALIAS(rpaligned_alloc_reverse) void *_ZnwmRKSt9nothrow_t(
-                        size_t size, rp_nothrow_t t)
-                        RPALIAS(rpmalloc_nothrow) void *_ZnamRKSt9nothrow_t(
-                            size_t size,
-                            rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void
-                            *_ZnwmSt11align_val_tRKSt9nothrow_t(size_t size,
-                                                                size_t align,
-                                                                rp_nothrow_t t)
-                                RPALIAS(rpaligned_alloc_reverse_nothrow) void
-                                    *_ZnamSt11align_val_tRKSt9nothrow_t(
-                                        size_t size, size_t align,
-                                        rp_nothrow_t t)
-                                        RPALIAS(rpaligned_alloc_reverse_nothrow)
-    // 64-bit operators delete and delete[], sized and aligned
-    void _ZdlPvm(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvm(void *p,
-                                                                      size_t n)
-        RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a)
-            RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p,
-                                                               size_t a)
-                RPALIAS(rpfree_aligned) void _ZdlPvmSt11align_val_t(void *p,
-                                                                    size_t n,
-                                                                    size_t a)
-                    RPALIAS(rpfree_size_aligned) void _ZdaPvmSt11align_val_t(
-                        void *p, size_t n, size_t a)
-                        RPALIAS(rpfree_size_aligned)
-#else
-    // 32-bit operators new and new[], normal and aligned
-    void *_Znwj(uint32_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
-        RPALIAS(rpmalloc) void *_Znaj(uint32_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwjj(uint32_t size,
-                                                                 uint32_t align)
-        RPALIAS(rpaligned_alloc_reverse) void *_Znajj(uint32_t size,
-                                                      uint32_t align)
-            RPALIAS(rpaligned_alloc_reverse) void *_ZnwjSt11align_val_t(
-                size_t size, size_t align)
-                RPALIAS(rpaligned_alloc_reverse) void *_ZnajSt11align_val_t(
-                    size_t size, size_t align)
-                    RPALIAS(rpaligned_alloc_reverse) void *_ZnwjRKSt9nothrow_t(
-                        size_t size, rp_nothrow_t t)
-                        RPALIAS(rpmalloc_nothrow) void *_ZnajRKSt9nothrow_t(
-                            size_t size,
-                            rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void
-                            *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size,
-                                                                size_t align,
-                                                                rp_nothrow_t t)
-                                RPALIAS(rpaligned_alloc_reverse_nothrow) void
-                                    *_ZnajSt11align_val_tRKSt9nothrow_t(
-                                        size_t size, size_t align,
-                                        rp_nothrow_t t)
-                                        RPALIAS(rpaligned_alloc_reverse_nothrow)
-    // 32-bit operators delete and delete[], sized and aligned
-    void _ZdlPvj(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvj(void *p,
-                                                                      size_t n)
-        RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a)
-            RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p,
-                                                               size_t a)
-                RPALIAS(rpfree_aligned) void _ZdlPvjSt11align_val_t(void *p,
-                                                                    size_t n,
-                                                                    size_t a)
-                    RPALIAS(rpfree_size_aligned) void _ZdaPvjSt11align_val_t(
-                        void *p, size_t n, size_t a)
-                        RPALIAS(rpfree_size_aligned)
-#endif
-
-                            void *malloc(size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
-        RPALIAS(rpmalloc) void *calloc(size_t count, size_t size)
-            RPALIAS(rpcalloc) void *realloc(void *ptr, size_t size)
-                RPALIAS(rprealloc) void *reallocf(void *ptr, size_t size)
-                    RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2)
-        RPALIAS(rprealloc) void *aligned_alloc(size_t alignment, size_t size)
-            RPALIAS(rpaligned_alloc) void *memalign(
-                size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2)
-        RPALIAS(rpmemalign) int posix_memalign(void **memptr, size_t alignment,
-                                               size_t size)
-            RPALIAS(rpposix_memalign) void free(void *ptr)
-                RPALIAS(rpfree) void cfree(void *ptr) RPALIAS(rpfree)
-#if defined(__ANDROID__) || defined(__FreeBSD__)
-                    size_t
-    malloc_usable_size(const void *ptr) RPALIAS(rpmalloc_usable_size)
-#else
-                    size_t
-    malloc_usable_size(void *ptr) RPALIAS(rpmalloc_usable_size)
-#endif
-        size_t malloc_size(void *ptr) RPALIAS(rpmalloc_usable_size)
-
-#endif
-
-            static inline size_t _rpmalloc_page_size(void) {
-  return _memory_page_size;
-}
-
-extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size);
-
-extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size) {
-  size_t total;
-#if ENABLE_VALIDATE_ARGS
-#ifdef _MSC_VER
-  int err = SizeTMult(count, size, &total);
-  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#else
-  int err = __builtin_umull_overflow(count, size, &total);
-  if (err || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-#else
-  total = count * size;
-#endif
-  return realloc(ptr, total);
-}
-
-extern inline void *RPMALLOC_CDECL valloc(size_t size) {
-  get_thread_heap();
-  return rpaligned_alloc(_rpmalloc_page_size(), size);
-}
-
-extern inline void *RPMALLOC_CDECL pvalloc(size_t size) {
-  get_thread_heap();
-  const size_t page_size = _rpmalloc_page_size();
-  const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size;
-#if ENABLE_VALIDATE_ARGS
-  if (aligned_size < size) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-  return rpaligned_alloc(_rpmalloc_page_size(), aligned_size);
-}
-
-#endif // ENABLE_OVERRIDE
-
-#if ENABLE_PRELOAD
-
-#ifdef _WIN32
-
-#if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK
-
-extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance,
-                                                 DWORD reason, LPVOID reserved);
-
-extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance,
-                                                 DWORD reason,
-                                                 LPVOID reserved) {
-  (void)sizeof(reserved);
-  (void)sizeof(instance);
-  if (reason == DLL_PROCESS_ATTACH)
-    rpmalloc_initialize();
-  else if (reason == DLL_PROCESS_DETACH)
-    rpmalloc_finalize();
-  else if (reason == DLL_THREAD_ATTACH)
-    rpmalloc_thread_initialize();
-  else if (reason == DLL_THREAD_DETACH)
-    rpmalloc_thread_finalize(1);
-  return TRUE;
-}
-
-// end BUILD_DYNAMIC_LINK
-#else
-
-extern void _global_rpmalloc_init(void) {
-  rpmalloc_set_main_thread();
-  rpmalloc_initialize();
-}
-
-#if defined(__clang__) || defined(__GNUC__)
-
-static void __attribute__((constructor)) initializer(void) {
-  _global_rpmalloc_init();
-}
-
-#elif defined(_MSC_VER)
-
-static int _global_rpmalloc_xib(void) {
-  _global_rpmalloc_init();
-  return 0;
-}
-
-#pragma section(".CRT$XIB", read)
-__declspec(allocate(".CRT$XIB")) void (*_rpmalloc_module_init)(void) =
-    _global_rpmalloc_xib;
-#if defined(_M_IX86) || defined(__i386__)
-#pragma comment(linker, "/include:"                                            \
-                        "__rpmalloc_module_init")
-#else
-#pragma comment(linker, "/include:"                                            \
-                        "_rpmalloc_module_init")
-#endif
-
-#endif
-
-// end !BUILD_DYNAMIC_LINK
-#endif
-
-#else
-
-#include <pthread.h>
-#include <stdint.h>
-#include <stdlib.h>
-#include <unistd.h>
-
-extern void rpmalloc_set_main_thread(void);
-
-static pthread_key_t destructor_key;
-
-static void thread_destructor(void *);
-
-static void __attribute__((constructor)) initializer(void) {
-  rpmalloc_set_main_thread();
-  rpmalloc_initialize();
-  pthread_key_create(&destructor_key, thread_destructor);
-}
-
-static void __attribute__((destructor)) finalizer(void) { rpmalloc_finalize(); }
-
-typedef struct {
-  void *(*real_start)(void *);
-  void *real_arg;
-} thread_starter_arg;
-
-static void *thread_starter(void *argptr) {
-  thread_starter_arg *arg = argptr;
-  void *(*real_start)(void *) = arg->real_start;
-  void *real_arg = arg->real_arg;
-  rpmalloc_thread_initialize();
-  rpfree(argptr);
-  pthread_setspecific(destructor_key, (void *)1);
-  return (*real_start)(real_arg);
-}
-
-static void thread_destructor(void *value) {
-  (void)sizeof(value);
-  rpmalloc_thread_finalize(1);
-}
-
-#ifdef __APPLE__
-
-static int pthread_create_proxy(pthread_t *thread, const pthread_attr_t *attr,
-                                void *(*start_routine)(void *), void *arg) {
-  rpmalloc_initialize();
-  thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg));
-  starter_arg->real_start = start_routine;
-  starter_arg->real_arg = arg;
-  return pthread_create(thread, attr, thread_starter, starter_arg);
-}
-
-MAC_INTERPOSE_SINGLE(pthread_create_proxy, pthread_create);
-
-#else
-
-#include <dlfcn.h>
-
-int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
-                   void *(*start_routine)(void *), void *arg) {
-#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__) ||      \
-    defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) ||     \
-    defined(__HAIKU__)
-  char fname[] = "pthread_create";
-#else
-  char fname[] = "_pthread_create";
-#endif
-  void *real_pthread_create = dlsym(RTLD_NEXT, fname);
-  rpmalloc_thread_initialize();
-  thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg));
-  starter_arg->real_start = start_routine;
-  starter_arg->real_arg = arg;
-  return (*(int (*)(pthread_t *, const pthread_attr_t *, void *(*)(void *),
-                    void *))real_pthread_create)(thread, attr, thread_starter,
-                                                 starter_arg);
-}
-
-#endif
-
-#endif
-
-#endif
-
-#if ENABLE_OVERRIDE
-
-#if defined(__GLIBC__) && defined(__linux__)
-
-void *__libc_malloc(size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
-        RPALIAS(rpmalloc) void *__libc_calloc(size_t count, size_t size)
-            RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2)
-                RPALIAS(rpcalloc) void *__libc_realloc(void *p, size_t size)
-                    RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2) RPALIAS(rprealloc) void __libc_free(void *p)
-        RPALIAS(rpfree) void __libc_cfree(void *p)
-            RPALIAS(rpfree) void *__libc_memalign(size_t align, size_t size)
-                RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2)
-        RPALIAS(rpmemalign) int __posix_memalign(void **p, size_t align,
-                                                 size_t size)
-            RPALIAS(rpposix_memalign)
-
-                extern void *__libc_valloc(size_t size);
-extern void *__libc_pvalloc(size_t size);
-
-void *__libc_valloc(size_t size) { return valloc(size); }
-
-void *__libc_pvalloc(size_t size) { return pvalloc(size); }
-
-#endif
-
-#endif
-
-#if (defined(__GNUC__) || defined(__clang__))
-#pragma GCC visibility pop
-#endif
+//===------------------------ malloc.c ------------------*- C -*-=============//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This library provides a cross-platform lock free thread caching malloc
+// implementation in C11.
+//
+//
+// This file provides overrides for the standard library malloc entry points for
+// C and new/delete operators for C++ It also provides automatic
+// initialization/finalization of process and threads
+//
+//===----------------------------------------------------------------------===//
+
+#if defined(__TINYC__)
+#include <sys/types.h>
+#endif
+
+#ifndef ARCH_64BIT
+#if defined(__LLP64__) || defined(__LP64__) || defined(_WIN64)
+#define ARCH_64BIT 1
+_Static_assert(sizeof(size_t) == 8, "Data type size mismatch");
+_Static_assert(sizeof(void *) == 8, "Data type size mismatch");
+#else
+#define ARCH_64BIT 0
+_Static_assert(sizeof(size_t) == 4, "Data type size mismatch");
+_Static_assert(sizeof(void *) == 4, "Data type size mismatch");
+#endif
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__))
+#pragma GCC visibility push(default)
+#endif
+
+#define USE_IMPLEMENT 1
+#define USE_INTERPOSE 0
+#define USE_ALIAS 0
+
+#if defined(__APPLE__)
+#undef USE_INTERPOSE
+#define USE_INTERPOSE 1
+
+typedef struct interpose_t {
+  void *new_func;
+  void *orig_func;
+} interpose_t;
+
+#define MAC_INTERPOSE_PAIR(newf, oldf) {(void *)newf, (void *)oldf}
+#define MAC_INTERPOSE_SINGLE(newf, oldf)                                       \
+  __attribute__((used)) static const interpose_t macinterpose##newf##oldf      \
+      __attribute__((section("__DATA, __interpose"))) =                        \
+          MAC_INTERPOSE_PAIR(newf, oldf)
+
+#endif
+
+#if !defined(_WIN32) && !defined(__APPLE__)
+#undef USE_IMPLEMENT
+#undef USE_ALIAS
+#define USE_IMPLEMENT 0
+#define USE_ALIAS 1
+#endif
+
+#ifdef _MSC_VER
+#pragma warning(disable : 4100)
+#undef malloc
+#undef free
+#undef calloc
+#define RPMALLOC_RESTRICT __declspec(restrict)
+#else
+#define RPMALLOC_RESTRICT
+#endif
+
+#if ENABLE_OVERRIDE
+
+typedef struct rp_nothrow_t {
+  int __dummy;
+} rp_nothrow_t;
+
+#if USE_IMPLEMENT
+
+extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL malloc(size_t size) {
+  return rpmalloc(size);
+}
+extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL calloc(size_t count,
+                                                            size_t size) {
+  return rpcalloc(count, size);
+}
+extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL realloc(void *ptr,
+                                                             size_t size) {
+  return rprealloc(ptr, size);
+}
+extern inline void *RPMALLOC_CDECL reallocf(void *ptr, size_t size) {
+  return rprealloc(ptr, size);
+}
+extern inline void *RPMALLOC_CDECL aligned_alloc(size_t alignment,
+                                                 size_t size) {
+  return rpaligned_alloc(alignment, size);
+}
+extern inline void *RPMALLOC_CDECL memalign(size_t alignment, size_t size) {
+  return rpmemalign(alignment, size);
+}
+extern inline int RPMALLOC_CDECL posix_memalign(void **memptr, size_t alignment,
+                                                size_t size) {
+  return rpposix_memalign(memptr, alignment, size);
+}
+extern inline void RPMALLOC_CDECL free(void *ptr) { rpfree(ptr); }
+extern inline void RPMALLOC_CDECL cfree(void *ptr) { rpfree(ptr); }
+extern inline size_t RPMALLOC_CDECL malloc_usable_size(void *ptr) {
+  return rpmalloc_usable_size(ptr);
+}
+extern inline size_t RPMALLOC_CDECL malloc_size(void *ptr) {
+  return rpmalloc_usable_size(ptr);
+}
+
+#ifdef _WIN32
+extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _malloc_base(size_t size) {
+  return rpmalloc(size);
+}
+extern inline void RPMALLOC_CDECL _free_base(void *ptr) { rpfree(ptr); }
+extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL _calloc_base(size_t count,
+                                                                  size_t size) {
+  return rpcalloc(count, size);
+}
+extern inline size_t RPMALLOC_CDECL _msize(void *ptr) {
+  return rpmalloc_usable_size(ptr);
+}
+extern inline size_t RPMALLOC_CDECL _msize_base(void *ptr) {
+  return rpmalloc_usable_size(ptr);
+}
+extern inline RPMALLOC_RESTRICT void *RPMALLOC_CDECL
+_realloc_base(void *ptr, size_t size) {
+  return rprealloc(ptr, size);
+}
+#endif
+
+#ifdef _WIN32
+// For Windows, #include <rpnew.h> in one source file to get the C++ operator
+// overrides implemented in your module
+#else
+// Overload the C++ operators using the mangled names
+// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling) operators
+// delete and delete[]
+#define RPDEFVIS __attribute__((visibility("default")))
+extern void _ZdlPv(void *p);
+void RPDEFVIS _ZdlPv(void *p) { rpfree(p); }
+extern void _ZdaPv(void *p);
+void RPDEFVIS _ZdaPv(void *p) { rpfree(p); }
+#if ARCH_64BIT
+// 64-bit operators new and new[], normal and aligned
+extern void *_Znwm(uint64_t size);
+void *RPDEFVIS _Znwm(uint64_t size) { return rpmalloc(size); }
+extern void *_Znam(uint64_t size);
+void *RPDEFVIS _Znam(uint64_t size) { return rpmalloc(size); }
+extern void *_Znwmm(uint64_t size, uint64_t align);
+void *RPDEFVIS _Znwmm(uint64_t size, uint64_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_Znamm(uint64_t size, uint64_t align);
+void *RPDEFVIS _Znamm(uint64_t size, uint64_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnwmSt11align_val_t(uint64_t size, uint64_t align);
+void *RPDEFVIS _ZnwmSt11align_val_t(uint64_t size, uint64_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnamSt11align_val_t(uint64_t size, uint64_t align);
+void *RPDEFVIS _ZnamSt11align_val_t(uint64_t size, uint64_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t);
+void *RPDEFVIS _ZnwmRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpmalloc(size);
+}
+extern void *_ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t);
+void *RPDEFVIS _ZnamRKSt9nothrow_t(uint64_t size, rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpmalloc(size);
+}
+extern void *_ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
+                                                rp_nothrow_t t);
+void *RPDEFVIS _ZnwmSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
+                                                  rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
+                                                rp_nothrow_t t);
+void *RPDEFVIS _ZnamSt11align_val_tRKSt9nothrow_t(uint64_t size, uint64_t align,
+                                                  rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpaligned_alloc(align, size);
+}
+// 64-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvm(void *p, uint64_t size);
+void RPDEFVIS _ZdlPvm(void *p, uint64_t size) {
+  rpfree(p);
+  (void)sizeof(size);
+}
+extern void _ZdaPvm(void *p, uint64_t size);
+void RPDEFVIS _ZdaPvm(void *p, uint64_t size) {
+  rpfree(p);
+  (void)sizeof(size);
+}
+extern void _ZdlPvSt11align_val_t(void *p, uint64_t align);
+void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t align) {
+  rpfree(p);
+  (void)sizeof(align);
+}
+extern void _ZdaPvSt11align_val_t(void *p, uint64_t align);
+void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t align) {
+  rpfree(p);
+  (void)sizeof(align);
+}
+extern void _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align);
+void RPDEFVIS _ZdlPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) {
+  rpfree(p);
+  (void)sizeof(size);
+  (void)sizeof(align);
+}
+extern void _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align);
+void RPDEFVIS _ZdaPvmSt11align_val_t(void *p, uint64_t size, uint64_t align) {
+  rpfree(p);
+  (void)sizeof(size);
+  (void)sizeof(align);
+}
+#else
+// 32-bit operators new and new[], normal and aligned
+extern void *_Znwj(uint32_t size);
+void *RPDEFVIS _Znwj(uint32_t size) { return rpmalloc(size); }
+extern void *_Znaj(uint32_t size);
+void *RPDEFVIS _Znaj(uint32_t size) { return rpmalloc(size); }
+extern void *_Znwjj(uint32_t size, uint32_t align);
+void *RPDEFVIS _Znwjj(uint32_t size, uint32_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_Znajj(uint32_t size, uint32_t align);
+void *RPDEFVIS _Znajj(uint32_t size, uint32_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnwjSt11align_val_t(size_t size, size_t align);
+void *RPDEFVIS _ZnwjSt11align_val_t(size_t size, size_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnajSt11align_val_t(size_t size, size_t align);
+void *RPDEFVIS _ZnajSt11align_val_t(size_t size, size_t align) {
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t);
+void *RPDEFVIS _ZnwjRKSt9nothrow_t(size_t size, rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpmalloc(size);
+}
+extern void *_ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t);
+void *RPDEFVIS _ZnajRKSt9nothrow_t(size_t size, rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpmalloc(size);
+}
+extern void *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
+                                                rp_nothrow_t t);
+void *RPDEFVIS _ZnwjSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
+                                                  rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpaligned_alloc(align, size);
+}
+extern void *_ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
+                                                rp_nothrow_t t);
+void *RPDEFVIS _ZnajSt11align_val_tRKSt9nothrow_t(size_t size, size_t align,
+                                                  rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpaligned_alloc(align, size);
+}
+// 32-bit operators sized delete and delete[], normal and aligned
+extern void _ZdlPvj(void *p, uint64_t size);
+void RPDEFVIS _ZdlPvj(void *p, uint64_t size) {
+  rpfree(p);
+  (void)sizeof(size);
+}
+extern void _ZdaPvj(void *p, uint64_t size);
+void RPDEFVIS _ZdaPvj(void *p, uint64_t size) {
+  rpfree(p);
+  (void)sizeof(size);
+}
+extern void _ZdlPvSt11align_val_t(void *p, uint32_t align);
+void RPDEFVIS _ZdlPvSt11align_val_t(void *p, uint64_t a) {
+  rpfree(p);
+  (void)sizeof(align);
+}
+extern void _ZdaPvSt11align_val_t(void *p, uint32_t align);
+void RPDEFVIS _ZdaPvSt11align_val_t(void *p, uint64_t a) {
+  rpfree(p);
+  (void)sizeof(align);
+}
+extern void _ZdlPvjSt11align_val_t(void *p, uint32_t size, uint32_t align);
+void RPDEFVIS _ZdlPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) {
+  rpfree(p);
+  (void)sizeof(size);
+  (void)sizeof(a);
+}
+extern void _ZdaPvjSt11align_val_t(void *p, uint32_t size, uint32_t align);
+void RPDEFVIS _ZdaPvjSt11align_val_t(void *p, uint64_t size, uint64_t align) {
+  rpfree(p);
+  (void)sizeof(size);
+  (void)sizeof(a);
+}
+#endif
+#endif
+#endif
+
+#if USE_INTERPOSE || USE_ALIAS
+
+static void *rpmalloc_nothrow(size_t size, rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpmalloc(size);
+}
+static void *rpaligned_alloc_reverse(size_t size, size_t align) {
+  return rpaligned_alloc(align, size);
+}
+static void *rpaligned_alloc_reverse_nothrow(size_t size, size_t align,
+                                             rp_nothrow_t t) {
+  (void)sizeof(t);
+  return rpaligned_alloc(align, size);
+}
+static void rpfree_size(void *p, size_t size) {
+  (void)sizeof(size);
+  rpfree(p);
+}
+static void rpfree_aligned(void *p, size_t align) {
+  (void)sizeof(align);
+  rpfree(p);
+}
+static void rpfree_size_aligned(void *p, size_t size, size_t align) {
+  (void)sizeof(size);
+  (void)sizeof(align);
+  rpfree(p);
+}
+
+#endif
+
+#if USE_INTERPOSE
+
+__attribute__((used)) static const interpose_t macinterpose_malloc[]
+    __attribute__((section("__DATA, __interpose"))) = {
+        // new and new[]
+        MAC_INTERPOSE_PAIR(rpmalloc, _Znwm),
+        MAC_INTERPOSE_PAIR(rpmalloc, _Znam),
+        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znwmm),
+        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _Znamm),
+        MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnwmRKSt9nothrow_t),
+        MAC_INTERPOSE_PAIR(rpmalloc_nothrow, _ZnamRKSt9nothrow_t),
+        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnwmSt11align_val_t),
+        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse, _ZnamSt11align_val_t),
+        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow,
+                           _ZnwmSt11align_val_tRKSt9nothrow_t),
+        MAC_INTERPOSE_PAIR(rpaligned_alloc_reverse_nothrow,
+                           _ZnamSt11align_val_tRKSt9nothrow_t),
+        // delete and delete[]
+        MAC_INTERPOSE_PAIR(rpfree, _ZdlPv), MAC_INTERPOSE_PAIR(rpfree, _ZdaPv),
+        MAC_INTERPOSE_PAIR(rpfree_size, _ZdlPvm),
+        MAC_INTERPOSE_PAIR(rpfree_size, _ZdaPvm),
+        MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdlPvSt11align_val_t),
+        MAC_INTERPOSE_PAIR(rpfree_aligned, _ZdaPvSt11align_val_t),
+        MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdlPvmSt11align_val_t),
+        MAC_INTERPOSE_PAIR(rpfree_size_aligned, _ZdaPvmSt11align_val_t),
+        // libc entry points
+        MAC_INTERPOSE_PAIR(rpmalloc, malloc),
+        MAC_INTERPOSE_PAIR(rpmalloc, calloc),
+        MAC_INTERPOSE_PAIR(rprealloc, realloc),
+        MAC_INTERPOSE_PAIR(rprealloc, reallocf),
+#if defined(__MAC_10_15) && __MAC_OS_X_VERSION_MIN_REQUIRED >= __MAC_10_15
+        MAC_INTERPOSE_PAIR(rpaligned_alloc, aligned_alloc),
+#endif
+        MAC_INTERPOSE_PAIR(rpmemalign, memalign),
+        MAC_INTERPOSE_PAIR(rpposix_memalign, posix_memalign),
+        MAC_INTERPOSE_PAIR(rpfree, free), MAC_INTERPOSE_PAIR(rpfree, cfree),
+        MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_usable_size),
+        MAC_INTERPOSE_PAIR(rpmalloc_usable_size, malloc_size)};
+
+#endif
+
+#if USE_ALIAS
+
+#define RPALIAS(fn) __attribute__((alias(#fn), used, visibility("default")));
+
+// Alias the C++ operators using the mangled names
+// (https://itanium-cxx-abi.github.io/cxx-abi/abi.html#mangling)
+
+// operators delete and delete[]
+void _ZdlPv(void *p) RPALIAS(rpfree) void _ZdaPv(void *p) RPALIAS(rpfree)
+
+#if ARCH_64BIT
+    // 64-bit operators new and new[], normal and aligned
+    void *_Znwm(uint64_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
+        RPALIAS(rpmalloc) void *_Znam(uint64_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwmm(uint64_t size,
+                                                                 uint64_t align)
+        RPALIAS(rpaligned_alloc_reverse) void *_Znamm(uint64_t size,
+                                                      uint64_t align)
+            RPALIAS(rpaligned_alloc_reverse) void *_ZnwmSt11align_val_t(
+                size_t size, size_t align)
+                RPALIAS(rpaligned_alloc_reverse) void *_ZnamSt11align_val_t(
+                    size_t size, size_t align)
+                    RPALIAS(rpaligned_alloc_reverse) void *_ZnwmRKSt9nothrow_t(
+                        size_t size, rp_nothrow_t t)
+                        RPALIAS(rpmalloc_nothrow) void *_ZnamRKSt9nothrow_t(
+                            size_t size,
+                            rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void
+                            *_ZnwmSt11align_val_tRKSt9nothrow_t(size_t size,
+                                                                size_t align,
+                                                                rp_nothrow_t t)
+                                RPALIAS(rpaligned_alloc_reverse_nothrow) void
+                                    *_ZnamSt11align_val_tRKSt9nothrow_t(
+                                        size_t size, size_t align,
+                                        rp_nothrow_t t)
+                                        RPALIAS(rpaligned_alloc_reverse_nothrow)
+    // 64-bit operators delete and delete[], sized and aligned
+    void _ZdlPvm(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvm(void *p,
+                                                                      size_t n)
+        RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a)
+            RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p,
+                                                               size_t a)
+                RPALIAS(rpfree_aligned) void _ZdlPvmSt11align_val_t(void *p,
+                                                                    size_t n,
+                                                                    size_t a)
+                    RPALIAS(rpfree_size_aligned) void _ZdaPvmSt11align_val_t(
+                        void *p, size_t n, size_t a)
+                        RPALIAS(rpfree_size_aligned)
+#else
+    // 32-bit operators new and new[], normal and aligned
+    void *_Znwj(uint32_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
+        RPALIAS(rpmalloc) void *_Znaj(uint32_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(1) RPALIAS(rpmalloc) void *_Znwjj(uint32_t size,
+                                                                 uint32_t align)
+        RPALIAS(rpaligned_alloc_reverse) void *_Znajj(uint32_t size,
+                                                      uint32_t align)
+            RPALIAS(rpaligned_alloc_reverse) void *_ZnwjSt11align_val_t(
+                size_t size, size_t align)
+                RPALIAS(rpaligned_alloc_reverse) void *_ZnajSt11align_val_t(
+                    size_t size, size_t align)
+                    RPALIAS(rpaligned_alloc_reverse) void *_ZnwjRKSt9nothrow_t(
+                        size_t size, rp_nothrow_t t)
+                        RPALIAS(rpmalloc_nothrow) void *_ZnajRKSt9nothrow_t(
+                            size_t size,
+                            rp_nothrow_t t) RPALIAS(rpmalloc_nothrow) void
+                            *_ZnwjSt11align_val_tRKSt9nothrow_t(size_t size,
+                                                                size_t align,
+                                                                rp_nothrow_t t)
+                                RPALIAS(rpaligned_alloc_reverse_nothrow) void
+                                    *_ZnajSt11align_val_tRKSt9nothrow_t(
+                                        size_t size, size_t align,
+                                        rp_nothrow_t t)
+                                        RPALIAS(rpaligned_alloc_reverse_nothrow)
+    // 32-bit operators delete and delete[], sized and aligned
+    void _ZdlPvj(void *p, size_t n) RPALIAS(rpfree_size) void _ZdaPvj(void *p,
+                                                                      size_t n)
+        RPALIAS(rpfree_size) void _ZdlPvSt11align_val_t(void *p, size_t a)
+            RPALIAS(rpfree_aligned) void _ZdaPvSt11align_val_t(void *p,
+                                                               size_t a)
+                RPALIAS(rpfree_aligned) void _ZdlPvjSt11align_val_t(void *p,
+                                                                    size_t n,
+                                                                    size_t a)
+                    RPALIAS(rpfree_size_aligned) void _ZdaPvjSt11align_val_t(
+                        void *p, size_t n, size_t a)
+                        RPALIAS(rpfree_size_aligned)
+#endif
+
+                            void *malloc(size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
+        RPALIAS(rpmalloc) void *calloc(size_t count, size_t size)
+            RPALIAS(rpcalloc) void *realloc(void *ptr, size_t size)
+                RPALIAS(rprealloc) void *reallocf(void *ptr, size_t size)
+                    RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2)
+        RPALIAS(rprealloc) void *aligned_alloc(size_t alignment, size_t size)
+            RPALIAS(rpaligned_alloc) void *memalign(
+                size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2)
+        RPALIAS(rpmemalign) int posix_memalign(void **memptr, size_t alignment,
+                                               size_t size)
+            RPALIAS(rpposix_memalign) void free(void *ptr)
+                RPALIAS(rpfree) void cfree(void *ptr) RPALIAS(rpfree)
+#if defined(__ANDROID__) || defined(__FreeBSD__)
+                    size_t
+    malloc_usable_size(const void *ptr) RPALIAS(rpmalloc_usable_size)
+#else
+                    size_t
+    malloc_usable_size(void *ptr) RPALIAS(rpmalloc_usable_size)
+#endif
+        size_t malloc_size(void *ptr) RPALIAS(rpmalloc_usable_size)
+
+#endif
+
+            static inline size_t _rpmalloc_page_size(void) {
+  return _memory_page_size;
+}
+
+extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size);
+
+extern void *RPMALLOC_CDECL reallocarray(void *ptr, size_t count, size_t size) {
+  size_t total;
+#if ENABLE_VALIDATE_ARGS
+#ifdef _MSC_VER
+  int err = SizeTMult(count, size, &total);
+  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#else
+  int err = __builtin_umull_overflow(count, size, &total);
+  if (err || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+#else
+  total = count * size;
+#endif
+  return realloc(ptr, total);
+}
+
+extern inline void *RPMALLOC_CDECL valloc(size_t size) {
+  get_thread_heap();
+  return rpaligned_alloc(_rpmalloc_page_size(), size);
+}
+
+extern inline void *RPMALLOC_CDECL pvalloc(size_t size) {
+  get_thread_heap();
+  const size_t page_size = _rpmalloc_page_size();
+  const size_t aligned_size = ((size + page_size - 1) / page_size) * page_size;
+#if ENABLE_VALIDATE_ARGS
+  if (aligned_size < size) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+  return rpaligned_alloc(_rpmalloc_page_size(), aligned_size);
+}
+
+#endif // ENABLE_OVERRIDE
+
+#if ENABLE_PRELOAD
+
+#ifdef _WIN32
+
+#if defined(BUILD_DYNAMIC_LINK) && BUILD_DYNAMIC_LINK
+
+extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance,
+                                                 DWORD reason, LPVOID reserved);
+
+extern __declspec(dllexport) BOOL WINAPI DllMain(HINSTANCE instance,
+                                                 DWORD reason,
+                                                 LPVOID reserved) {
+  (void)sizeof(reserved);
+  (void)sizeof(instance);
+  if (reason == DLL_PROCESS_ATTACH)
+    rpmalloc_initialize();
+  else if (reason == DLL_PROCESS_DETACH)
+    rpmalloc_finalize();
+  else if (reason == DLL_THREAD_ATTACH)
+    rpmalloc_thread_initialize();
+  else if (reason == DLL_THREAD_DETACH)
+    rpmalloc_thread_finalize(1);
+  return TRUE;
+}
+
+// end BUILD_DYNAMIC_LINK
+#else
+
+extern void _global_rpmalloc_init(void) {
+  rpmalloc_set_main_thread();
+  rpmalloc_initialize();
+}
+
+#if defined(__clang__) || defined(__GNUC__)
+
+static void __attribute__((constructor)) initializer(void) {
+  _global_rpmalloc_init();
+}
+
+#elif defined(_MSC_VER)
+
+static int _global_rpmalloc_xib(void) {
+  _global_rpmalloc_init();
+  return 0;
+}
+
+#pragma section(".CRT$XIB", read)
+__declspec(allocate(".CRT$XIB")) void (*_rpmalloc_module_init)(void) =
+    _global_rpmalloc_xib;
+#if defined(_M_IX86) || defined(__i386__)
+#pragma comment(linker, "/include:"                                            \
+                        "__rpmalloc_module_init")
+#else
+#pragma comment(linker, "/include:"                                            \
+                        "_rpmalloc_module_init")
+#endif
+
+#endif
+
+// end !BUILD_DYNAMIC_LINK
+#endif
+
+#else
+
+#include <pthread.h>
+#include <stdint.h>
+#include <stdlib.h>
+#include <unistd.h>
+
+extern void rpmalloc_set_main_thread(void);
+
+static pthread_key_t destructor_key;
+
+static void thread_destructor(void *);
+
+static void __attribute__((constructor)) initializer(void) {
+  rpmalloc_set_main_thread();
+  rpmalloc_initialize();
+  pthread_key_create(&destructor_key, thread_destructor);
+}
+
+static void __attribute__((destructor)) finalizer(void) { rpmalloc_finalize(); }
+
+typedef struct {
+  void *(*real_start)(void *);
+  void *real_arg;
+} thread_starter_arg;
+
+static void *thread_starter(void *argptr) {
+  thread_starter_arg *arg = argptr;
+  void *(*real_start)(void *) = arg->real_start;
+  void *real_arg = arg->real_arg;
+  rpmalloc_thread_initialize();
+  rpfree(argptr);
+  pthread_setspecific(destructor_key, (void *)1);
+  return (*real_start)(real_arg);
+}
+
+static void thread_destructor(void *value) {
+  (void)sizeof(value);
+  rpmalloc_thread_finalize(1);
+}
+
+#ifdef __APPLE__
+
+static int pthread_create_proxy(pthread_t *thread, const pthread_attr_t *attr,
+                                void *(*start_routine)(void *), void *arg) {
+  rpmalloc_initialize();
+  thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg));
+  starter_arg->real_start = start_routine;
+  starter_arg->real_arg = arg;
+  return pthread_create(thread, attr, thread_starter, starter_arg);
+}
+
+MAC_INTERPOSE_SINGLE(pthread_create_proxy, pthread_create);
+
+#else
+
+#include <dlfcn.h>
+
+int pthread_create(pthread_t *thread, const pthread_attr_t *attr,
+                   void *(*start_routine)(void *), void *arg) {
+#if defined(__linux__) || defined(__FreeBSD__) || defined(__OpenBSD__) ||      \
+    defined(__NetBSD__) || defined(__DragonFly__) || defined(__APPLE__) ||     \
+    defined(__HAIKU__)
+  char fname[] = "pthread_create";
+#else
+  char fname[] = "_pthread_create";
+#endif
+  void *real_pthread_create = dlsym(RTLD_NEXT, fname);
+  rpmalloc_thread_initialize();
+  thread_starter_arg *starter_arg = rpmalloc(sizeof(thread_starter_arg));
+  starter_arg->real_start = start_routine;
+  starter_arg->real_arg = arg;
+  return (*(int (*)(pthread_t *, const pthread_attr_t *, void *(*)(void *),
+                    void *))real_pthread_create)(thread, attr, thread_starter,
+                                                 starter_arg);
+}
+
+#endif
+
+#endif
+
+#endif
+
+#if ENABLE_OVERRIDE
+
+#if defined(__GLIBC__) && defined(__linux__)
+
+void *__libc_malloc(size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(1)
+        RPALIAS(rpmalloc) void *__libc_calloc(size_t count, size_t size)
+            RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2)
+                RPALIAS(rpcalloc) void *__libc_realloc(void *p, size_t size)
+                    RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2) RPALIAS(rprealloc) void __libc_free(void *p)
+        RPALIAS(rpfree) void __libc_cfree(void *p)
+            RPALIAS(rpfree) void *__libc_memalign(size_t align, size_t size)
+                RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2)
+        RPALIAS(rpmemalign) int __posix_memalign(void **p, size_t align,
+                                                 size_t size)
+            RPALIAS(rpposix_memalign)
+
+                extern void *__libc_valloc(size_t size);
+extern void *__libc_pvalloc(size_t size);
+
+void *__libc_valloc(size_t size) { return valloc(size); }
+
+void *__libc_pvalloc(size_t size) { return pvalloc(size); }
+
+#endif
+
+#endif
+
+#if (defined(__GNUC__) || defined(__clang__))
+#pragma GCC visibility pop
+#endif
diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.c b/llvm/lib/Support/rpmalloc/rpmalloc.c
index a06d3cdb5b52..0976ec8ae6af 100644
--- a/llvm/lib/Support/rpmalloc/rpmalloc.c
+++ b/llvm/lib/Support/rpmalloc/rpmalloc.c
@@ -1,3992 +1,3992 @@
-//===---------------------- rpmalloc.c ------------------*- C -*-=============//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This library provides a cross-platform lock free thread caching malloc
-// implementation in C11.
-//
-//===----------------------------------------------------------------------===//
-
-#include "rpmalloc.h"
-
-////////////
-///
-/// Build time configurable limits
-///
-//////
-
-#if defined(__clang__)
-#pragma clang diagnostic ignored "-Wunused-macros"
-#pragma clang diagnostic ignored "-Wunused-function"
-#if __has_warning("-Wreserved-identifier")
-#pragma clang diagnostic ignored "-Wreserved-identifier"
-#endif
-#if __has_warning("-Wstatic-in-inline")
-#pragma clang diagnostic ignored "-Wstatic-in-inline"
-#endif
-#elif defined(__GNUC__)
-#pragma GCC diagnostic ignored "-Wunused-macros"
-#pragma GCC diagnostic ignored "-Wunused-function"
-#endif
-
-#if !defined(__has_builtin)
-#define __has_builtin(b) 0
-#endif
-
-#if defined(__GNUC__) || defined(__clang__)
-
-#if __has_builtin(__builtin_memcpy_inline)
-#define _rpmalloc_memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s)
-#else
-#define _rpmalloc_memcpy_const(x, y, s)                                        \
-  do {                                                                         \
-    _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0),       \
-                   "len must be a constant integer");                          \
-    memcpy(x, y, s);                                                           \
-  } while (0)
-#endif
-
-#if __has_builtin(__builtin_memset_inline)
-#define _rpmalloc_memset_const(x, y, s) __builtin_memset_inline(x, y, s)
-#else
-#define _rpmalloc_memset_const(x, y, s)                                        \
-  do {                                                                         \
-    _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0),       \
-                   "len must be a constant integer");                          \
-    memset(x, y, s);                                                           \
-  } while (0)
-#endif
-#else
-#define _rpmalloc_memcpy_const(x, y, s) memcpy(x, y, s)
-#define _rpmalloc_memset_const(x, y, s) memset(x, y, s)
-#endif
-
-#if __has_builtin(__builtin_assume)
-#define rpmalloc_assume(cond) __builtin_assume(cond)
-#elif defined(__GNUC__)
-#define rpmalloc_assume(cond)                                                  \
-  do {                                                                         \
-    if (!__builtin_expect(cond, 0))                                            \
-      __builtin_unreachable();                                                 \
-  } while (0)
-#elif defined(_MSC_VER)
-#define rpmalloc_assume(cond) __assume(cond)
-#else
-#define rpmalloc_assume(cond) 0
-#endif
-
-#ifndef HEAP_ARRAY_SIZE
-//! Size of heap hashmap
-#define HEAP_ARRAY_SIZE 47
-#endif
-#ifndef ENABLE_THREAD_CACHE
-//! Enable per-thread cache
-#define ENABLE_THREAD_CACHE 1
-#endif
-#ifndef ENABLE_GLOBAL_CACHE
-//! Enable global cache shared between all threads, requires thread cache
-#define ENABLE_GLOBAL_CACHE 1
-#endif
-#ifndef ENABLE_VALIDATE_ARGS
-//! Enable validation of args to public entry points
-#define ENABLE_VALIDATE_ARGS 0
-#endif
-#ifndef ENABLE_STATISTICS
-//! Enable statistics collection
-#define ENABLE_STATISTICS 0
-#endif
-#ifndef ENABLE_ASSERTS
-//! Enable asserts
-#define ENABLE_ASSERTS 0
-#endif
-#ifndef ENABLE_OVERRIDE
-//! Override standard library malloc/free and new/delete entry points
-#define ENABLE_OVERRIDE 0
-#endif
-#ifndef ENABLE_PRELOAD
-//! Support preloading
-#define ENABLE_PRELOAD 0
-#endif
-#ifndef DISABLE_UNMAP
-//! Disable unmapping memory pages (also enables unlimited cache)
-#define DISABLE_UNMAP 0
-#endif
-#ifndef ENABLE_UNLIMITED_CACHE
-//! Enable unlimited global cache (no unmapping until finalization)
-#define ENABLE_UNLIMITED_CACHE 0
-#endif
-#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
-//! Enable adaptive thread cache size based on use heuristics
-#define ENABLE_ADAPTIVE_THREAD_CACHE 0
-#endif
-#ifndef DEFAULT_SPAN_MAP_COUNT
-//! Default number of spans to map in call to map more virtual memory (default
-//! values yield 4MiB here)
-#define DEFAULT_SPAN_MAP_COUNT 64
-#endif
-#ifndef GLOBAL_CACHE_MULTIPLIER
-//! Multiplier for global cache
-#define GLOBAL_CACHE_MULTIPLIER 8
-#endif
-
-#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
-#error Must use global cache if unmap is disabled
-#endif
-
-#if DISABLE_UNMAP
-#undef ENABLE_UNLIMITED_CACHE
-#define ENABLE_UNLIMITED_CACHE 1
-#endif
-
-#if !ENABLE_GLOBAL_CACHE
-#undef ENABLE_UNLIMITED_CACHE
-#define ENABLE_UNLIMITED_CACHE 0
-#endif
-
-#if !ENABLE_THREAD_CACHE
-#undef ENABLE_ADAPTIVE_THREAD_CACHE
-#define ENABLE_ADAPTIVE_THREAD_CACHE 0
-#endif
-
-#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
-#define PLATFORM_WINDOWS 1
-#define PLATFORM_POSIX 0
-#else
-#define PLATFORM_WINDOWS 0
-#define PLATFORM_POSIX 1
-#endif
-
-/// Platform and arch specifics
-#if defined(_MSC_VER) && !defined(__clang__)
-#pragma warning(disable : 5105)
-#ifndef FORCEINLINE
-#define FORCEINLINE inline __forceinline
-#endif
-#define _Static_assert static_assert
-#else
-#ifndef FORCEINLINE
-#define FORCEINLINE inline __attribute__((__always_inline__))
-#endif
-#endif
-#if PLATFORM_WINDOWS
-#ifndef WIN32_LEAN_AND_MEAN
-#define WIN32_LEAN_AND_MEAN
-#endif
-#include <windows.h>
-#if ENABLE_VALIDATE_ARGS
-#include <intsafe.h>
-#endif
-#else
-#include <stdio.h>
-#include <stdlib.h>
-#include <time.h>
-#include <unistd.h>
-#if defined(__linux__) || defined(__ANDROID__)
-#include <sys/prctl.h>
-#if !defined(PR_SET_VMA)
-#define PR_SET_VMA 0x53564d41
-#define PR_SET_VMA_ANON_NAME 0
-#endif
-#endif
-#if defined(__APPLE__)
-#include <TargetConditionals.h>
-#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
-#include <mach/mach_vm.h>
-#include <mach/vm_statistics.h>
-#endif
-#include <pthread.h>
-#endif
-#if defined(__HAIKU__) || defined(__TINYC__)
-#include <pthread.h>
-#endif
-#endif
-
-#include <errno.h>
-#include <stdint.h>
-#include <string.h>
-
-#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-#include <fibersapi.h>
-static DWORD fls_key;
-#endif
-
-#if PLATFORM_POSIX
-#include <sched.h>
-#include <sys/mman.h>
-#ifdef __FreeBSD__
-#include <sys/sysctl.h>
-#define MAP_HUGETLB MAP_ALIGNED_SUPER
-#ifndef PROT_MAX
-#define PROT_MAX(f) 0
-#endif
-#else
-#define PROT_MAX(f) 0
-#endif
-#ifdef __sun
-extern int madvise(caddr_t, size_t, int);
-#endif
-#ifndef MAP_UNINITIALIZED
-#define MAP_UNINITIALIZED 0
-#endif
-#endif
-#include <errno.h>
-
-#if ENABLE_ASSERTS
-#undef NDEBUG
-#if defined(_MSC_VER) && !defined(_DEBUG)
-#define _DEBUG
-#endif
-#include <assert.h>
-#define RPMALLOC_TOSTRING_M(x) #x
-#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x)
-#define rpmalloc_assert(truth, message)                                        \
-  do {                                                                         \
-    if (!(truth)) {                                                            \
-      if (_memory_config.error_callback) {                                     \
-        _memory_config.error_callback(message " (" RPMALLOC_TOSTRING(          \
-            truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__));          \
-      } else {                                                                 \
-        assert((truth) && message);                                            \
-      }                                                                        \
-    }                                                                          \
-  } while (0)
-#else
-#define rpmalloc_assert(truth, message)                                        \
-  do {                                                                         \
-  } while (0)
-#endif
-#if ENABLE_STATISTICS
-#include <stdio.h>
-#endif
-
-//////
-///
-/// Atomic access abstraction (since MSVC does not do C11 yet)
-///
-//////
-
-#if defined(_MSC_VER) && !defined(__clang__)
-
-typedef volatile long atomic32_t;
-typedef volatile long long atomic64_t;
-typedef volatile void *atomicptr_t;
-
-static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { return *src; }
-static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) {
-  *dst = val;
-}
-static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) {
-  return (int32_t)InterlockedIncrement(val);
-}
-static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) {
-  return (int32_t)InterlockedDecrement(val);
-}
-static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) {
-  return (int32_t)InterlockedExchangeAdd(val, add) + add;
-}
-static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val,
-                                            int32_t ref) {
-  return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0;
-}
-static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) {
-  *dst = val;
-}
-static FORCEINLINE int64_t atomic_load64(atomic64_t *src) { return *src; }
-static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) {
-  return (int64_t)InterlockedExchangeAdd64(val, add) + add;
-}
-static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) {
-  return (void *)*src;
-}
-static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) {
-  *dst = val;
-}
-static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) {
-  *dst = val;
-}
-static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst,
-                                                     void *val) {
-  return (void *)InterlockedExchangePointer((void *volatile *)dst, val);
-}
-static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) {
-  return (InterlockedCompareExchangePointer((void *volatile *)dst, val, ref) ==
-          ref)
-             ? 1
-             : 0;
-}
-
-#define EXPECTED(x) (x)
-#define UNEXPECTED(x) (x)
-
-#else
-
-#include <stdatomic.h>
-
-typedef volatile _Atomic(int32_t) atomic32_t;
-typedef volatile _Atomic(int64_t) atomic64_t;
-typedef volatile _Atomic(void *) atomicptr_t;
-
-static FORCEINLINE int32_t atomic_load32(atomic32_t *src) {
-  return atomic_load_explicit(src, memory_order_relaxed);
-}
-static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) {
-  atomic_store_explicit(dst, val, memory_order_relaxed);
-}
-static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) {
-  return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1;
-}
-static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) {
-  return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1;
-}
-static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) {
-  return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add;
-}
-static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val,
-                                            int32_t ref) {
-  return atomic_compare_exchange_weak_explicit(
-      dst, &ref, val, memory_order_acquire, memory_order_relaxed);
-}
-static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) {
-  atomic_store_explicit(dst, val, memory_order_release);
-}
-static FORCEINLINE int64_t atomic_load64(atomic64_t *val) {
-  return atomic_load_explicit(val, memory_order_relaxed);
-}
-static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) {
-  return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add;
-}
-static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) {
-  return atomic_load_explicit(src, memory_order_relaxed);
-}
-static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) {
-  atomic_store_explicit(dst, val, memory_order_relaxed);
-}
-static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) {
-  atomic_store_explicit(dst, val, memory_order_release);
-}
-static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst,
-                                                     void *val) {
-  return atomic_exchange_explicit(dst, val, memory_order_acquire);
-}
-static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) {
-  return atomic_compare_exchange_weak_explicit(
-      dst, &ref, val, memory_order_relaxed, memory_order_relaxed);
-}
-
-#define EXPECTED(x) __builtin_expect((x), 1)
-#define UNEXPECTED(x) __builtin_expect((x), 0)
-
-#endif
-
-////////////
-///
-/// Statistics related functions (evaluate to nothing when statistics not
-/// enabled)
-///
-//////
-
-#if ENABLE_STATISTICS
-#define _rpmalloc_stat_inc(counter) atomic_incr32(counter)
-#define _rpmalloc_stat_dec(counter) atomic_decr32(counter)
-#define _rpmalloc_stat_add(counter, value)                                     \
-  atomic_add32(counter, (int32_t)(value))
-#define _rpmalloc_stat_add64(counter, value)                                   \
-  atomic_add64(counter, (int64_t)(value))
-#define _rpmalloc_stat_add_peak(counter, value, peak)                          \
-  do {                                                                         \
-    int32_t _cur_count = atomic_add32(counter, (int32_t)(value));              \
-    if (_cur_count > (peak))                                                   \
-      peak = _cur_count;                                                       \
-  } while (0)
-#define _rpmalloc_stat_sub(counter, value)                                     \
-  atomic_add32(counter, -(int32_t)(value))
-#define _rpmalloc_stat_inc_alloc(heap, class_idx)                              \
-  do {                                                                         \
-    int32_t alloc_current =                                                    \
-        atomic_incr32(&heap->size_class_use[class_idx].alloc_current);         \
-    if (alloc_current > heap->size_class_use[class_idx].alloc_peak)            \
-      heap->size_class_use[class_idx].alloc_peak = alloc_current;              \
-    atomic_incr32(&heap->size_class_use[class_idx].alloc_total);               \
-  } while (0)
-#define _rpmalloc_stat_inc_free(heap, class_idx)                               \
-  do {                                                                         \
-    atomic_decr32(&heap->size_class_use[class_idx].alloc_current);             \
-    atomic_incr32(&heap->size_class_use[class_idx].free_total);                \
-  } while (0)
-#else
-#define _rpmalloc_stat_inc(counter)                                            \
-  do {                                                                         \
-  } while (0)
-#define _rpmalloc_stat_dec(counter)                                            \
-  do {                                                                         \
-  } while (0)
-#define _rpmalloc_stat_add(counter, value)                                     \
-  do {                                                                         \
-  } while (0)
-#define _rpmalloc_stat_add64(counter, value)                                   \
-  do {                                                                         \
-  } while (0)
-#define _rpmalloc_stat_add_peak(counter, value, peak)                          \
-  do {                                                                         \
-  } while (0)
-#define _rpmalloc_stat_sub(counter, value)                                     \
-  do {                                                                         \
-  } while (0)
-#define _rpmalloc_stat_inc_alloc(heap, class_idx)                              \
-  do {                                                                         \
-  } while (0)
-#define _rpmalloc_stat_inc_free(heap, class_idx)                               \
-  do {                                                                         \
-  } while (0)
-#endif
-
-///
-/// Preconfigured limits and sizes
-///
-
-//! Granularity of a small allocation block (must be power of two)
-#define SMALL_GRANULARITY 16
-//! Small granularity shift count
-#define SMALL_GRANULARITY_SHIFT 4
-//! Number of small block size classes
-#define SMALL_CLASS_COUNT 65
-//! Maximum size of a small block
-#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1))
-//! Granularity of a medium allocation block
-#define MEDIUM_GRANULARITY 512
-//! Medium granularity shift count
-#define MEDIUM_GRANULARITY_SHIFT 9
-//! Number of medium block size classes
-#define MEDIUM_CLASS_COUNT 61
-//! Total number of small + medium size classes
-#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
-//! Number of large block size classes
-#define LARGE_CLASS_COUNT 63
-//! Maximum size of a medium block
-#define MEDIUM_SIZE_LIMIT                                                      \
-  (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
-//! Maximum size of a large block
-#define LARGE_SIZE_LIMIT                                                       \
-  ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
-//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power
-//! of two)
-#define SPAN_HEADER_SIZE 128
-//! Number of spans in thread cache
-#define MAX_THREAD_SPAN_CACHE 400
-//! Number of spans to transfer between thread and global cache
-#define THREAD_SPAN_CACHE_TRANSFER 64
-//! Number of spans in thread cache for large spans (must be greater than
-//! LARGE_CLASS_COUNT / 2)
-#define MAX_THREAD_SPAN_LARGE_CACHE 100
-//! Number of spans to transfer between thread and global cache for large spans
-#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
-
-_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0,
-               "Small granularity must be power of two");
-_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0,
-               "Span header size must be power of two");
-
-#if ENABLE_VALIDATE_ARGS
-//! Maximum allocation size to avoid integer overflow
-#undef MAX_ALLOC_SIZE
-#define MAX_ALLOC_SIZE (((size_t) - 1) - _memory_span_size)
-#endif
-
-#define pointer_offset(ptr, ofs) (void *)((char *)(ptr) + (ptrdiff_t)(ofs))
-#define pointer_diff(first, second)                                            \
-  (ptrdiff_t)((const char *)(first) - (const char *)(second))
-
-#define INVALID_POINTER ((void *)((uintptr_t) - 1))
-
-#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT
-#define SIZE_CLASS_HUGE ((uint32_t) - 1)
-
-////////////
-///
-/// Data types
-///
-//////
-
-//! A memory heap, per thread
-typedef struct heap_t heap_t;
-//! Span of memory pages
-typedef struct span_t span_t;
-//! Span list
-typedef struct span_list_t span_list_t;
-//! Span active data
-typedef struct span_active_t span_active_t;
-//! Size class definition
-typedef struct size_class_t size_class_t;
-//! Global cache
-typedef struct global_cache_t global_cache_t;
-
-//! Flag indicating span is the first (master) span of a split superspan
-#define SPAN_FLAG_MASTER 1U
-//! Flag indicating span is a secondary (sub) span of a split superspan
-#define SPAN_FLAG_SUBSPAN 2U
-//! Flag indicating span has blocks with increased alignment
-#define SPAN_FLAG_ALIGNED_BLOCKS 4U
-//! Flag indicating an unmapped master span
-#define SPAN_FLAG_UNMAPPED_MASTER 8U
-
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-struct span_use_t {
-  //! Current number of spans used (actually used, not in cache)
-  atomic32_t current;
-  //! High water mark of spans used
-  atomic32_t high;
-#if ENABLE_STATISTICS
-  //! Number of spans in deferred list
-  atomic32_t spans_deferred;
-  //! Number of spans transitioned to global cache
-  atomic32_t spans_to_global;
-  //! Number of spans transitioned from global cache
-  atomic32_t spans_from_global;
-  //! Number of spans transitioned to thread cache
-  atomic32_t spans_to_cache;
-  //! Number of spans transitioned from thread cache
-  atomic32_t spans_from_cache;
-  //! Number of spans transitioned to reserved state
-  atomic32_t spans_to_reserved;
-  //! Number of spans transitioned from reserved state
-  atomic32_t spans_from_reserved;
-  //! Number of raw memory map calls
-  atomic32_t spans_map_calls;
-#endif
-};
-typedef struct span_use_t span_use_t;
-#endif
-
-#if ENABLE_STATISTICS
-struct size_class_use_t {
-  //! Current number of allocations
-  atomic32_t alloc_current;
-  //! Peak number of allocations
-  int32_t alloc_peak;
-  //! Total number of allocations
-  atomic32_t alloc_total;
-  //! Total number of frees
-  atomic32_t free_total;
-  //! Number of spans in use
-  atomic32_t spans_current;
-  //! Number of spans transitioned to cache
-  int32_t spans_peak;
-  //! Number of spans transitioned to cache
-  atomic32_t spans_to_cache;
-  //! Number of spans transitioned from cache
-  atomic32_t spans_from_cache;
-  //! Number of spans transitioned from reserved state
-  atomic32_t spans_from_reserved;
-  //! Number of spans mapped
-  atomic32_t spans_map_calls;
-  int32_t unused;
-};
-typedef struct size_class_use_t size_class_use_t;
-#endif
-
-// A span can either represent a single span of memory pages with size declared
-// by span_map_count configuration variable, or a set of spans in a continuous
-// region, a super span. Any reference to the term "span" usually refers to both
-// a single span or a super span. A super span can further be divided into
-// multiple spans (or this, super spans), where the first (super)span is the
-// master and subsequent (super)spans are subspans. The master span keeps track
-// of how many subspans that are still alive and mapped in virtual memory, and
-// once all subspans and master have been unmapped the entire superspan region
-// is released and unmapped (on Windows for example, the entire superspan range
-// has to be released in the same call to release the virtual memory range, but
-// individual subranges can be decommitted individually to reduce physical
-// memory use).
-struct span_t {
-  //! Free list
-  void *free_list;
-  //! Total block count of size class
-  uint32_t block_count;
-  //! Size class
-  uint32_t size_class;
-  //! Index of last block initialized in free list
-  uint32_t free_list_limit;
-  //! Number of used blocks remaining when in partial state
-  uint32_t used_count;
-  //! Deferred free list
-  atomicptr_t free_list_deferred;
-  //! Size of deferred free list, or list of spans when part of a cache list
-  uint32_t list_size;
-  //! Size of a block
-  uint32_t block_size;
-  //! Flags and counters
-  uint32_t flags;
-  //! Number of spans
-  uint32_t span_count;
-  //! Total span counter for master spans
-  uint32_t total_spans;
-  //! Offset from master span for subspans
-  uint32_t offset_from_master;
-  //! Remaining span counter, for master spans
-  atomic32_t remaining_spans;
-  //! Alignment offset
-  uint32_t align_offset;
-  //! Owning heap
-  heap_t *heap;
-  //! Next span
-  span_t *next;
-  //! Previous span
-  span_t *prev;
-};
-_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
-
-struct span_cache_t {
-  size_t count;
-  span_t *span[MAX_THREAD_SPAN_CACHE];
-};
-typedef struct span_cache_t span_cache_t;
-
-struct span_large_cache_t {
-  size_t count;
-  span_t *span[MAX_THREAD_SPAN_LARGE_CACHE];
-};
-typedef struct span_large_cache_t span_large_cache_t;
-
-struct heap_size_class_t {
-  //! Free list of active span
-  void *free_list;
-  //! Double linked list of partially used spans with free blocks.
-  //  Previous span pointer in head points to tail span of list.
-  span_t *partial_span;
-  //! Early level cache of fully free spans
-  span_t *cache;
-};
-typedef struct heap_size_class_t heap_size_class_t;
-
-// Control structure for a heap, either a thread heap or a first class heap if
-// enabled
-struct heap_t {
-  //! Owning thread ID
-  uintptr_t owner_thread;
-  //! Free lists for each size class
-  heap_size_class_t size_class[SIZE_CLASS_COUNT];
-#if ENABLE_THREAD_CACHE
-  //! Arrays of fully freed spans, single span
-  span_cache_t span_cache;
-#endif
-  //! List of deferred free spans (single linked list)
-  atomicptr_t span_free_deferred;
-  //! Number of full spans
-  size_t full_span_count;
-  //! Mapped but unused spans
-  span_t *span_reserve;
-  //! Master span for mapped but unused spans
-  span_t *span_reserve_master;
-  //! Number of mapped but unused spans
-  uint32_t spans_reserved;
-  //! Child count
-  atomic32_t child_count;
-  //! Next heap in id list
-  heap_t *next_heap;
-  //! Next heap in orphan list
-  heap_t *next_orphan;
-  //! Heap ID
-  int32_t id;
-  //! Finalization state flag
-  int finalize;
-  //! Master heap owning the memory pages
-  heap_t *master_heap;
-#if ENABLE_THREAD_CACHE
-  //! Arrays of fully freed spans, large spans with > 1 span count
-  span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
-#endif
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  //! Double linked list of fully utilized spans with free blocks for each size
-  //! class.
-  //  Previous span pointer in head points to tail span of list.
-  span_t *full_span[SIZE_CLASS_COUNT];
-  //! Double linked list of large and huge spans allocated by this heap
-  span_t *large_huge_span;
-#endif
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-  //! Current and high water mark of spans used per span count
-  span_use_t span_use[LARGE_CLASS_COUNT];
-#endif
-#if ENABLE_STATISTICS
-  //! Allocation stats per size class
-  size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
-  //! Number of bytes transitioned thread -> global
-  atomic64_t thread_to_global;
-  //! Number of bytes transitioned global -> thread
-  atomic64_t global_to_thread;
-#endif
-};
-
-// Size class for defining a block size bucket
-struct size_class_t {
-  //! Size of blocks in this class
-  uint32_t block_size;
-  //! Number of blocks in each chunk
-  uint16_t block_count;
-  //! Class index this class is merged with
-  uint16_t class_idx;
-};
-_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
-
-struct global_cache_t {
-  //! Cache lock
-  atomic32_t lock;
-  //! Cache count
-  uint32_t count;
-#if ENABLE_STATISTICS
-  //! Insert count
-  size_t insert_count;
-  //! Extract count
-  size_t extract_count;
-#endif
-  //! Cached spans
-  span_t *span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
-  //! Unlimited cache overflow
-  span_t *overflow;
-};
-
-////////////
-///
-/// Global data
-///
-//////
-
-//! Default span size (64KiB)
-#define _memory_default_span_size (64 * 1024)
-#define _memory_default_span_size_shift 16
-#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1)))
-
-//! Initialized flag
-static int _rpmalloc_initialized;
-//! Main thread ID
-static uintptr_t _rpmalloc_main_thread_id;
-//! Configuration
-static rpmalloc_config_t _memory_config;
-//! Memory page size
-static size_t _memory_page_size;
-//! Shift to divide by page size
-static size_t _memory_page_size_shift;
-//! Granularity at which memory pages are mapped by OS
-static size_t _memory_map_granularity;
-#if RPMALLOC_CONFIGURABLE
-//! Size of a span of memory pages
-static size_t _memory_span_size;
-//! Shift to divide by span size
-static size_t _memory_span_size_shift;
-//! Mask to get to start of a memory span
-static uintptr_t _memory_span_mask;
-#else
-//! Hardwired span size
-#define _memory_span_size _memory_default_span_size
-#define _memory_span_size_shift _memory_default_span_size_shift
-#define _memory_span_mask _memory_default_span_mask
-#endif
-//! Number of spans to map in each map call
-static size_t _memory_span_map_count;
-//! Number of spans to keep reserved in each heap
-static size_t _memory_heap_reserve_count;
-//! Global size classes
-static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
-//! Run-time size limit of medium blocks
-static size_t _memory_medium_size_limit;
-//! Heap ID counter
-static atomic32_t _memory_heap_id;
-//! Huge page support
-static int _memory_huge_pages;
-#if ENABLE_GLOBAL_CACHE
-//! Global span cache
-static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
-#endif
-//! Global reserved spans
-static span_t *_memory_global_reserve;
-//! Global reserved count
-static size_t _memory_global_reserve_count;
-//! Global reserved master
-static span_t *_memory_global_reserve_master;
-//! All heaps
-static heap_t *_memory_heaps[HEAP_ARRAY_SIZE];
-//! Used to restrict access to mapping memory for huge pages
-static atomic32_t _memory_global_lock;
-//! Orphaned heaps
-static heap_t *_memory_orphan_heaps;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-//! Orphaned heaps (first class heaps)
-static heap_t *_memory_first_class_orphan_heaps;
-#endif
-#if ENABLE_STATISTICS
-//! Allocations counter
-static atomic64_t _allocation_counter;
-//! Deallocations counter
-static atomic64_t _deallocation_counter;
-//! Active heap count
-static atomic32_t _memory_active_heaps;
-//! Number of currently mapped memory pages
-static atomic32_t _mapped_pages;
-//! Peak number of concurrently mapped memory pages
-static int32_t _mapped_pages_peak;
-//! Number of mapped master spans
-static atomic32_t _master_spans;
-//! Number of unmapped dangling master spans
-static atomic32_t _unmapped_master_spans;
-//! Running counter of total number of mapped memory pages since start
-static atomic32_t _mapped_total;
-//! Running counter of total number of unmapped memory pages since start
-static atomic32_t _unmapped_total;
-//! Number of currently mapped memory pages in OS calls
-static atomic32_t _mapped_pages_os;
-//! Number of currently allocated pages in huge allocations
-static atomic32_t _huge_pages_current;
-//! Peak number of currently allocated pages in huge allocations
-static int32_t _huge_pages_peak;
-#endif
-
-////////////
-///
-/// Thread local heap and ID
-///
-//////
-
-//! Current thread heap
-#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) ||          \
-    defined(__TINYC__)
-static pthread_key_t _memory_thread_heap;
-#else
-#ifdef _MSC_VER
-#define _Thread_local __declspec(thread)
-#define TLS_MODEL
-#else
-#ifndef __HAIKU__
-#define TLS_MODEL __attribute__((tls_model("initial-exec")))
-#else
-#define TLS_MODEL
-#endif
-#if !defined(__clang__) && defined(__GNUC__)
-#define _Thread_local __thread
-#endif
-#endif
-static _Thread_local heap_t *_memory_thread_heap TLS_MODEL;
-#endif
-
-static inline heap_t *get_thread_heap_raw(void) {
-#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-  return pthread_getspecific(_memory_thread_heap);
-#else
-  return _memory_thread_heap;
-#endif
-}
-
-//! Get the current thread heap
-static inline heap_t *get_thread_heap(void) {
-  heap_t *heap = get_thread_heap_raw();
-#if ENABLE_PRELOAD
-  if (EXPECTED(heap != 0))
-    return heap;
-  rpmalloc_initialize();
-  return get_thread_heap_raw();
-#else
-  return heap;
-#endif
-}
-
-//! Fast thread ID
-static inline uintptr_t get_thread_id(void) {
-#if defined(_WIN32)
-  return (uintptr_t)((void *)NtCurrentTeb());
-#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__)
-  uintptr_t tid;
-#if defined(__i386__)
-  __asm__("movl %%gs:0, %0" : "=r"(tid) : :);
-#elif defined(__x86_64__)
-#if defined(__MACH__)
-  __asm__("movq %%gs:0, %0" : "=r"(tid) : :);
-#else
-  __asm__("movq %%fs:0, %0" : "=r"(tid) : :);
-#endif
-#elif defined(__arm__)
-  __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid));
-#elif defined(__aarch64__)
-#if defined(__MACH__)
-  // tpidr_el0 likely unused, always return 0 on iOS
-  __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid));
-#else
-  __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid));
-#endif
-#else
-#error This platform needs implementation of get_thread_id()
-#endif
-  return tid;
-#else
-#error This platform needs implementation of get_thread_id()
-#endif
-}
-
-//! Set the current thread heap
-static void set_thread_heap(heap_t *heap) {
-#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) ||          \
-    defined(__TINYC__)
-  pthread_setspecific(_memory_thread_heap, heap);
-#else
-  _memory_thread_heap = heap;
-#endif
-  if (heap)
-    heap->owner_thread = get_thread_id();
-}
-
-//! Set main thread ID
-extern void rpmalloc_set_main_thread(void);
-
-void rpmalloc_set_main_thread(void) {
-  _rpmalloc_main_thread_id = get_thread_id();
-}
-
-static void _rpmalloc_spin(void) {
-#if defined(_MSC_VER)
-#if defined(_M_ARM64)
-  __yield();
-#else
-  _mm_pause();
-#endif
-#elif defined(__x86_64__) || defined(__i386__)
-  __asm__ volatile("pause" ::: "memory");
-#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7)
-  __asm__ volatile("yield" ::: "memory");
-#elif defined(__powerpc__) || defined(__powerpc64__)
-  // No idea if ever been compiled in such archs but ... as precaution
-  __asm__ volatile("or 27,27,27");
-#elif defined(__sparc__)
-  __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0");
-#else
-  struct timespec ts = {0};
-  nanosleep(&ts, 0);
-#endif
-}
-
-#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-static void NTAPI _rpmalloc_thread_destructor(void *value) {
-#if ENABLE_OVERRIDE
-  // If this is called on main thread it means rpmalloc_finalize
-  // has not been called and shutdown is forced (through _exit) or unclean
-  if (get_thread_id() == _rpmalloc_main_thread_id)
-    return;
-#endif
-  if (value)
-    rpmalloc_thread_finalize(1);
-}
-#endif
-
-////////////
-///
-/// Low level memory map/unmap
-///
-//////
-
-static void _rpmalloc_set_name(void *address, size_t size) {
-#if defined(__linux__) || defined(__ANDROID__)
-  const char *name = _memory_huge_pages ? _memory_config.huge_page_name
-                                        : _memory_config.page_name;
-  if (address == MAP_FAILED || !name)
-    return;
-  // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails
-  // (e.g. invalid name) it is a no-op basically.
-  (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size,
-              (uintptr_t)name);
-#else
-  (void)sizeof(size);
-  (void)sizeof(address);
-#endif
-}
-
-//! Map more virtual memory
-//  size is number of bytes to map
-//  offset receives the offset in bytes from start of mapped region
-//  returns address to start of mapped region to use
-static void *_rpmalloc_mmap(size_t size, size_t *offset) {
-  rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size");
-  rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
-  void *address = _memory_config.memory_map(size, offset);
-  if (EXPECTED(address != 0)) {
-    _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift),
-                            _mapped_pages_peak);
-    _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift));
-  }
-  return address;
-}
-
-//! Unmap virtual memory
-//  address is the memory address to unmap, as returned from _memory_map
-//  size is the number of bytes to unmap, which might be less than full region
-//  for a partial unmap offset is the offset in bytes to the actual mapped
-//  region, as set by _memory_map release is set to 0 for partial unmap, or size
-//  of entire range for a full unmap
-static void _rpmalloc_unmap(void *address, size_t size, size_t offset,
-                            size_t release) {
-  rpmalloc_assert(!release || (release >= size), "Invalid unmap size");
-  rpmalloc_assert(!release || (release >= _memory_page_size),
-                  "Invalid unmap size");
-  if (release) {
-    rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size");
-    _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift));
-    _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift));
-  }
-  _memory_config.memory_unmap(address, size, offset, release);
-}
-
-//! Default implementation to map new pages to virtual memory
-static void *_rpmalloc_mmap_os(size_t size, size_t *offset) {
-  // Either size is a heap (a single page) or a (multiple) span - we only need
-  // to align spans, and only if larger than map granularity
-  size_t padding = ((size >= _memory_span_size) &&
-                    (_memory_span_size > _memory_map_granularity))
-                       ? _memory_span_size
-                       : 0;
-  rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
-#if PLATFORM_WINDOWS
-  // Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not
-  // allocated unless/until the virtual addresses are actually accessed"
-  void *ptr = VirtualAlloc(0, size + padding,
-                           (_memory_huge_pages ? MEM_LARGE_PAGES : 0) |
-                               MEM_RESERVE | MEM_COMMIT,
-                           PAGE_READWRITE);
-  if (!ptr) {
-    if (_memory_config.map_fail_callback) {
-      if (_memory_config.map_fail_callback(size + padding))
-        return _rpmalloc_mmap_os(size, offset);
-    } else {
-      rpmalloc_assert(ptr, "Failed to map virtual memory block");
-    }
-    return 0;
-  }
-#else
-  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
-#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
-  int fd = (int)VM_MAKE_TAG(240U);
-  if (_memory_huge_pages)
-    fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
-  void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
-#elif defined(MAP_HUGETLB)
-  void *ptr = mmap(0, size + padding,
-                   PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE),
-                   (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
-#if defined(MADV_HUGEPAGE)
-  // In some configurations, huge pages allocations might fail thus
-  // we fallback to normal allocations and promote the region as transparent
-  // huge page
-  if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) {
-    ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
-    if (ptr && ptr != MAP_FAILED) {
-      int prm = madvise(ptr, size + padding, MADV_HUGEPAGE);
-      (void)prm;
-      rpmalloc_assert((prm == 0), "Failed to promote the page to THP");
-    }
-  }
-#endif
-  _rpmalloc_set_name(ptr, size + padding);
-#elif defined(MAP_ALIGNED)
-  const size_t align =
-      (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1));
-  void *ptr =
-      mmap(0, size + padding, PROT_READ | PROT_WRITE,
-           (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0);
-#elif defined(MAP_ALIGN)
-  caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0);
-  void *ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE,
-                   (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
-#else
-  void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
-#endif
-  if ((ptr == MAP_FAILED) || !ptr) {
-    if (_memory_config.map_fail_callback) {
-      if (_memory_config.map_fail_callback(size + padding))
-        return _rpmalloc_mmap_os(size, offset);
-    } else if (errno != ENOMEM) {
-      rpmalloc_assert((ptr != MAP_FAILED) && ptr,
-                      "Failed to map virtual memory block");
-    }
-    return 0;
-  }
-#endif
-  _rpmalloc_stat_add(&_mapped_pages_os,
-                     (int32_t)((size + padding) >> _memory_page_size_shift));
-  if (padding) {
-    size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
-    rpmalloc_assert(final_padding <= _memory_span_size,
-                    "Internal failure in padding");
-    rpmalloc_assert(final_padding <= padding, "Internal failure in padding");
-    rpmalloc_assert(!(final_padding % 8), "Internal failure in padding");
-    ptr = pointer_offset(ptr, final_padding);
-    *offset = final_padding >> 3;
-  }
-  rpmalloc_assert((size < _memory_span_size) ||
-                      !((uintptr_t)ptr & ~_memory_span_mask),
-                  "Internal failure in padding");
-  return ptr;
-}
-
-//! Default implementation to unmap pages from virtual memory
-static void _rpmalloc_unmap_os(void *address, size_t size, size_t offset,
-                               size_t release) {
-  rpmalloc_assert(release || (offset == 0), "Invalid unmap size");
-  rpmalloc_assert(!release || (release >= _memory_page_size),
-                  "Invalid unmap size");
-  rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size");
-  if (release && offset) {
-    offset <<= 3;
-    address = pointer_offset(address, -(int32_t)offset);
-    if ((release >= _memory_span_size) &&
-        (_memory_span_size > _memory_map_granularity)) {
-      // Padding is always one span size
-      release += _memory_span_size;
-    }
-  }
-#if !DISABLE_UNMAP
-#if PLATFORM_WINDOWS
-  if (!VirtualFree(address, release ? 0 : size,
-                   release ? MEM_RELEASE : MEM_DECOMMIT)) {
-    rpmalloc_assert(0, "Failed to unmap virtual memory block");
-  }
-#else
-  if (release) {
-    if (munmap(address, release)) {
-      rpmalloc_assert(0, "Failed to unmap virtual memory block");
-    }
-  } else {
-#if defined(MADV_FREE_REUSABLE)
-    int ret;
-    while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 &&
-           (errno == EAGAIN))
-      errno = 0;
-    if ((ret == -1) && (errno != 0)) {
-#elif defined(MADV_DONTNEED)
-    if (madvise(address, size, MADV_DONTNEED)) {
-#elif defined(MADV_PAGEOUT)
-    if (madvise(address, size, MADV_PAGEOUT)) {
-#elif defined(MADV_FREE)
-    if (madvise(address, size, MADV_FREE)) {
-#else
-    if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
-#endif
-      rpmalloc_assert(0, "Failed to madvise virtual memory block as free");
-    }
-  }
-#endif
-#endif
-  if (release)
-    _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
-}
-
-static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master,
-                                                         span_t *subspan,
-                                                         size_t span_count);
-
-//! Use global reserved spans to fulfill a memory map request (reserve size must
-//! be checked by caller)
-static span_t *_rpmalloc_global_get_reserved_spans(size_t span_count) {
-  span_t *span = _memory_global_reserve;
-  _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master,
-                                               span, span_count);
-  _memory_global_reserve_count -= span_count;
-  if (_memory_global_reserve_count)
-    _memory_global_reserve =
-        (span_t *)pointer_offset(span, span_count << _memory_span_size_shift);
-  else
-    _memory_global_reserve = 0;
-  return span;
-}
-
-//! Store the given spans as global reserve (must only be called from within new
-//! heap allocation, not thread safe)
-static void _rpmalloc_global_set_reserved_spans(span_t *master, span_t *reserve,
-                                                size_t reserve_span_count) {
-  _memory_global_reserve_master = master;
-  _memory_global_reserve_count = reserve_span_count;
-  _memory_global_reserve = reserve;
-}
-
-////////////
-///
-/// Span linked list management
-///
-//////
-
-//! Add a span to double linked list at the head
-static void _rpmalloc_span_double_link_list_add(span_t **head, span_t *span) {
-  if (*head)
-    (*head)->prev = span;
-  span->next = *head;
-  *head = span;
-}
-
-//! Pop head span from double linked list
-static void _rpmalloc_span_double_link_list_pop_head(span_t **head,
-                                                     span_t *span) {
-  rpmalloc_assert(*head == span, "Linked list corrupted");
-  span = *head;
-  *head = span->next;
-}
-
-//! Remove a span from double linked list
-static void _rpmalloc_span_double_link_list_remove(span_t **head,
-                                                   span_t *span) {
-  rpmalloc_assert(*head, "Linked list corrupted");
-  if (*head == span) {
-    *head = span->next;
-  } else {
-    span_t *next_span = span->next;
-    span_t *prev_span = span->prev;
-    prev_span->next = next_span;
-    if (EXPECTED(next_span != 0))
-      next_span->prev = prev_span;
-  }
-}
-
-////////////
-///
-/// Span control
-///
-//////
-
-static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span);
-
-static void _rpmalloc_heap_finalize(heap_t *heap);
-
-static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master,
-                                              span_t *reserve,
-                                              size_t reserve_span_count);
-
-//! Declare the span to be a subspan and store distance from master span and
-//! span count
-static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master,
-                                                         span_t *subspan,
-                                                         size_t span_count) {
-  rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER),
-                  "Span master pointer and/or flag mismatch");
-  if (subspan != master) {
-    subspan->flags = SPAN_FLAG_SUBSPAN;
-    subspan->offset_from_master =
-        (uint32_t)((uintptr_t)pointer_diff(subspan, master) >>
-                   _memory_span_size_shift);
-    subspan->align_offset = 0;
-  }
-  subspan->span_count = (uint32_t)span_count;
-}
-
-//! Use reserved spans to fulfill a memory map request (reserve size must be
-//! checked by caller)
-static span_t *_rpmalloc_span_map_from_reserve(heap_t *heap,
-                                               size_t span_count) {
-  // Update the heap span reserve
-  span_t *span = heap->span_reserve;
-  heap->span_reserve =
-      (span_t *)pointer_offset(span, span_count * _memory_span_size);
-  heap->spans_reserved -= (uint32_t)span_count;
-
-  _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span,
-                                               span_count);
-  if (span_count <= LARGE_CLASS_COUNT)
-    _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved);
-
-  return span;
-}
-
-//! Get the aligned number of spans to map in based on wanted count, configured
-//! mapping granularity and the page size
-static size_t _rpmalloc_span_align_count(size_t span_count) {
-  size_t request_count = (span_count > _memory_span_map_count)
-                             ? span_count
-                             : _memory_span_map_count;
-  if ((_memory_page_size > _memory_span_size) &&
-      ((request_count * _memory_span_size) % _memory_page_size))
-    request_count +=
-        _memory_span_map_count - (request_count % _memory_span_map_count);
-  return request_count;
-}
-
-//! Setup a newly mapped span
-static void _rpmalloc_span_initialize(span_t *span, size_t total_span_count,
-                                      size_t span_count, size_t align_offset) {
-  span->total_spans = (uint32_t)total_span_count;
-  span->span_count = (uint32_t)span_count;
-  span->align_offset = (uint32_t)align_offset;
-  span->flags = SPAN_FLAG_MASTER;
-  atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
-}
-
-static void _rpmalloc_span_unmap(span_t *span);
-
-//! Map an aligned set of spans, taking configured mapping granularity and the
-//! page size into account
-static span_t *_rpmalloc_span_map_aligned_count(heap_t *heap,
-                                                size_t span_count) {
-  // If we already have some, but not enough, reserved spans, release those to
-  // heap cache and map a new full set of spans. Otherwise we would waste memory
-  // if page size > span size (huge pages)
-  size_t aligned_span_count = _rpmalloc_span_align_count(span_count);
-  size_t align_offset = 0;
-  span_t *span = (span_t *)_rpmalloc_mmap(
-      aligned_span_count * _memory_span_size, &align_offset);
-  if (!span)
-    return 0;
-  _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset);
-  _rpmalloc_stat_inc(&_master_spans);
-  if (span_count <= LARGE_CLASS_COUNT)
-    _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls);
-  if (aligned_span_count > span_count) {
-    span_t *reserved_spans =
-        (span_t *)pointer_offset(span, span_count * _memory_span_size);
-    size_t reserved_count = aligned_span_count - span_count;
-    if (heap->spans_reserved) {
-      _rpmalloc_span_mark_as_subspan_unless_master(
-          heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
-      _rpmalloc_heap_cache_insert(heap, heap->span_reserve);
-    }
-    if (reserved_count > _memory_heap_reserve_count) {
-      // If huge pages or eager spam map count, the global reserve spin lock is
-      // held by caller, _rpmalloc_span_map
-      rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1,
-                      "Global spin lock not held as expected");
-      size_t remain_count = reserved_count - _memory_heap_reserve_count;
-      reserved_count = _memory_heap_reserve_count;
-      span_t *remain_span = (span_t *)pointer_offset(
-          reserved_spans, reserved_count * _memory_span_size);
-      if (_memory_global_reserve) {
-        _rpmalloc_span_mark_as_subspan_unless_master(
-            _memory_global_reserve_master, _memory_global_reserve,
-            _memory_global_reserve_count);
-        _rpmalloc_span_unmap(_memory_global_reserve);
-      }
-      _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count);
-    }
-    _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans,
-                                      reserved_count);
-  }
-  return span;
-}
-
-//! Map in memory pages for the given number of spans (or use previously
-//! reserved pages)
-static span_t *_rpmalloc_span_map(heap_t *heap, size_t span_count) {
-  if (span_count <= heap->spans_reserved)
-    return _rpmalloc_span_map_from_reserve(heap, span_count);
-  span_t *span = 0;
-  int use_global_reserve =
-      (_memory_page_size > _memory_span_size) ||
-      (_memory_span_map_count > _memory_heap_reserve_count);
-  if (use_global_reserve) {
-    // If huge pages, make sure only one thread maps more memory to avoid bloat
-    while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
-      _rpmalloc_spin();
-    if (_memory_global_reserve_count >= span_count) {
-      size_t reserve_count =
-          (!heap->spans_reserved ? _memory_heap_reserve_count : span_count);
-      if (_memory_global_reserve_count < reserve_count)
-        reserve_count = _memory_global_reserve_count;
-      span = _rpmalloc_global_get_reserved_spans(reserve_count);
-      if (span) {
-        if (reserve_count > span_count) {
-          span_t *reserved_span = (span_t *)pointer_offset(
-              span, span_count << _memory_span_size_shift);
-          _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master,
-                                            reserved_span,
-                                            reserve_count - span_count);
-        }
-        // Already marked as subspan in _rpmalloc_global_get_reserved_spans
-        span->span_count = (uint32_t)span_count;
-      }
-    }
-  }
-  if (!span)
-    span = _rpmalloc_span_map_aligned_count(heap, span_count);
-  if (use_global_reserve)
-    atomic_store32_release(&_memory_global_lock, 0);
-  return span;
-}
-
-//! Unmap memory pages for the given number of spans (or mark as unused if no
-//! partial unmappings)
-static void _rpmalloc_span_unmap(span_t *span) {
-  rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) ||
-                      (span->flags & SPAN_FLAG_SUBSPAN),
-                  "Span flag corrupted");
-  rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) ||
-                      !(span->flags & SPAN_FLAG_SUBSPAN),
-                  "Span flag corrupted");
-
-  int is_master = !!(span->flags & SPAN_FLAG_MASTER);
-  span_t *master =
-      is_master ? span
-                : ((span_t *)pointer_offset(
-                      span, -(intptr_t)((uintptr_t)span->offset_from_master *
-                                        _memory_span_size)));
-  rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN),
-                  "Span flag corrupted");
-  rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
-
-  size_t span_count = span->span_count;
-  if (!is_master) {
-    // Directly unmap subspans (unless huge pages, in which case we defer and
-    // unmap entire page range with master)
-    rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted");
-    if (_memory_span_size >= _memory_page_size)
-      _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0);
-  } else {
-    // Special double flag to denote an unmapped master
-    // It must be kept in memory since span header must be used
-    span->flags |=
-        SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER;
-    _rpmalloc_stat_add(&_unmapped_master_spans, 1);
-  }
-
-  if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
-    // Everything unmapped, unmap the master span with release flag to unmap the
-    // entire range of the super span
-    rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) &&
-                        !!(master->flags & SPAN_FLAG_SUBSPAN),
-                    "Span flag corrupted");
-    size_t unmap_count = master->span_count;
-    if (_memory_span_size < _memory_page_size)
-      unmap_count = master->total_spans;
-    _rpmalloc_stat_sub(&_master_spans, 1);
-    _rpmalloc_stat_sub(&_unmapped_master_spans, 1);
-    _rpmalloc_unmap(master, unmap_count * _memory_span_size,
-                    master->align_offset,
-                    (size_t)master->total_spans * _memory_span_size);
-  }
-}
-
-//! Move the span (used for small or medium allocations) to the heap thread
-//! cache
-static void _rpmalloc_span_release_to_cache(heap_t *heap, span_t *span) {
-  rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted");
-  rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT,
-                  "Invalid span size class");
-  rpmalloc_assert(span->span_count == 1, "Invalid span count");
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-  atomic_decr32(&heap->span_use[0].current);
-#endif
-  _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
-  if (!heap->finalize) {
-    _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
-    _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
-    if (heap->size_class[span->size_class].cache)
-      _rpmalloc_heap_cache_insert(heap,
-                                  heap->size_class[span->size_class].cache);
-    heap->size_class[span->size_class].cache = span;
-  } else {
-    _rpmalloc_span_unmap(span);
-  }
-}
-
-//! Initialize a (partial) free list up to next system memory page, while
-//! reserving the first block as allocated, returning number of blocks in list
-static uint32_t free_list_partial_init(void **list, void **first_block,
-                                       void *page_start, void *block_start,
-                                       uint32_t block_count,
-                                       uint32_t block_size) {
-  rpmalloc_assert(block_count, "Internal failure");
-  *first_block = block_start;
-  if (block_count > 1) {
-    void *free_block = pointer_offset(block_start, block_size);
-    void *block_end =
-        pointer_offset(block_start, (size_t)block_size * block_count);
-    // If block size is less than half a memory page, bound init to next memory
-    // page boundary
-    if (block_size < (_memory_page_size >> 1)) {
-      void *page_end = pointer_offset(page_start, _memory_page_size);
-      if (page_end < block_end)
-        block_end = page_end;
-    }
-    *list = free_block;
-    block_count = 2;
-    void *next_block = pointer_offset(free_block, block_size);
-    while (next_block < block_end) {
-      *((void **)free_block) = next_block;
-      free_block = next_block;
-      ++block_count;
-      next_block = pointer_offset(next_block, block_size);
-    }
-    *((void **)free_block) = 0;
-  } else {
-    *list = 0;
-  }
-  return block_count;
-}
-
-//! Initialize an unused span (from cache or mapped) to be new active span,
-//! putting the initial free list in heap class free list
-static void *_rpmalloc_span_initialize_new(heap_t *heap,
-                                           heap_size_class_t *heap_size_class,
-                                           span_t *span, uint32_t class_idx) {
-  rpmalloc_assert(span->span_count == 1, "Internal failure");
-  size_class_t *size_class = _memory_size_class + class_idx;
-  span->size_class = class_idx;
-  span->heap = heap;
-  span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
-  span->block_size = size_class->block_size;
-  span->block_count = size_class->block_count;
-  span->free_list = 0;
-  span->list_size = 0;
-  atomic_store_ptr_release(&span->free_list_deferred, 0);
-
-  // Setup free list. Only initialize one system page worth of free blocks in
-  // list
-  void *block;
-  span->free_list_limit =
-      free_list_partial_init(&heap_size_class->free_list, &block, span,
-                             pointer_offset(span, SPAN_HEADER_SIZE),
-                             size_class->block_count, size_class->block_size);
-  // Link span as partial if there remains blocks to be initialized as free
-  // list, or full if fully initialized
-  if (span->free_list_limit < span->block_count) {
-    _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span);
-    span->used_count = span->free_list_limit;
-  } else {
-#if RPMALLOC_FIRST_CLASS_HEAPS
-    _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
-#endif
-    ++heap->full_span_count;
-    span->used_count = span->block_count;
-  }
-  return block;
-}
-
-static void _rpmalloc_span_extract_free_list_deferred(span_t *span) {
-  // We need acquire semantics on the CAS operation since we are interested in
-  // the list size Refer to _rpmalloc_deallocate_defer_small_or_medium for
-  // further comments on this dependency
-  do {
-    span->free_list =
-        atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
-  } while (span->free_list == INVALID_POINTER);
-  span->used_count -= span->list_size;
-  span->list_size = 0;
-  atomic_store_ptr_release(&span->free_list_deferred, 0);
-}
-
-static int _rpmalloc_span_is_fully_utilized(span_t *span) {
-  rpmalloc_assert(span->free_list_limit <= span->block_count,
-                  "Span free list corrupted");
-  return !span->free_list && (span->free_list_limit >= span->block_count);
-}
-
-static int _rpmalloc_span_finalize(heap_t *heap, size_t iclass, span_t *span,
-                                   span_t **list_head) {
-  void *free_list = heap->size_class[iclass].free_list;
-  span_t *class_span = (span_t *)((uintptr_t)free_list & _memory_span_mask);
-  if (span == class_span) {
-    // Adopt the heap class free list back into the span free list
-    void *block = span->free_list;
-    void *last_block = 0;
-    while (block) {
-      last_block = block;
-      block = *((void **)block);
-    }
-    uint32_t free_count = 0;
-    block = free_list;
-    while (block) {
-      ++free_count;
-      block = *((void **)block);
-    }
-    if (last_block) {
-      *((void **)last_block) = free_list;
-    } else {
-      span->free_list = free_list;
-    }
-    heap->size_class[iclass].free_list = 0;
-    span->used_count -= free_count;
-  }
-  // If this assert triggers you have memory leaks
-  rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected");
-  if (span->list_size == span->used_count) {
-    _rpmalloc_stat_dec(&heap->span_use[0].current);
-    _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current);
-    // This function only used for spans in double linked lists
-    if (list_head)
-      _rpmalloc_span_double_link_list_remove(list_head, span);
-    _rpmalloc_span_unmap(span);
-    return 1;
-  }
-  return 0;
-}
-
-////////////
-///
-/// Global cache
-///
-//////
-
-#if ENABLE_GLOBAL_CACHE
-
-//! Finalize a global cache
-static void _rpmalloc_global_cache_finalize(global_cache_t *cache) {
-  while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-    _rpmalloc_spin();
-
-  for (size_t ispan = 0; ispan < cache->count; ++ispan)
-    _rpmalloc_span_unmap(cache->span[ispan]);
-  cache->count = 0;
-
-  while (cache->overflow) {
-    span_t *span = cache->overflow;
-    cache->overflow = span->next;
-    _rpmalloc_span_unmap(span);
-  }
-
-  atomic_store32_release(&cache->lock, 0);
-}
-
-static void _rpmalloc_global_cache_insert_spans(span_t **span,
-                                                size_t span_count,
-                                                size_t count) {
-  const size_t cache_limit =
-      (span_count == 1) ? GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE
-                        : GLOBAL_CACHE_MULTIPLIER *
-                              (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
-
-  global_cache_t *cache = &_memory_span_cache[span_count - 1];
-
-  size_t insert_count = count;
-  while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-    _rpmalloc_spin();
-
-#if ENABLE_STATISTICS
-  cache->insert_count += count;
-#endif
-  if ((cache->count + insert_count) > cache_limit)
-    insert_count = cache_limit - cache->count;
-
-  memcpy(cache->span + cache->count, span, sizeof(span_t *) * insert_count);
-  cache->count += (uint32_t)insert_count;
-
-#if ENABLE_UNLIMITED_CACHE
-  while (insert_count < count) {
-#else
-  // Enable unlimited cache if huge pages, or we will leak since it is unlikely
-  // that an entire huge page will be unmapped, and we're unable to partially
-  // decommit a huge page
-  while ((_memory_page_size > _memory_span_size) && (insert_count < count)) {
-#endif
-    span_t *current_span = span[insert_count++];
-    current_span->next = cache->overflow;
-    cache->overflow = current_span;
-  }
-  atomic_store32_release(&cache->lock, 0);
-
-  span_t *keep = 0;
-  for (size_t ispan = insert_count; ispan < count; ++ispan) {
-    span_t *current_span = span[ispan];
-    // Keep master spans that has remaining subspans to avoid dangling them
-    if ((current_span->flags & SPAN_FLAG_MASTER) &&
-        (atomic_load32(&current_span->remaining_spans) >
-         (int32_t)current_span->span_count)) {
-      current_span->next = keep;
-      keep = current_span;
-    } else {
-      _rpmalloc_span_unmap(current_span);
-    }
-  }
-
-  if (keep) {
-    while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-      _rpmalloc_spin();
-
-    size_t islot = 0;
-    while (keep) {
-      for (; islot < cache->count; ++islot) {
-        span_t *current_span = cache->span[islot];
-        if (!(current_span->flags & SPAN_FLAG_MASTER) ||
-            ((current_span->flags & SPAN_FLAG_MASTER) &&
-             (atomic_load32(&current_span->remaining_spans) <=
-              (int32_t)current_span->span_count))) {
-          _rpmalloc_span_unmap(current_span);
-          cache->span[islot] = keep;
-          break;
-        }
-      }
-      if (islot == cache->count)
-        break;
-      keep = keep->next;
-    }
-
-    if (keep) {
-      span_t *tail = keep;
-      while (tail->next)
-        tail = tail->next;
-      tail->next = cache->overflow;
-      cache->overflow = keep;
-    }
-
-    atomic_store32_release(&cache->lock, 0);
-  }
-}
-
-static size_t _rpmalloc_global_cache_extract_spans(span_t **span,
-                                                   size_t span_count,
-                                                   size_t count) {
-  global_cache_t *cache = &_memory_span_cache[span_count - 1];
-
-  size_t extract_count = 0;
-  while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-    _rpmalloc_spin();
-
-#if ENABLE_STATISTICS
-  cache->extract_count += count;
-#endif
-  size_t want = count - extract_count;
-  if (want > cache->count)
-    want = cache->count;
-
-  memcpy(span + extract_count, cache->span + (cache->count - want),
-         sizeof(span_t *) * want);
-  cache->count -= (uint32_t)want;
-  extract_count += want;
-
-  while ((extract_count < count) && cache->overflow) {
-    span_t *current_span = cache->overflow;
-    span[extract_count++] = current_span;
-    cache->overflow = current_span->next;
-  }
-
-#if ENABLE_ASSERTS
-  for (size_t ispan = 0; ispan < extract_count; ++ispan) {
-    rpmalloc_assert(span[ispan]->span_count == span_count,
-                    "Global cache span count mismatch");
-  }
-#endif
-
-  atomic_store32_release(&cache->lock, 0);
-
-  return extract_count;
-}
-
-#endif
-
-////////////
-///
-/// Heap control
-///
-//////
-
-static void _rpmalloc_deallocate_huge(span_t *);
-
-//! Store the given spans as reserve in the given heap
-static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master,
-                                              span_t *reserve,
-                                              size_t reserve_span_count) {
-  heap->span_reserve_master = master;
-  heap->span_reserve = reserve;
-  heap->spans_reserved = (uint32_t)reserve_span_count;
-}
-
-//! Adopt the deferred span cache list, optionally extracting the first single
-//! span for immediate re-use
-static void _rpmalloc_heap_cache_adopt_deferred(heap_t *heap,
-                                                span_t **single_span) {
-  span_t *span = (span_t *)((void *)atomic_exchange_ptr_acquire(
-      &heap->span_free_deferred, 0));
-  while (span) {
-    span_t *next_span = (span_t *)span->free_list;
-    rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted");
-    if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
-      rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
-      --heap->full_span_count;
-      _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred);
-#if RPMALLOC_FIRST_CLASS_HEAPS
-      _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class],
-                                             span);
-#endif
-      _rpmalloc_stat_dec(&heap->span_use[0].current);
-      _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
-      if (single_span && !*single_span)
-        *single_span = span;
-      else
-        _rpmalloc_heap_cache_insert(heap, span);
-    } else {
-      if (span->size_class == SIZE_CLASS_HUGE) {
-        _rpmalloc_deallocate_huge(span);
-      } else {
-        rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE,
-                        "Span size class invalid");
-        rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
-        --heap->full_span_count;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-        _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span);
-#endif
-        uint32_t idx = span->span_count - 1;
-        _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred);
-        _rpmalloc_stat_dec(&heap->span_use[idx].current);
-        if (!idx && single_span && !*single_span)
-          *single_span = span;
-        else
-          _rpmalloc_heap_cache_insert(heap, span);
-      }
-    }
-    span = next_span;
-  }
-}
-
-static void _rpmalloc_heap_unmap(heap_t *heap) {
-  if (!heap->master_heap) {
-    if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) {
-      span_t *span = (span_t *)((uintptr_t)heap & _memory_span_mask);
-      _rpmalloc_span_unmap(span);
-    }
-  } else {
-    if (atomic_decr32(&heap->master_heap->child_count) == 0) {
-      _rpmalloc_heap_unmap(heap->master_heap);
-    }
-  }
-}
-
-static void _rpmalloc_heap_global_finalize(heap_t *heap) {
-  if (heap->finalize++ > 1) {
-    --heap->finalize;
-    return;
-  }
-
-  _rpmalloc_heap_finalize(heap);
-
-#if ENABLE_THREAD_CACHE
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    span_cache_t *span_cache;
-    if (!iclass)
-      span_cache = &heap->span_cache;
-    else
-      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
-    for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
-      _rpmalloc_span_unmap(span_cache->span[ispan]);
-    span_cache->count = 0;
-  }
-#endif
-
-  if (heap->full_span_count) {
-    --heap->finalize;
-    return;
-  }
-
-  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-    if (heap->size_class[iclass].free_list ||
-        heap->size_class[iclass].partial_span) {
-      --heap->finalize;
-      return;
-    }
-  }
-  // Heap is now completely free, unmap and remove from heap list
-  size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
-  heap_t *list_heap = _memory_heaps[list_idx];
-  if (list_heap == heap) {
-    _memory_heaps[list_idx] = heap->next_heap;
-  } else {
-    while (list_heap->next_heap != heap)
-      list_heap = list_heap->next_heap;
-    list_heap->next_heap = heap->next_heap;
-  }
-
-  _rpmalloc_heap_unmap(heap);
-}
-
-//! Insert a single span into thread heap cache, releasing to global cache if
-//! overflow
-static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span) {
-  if (UNEXPECTED(heap->finalize != 0)) {
-    _rpmalloc_span_unmap(span);
-    _rpmalloc_heap_global_finalize(heap);
-    return;
-  }
-#if ENABLE_THREAD_CACHE
-  size_t span_count = span->span_count;
-  _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache);
-  if (span_count == 1) {
-    span_cache_t *span_cache = &heap->span_cache;
-    span_cache->span[span_cache->count++] = span;
-    if (span_cache->count == MAX_THREAD_SPAN_CACHE) {
-      const size_t remain_count =
-          MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER;
-#if ENABLE_GLOBAL_CACHE
-      _rpmalloc_stat_add64(&heap->thread_to_global,
-                           THREAD_SPAN_CACHE_TRANSFER * _memory_span_size);
-      _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global,
-                         THREAD_SPAN_CACHE_TRANSFER);
-      _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count,
-                                          span_count,
-                                          THREAD_SPAN_CACHE_TRANSFER);
-#else
-      for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan)
-        _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
-#endif
-      span_cache->count = remain_count;
-    }
-  } else {
-    size_t cache_idx = span_count - 2;
-    span_large_cache_t *span_cache = heap->span_large_cache + cache_idx;
-    span_cache->span[span_cache->count++] = span;
-    const size_t cache_limit =
-        (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
-    if (span_cache->count == cache_limit) {
-      const size_t transfer_limit = 2 + (cache_limit >> 2);
-      const size_t transfer_count =
-          (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit
-               ? THREAD_SPAN_LARGE_CACHE_TRANSFER
-               : transfer_limit);
-      const size_t remain_count = cache_limit - transfer_count;
-#if ENABLE_GLOBAL_CACHE
-      _rpmalloc_stat_add64(&heap->thread_to_global,
-                           transfer_count * span_count * _memory_span_size);
-      _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global,
-                         transfer_count);
-      _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count,
-                                          span_count, transfer_count);
-#else
-      for (size_t ispan = 0; ispan < transfer_count; ++ispan)
-        _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
-#endif
-      span_cache->count = remain_count;
-    }
-  }
-#else
-  (void)sizeof(heap);
-  _rpmalloc_span_unmap(span);
-#endif
-}
-
-//! Extract the given number of spans from the different cache levels
-static span_t *_rpmalloc_heap_thread_cache_extract(heap_t *heap,
-                                                   size_t span_count) {
-  span_t *span = 0;
-#if ENABLE_THREAD_CACHE
-  span_cache_t *span_cache;
-  if (span_count == 1)
-    span_cache = &heap->span_cache;
-  else
-    span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2));
-  if (span_cache->count) {
-    _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache);
-    return span_cache->span[--span_cache->count];
-  }
-#endif
-  return span;
-}
-
-static span_t *_rpmalloc_heap_thread_cache_deferred_extract(heap_t *heap,
-                                                            size_t span_count) {
-  span_t *span = 0;
-  if (span_count == 1) {
-    _rpmalloc_heap_cache_adopt_deferred(heap, &span);
-  } else {
-    _rpmalloc_heap_cache_adopt_deferred(heap, 0);
-    span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
-  }
-  return span;
-}
-
-static span_t *_rpmalloc_heap_reserved_extract(heap_t *heap,
-                                               size_t span_count) {
-  if (heap->spans_reserved >= span_count)
-    return _rpmalloc_span_map(heap, span_count);
-  return 0;
-}
-
-//! Extract a span from the global cache
-static span_t *_rpmalloc_heap_global_cache_extract(heap_t *heap,
-                                                   size_t span_count) {
-#if ENABLE_GLOBAL_CACHE
-#if ENABLE_THREAD_CACHE
-  span_cache_t *span_cache;
-  size_t wanted_count;
-  if (span_count == 1) {
-    span_cache = &heap->span_cache;
-    wanted_count = THREAD_SPAN_CACHE_TRANSFER;
-  } else {
-    span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2));
-    wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER;
-  }
-  span_cache->count = _rpmalloc_global_cache_extract_spans(
-      span_cache->span, span_count, wanted_count);
-  if (span_cache->count) {
-    _rpmalloc_stat_add64(&heap->global_to_thread,
-                         span_count * span_cache->count * _memory_span_size);
-    _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global,
-                       span_cache->count);
-    return span_cache->span[--span_cache->count];
-  }
-#else
-  span_t *span = 0;
-  size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1);
-  if (count) {
-    _rpmalloc_stat_add64(&heap->global_to_thread,
-                         span_count * count * _memory_span_size);
-    _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global,
-                       count);
-    return span;
-  }
-#endif
-#endif
-  (void)sizeof(heap);
-  (void)sizeof(span_count);
-  return 0;
-}
-
-static void _rpmalloc_inc_span_statistics(heap_t *heap, size_t span_count,
-                                          uint32_t class_idx) {
-  (void)sizeof(heap);
-  (void)sizeof(span_count);
-  (void)sizeof(class_idx);
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-  uint32_t idx = (uint32_t)span_count - 1;
-  uint32_t current_count =
-      (uint32_t)atomic_incr32(&heap->span_use[idx].current);
-  if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high))
-    atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
-  _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1,
-                          heap->size_class_use[class_idx].spans_peak);
-#endif
-}
-
-//! Get a span from one of the cache levels (thread cache, reserved, global
-//! cache) or fallback to mapping more memory
-static span_t *
-_rpmalloc_heap_extract_new_span(heap_t *heap,
-                                heap_size_class_t *heap_size_class,
-                                size_t span_count, uint32_t class_idx) {
-  span_t *span;
-#if ENABLE_THREAD_CACHE
-  if (heap_size_class && heap_size_class->cache) {
-    span = heap_size_class->cache;
-    heap_size_class->cache =
-        (heap->span_cache.count
-             ? heap->span_cache.span[--heap->span_cache.count]
-             : 0);
-    _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
-    return span;
-  }
-#endif
-  (void)sizeof(class_idx);
-  // Allow 50% overhead to increase cache hits
-  size_t base_span_count = span_count;
-  size_t limit_span_count =
-      (span_count > 2) ? (span_count + (span_count >> 1)) : span_count;
-  if (limit_span_count > LARGE_CLASS_COUNT)
-    limit_span_count = LARGE_CLASS_COUNT;
-  do {
-    span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
-    if (EXPECTED(span != 0)) {
-      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
-      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
-      return span;
-    }
-    span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count);
-    if (EXPECTED(span != 0)) {
-      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
-      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
-      return span;
-    }
-    span = _rpmalloc_heap_global_cache_extract(heap, span_count);
-    if (EXPECTED(span != 0)) {
-      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
-      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
-      return span;
-    }
-    span = _rpmalloc_heap_reserved_extract(heap, span_count);
-    if (EXPECTED(span != 0)) {
-      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
-      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
-      return span;
-    }
-    ++span_count;
-  } while (span_count <= limit_span_count);
-  // Final fallback, map in more virtual memory
-  span = _rpmalloc_span_map(heap, base_span_count);
-  _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx);
-  _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls);
-  return span;
-}
-
-static void _rpmalloc_heap_initialize(heap_t *heap) {
-  _rpmalloc_memset_const(heap, 0, sizeof(heap_t));
-  // Get a new heap ID
-  heap->id = 1 + atomic_incr32(&_memory_heap_id);
-
-  // Link in heap in heap ID map
-  size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
-  heap->next_heap = _memory_heaps[list_idx];
-  _memory_heaps[list_idx] = heap;
-}
-
-static void _rpmalloc_heap_orphan(heap_t *heap, int first_class) {
-  heap->owner_thread = (uintptr_t)-1;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  heap_t **heap_list =
-      (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
-#else
-  (void)sizeof(first_class);
-  heap_t **heap_list = &_memory_orphan_heaps;
-#endif
-  heap->next_orphan = *heap_list;
-  *heap_list = heap;
-}
-
-//! Allocate a new heap from newly mapped memory pages
-static heap_t *_rpmalloc_heap_allocate_new(void) {
-  // Map in pages for a 16 heaps. If page size is greater than required size for
-  // this, map a page and use first part for heaps and remaining part for spans
-  // for allocations. Adds a lot of complexity, but saves a lot of memory on
-  // systems where page size > 64 spans (4MiB)
-  size_t heap_size = sizeof(heap_t);
-  size_t aligned_heap_size = 16 * ((heap_size + 15) / 16);
-  size_t request_heap_count = 16;
-  size_t heap_span_count = ((aligned_heap_size * request_heap_count) +
-                            sizeof(span_t) + _memory_span_size - 1) /
-                           _memory_span_size;
-  size_t block_size = _memory_span_size * heap_span_count;
-  size_t span_count = heap_span_count;
-  span_t *span = 0;
-  // If there are global reserved spans, use these first
-  if (_memory_global_reserve_count >= heap_span_count) {
-    span = _rpmalloc_global_get_reserved_spans(heap_span_count);
-  }
-  if (!span) {
-    if (_memory_page_size > block_size) {
-      span_count = _memory_page_size / _memory_span_size;
-      block_size = _memory_page_size;
-      // If using huge pages, make sure to grab enough heaps to avoid
-      // reallocating a huge page just to serve new heaps
-      size_t possible_heap_count =
-          (block_size - sizeof(span_t)) / aligned_heap_size;
-      if (possible_heap_count >= (request_heap_count * 16))
-        request_heap_count *= 16;
-      else if (possible_heap_count < request_heap_count)
-        request_heap_count = possible_heap_count;
-      heap_span_count = ((aligned_heap_size * request_heap_count) +
-                         sizeof(span_t) + _memory_span_size - 1) /
-                        _memory_span_size;
-    }
-
-    size_t align_offset = 0;
-    span = (span_t *)_rpmalloc_mmap(block_size, &align_offset);
-    if (!span)
-      return 0;
-
-    // Master span will contain the heaps
-    _rpmalloc_stat_inc(&_master_spans);
-    _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset);
-  }
-
-  size_t remain_size = _memory_span_size - sizeof(span_t);
-  heap_t *heap = (heap_t *)pointer_offset(span, sizeof(span_t));
-  _rpmalloc_heap_initialize(heap);
-
-  // Put extra heaps as orphans
-  size_t num_heaps = remain_size / aligned_heap_size;
-  if (num_heaps < request_heap_count)
-    num_heaps = request_heap_count;
-  atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
-  heap_t *extra_heap = (heap_t *)pointer_offset(heap, aligned_heap_size);
-  while (num_heaps > 1) {
-    _rpmalloc_heap_initialize(extra_heap);
-    extra_heap->master_heap = heap;
-    _rpmalloc_heap_orphan(extra_heap, 1);
-    extra_heap = (heap_t *)pointer_offset(extra_heap, aligned_heap_size);
-    --num_heaps;
-  }
-
-  if (span_count > heap_span_count) {
-    // Cap reserved spans
-    size_t remain_count = span_count - heap_span_count;
-    size_t reserve_count =
-        (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count
-                                                   : remain_count);
-    span_t *remain_span =
-        (span_t *)pointer_offset(span, heap_span_count * _memory_span_size);
-    _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count);
-
-    if (remain_count > reserve_count) {
-      // Set to global reserved spans
-      remain_span = (span_t *)pointer_offset(remain_span,
-                                             reserve_count * _memory_span_size);
-      reserve_count = remain_count - reserve_count;
-      _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count);
-    }
-  }
-
-  return heap;
-}
-
-static heap_t *_rpmalloc_heap_extract_orphan(heap_t **heap_list) {
-  heap_t *heap = *heap_list;
-  *heap_list = (heap ? heap->next_orphan : 0);
-  return heap;
-}
-
-//! Allocate a new heap, potentially reusing a previously orphaned heap
-static heap_t *_rpmalloc_heap_allocate(int first_class) {
-  heap_t *heap = 0;
-  while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
-    _rpmalloc_spin();
-  if (first_class == 0)
-    heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  if (!heap)
-    heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps);
-#endif
-  if (!heap)
-    heap = _rpmalloc_heap_allocate_new();
-  atomic_store32_release(&_memory_global_lock, 0);
-  if (heap)
-    _rpmalloc_heap_cache_adopt_deferred(heap, 0);
-  return heap;
-}
-
-static void _rpmalloc_heap_release(void *heapptr, int first_class,
-                                   int release_cache) {
-  heap_t *heap = (heap_t *)heapptr;
-  if (!heap)
-    return;
-  // Release thread cache spans back to global cache
-  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
-  if (release_cache || heap->finalize) {
-#if ENABLE_THREAD_CACHE
-    for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-      span_cache_t *span_cache;
-      if (!iclass)
-        span_cache = &heap->span_cache;
-      else
-        span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
-      if (!span_cache->count)
-        continue;
-#if ENABLE_GLOBAL_CACHE
-      if (heap->finalize) {
-        for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
-          _rpmalloc_span_unmap(span_cache->span[ispan]);
-      } else {
-        _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count *
-                                                          (iclass + 1) *
-                                                          _memory_span_size);
-        _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global,
-                           span_cache->count);
-        _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1,
-                                            span_cache->count);
-      }
-#else
-      for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
-        _rpmalloc_span_unmap(span_cache->span[ispan]);
-#endif
-      span_cache->count = 0;
-    }
-#endif
-  }
-
-  if (get_thread_heap_raw() == heap)
-    set_thread_heap(0);
-
-#if ENABLE_STATISTICS
-  atomic_decr32(&_memory_active_heaps);
-  rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0,
-                  "Still active heaps during finalization");
-#endif
-
-  // If we are forcibly terminating with _exit the state of the
-  // lock atomic is unknown and it's best to just go ahead and exit
-  if (get_thread_id() != _rpmalloc_main_thread_id) {
-    while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
-      _rpmalloc_spin();
-  }
-  _rpmalloc_heap_orphan(heap, first_class);
-  atomic_store32_release(&_memory_global_lock, 0);
-}
-
-static void _rpmalloc_heap_release_raw(void *heapptr, int release_cache) {
-  _rpmalloc_heap_release(heapptr, 0, release_cache);
-}
-
-static void _rpmalloc_heap_release_raw_fc(void *heapptr) {
-  _rpmalloc_heap_release_raw(heapptr, 1);
-}
-
-static void _rpmalloc_heap_finalize(heap_t *heap) {
-  if (heap->spans_reserved) {
-    span_t *span = _rpmalloc_span_map(heap, heap->spans_reserved);
-    _rpmalloc_span_unmap(span);
-    heap->spans_reserved = 0;
-  }
-
-  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
-
-  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-    if (heap->size_class[iclass].cache)
-      _rpmalloc_span_unmap(heap->size_class[iclass].cache);
-    heap->size_class[iclass].cache = 0;
-    span_t *span = heap->size_class[iclass].partial_span;
-    while (span) {
-      span_t *next = span->next;
-      _rpmalloc_span_finalize(heap, iclass, span,
-                              &heap->size_class[iclass].partial_span);
-      span = next;
-    }
-    // If class still has a free list it must be a full span
-    if (heap->size_class[iclass].free_list) {
-      span_t *class_span =
-          (span_t *)((uintptr_t)heap->size_class[iclass].free_list &
-                     _memory_span_mask);
-      span_t **list = 0;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-      list = &heap->full_span[iclass];
-#endif
-      --heap->full_span_count;
-      if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
-        if (list)
-          _rpmalloc_span_double_link_list_remove(list, class_span);
-        _rpmalloc_span_double_link_list_add(
-            &heap->size_class[iclass].partial_span, class_span);
-      }
-    }
-  }
-
-#if ENABLE_THREAD_CACHE
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    span_cache_t *span_cache;
-    if (!iclass)
-      span_cache = &heap->span_cache;
-    else
-      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
-    for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
-      _rpmalloc_span_unmap(span_cache->span[ispan]);
-    span_cache->count = 0;
-  }
-#endif
-  rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred),
-                  "Heaps still active during finalization");
-}
-
-////////////
-///
-/// Allocation entry points
-///
-//////
-
-//! Pop first block from a free list
-static void *free_list_pop(void **list) {
-  void *block = *list;
-  *list = *((void **)block);
-  return block;
-}
-
-//! Allocate a small/medium sized memory block from the given heap
-static void *_rpmalloc_allocate_from_heap_fallback(
-    heap_t *heap, heap_size_class_t *heap_size_class, uint32_t class_idx) {
-  span_t *span = heap_size_class->partial_span;
-  rpmalloc_assume(heap != 0);
-  if (EXPECTED(span != 0)) {
-    rpmalloc_assert(span->block_count ==
-                        _memory_size_class[span->size_class].block_count,
-                    "Span block count corrupted");
-    rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span),
-                    "Internal failure");
-    void *block;
-    if (span->free_list) {
-      // Span local free list is not empty, swap to size class free list
-      block = free_list_pop(&span->free_list);
-      heap_size_class->free_list = span->free_list;
-      span->free_list = 0;
-    } else {
-      // If the span did not fully initialize free list, link up another page
-      // worth of blocks
-      void *block_start = pointer_offset(
-          span, SPAN_HEADER_SIZE +
-                    ((size_t)span->free_list_limit * span->block_size));
-      span->free_list_limit += free_list_partial_init(
-          &heap_size_class->free_list, &block,
-          (void *)((uintptr_t)block_start & ~(_memory_page_size - 1)),
-          block_start, span->block_count - span->free_list_limit,
-          span->block_size);
-    }
-    rpmalloc_assert(span->free_list_limit <= span->block_count,
-                    "Span block count corrupted");
-    span->used_count = span->free_list_limit;
-
-    // Swap in deferred free list if present
-    if (atomic_load_ptr(&span->free_list_deferred))
-      _rpmalloc_span_extract_free_list_deferred(span);
-
-    // If span is still not fully utilized keep it in partial list and early
-    // return block
-    if (!_rpmalloc_span_is_fully_utilized(span))
-      return block;
-
-    // The span is fully utilized, unlink from partial list and add to fully
-    // utilized list
-    _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span,
-                                             span);
-#if RPMALLOC_FIRST_CLASS_HEAPS
-    _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
-#endif
-    ++heap->full_span_count;
-    return block;
-  }
-
-  // Find a span in one of the cache levels
-  span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx);
-  if (EXPECTED(span != 0)) {
-    // Mark span as owned by this heap and set base data, return first block
-    return _rpmalloc_span_initialize_new(heap, heap_size_class, span,
-                                         class_idx);
-  }
-
-  return 0;
-}
-
-//! Allocate a small sized memory block from the given heap
-static void *_rpmalloc_allocate_small(heap_t *heap, size_t size) {
-  rpmalloc_assert(heap, "No thread heap");
-  // Small sizes have unique size classes
-  const uint32_t class_idx =
-      (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
-  heap_size_class_t *heap_size_class = heap->size_class + class_idx;
-  _rpmalloc_stat_inc_alloc(heap, class_idx);
-  if (EXPECTED(heap_size_class->free_list != 0))
-    return free_list_pop(&heap_size_class->free_list);
-  return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class,
-                                               class_idx);
-}
-
-//! Allocate a medium sized memory block from the given heap
-static void *_rpmalloc_allocate_medium(heap_t *heap, size_t size) {
-  rpmalloc_assert(heap, "No thread heap");
-  // Calculate the size class index and do a dependent lookup of the final class
-  // index (in case of merged classes)
-  const uint32_t base_idx =
-      (uint32_t)(SMALL_CLASS_COUNT +
-                 ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
-  const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
-  heap_size_class_t *heap_size_class = heap->size_class + class_idx;
-  _rpmalloc_stat_inc_alloc(heap, class_idx);
-  if (EXPECTED(heap_size_class->free_list != 0))
-    return free_list_pop(&heap_size_class->free_list);
-  return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class,
-                                               class_idx);
-}
-
-//! Allocate a large sized memory block from the given heap
-static void *_rpmalloc_allocate_large(heap_t *heap, size_t size) {
-  rpmalloc_assert(heap, "No thread heap");
-  // Calculate number of needed max sized spans (including header)
-  // Since this function is never called if size > LARGE_SIZE_LIMIT
-  // the span_count is guaranteed to be <= LARGE_CLASS_COUNT
-  size += SPAN_HEADER_SIZE;
-  size_t span_count = size >> _memory_span_size_shift;
-  if (size & (_memory_span_size - 1))
-    ++span_count;
-
-  // Find a span in one of the cache levels
-  span_t *span =
-      _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE);
-  if (!span)
-    return span;
-
-  // Mark span as owned by this heap and set base data
-  rpmalloc_assert(span->span_count >= span_count, "Internal failure");
-  span->size_class = SIZE_CLASS_LARGE;
-  span->heap = heap;
-
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
-#endif
-  ++heap->full_span_count;
-
-  return pointer_offset(span, SPAN_HEADER_SIZE);
-}
-
-//! Allocate a huge block by mapping memory pages directly
-static void *_rpmalloc_allocate_huge(heap_t *heap, size_t size) {
-  rpmalloc_assert(heap, "No thread heap");
-  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
-  size += SPAN_HEADER_SIZE;
-  size_t num_pages = size >> _memory_page_size_shift;
-  if (size & (_memory_page_size - 1))
-    ++num_pages;
-  size_t align_offset = 0;
-  span_t *span =
-      (span_t *)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset);
-  if (!span)
-    return span;
-
-  // Store page count in span_count
-  span->size_class = SIZE_CLASS_HUGE;
-  span->span_count = (uint32_t)num_pages;
-  span->align_offset = (uint32_t)align_offset;
-  span->heap = heap;
-  _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
-
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
-#endif
-  ++heap->full_span_count;
-
-  return pointer_offset(span, SPAN_HEADER_SIZE);
-}
-
-//! Allocate a block of the given size
-static void *_rpmalloc_allocate(heap_t *heap, size_t size) {
-  _rpmalloc_stat_add64(&_allocation_counter, 1);
-  if (EXPECTED(size <= SMALL_SIZE_LIMIT))
-    return _rpmalloc_allocate_small(heap, size);
-  else if (size <= _memory_medium_size_limit)
-    return _rpmalloc_allocate_medium(heap, size);
-  else if (size <= LARGE_SIZE_LIMIT)
-    return _rpmalloc_allocate_large(heap, size);
-  return _rpmalloc_allocate_huge(heap, size);
-}
-
-static void *_rpmalloc_aligned_allocate(heap_t *heap, size_t alignment,
-                                        size_t size) {
-  if (alignment <= SMALL_GRANULARITY)
-    return _rpmalloc_allocate(heap, size);
-
-#if ENABLE_VALIDATE_ARGS
-  if ((size + alignment) < size) {
-    errno = EINVAL;
-    return 0;
-  }
-  if (alignment & (alignment - 1)) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-
-  if ((alignment <= SPAN_HEADER_SIZE) &&
-      ((size + SPAN_HEADER_SIZE) < _memory_medium_size_limit)) {
-    // If alignment is less or equal to span header size (which is power of
-    // two), and size aligned to span header size multiples is less than size +
-    // alignment, then use natural alignment of blocks to provide alignment
-    size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) &
-                                      ~(uintptr_t)(SPAN_HEADER_SIZE - 1)
-                                : SPAN_HEADER_SIZE;
-    rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE),
-                    "Failed alignment calculation");
-    if (multiple_size <= (size + alignment))
-      return _rpmalloc_allocate(heap, multiple_size);
-  }
-
-  void *ptr = 0;
-  size_t align_mask = alignment - 1;
-  if (alignment <= _memory_page_size) {
-    ptr = _rpmalloc_allocate(heap, size + alignment);
-    if ((uintptr_t)ptr & align_mask) {
-      ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-      // Mark as having aligned blocks
-      span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask);
-      span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
-    }
-    return ptr;
-  }
-
-  // Fallback to mapping new pages for this request. Since pointers passed
-  // to rpfree must be able to reach the start of the span by bitmasking of
-  // the address with the span size, the returned aligned pointer from this
-  // function must be with a span size of the start of the mapped area.
-  // In worst case this requires us to loop and map pages until we get a
-  // suitable memory address. It also means we can never align to span size
-  // or greater, since the span header will push alignment more than one
-  // span size away from span start (thus causing pointer mask to give us
-  // an invalid span start on free)
-  if (alignment & align_mask) {
-    errno = EINVAL;
-    return 0;
-  }
-  if (alignment >= _memory_span_size) {
-    errno = EINVAL;
-    return 0;
-  }
-
-  size_t extra_pages = alignment / _memory_page_size;
-
-  // Since each span has a header, we will at least need one extra memory page
-  size_t num_pages = 1 + (size / _memory_page_size);
-  if (size & (_memory_page_size - 1))
-    ++num_pages;
-
-  if (extra_pages > num_pages)
-    num_pages = 1 + extra_pages;
-
-  size_t original_pages = num_pages;
-  size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
-  if (limit_pages < (original_pages * 2))
-    limit_pages = original_pages * 2;
-
-  size_t mapped_size, align_offset;
-  span_t *span;
-
-retry:
-  align_offset = 0;
-  mapped_size = num_pages * _memory_page_size;
-
-  span = (span_t *)_rpmalloc_mmap(mapped_size, &align_offset);
-  if (!span) {
-    errno = ENOMEM;
-    return 0;
-  }
-  ptr = pointer_offset(span, SPAN_HEADER_SIZE);
-
-  if ((uintptr_t)ptr & align_mask)
-    ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
-
-  if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
-      (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
-      (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
-    _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size);
-    ++num_pages;
-    if (num_pages > limit_pages) {
-      errno = EINVAL;
-      return 0;
-    }
-    goto retry;
-  }
-
-  // Store page count in span_count
-  span->size_class = SIZE_CLASS_HUGE;
-  span->span_count = (uint32_t)num_pages;
-  span->align_offset = (uint32_t)align_offset;
-  span->heap = heap;
-  _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
-
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
-#endif
-  ++heap->full_span_count;
-
-  _rpmalloc_stat_add64(&_allocation_counter, 1);
-
-  return ptr;
-}
-
-////////////
-///
-/// Deallocation entry points
-///
-//////
-
-//! Deallocate the given small/medium memory block in the current thread local
-//! heap
-static void _rpmalloc_deallocate_direct_small_or_medium(span_t *span,
-                                                        void *block) {
-  heap_t *heap = span->heap;
-  rpmalloc_assert(heap->owner_thread == get_thread_id() ||
-                      !heap->owner_thread || heap->finalize,
-                  "Internal failure");
-  // Add block to free list
-  if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) {
-    span->used_count = span->block_count;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-    _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class],
-                                           span);
-#endif
-    _rpmalloc_span_double_link_list_add(
-        &heap->size_class[span->size_class].partial_span, span);
-    --heap->full_span_count;
-  }
-  *((void **)block) = span->free_list;
-  --span->used_count;
-  span->free_list = block;
-  if (UNEXPECTED(span->used_count == span->list_size)) {
-    // If there are no used blocks it is guaranteed that no other external
-    // thread is accessing the span
-    if (span->used_count) {
-      // Make sure we have synchronized the deferred list and list size by using
-      // acquire semantics and guarantee that no external thread is accessing
-      // span concurrently
-      void *free_list;
-      do {
-        free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred,
-                                                INVALID_POINTER);
-      } while (free_list == INVALID_POINTER);
-      atomic_store_ptr_release(&span->free_list_deferred, free_list);
-    }
-    _rpmalloc_span_double_link_list_remove(
-        &heap->size_class[span->size_class].partial_span, span);
-    _rpmalloc_span_release_to_cache(heap, span);
-  }
-}
-
-static void _rpmalloc_deallocate_defer_free_span(heap_t *heap, span_t *span) {
-  if (span->size_class != SIZE_CLASS_HUGE)
-    _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred);
-  // This list does not need ABA protection, no mutable side state
-  do {
-    span->free_list = (void *)atomic_load_ptr(&heap->span_free_deferred);
-  } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list));
-}
-
-//! Put the block in the deferred free list of the owning span
-static void _rpmalloc_deallocate_defer_small_or_medium(span_t *span,
-                                                       void *block) {
-  // The memory ordering here is a bit tricky, to avoid having to ABA protect
-  // the deferred free list to avoid desynchronization of list and list size
-  // we need to have acquire semantics on successful CAS of the pointer to
-  // guarantee the list_size variable validity + release semantics on pointer
-  // store
-  void *free_list;
-  do {
-    free_list =
-        atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
-  } while (free_list == INVALID_POINTER);
-  *((void **)block) = free_list;
-  uint32_t free_count = ++span->list_size;
-  int all_deferred_free = (free_count == span->block_count);
-  atomic_store_ptr_release(&span->free_list_deferred, block);
-  if (all_deferred_free) {
-    // Span was completely freed by this block. Due to the INVALID_POINTER spin
-    // lock no other thread can reach this state simultaneously on this span.
-    // Safe to move to owner heap deferred cache
-    _rpmalloc_deallocate_defer_free_span(span->heap, span);
-  }
-}
-
-static void _rpmalloc_deallocate_small_or_medium(span_t *span, void *p) {
-  _rpmalloc_stat_inc_free(span->heap, span->size_class);
-  if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
-    // Realign pointer to block start
-    void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-    uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
-    p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
-  }
-  // Check if block belongs to this heap or if deallocation should be deferred
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  int defer =
-      (span->heap->owner_thread &&
-       (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
-#else
-  int defer =
-      ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
-#endif
-  if (!defer)
-    _rpmalloc_deallocate_direct_small_or_medium(span, p);
-  else
-    _rpmalloc_deallocate_defer_small_or_medium(span, p);
-}
-
-//! Deallocate the given large memory block to the current heap
-static void _rpmalloc_deallocate_large(span_t *span) {
-  rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class");
-  rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) ||
-                      !(span->flags & SPAN_FLAG_SUBSPAN),
-                  "Span flag corrupted");
-  rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) ||
-                      (span->flags & SPAN_FLAG_SUBSPAN),
-                  "Span flag corrupted");
-  // We must always defer (unless finalizing) if from another heap since we
-  // cannot touch the list or counters of another heap
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  int defer =
-      (span->heap->owner_thread &&
-       (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
-#else
-  int defer =
-      ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
-#endif
-  if (defer) {
-    _rpmalloc_deallocate_defer_free_span(span->heap, span);
-    return;
-  }
-  rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
-  --span->heap->full_span_count;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
-#endif
-#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
-  // Decrease counter
-  size_t idx = span->span_count - 1;
-  atomic_decr32(&span->heap->span_use[idx].current);
-#endif
-  heap_t *heap = span->heap;
-  rpmalloc_assert(heap, "No thread heap");
-#if ENABLE_THREAD_CACHE
-  const int set_as_reserved =
-      ((span->span_count > 1) && (heap->span_cache.count == 0) &&
-       !heap->finalize && !heap->spans_reserved);
-#else
-  const int set_as_reserved =
-      ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved);
-#endif
-  if (set_as_reserved) {
-    heap->span_reserve = span;
-    heap->spans_reserved = span->span_count;
-    if (span->flags & SPAN_FLAG_MASTER) {
-      heap->span_reserve_master = span;
-    } else { // SPAN_FLAG_SUBSPAN
-      span_t *master = (span_t *)pointer_offset(
-          span,
-          -(intptr_t)((size_t)span->offset_from_master * _memory_span_size));
-      heap->span_reserve_master = master;
-      rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
-      rpmalloc_assert(atomic_load32(&master->remaining_spans) >=
-                          (int32_t)span->span_count,
-                      "Master span count corrupted");
-    }
-    _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved);
-  } else {
-    // Insert into cache list
-    _rpmalloc_heap_cache_insert(heap, span);
-  }
-}
-
-//! Deallocate the given huge span
-static void _rpmalloc_deallocate_huge(span_t *span) {
-  rpmalloc_assert(span->heap, "No span heap");
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  int defer =
-      (span->heap->owner_thread &&
-       (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
-#else
-  int defer =
-      ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
-#endif
-  if (defer) {
-    _rpmalloc_deallocate_defer_free_span(span->heap, span);
-    return;
-  }
-  rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
-  --span->heap->full_span_count;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
-#endif
-
-  // Oversized allocation, page count is stored in span_count
-  size_t num_pages = span->span_count;
-  _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset,
-                  num_pages * _memory_page_size);
-  _rpmalloc_stat_sub(&_huge_pages_current, num_pages);
-}
-
-//! Deallocate the given block
-static void _rpmalloc_deallocate(void *p) {
-  _rpmalloc_stat_add64(&_deallocation_counter, 1);
-  // Grab the span (always at start of span, using span alignment)
-  span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask);
-  if (UNEXPECTED(!span))
-    return;
-  if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
-    _rpmalloc_deallocate_small_or_medium(span, p);
-  else if (span->size_class == SIZE_CLASS_LARGE)
-    _rpmalloc_deallocate_large(span);
-  else
-    _rpmalloc_deallocate_huge(span);
-}
-
-////////////
-///
-/// Reallocation entry points
-///
-//////
-
-static size_t _rpmalloc_usable_size(void *p);
-
-//! Reallocate the given block to the given size
-static void *_rpmalloc_reallocate(heap_t *heap, void *p, size_t size,
-                                  size_t oldsize, unsigned int flags) {
-  if (p) {
-    // Grab the span using guaranteed span alignment
-    span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask);
-    if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
-      // Small/medium sized block
-      rpmalloc_assert(span->span_count == 1, "Span counter corrupted");
-      void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-      uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
-      uint32_t block_idx = block_offset / span->block_size;
-      void *block =
-          pointer_offset(blocks_start, (size_t)block_idx * span->block_size);
-      if (!oldsize)
-        oldsize =
-            (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block));
-      if ((size_t)span->block_size >= size) {
-        // Still fits in block, never mind trying to save memory, but preserve
-        // data if alignment changed
-        if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-          memmove(block, p, oldsize);
-        return block;
-      }
-    } else if (span->size_class == SIZE_CLASS_LARGE) {
-      // Large block
-      size_t total_size = size + SPAN_HEADER_SIZE;
-      size_t num_spans = total_size >> _memory_span_size_shift;
-      if (total_size & (_memory_span_mask - 1))
-        ++num_spans;
-      size_t current_spans = span->span_count;
-      void *block = pointer_offset(span, SPAN_HEADER_SIZE);
-      if (!oldsize)
-        oldsize = (current_spans * _memory_span_size) -
-                  (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-      if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
-        // Still fits in block, never mind trying to save memory, but preserve
-        // data if alignment changed
-        if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-          memmove(block, p, oldsize);
-        return block;
-      }
-    } else {
-      // Oversized block
-      size_t total_size = size + SPAN_HEADER_SIZE;
-      size_t num_pages = total_size >> _memory_page_size_shift;
-      if (total_size & (_memory_page_size - 1))
-        ++num_pages;
-      // Page count is stored in span_count
-      size_t current_pages = span->span_count;
-      void *block = pointer_offset(span, SPAN_HEADER_SIZE);
-      if (!oldsize)
-        oldsize = (current_pages * _memory_page_size) -
-                  (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
-      if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) {
-        // Still fits in block, never mind trying to save memory, but preserve
-        // data if alignment changed
-        if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
-          memmove(block, p, oldsize);
-        return block;
-      }
-    }
-  } else {
-    oldsize = 0;
-  }
-
-  if (!!(flags & RPMALLOC_GROW_OR_FAIL))
-    return 0;
-
-  // Size is greater than block size, need to allocate a new block and
-  // deallocate the old Avoid hysteresis by overallocating if increase is small
-  // (below 37%)
-  size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
-  size_t new_size =
-      (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
-  void *block = _rpmalloc_allocate(heap, new_size);
-  if (p && block) {
-    if (!(flags & RPMALLOC_NO_PRESERVE))
-      memcpy(block, p, oldsize < new_size ? oldsize : new_size);
-    _rpmalloc_deallocate(p);
-  }
-
-  return block;
-}
-
-static void *_rpmalloc_aligned_reallocate(heap_t *heap, void *ptr,
-                                          size_t alignment, size_t size,
-                                          size_t oldsize, unsigned int flags) {
-  if (alignment <= SMALL_GRANULARITY)
-    return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags);
-
-  int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
-  size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0);
-  if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
-    if (no_alloc || (size >= (usablesize / 2)))
-      return ptr;
-  }
-  // Aligned alloc marks span as having aligned blocks
-  void *block =
-      (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0);
-  if (EXPECTED(block != 0)) {
-    if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
-      if (!oldsize)
-        oldsize = usablesize;
-      memcpy(block, ptr, oldsize < size ? oldsize : size);
-    }
-    _rpmalloc_deallocate(ptr);
-  }
-  return block;
-}
-
-////////////
-///
-/// Initialization, finalization and utility
-///
-//////
-
-//! Get the usable size of the given block
-static size_t _rpmalloc_usable_size(void *p) {
-  // Grab the span using guaranteed span alignment
-  span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask);
-  if (span->size_class < SIZE_CLASS_COUNT) {
-    // Small/medium block
-    void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
-    return span->block_size -
-           ((size_t)pointer_diff(p, blocks_start) % span->block_size);
-  }
-  if (span->size_class == SIZE_CLASS_LARGE) {
-    // Large block
-    size_t current_spans = span->span_count;
-    return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
-  }
-  // Oversized block, page count is stored in span_count
-  size_t current_pages = span->span_count;
-  return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
-}
-
-//! Adjust and optimize the size class properties for the given class
-static void _rpmalloc_adjust_size_class(size_t iclass) {
-  size_t block_size = _memory_size_class[iclass].block_size;
-  size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
-
-  _memory_size_class[iclass].block_count = (uint16_t)block_count;
-  _memory_size_class[iclass].class_idx = (uint16_t)iclass;
-
-  // Check if previous size classes can be merged
-  if (iclass >= SMALL_CLASS_COUNT) {
-    size_t prevclass = iclass;
-    while (prevclass > 0) {
-      --prevclass;
-      // A class can be merged if number of pages and number of blocks are equal
-      if (_memory_size_class[prevclass].block_count ==
-          _memory_size_class[iclass].block_count)
-        _rpmalloc_memcpy_const(_memory_size_class + prevclass,
-                               _memory_size_class + iclass,
-                               sizeof(_memory_size_class[iclass]));
-      else
-        break;
-    }
-  }
-}
-
-//! Initialize the allocator and setup global data
-extern inline int rpmalloc_initialize(void) {
-  if (_rpmalloc_initialized) {
-    rpmalloc_thread_initialize();
-    return 0;
-  }
-  return rpmalloc_initialize_config(0);
-}
-
-int rpmalloc_initialize_config(const rpmalloc_config_t *config) {
-  if (_rpmalloc_initialized) {
-    rpmalloc_thread_initialize();
-    return 0;
-  }
-  _rpmalloc_initialized = 1;
-
-  if (config)
-    memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
-  else
-    _rpmalloc_memset_const(&_memory_config, 0, sizeof(rpmalloc_config_t));
-
-  if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
-    _memory_config.memory_map = _rpmalloc_mmap_os;
-    _memory_config.memory_unmap = _rpmalloc_unmap_os;
-  }
-
-#if PLATFORM_WINDOWS
-  SYSTEM_INFO system_info;
-  memset(&system_info, 0, sizeof(system_info));
-  GetSystemInfo(&system_info);
-  _memory_map_granularity = system_info.dwAllocationGranularity;
-#else
-  _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE);
-#endif
-
-#if RPMALLOC_CONFIGURABLE
-  _memory_page_size = _memory_config.page_size;
-#else
-  _memory_page_size = 0;
-#endif
-  _memory_huge_pages = 0;
-  if (!_memory_page_size) {
-#if PLATFORM_WINDOWS
-    _memory_page_size = system_info.dwPageSize;
-#else
-    _memory_page_size = _memory_map_granularity;
-    if (_memory_config.enable_huge_pages) {
-#if defined(__linux__)
-      size_t huge_page_size = 0;
-      FILE *meminfo = fopen("/proc/meminfo", "r");
-      if (meminfo) {
-        char line[128];
-        while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
-          line[sizeof(line) - 1] = 0;
-          if (strstr(line, "Hugepagesize:"))
-            huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
-        }
-        fclose(meminfo);
-      }
-      if (huge_page_size) {
-        _memory_huge_pages = 1;
-        _memory_page_size = huge_page_size;
-        _memory_map_granularity = huge_page_size;
-      }
-#elif defined(__FreeBSD__)
-      int rc;
-      size_t sz = sizeof(rc);
-
-      if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 &&
-          rc == 1) {
-        static size_t defsize = 2 * 1024 * 1024;
-        int nsize = 0;
-        size_t sizes[4] = {0};
-        _memory_huge_pages = 1;
-        _memory_page_size = defsize;
-        if ((nsize = getpagesizes(sizes, 4)) >= 2) {
-          nsize--;
-          for (size_t csize = sizes[nsize]; nsize >= 0 && csize;
-               --nsize, csize = sizes[nsize]) {
-            //! Unlikely, but as a precaution..
-            rpmalloc_assert(!(csize & (csize - 1)) && !(csize % 1024),
-                            "Invalid page size");
-            if (defsize < csize) {
-              _memory_page_size = csize;
-              break;
-            }
-          }
-        }
-        _memory_map_granularity = _memory_page_size;
-      }
-#elif defined(__APPLE__) || defined(__NetBSD__)
-      _memory_huge_pages = 1;
-      _memory_page_size = 2 * 1024 * 1024;
-      _memory_map_granularity = _memory_page_size;
-#endif
-    }
-#endif
-  } else {
-    if (_memory_config.enable_huge_pages)
-      _memory_huge_pages = 1;
-  }
-
-#if PLATFORM_WINDOWS
-  if (_memory_config.enable_huge_pages) {
-    HANDLE token = 0;
-    size_t large_page_minimum = GetLargePageMinimum();
-    if (large_page_minimum)
-      OpenProcessToken(GetCurrentProcess(),
-                       TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
-    if (token) {
-      LUID luid;
-      if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
-        TOKEN_PRIVILEGES token_privileges;
-        memset(&token_privileges, 0, sizeof(token_privileges));
-        token_privileges.PrivilegeCount = 1;
-        token_privileges.Privileges[0].Luid = luid;
-        token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
-        if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
-          if (GetLastError() == ERROR_SUCCESS)
-            _memory_huge_pages = 1;
-        }
-      }
-      CloseHandle(token);
-    }
-    if (_memory_huge_pages) {
-      if (large_page_minimum > _memory_page_size)
-        _memory_page_size = large_page_minimum;
-      if (large_page_minimum > _memory_map_granularity)
-        _memory_map_granularity = large_page_minimum;
-    }
-  }
-#endif
-
-  size_t min_span_size = 256;
-  size_t max_page_size;
-#if UINTPTR_MAX > 0xFFFFFFFF
-  max_page_size = 4096ULL * 1024ULL * 1024ULL;
-#else
-  max_page_size = 4 * 1024 * 1024;
-#endif
-  if (_memory_page_size < min_span_size)
-    _memory_page_size = min_span_size;
-  if (_memory_page_size > max_page_size)
-    _memory_page_size = max_page_size;
-  _memory_page_size_shift = 0;
-  size_t page_size_bit = _memory_page_size;
-  while (page_size_bit != 1) {
-    ++_memory_page_size_shift;
-    page_size_bit >>= 1;
-  }
-  _memory_page_size = ((size_t)1 << _memory_page_size_shift);
-
-#if RPMALLOC_CONFIGURABLE
-  if (!_memory_config.span_size) {
-    _memory_span_size = _memory_default_span_size;
-    _memory_span_size_shift = _memory_default_span_size_shift;
-    _memory_span_mask = _memory_default_span_mask;
-  } else {
-    size_t span_size = _memory_config.span_size;
-    if (span_size > (256 * 1024))
-      span_size = (256 * 1024);
-    _memory_span_size = 4096;
-    _memory_span_size_shift = 12;
-    while (_memory_span_size < span_size) {
-      _memory_span_size <<= 1;
-      ++_memory_span_size_shift;
-    }
-    _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
-  }
-#endif
-
-  _memory_span_map_count =
-      (_memory_config.span_map_count ? _memory_config.span_map_count
-                                     : DEFAULT_SPAN_MAP_COUNT);
-  if ((_memory_span_size * _memory_span_map_count) < _memory_page_size)
-    _memory_span_map_count = (_memory_page_size / _memory_span_size);
-  if ((_memory_page_size >= _memory_span_size) &&
-      ((_memory_span_map_count * _memory_span_size) % _memory_page_size))
-    _memory_span_map_count = (_memory_page_size / _memory_span_size);
-  _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT)
-                                   ? DEFAULT_SPAN_MAP_COUNT
-                                   : _memory_span_map_count;
-
-  _memory_config.page_size = _memory_page_size;
-  _memory_config.span_size = _memory_span_size;
-  _memory_config.span_map_count = _memory_span_map_count;
-  _memory_config.enable_huge_pages = _memory_huge_pages;
-
-#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) ||          \
-    defined(__TINYC__)
-  if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc))
-    return -1;
-#endif
-#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-  fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
-#endif
-
-  // Setup all small and medium size classes
-  size_t iclass = 0;
-  _memory_size_class[iclass].block_size = SMALL_GRANULARITY;
-  _rpmalloc_adjust_size_class(iclass);
-  for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
-    size_t size = iclass * SMALL_GRANULARITY;
-    _memory_size_class[iclass].block_size = (uint32_t)size;
-    _rpmalloc_adjust_size_class(iclass);
-  }
-  // At least two blocks per span, then fall back to large allocations
-  _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
-  if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
-    _memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
-  for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
-    size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
-    if (size > _memory_medium_size_limit) {
-      _memory_medium_size_limit =
-          SMALL_SIZE_LIMIT + (iclass * MEDIUM_GRANULARITY);
-      break;
-    }
-    _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
-    _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
-  }
-
-  _memory_orphan_heaps = 0;
-#if RPMALLOC_FIRST_CLASS_HEAPS
-  _memory_first_class_orphan_heaps = 0;
-#endif
-#if ENABLE_STATISTICS
-  atomic_store32(&_memory_active_heaps, 0);
-  atomic_store32(&_mapped_pages, 0);
-  _mapped_pages_peak = 0;
-  atomic_store32(&_master_spans, 0);
-  atomic_store32(&_mapped_total, 0);
-  atomic_store32(&_unmapped_total, 0);
-  atomic_store32(&_mapped_pages_os, 0);
-  atomic_store32(&_huge_pages_current, 0);
-  _huge_pages_peak = 0;
-#endif
-  memset(_memory_heaps, 0, sizeof(_memory_heaps));
-  atomic_store32_release(&_memory_global_lock, 0);
-
-  rpmalloc_linker_reference();
-
-  // Initialize this thread
-  rpmalloc_thread_initialize();
-  return 0;
-}
-
-//! Finalize the allocator
-void rpmalloc_finalize(void) {
-  rpmalloc_thread_finalize(1);
-  // rpmalloc_dump_statistics(stdout);
-
-  if (_memory_global_reserve) {
-    atomic_add32(&_memory_global_reserve_master->remaining_spans,
-                 -(int32_t)_memory_global_reserve_count);
-    _memory_global_reserve_master = 0;
-    _memory_global_reserve_count = 0;
-    _memory_global_reserve = 0;
-  }
-  atomic_store32_release(&_memory_global_lock, 0);
-
-  // Free all thread caches and fully free spans
-  for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-    heap_t *heap = _memory_heaps[list_idx];
-    while (heap) {
-      heap_t *next_heap = heap->next_heap;
-      heap->finalize = 1;
-      _rpmalloc_heap_global_finalize(heap);
-      heap = next_heap;
-    }
-  }
-
-#if ENABLE_GLOBAL_CACHE
-  // Free global caches
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
-    _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]);
-#endif
-
-#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
-  pthread_key_delete(_memory_thread_heap);
-#endif
-#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-  FlsFree(fls_key);
-  fls_key = 0;
-#endif
-#if ENABLE_STATISTICS
-  // If you hit these asserts you probably have memory leaks (perhaps global
-  // scope data doing dynamic allocations) or double frees in your code
-  rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected");
-  rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0,
-                  "Memory leak detected");
-#endif
-
-  _rpmalloc_initialized = 0;
-}
-
-//! Initialize thread, assign heap
-extern inline void rpmalloc_thread_initialize(void) {
-  if (!get_thread_heap_raw()) {
-    heap_t *heap = _rpmalloc_heap_allocate(0);
-    if (heap) {
-      _rpmalloc_stat_inc(&_memory_active_heaps);
-      set_thread_heap(heap);
-#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-      FlsSetValue(fls_key, heap);
-#endif
-    }
-  }
-}
-
-//! Finalize thread, orphan heap
-void rpmalloc_thread_finalize(int release_caches) {
-  heap_t *heap = get_thread_heap_raw();
-  if (heap)
-    _rpmalloc_heap_release_raw(heap, release_caches);
-  set_thread_heap(0);
-#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
-  FlsSetValue(fls_key, 0);
-#endif
-}
-
-int rpmalloc_is_thread_initialized(void) {
-  return (get_thread_heap_raw() != 0) ? 1 : 0;
-}
-
-const rpmalloc_config_t *rpmalloc_config(void) { return &_memory_config; }
-
-// Extern interface
-
-extern inline RPMALLOC_ALLOCATOR void *rpmalloc(size_t size) {
-#if ENABLE_VALIDATE_ARGS
-  if (size >= MAX_ALLOC_SIZE) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-  heap_t *heap = get_thread_heap();
-  return _rpmalloc_allocate(heap, size);
-}
-
-extern inline void rpfree(void *ptr) { _rpmalloc_deallocate(ptr); }
-
-extern inline RPMALLOC_ALLOCATOR void *rpcalloc(size_t num, size_t size) {
-  size_t total;
-#if ENABLE_VALIDATE_ARGS
-#if PLATFORM_WINDOWS
-  int err = SizeTMult(num, size, &total);
-  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#else
-  int err = __builtin_umull_overflow(num, size, &total);
-  if (err || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-#else
-  total = num * size;
-#endif
-  heap_t *heap = get_thread_heap();
-  void *block = _rpmalloc_allocate(heap, total);
-  if (block)
-    memset(block, 0, total);
-  return block;
-}
-
-extern inline RPMALLOC_ALLOCATOR void *rprealloc(void *ptr, size_t size) {
-#if ENABLE_VALIDATE_ARGS
-  if (size >= MAX_ALLOC_SIZE) {
-    errno = EINVAL;
-    return ptr;
-  }
-#endif
-  heap_t *heap = get_thread_heap();
-  return _rpmalloc_reallocate(heap, ptr, size, 0, 0);
-}
-
-extern RPMALLOC_ALLOCATOR void *rpaligned_realloc(void *ptr, size_t alignment,
-                                                  size_t size, size_t oldsize,
-                                                  unsigned int flags) {
-#if ENABLE_VALIDATE_ARGS
-  if ((size + alignment < size) || (alignment > _memory_page_size)) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-  heap_t *heap = get_thread_heap();
-  return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize,
-                                      flags);
-}
-
-extern RPMALLOC_ALLOCATOR void *rpaligned_alloc(size_t alignment, size_t size) {
-  heap_t *heap = get_thread_heap();
-  return _rpmalloc_aligned_allocate(heap, alignment, size);
-}
-
-extern inline RPMALLOC_ALLOCATOR void *
-rpaligned_calloc(size_t alignment, size_t num, size_t size) {
-  size_t total;
-#if ENABLE_VALIDATE_ARGS
-#if PLATFORM_WINDOWS
-  int err = SizeTMult(num, size, &total);
-  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#else
-  int err = __builtin_umull_overflow(num, size, &total);
-  if (err || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-#else
-  total = num * size;
-#endif
-  void *block = rpaligned_alloc(alignment, total);
-  if (block)
-    memset(block, 0, total);
-  return block;
-}
-
-extern inline RPMALLOC_ALLOCATOR void *rpmemalign(size_t alignment,
-                                                  size_t size) {
-  return rpaligned_alloc(alignment, size);
-}
-
-extern inline int rpposix_memalign(void **memptr, size_t alignment,
-                                   size_t size) {
-  if (memptr)
-    *memptr = rpaligned_alloc(alignment, size);
-  else
-    return EINVAL;
-  return *memptr ? 0 : ENOMEM;
-}
-
-extern inline size_t rpmalloc_usable_size(void *ptr) {
-  return (ptr ? _rpmalloc_usable_size(ptr) : 0);
-}
-
-extern inline void rpmalloc_thread_collect(void) {}
-
-void rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats) {
-  memset(stats, 0, sizeof(rpmalloc_thread_statistics_t));
-  heap_t *heap = get_thread_heap_raw();
-  if (!heap)
-    return;
-
-  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-    size_class_t *size_class = _memory_size_class + iclass;
-    span_t *span = heap->size_class[iclass].partial_span;
-    while (span) {
-      size_t free_count = span->list_size;
-      size_t block_count = size_class->block_count;
-      if (span->free_list_limit < block_count)
-        block_count = span->free_list_limit;
-      free_count += (block_count - span->used_count);
-      stats->sizecache += free_count * size_class->block_size;
-      span = span->next;
-    }
-  }
-
-#if ENABLE_THREAD_CACHE
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    span_cache_t *span_cache;
-    if (!iclass)
-      span_cache = &heap->span_cache;
-    else
-      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
-    stats->spancache += span_cache->count * (iclass + 1) * _memory_span_size;
-  }
-#endif
-
-  span_t *deferred = (span_t *)atomic_load_ptr(&heap->span_free_deferred);
-  while (deferred) {
-    if (deferred->size_class != SIZE_CLASS_HUGE)
-      stats->spancache += (size_t)deferred->span_count * _memory_span_size;
-    deferred = (span_t *)deferred->free_list;
-  }
-
-#if ENABLE_STATISTICS
-  stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global);
-  stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread);
-
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    stats->span_use[iclass].current =
-        (size_t)atomic_load32(&heap->span_use[iclass].current);
-    stats->span_use[iclass].peak =
-        (size_t)atomic_load32(&heap->span_use[iclass].high);
-    stats->span_use[iclass].to_global =
-        (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global);
-    stats->span_use[iclass].from_global =
-        (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global);
-    stats->span_use[iclass].to_cache =
-        (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache);
-    stats->span_use[iclass].from_cache =
-        (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache);
-    stats->span_use[iclass].to_reserved =
-        (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved);
-    stats->span_use[iclass].from_reserved =
-        (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved);
-    stats->span_use[iclass].map_calls =
-        (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls);
-  }
-  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-    stats->size_use[iclass].alloc_current =
-        (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current);
-    stats->size_use[iclass].alloc_peak =
-        (size_t)heap->size_class_use[iclass].alloc_peak;
-    stats->size_use[iclass].alloc_total =
-        (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total);
-    stats->size_use[iclass].free_total =
-        (size_t)atomic_load32(&heap->size_class_use[iclass].free_total);
-    stats->size_use[iclass].spans_to_cache =
-        (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache);
-    stats->size_use[iclass].spans_from_cache =
-        (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache);
-    stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(
-        &heap->size_class_use[iclass].spans_from_reserved);
-    stats->size_use[iclass].map_calls =
-        (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls);
-  }
-#endif
-}
-
-void rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats) {
-  memset(stats, 0, sizeof(rpmalloc_global_statistics_t));
-#if ENABLE_STATISTICS
-  stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
-  stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
-  stats->mapped_total =
-      (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
-  stats->unmapped_total =
-      (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
-  stats->huge_alloc =
-      (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
-  stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
-#endif
-#if ENABLE_GLOBAL_CACHE
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    global_cache_t *cache = &_memory_span_cache[iclass];
-    while (!atomic_cas32_acquire(&cache->lock, 1, 0))
-      _rpmalloc_spin();
-    uint32_t count = cache->count;
-#if ENABLE_UNLIMITED_CACHE
-    span_t *current_span = cache->overflow;
-    while (current_span) {
-      ++count;
-      current_span = current_span->next;
-    }
-#endif
-    atomic_store32_release(&cache->lock, 0);
-    stats->cached += count * (iclass + 1) * _memory_span_size;
-  }
-#endif
-}
-
-#if ENABLE_STATISTICS
-
-static void _memory_heap_dump_statistics(heap_t *heap, void *file) {
-  fprintf(file, "Heap %d stats:\n", heap->id);
-  fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize "
-                "BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB "
-                "FromCacheMiB FromReserveMiB MmapCalls\n");
-  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-    if (!atomic_load32(&heap->size_class_use[iclass].alloc_total))
-      continue;
-    fprintf(
-        file,
-        "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu "
-        "%9u\n",
-        (uint32_t)iclass,
-        atomic_load32(&heap->size_class_use[iclass].alloc_current),
-        heap->size_class_use[iclass].alloc_peak,
-        atomic_load32(&heap->size_class_use[iclass].alloc_total),
-        atomic_load32(&heap->size_class_use[iclass].free_total),
-        _memory_size_class[iclass].block_size,
-        _memory_size_class[iclass].block_count,
-        atomic_load32(&heap->size_class_use[iclass].spans_current),
-        heap->size_class_use[iclass].spans_peak,
-        ((size_t)heap->size_class_use[iclass].alloc_peak *
-         (size_t)_memory_size_class[iclass].block_size) /
-            (size_t)(1024 * 1024),
-        ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) *
-         _memory_span_size) /
-            (size_t)(1024 * 1024),
-        ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) *
-         _memory_span_size) /
-            (size_t)(1024 * 1024),
-        ((size_t)atomic_load32(
-             &heap->size_class_use[iclass].spans_from_reserved) *
-         _memory_span_size) /
-            (size_t)(1024 * 1024),
-        atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
-  }
-  fprintf(file, "Spans  Current     Peak Deferred  PeakMiB  Cached  ToCacheMiB "
-                "FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB "
-                "FromGlobalMiB  MmapCalls\n");
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    if (!atomic_load32(&heap->span_use[iclass].high) &&
-        !atomic_load32(&heap->span_use[iclass].spans_map_calls))
-      continue;
-    fprintf(
-        file,
-        "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n",
-        (uint32_t)(iclass + 1), atomic_load32(&heap->span_use[iclass].current),
-        atomic_load32(&heap->span_use[iclass].high),
-        atomic_load32(&heap->span_use[iclass].spans_deferred),
-        ((size_t)atomic_load32(&heap->span_use[iclass].high) *
-         (size_t)_memory_span_size * (iclass + 1)) /
-            (size_t)(1024 * 1024),
-#if ENABLE_THREAD_CACHE
-        (unsigned int)(!iclass ? heap->span_cache.count
-                               : heap->span_large_cache[iclass - 1].count),
-        ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) *
-         (iclass + 1) * _memory_span_size) /
-            (size_t)(1024 * 1024),
-        ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) *
-         (iclass + 1) * _memory_span_size) /
-            (size_t)(1024 * 1024),
-#else
-        0, (size_t)0, (size_t)0,
-#endif
-        ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) *
-         (iclass + 1) * _memory_span_size) /
-            (size_t)(1024 * 1024),
-        ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) *
-         (iclass + 1) * _memory_span_size) /
-            (size_t)(1024 * 1024),
-        ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) *
-         (size_t)_memory_span_size * (iclass + 1)) /
-            (size_t)(1024 * 1024),
-        ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) *
-         (size_t)_memory_span_size * (iclass + 1)) /
-            (size_t)(1024 * 1024),
-        atomic_load32(&heap->span_use[iclass].spans_map_calls));
-  }
-  fprintf(file, "Full spans: %zu\n", heap->full_span_count);
-  fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
-  fprintf(
-      file, "%17zu %17zu\n",
-      (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024),
-      (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
-}
-
-#endif
-
-void rpmalloc_dump_statistics(void *file) {
-#if ENABLE_STATISTICS
-  for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
-    heap_t *heap = _memory_heaps[list_idx];
-    while (heap) {
-      int need_dump = 0;
-      for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT);
-           ++iclass) {
-        if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) {
-          rpmalloc_assert(
-              !atomic_load32(&heap->size_class_use[iclass].free_total),
-              "Heap statistics counter mismatch");
-          rpmalloc_assert(
-              !atomic_load32(&heap->size_class_use[iclass].spans_map_calls),
-              "Heap statistics counter mismatch");
-          continue;
-        }
-        need_dump = 1;
-      }
-      for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT);
-           ++iclass) {
-        if (!atomic_load32(&heap->span_use[iclass].high) &&
-            !atomic_load32(&heap->span_use[iclass].spans_map_calls))
-          continue;
-        need_dump = 1;
-      }
-      if (need_dump)
-        _memory_heap_dump_statistics(heap, file);
-      heap = heap->next_heap;
-    }
-  }
-  fprintf(file, "Global stats:\n");
-  size_t huge_current =
-      (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
-  size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size;
-  fprintf(file, "HugeCurrentMiB HugePeakMiB\n");
-  fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024),
-          huge_peak / (size_t)(1024 * 1024));
-
-#if ENABLE_GLOBAL_CACHE
-  fprintf(file, "GlobalCacheMiB\n");
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    global_cache_t *cache = _memory_span_cache + iclass;
-    size_t global_cache = (size_t)cache->count * iclass * _memory_span_size;
-
-    size_t global_overflow_cache = 0;
-    span_t *span = cache->overflow;
-    while (span) {
-      global_overflow_cache += iclass * _memory_span_size;
-      span = span->next;
-    }
-    if (global_cache || global_overflow_cache || cache->insert_count ||
-        cache->extract_count)
-      fprintf(file,
-              "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n",
-              iclass + 1, global_cache / (size_t)(1024 * 1024),
-              global_overflow_cache / (size_t)(1024 * 1024),
-              cache->insert_count, cache->extract_count);
-  }
-#endif
-
-  size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
-  size_t mapped_os =
-      (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size;
-  size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
-  size_t mapped_total =
-      (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
-  size_t unmapped_total =
-      (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
-  fprintf(
-      file,
-      "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n");
-  fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n",
-          mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024),
-          mapped_peak / (size_t)(1024 * 1024),
-          mapped_total / (size_t)(1024 * 1024),
-          unmapped_total / (size_t)(1024 * 1024));
-
-  fprintf(file, "\n");
-#if 0
-	int64_t allocated = atomic_load64(&_allocation_counter);
-	int64_t deallocated = atomic_load64(&_deallocation_counter);
-	fprintf(file, "Allocation count: %lli\n", allocated);
-	fprintf(file, "Deallocation count: %lli\n", deallocated);
-	fprintf(file, "Current allocations: %lli\n", (allocated - deallocated));
-	fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans));
-	fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans));
-#endif
-#endif
-  (void)sizeof(file);
-}
-
-#if RPMALLOC_FIRST_CLASS_HEAPS
-
-extern inline rpmalloc_heap_t *rpmalloc_heap_acquire(void) {
-  // Must be a pristine heap from newly mapped memory pages, or else memory
-  // blocks could already be allocated from the heap which would (wrongly) be
-  // released when heap is cleared with rpmalloc_heap_free_all(). Also heaps
-  // guaranteed to be pristine from the dedicated orphan list can be used.
-  heap_t *heap = _rpmalloc_heap_allocate(1);
-  rpmalloc_assume(heap != NULL);
-  heap->owner_thread = 0;
-  _rpmalloc_stat_inc(&_memory_active_heaps);
-  return heap;
-}
-
-extern inline void rpmalloc_heap_release(rpmalloc_heap_t *heap) {
-  if (heap)
-    _rpmalloc_heap_release(heap, 1, 1);
-}
-
-extern inline RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) {
-#if ENABLE_VALIDATE_ARGS
-  if (size >= MAX_ALLOC_SIZE) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-  return _rpmalloc_allocate(heap, size);
-}
-
-extern inline RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment,
-                            size_t size) {
-#if ENABLE_VALIDATE_ARGS
-  if (size >= MAX_ALLOC_SIZE) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-  return _rpmalloc_aligned_allocate(heap, alignment, size);
-}
-
-extern inline RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, size_t size) {
-  return rpmalloc_heap_aligned_calloc(heap, 0, num, size);
-}
-
-extern inline RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment,
-                             size_t num, size_t size) {
-  size_t total;
-#if ENABLE_VALIDATE_ARGS
-#if PLATFORM_WINDOWS
-  int err = SizeTMult(num, size, &total);
-  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#else
-  int err = __builtin_umull_overflow(num, size, &total);
-  if (err || (total >= MAX_ALLOC_SIZE)) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-#else
-  total = num * size;
-#endif
-  void *block = _rpmalloc_aligned_allocate(heap, alignment, total);
-  if (block)
-    memset(block, 0, total);
-  return block;
-}
-
-extern inline RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size,
-                      unsigned int flags) {
-#if ENABLE_VALIDATE_ARGS
-  if (size >= MAX_ALLOC_SIZE) {
-    errno = EINVAL;
-    return ptr;
-  }
-#endif
-  return _rpmalloc_reallocate(heap, ptr, size, 0, flags);
-}
-
-extern inline RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_aligned_realloc(rpmalloc_heap_t *heap, void *ptr,
-                              size_t alignment, size_t size,
-                              unsigned int flags) {
-#if ENABLE_VALIDATE_ARGS
-  if ((size + alignment < size) || (alignment > _memory_page_size)) {
-    errno = EINVAL;
-    return 0;
-  }
-#endif
-  return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags);
-}
-
-extern inline void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr) {
-  (void)sizeof(heap);
-  _rpmalloc_deallocate(ptr);
-}
-
-extern inline void rpmalloc_heap_free_all(rpmalloc_heap_t *heap) {
-  span_t *span;
-  span_t *next_span;
-
-  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
-
-  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-    span = heap->size_class[iclass].partial_span;
-    while (span) {
-      next_span = span->next;
-      _rpmalloc_heap_cache_insert(heap, span);
-      span = next_span;
-    }
-    heap->size_class[iclass].partial_span = 0;
-    span = heap->full_span[iclass];
-    while (span) {
-      next_span = span->next;
-      _rpmalloc_heap_cache_insert(heap, span);
-      span = next_span;
-    }
-
-    span = heap->size_class[iclass].cache;
-    if (span)
-      _rpmalloc_heap_cache_insert(heap, span);
-    heap->size_class[iclass].cache = 0;
-  }
-  memset(heap->size_class, 0, sizeof(heap->size_class));
-  memset(heap->full_span, 0, sizeof(heap->full_span));
-
-  span = heap->large_huge_span;
-  while (span) {
-    next_span = span->next;
-    if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE))
-      _rpmalloc_deallocate_huge(span);
-    else
-      _rpmalloc_heap_cache_insert(heap, span);
-    span = next_span;
-  }
-  heap->large_huge_span = 0;
-  heap->full_span_count = 0;
-
-#if ENABLE_THREAD_CACHE
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    span_cache_t *span_cache;
-    if (!iclass)
-      span_cache = &heap->span_cache;
-    else
-      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
-    if (!span_cache->count)
-      continue;
-#if ENABLE_GLOBAL_CACHE
-    _rpmalloc_stat_add64(&heap->thread_to_global,
-                         span_cache->count * (iclass + 1) * _memory_span_size);
-    _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global,
-                       span_cache->count);
-    _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1,
-                                        span_cache->count);
-#else
-    for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
-      _rpmalloc_span_unmap(span_cache->span[ispan]);
-#endif
-    span_cache->count = 0;
-  }
-#endif
-
-#if ENABLE_STATISTICS
-  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
-    atomic_store32(&heap->size_class_use[iclass].alloc_current, 0);
-    atomic_store32(&heap->size_class_use[iclass].spans_current, 0);
-  }
-  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
-    atomic_store32(&heap->span_use[iclass].current, 0);
-  }
-#endif
-}
-
-extern inline void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap) {
-  heap_t *prev_heap = get_thread_heap_raw();
-  if (prev_heap != heap) {
-    set_thread_heap(heap);
-    if (prev_heap)
-      rpmalloc_heap_release(prev_heap);
-  }
-}
-
-extern inline rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr) {
-  // Grab the span, and then the heap from the span
-  span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask);
-  if (span) {
-    return span->heap;
-  }
-  return 0;
-}
-
-#endif
-
-#if ENABLE_PRELOAD || ENABLE_OVERRIDE
-
-#include "malloc.c"
-
-#endif
-
-void rpmalloc_linker_reference(void) { (void)sizeof(_rpmalloc_initialized); }
+//===---------------------- rpmalloc.c ------------------*- C -*-=============//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This library provides a cross-platform lock free thread caching malloc
+// implementation in C11.
+//
+//===----------------------------------------------------------------------===//
+
+#include "rpmalloc.h"
+
+////////////
+///
+/// Build time configurable limits
+///
+//////
+
+#if defined(__clang__)
+#pragma clang diagnostic ignored "-Wunused-macros"
+#pragma clang diagnostic ignored "-Wunused-function"
+#if __has_warning("-Wreserved-identifier")
+#pragma clang diagnostic ignored "-Wreserved-identifier"
+#endif
+#if __has_warning("-Wstatic-in-inline")
+#pragma clang diagnostic ignored "-Wstatic-in-inline"
+#endif
+#elif defined(__GNUC__)
+#pragma GCC diagnostic ignored "-Wunused-macros"
+#pragma GCC diagnostic ignored "-Wunused-function"
+#endif
+
+#if !defined(__has_builtin)
+#define __has_builtin(b) 0
+#endif
+
+#if defined(__GNUC__) || defined(__clang__)
+
+#if __has_builtin(__builtin_memcpy_inline)
+#define _rpmalloc_memcpy_const(x, y, s) __builtin_memcpy_inline(x, y, s)
+#else
+#define _rpmalloc_memcpy_const(x, y, s)                                        \
+  do {                                                                         \
+    _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0),       \
+                   "len must be a constant integer");                          \
+    memcpy(x, y, s);                                                           \
+  } while (0)
+#endif
+
+#if __has_builtin(__builtin_memset_inline)
+#define _rpmalloc_memset_const(x, y, s) __builtin_memset_inline(x, y, s)
+#else
+#define _rpmalloc_memset_const(x, y, s)                                        \
+  do {                                                                         \
+    _Static_assert(__builtin_choose_expr(__builtin_constant_p(s), 1, 0),       \
+                   "len must be a constant integer");                          \
+    memset(x, y, s);                                                           \
+  } while (0)
+#endif
+#else
+#define _rpmalloc_memcpy_const(x, y, s) memcpy(x, y, s)
+#define _rpmalloc_memset_const(x, y, s) memset(x, y, s)
+#endif
+
+#if __has_builtin(__builtin_assume)
+#define rpmalloc_assume(cond) __builtin_assume(cond)
+#elif defined(__GNUC__)
+#define rpmalloc_assume(cond)                                                  \
+  do {                                                                         \
+    if (!__builtin_expect(cond, 0))                                            \
+      __builtin_unreachable();                                                 \
+  } while (0)
+#elif defined(_MSC_VER)
+#define rpmalloc_assume(cond) __assume(cond)
+#else
+#define rpmalloc_assume(cond) 0
+#endif
+
+#ifndef HEAP_ARRAY_SIZE
+//! Size of heap hashmap
+#define HEAP_ARRAY_SIZE 47
+#endif
+#ifndef ENABLE_THREAD_CACHE
+//! Enable per-thread cache
+#define ENABLE_THREAD_CACHE 1
+#endif
+#ifndef ENABLE_GLOBAL_CACHE
+//! Enable global cache shared between all threads, requires thread cache
+#define ENABLE_GLOBAL_CACHE 1
+#endif
+#ifndef ENABLE_VALIDATE_ARGS
+//! Enable validation of args to public entry points
+#define ENABLE_VALIDATE_ARGS 0
+#endif
+#ifndef ENABLE_STATISTICS
+//! Enable statistics collection
+#define ENABLE_STATISTICS 0
+#endif
+#ifndef ENABLE_ASSERTS
+//! Enable asserts
+#define ENABLE_ASSERTS 0
+#endif
+#ifndef ENABLE_OVERRIDE
+//! Override standard library malloc/free and new/delete entry points
+#define ENABLE_OVERRIDE 0
+#endif
+#ifndef ENABLE_PRELOAD
+//! Support preloading
+#define ENABLE_PRELOAD 0
+#endif
+#ifndef DISABLE_UNMAP
+//! Disable unmapping memory pages (also enables unlimited cache)
+#define DISABLE_UNMAP 0
+#endif
+#ifndef ENABLE_UNLIMITED_CACHE
+//! Enable unlimited global cache (no unmapping until finalization)
+#define ENABLE_UNLIMITED_CACHE 0
+#endif
+#ifndef ENABLE_ADAPTIVE_THREAD_CACHE
+//! Enable adaptive thread cache size based on use heuristics
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
+#ifndef DEFAULT_SPAN_MAP_COUNT
+//! Default number of spans to map in call to map more virtual memory (default
+//! values yield 4MiB here)
+#define DEFAULT_SPAN_MAP_COUNT 64
+#endif
+#ifndef GLOBAL_CACHE_MULTIPLIER
+//! Multiplier for global cache
+#define GLOBAL_CACHE_MULTIPLIER 8
+#endif
+
+#if DISABLE_UNMAP && !ENABLE_GLOBAL_CACHE
+#error Must use global cache if unmap is disabled
+#endif
+
+#if DISABLE_UNMAP
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 1
+#endif
+
+#if !ENABLE_GLOBAL_CACHE
+#undef ENABLE_UNLIMITED_CACHE
+#define ENABLE_UNLIMITED_CACHE 0
+#endif
+
+#if !ENABLE_THREAD_CACHE
+#undef ENABLE_ADAPTIVE_THREAD_CACHE
+#define ENABLE_ADAPTIVE_THREAD_CACHE 0
+#endif
+
+#if defined(_WIN32) || defined(__WIN32__) || defined(_WIN64)
+#define PLATFORM_WINDOWS 1
+#define PLATFORM_POSIX 0
+#else
+#define PLATFORM_WINDOWS 0
+#define PLATFORM_POSIX 1
+#endif
+
+/// Platform and arch specifics
+#if defined(_MSC_VER) && !defined(__clang__)
+#pragma warning(disable : 5105)
+#ifndef FORCEINLINE
+#define FORCEINLINE inline __forceinline
+#endif
+#define _Static_assert static_assert
+#else
+#ifndef FORCEINLINE
+#define FORCEINLINE inline __attribute__((__always_inline__))
+#endif
+#endif
+#if PLATFORM_WINDOWS
+#ifndef WIN32_LEAN_AND_MEAN
+#define WIN32_LEAN_AND_MEAN
+#endif
+#include <windows.h>
+#if ENABLE_VALIDATE_ARGS
+#include <intsafe.h>
+#endif
+#else
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include <unistd.h>
+#if defined(__linux__) || defined(__ANDROID__)
+#include <sys/prctl.h>
+#if !defined(PR_SET_VMA)
+#define PR_SET_VMA 0x53564d41
+#define PR_SET_VMA_ANON_NAME 0
+#endif
+#endif
+#if defined(__APPLE__)
+#include <TargetConditionals.h>
+#if !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+#include <mach/mach_vm.h>
+#include <mach/vm_statistics.h>
+#endif
+#include <pthread.h>
+#endif
+#if defined(__HAIKU__) || defined(__TINYC__)
+#include <pthread.h>
+#endif
+#endif
+
+#include <errno.h>
+#include <stdint.h>
+#include <string.h>
+
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+#include <fibersapi.h>
+static DWORD fls_key;
+#endif
+
+#if PLATFORM_POSIX
+#include <sched.h>
+#include <sys/mman.h>
+#ifdef __FreeBSD__
+#include <sys/sysctl.h>
+#define MAP_HUGETLB MAP_ALIGNED_SUPER
+#ifndef PROT_MAX
+#define PROT_MAX(f) 0
+#endif
+#else
+#define PROT_MAX(f) 0
+#endif
+#ifdef __sun
+extern int madvise(caddr_t, size_t, int);
+#endif
+#ifndef MAP_UNINITIALIZED
+#define MAP_UNINITIALIZED 0
+#endif
+#endif
+#include <errno.h>
+
+#if ENABLE_ASSERTS
+#undef NDEBUG
+#if defined(_MSC_VER) && !defined(_DEBUG)
+#define _DEBUG
+#endif
+#include <assert.h>
+#define RPMALLOC_TOSTRING_M(x) #x
+#define RPMALLOC_TOSTRING(x) RPMALLOC_TOSTRING_M(x)
+#define rpmalloc_assert(truth, message)                                        \
+  do {                                                                         \
+    if (!(truth)) {                                                            \
+      if (_memory_config.error_callback) {                                     \
+        _memory_config.error_callback(message " (" RPMALLOC_TOSTRING(          \
+            truth) ") at " __FILE__ ":" RPMALLOC_TOSTRING(__LINE__));          \
+      } else {                                                                 \
+        assert((truth) && message);                                            \
+      }                                                                        \
+    }                                                                          \
+  } while (0)
+#else
+#define rpmalloc_assert(truth, message)                                        \
+  do {                                                                         \
+  } while (0)
+#endif
+#if ENABLE_STATISTICS
+#include <stdio.h>
+#endif
+
+//////
+///
+/// Atomic access abstraction (since MSVC does not do C11 yet)
+///
+//////
+
+#if defined(_MSC_VER) && !defined(__clang__)
+
+typedef volatile long atomic32_t;
+typedef volatile long long atomic64_t;
+typedef volatile void *atomicptr_t;
+
+static FORCEINLINE int32_t atomic_load32(atomic32_t *src) { return *src; }
+static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) {
+  *dst = val;
+}
+static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) {
+  return (int32_t)InterlockedIncrement(val);
+}
+static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) {
+  return (int32_t)InterlockedDecrement(val);
+}
+static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) {
+  return (int32_t)InterlockedExchangeAdd(val, add) + add;
+}
+static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val,
+                                            int32_t ref) {
+  return (InterlockedCompareExchange(dst, val, ref) == ref) ? 1 : 0;
+}
+static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) {
+  *dst = val;
+}
+static FORCEINLINE int64_t atomic_load64(atomic64_t *src) { return *src; }
+static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) {
+  return (int64_t)InterlockedExchangeAdd64(val, add) + add;
+}
+static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) {
+  return (void *)*src;
+}
+static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) {
+  *dst = val;
+}
+static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) {
+  *dst = val;
+}
+static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst,
+                                                     void *val) {
+  return (void *)InterlockedExchangePointer((void *volatile *)dst, val);
+}
+static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) {
+  return (InterlockedCompareExchangePointer((void *volatile *)dst, val, ref) ==
+          ref)
+             ? 1
+             : 0;
+}
+
+#define EXPECTED(x) (x)
+#define UNEXPECTED(x) (x)
+
+#else
+
+#include <stdatomic.h>
+
+typedef volatile _Atomic(int32_t) atomic32_t;
+typedef volatile _Atomic(int64_t) atomic64_t;
+typedef volatile _Atomic(void *) atomicptr_t;
+
+static FORCEINLINE int32_t atomic_load32(atomic32_t *src) {
+  return atomic_load_explicit(src, memory_order_relaxed);
+}
+static FORCEINLINE void atomic_store32(atomic32_t *dst, int32_t val) {
+  atomic_store_explicit(dst, val, memory_order_relaxed);
+}
+static FORCEINLINE int32_t atomic_incr32(atomic32_t *val) {
+  return atomic_fetch_add_explicit(val, 1, memory_order_relaxed) + 1;
+}
+static FORCEINLINE int32_t atomic_decr32(atomic32_t *val) {
+  return atomic_fetch_add_explicit(val, -1, memory_order_relaxed) - 1;
+}
+static FORCEINLINE int32_t atomic_add32(atomic32_t *val, int32_t add) {
+  return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add;
+}
+static FORCEINLINE int atomic_cas32_acquire(atomic32_t *dst, int32_t val,
+                                            int32_t ref) {
+  return atomic_compare_exchange_weak_explicit(
+      dst, &ref, val, memory_order_acquire, memory_order_relaxed);
+}
+static FORCEINLINE void atomic_store32_release(atomic32_t *dst, int32_t val) {
+  atomic_store_explicit(dst, val, memory_order_release);
+}
+static FORCEINLINE int64_t atomic_load64(atomic64_t *val) {
+  return atomic_load_explicit(val, memory_order_relaxed);
+}
+static FORCEINLINE int64_t atomic_add64(atomic64_t *val, int64_t add) {
+  return atomic_fetch_add_explicit(val, add, memory_order_relaxed) + add;
+}
+static FORCEINLINE void *atomic_load_ptr(atomicptr_t *src) {
+  return atomic_load_explicit(src, memory_order_relaxed);
+}
+static FORCEINLINE void atomic_store_ptr(atomicptr_t *dst, void *val) {
+  atomic_store_explicit(dst, val, memory_order_relaxed);
+}
+static FORCEINLINE void atomic_store_ptr_release(atomicptr_t *dst, void *val) {
+  atomic_store_explicit(dst, val, memory_order_release);
+}
+static FORCEINLINE void *atomic_exchange_ptr_acquire(atomicptr_t *dst,
+                                                     void *val) {
+  return atomic_exchange_explicit(dst, val, memory_order_acquire);
+}
+static FORCEINLINE int atomic_cas_ptr(atomicptr_t *dst, void *val, void *ref) {
+  return atomic_compare_exchange_weak_explicit(
+      dst, &ref, val, memory_order_relaxed, memory_order_relaxed);
+}
+
+#define EXPECTED(x) __builtin_expect((x), 1)
+#define UNEXPECTED(x) __builtin_expect((x), 0)
+
+#endif
+
+////////////
+///
+/// Statistics related functions (evaluate to nothing when statistics not
+/// enabled)
+///
+//////
+
+#if ENABLE_STATISTICS
+#define _rpmalloc_stat_inc(counter) atomic_incr32(counter)
+#define _rpmalloc_stat_dec(counter) atomic_decr32(counter)
+#define _rpmalloc_stat_add(counter, value)                                     \
+  atomic_add32(counter, (int32_t)(value))
+#define _rpmalloc_stat_add64(counter, value)                                   \
+  atomic_add64(counter, (int64_t)(value))
+#define _rpmalloc_stat_add_peak(counter, value, peak)                          \
+  do {                                                                         \
+    int32_t _cur_count = atomic_add32(counter, (int32_t)(value));              \
+    if (_cur_count > (peak))                                                   \
+      peak = _cur_count;                                                       \
+  } while (0)
+#define _rpmalloc_stat_sub(counter, value)                                     \
+  atomic_add32(counter, -(int32_t)(value))
+#define _rpmalloc_stat_inc_alloc(heap, class_idx)                              \
+  do {                                                                         \
+    int32_t alloc_current =                                                    \
+        atomic_incr32(&heap->size_class_use[class_idx].alloc_current);         \
+    if (alloc_current > heap->size_class_use[class_idx].alloc_peak)            \
+      heap->size_class_use[class_idx].alloc_peak = alloc_current;              \
+    atomic_incr32(&heap->size_class_use[class_idx].alloc_total);               \
+  } while (0)
+#define _rpmalloc_stat_inc_free(heap, class_idx)                               \
+  do {                                                                         \
+    atomic_decr32(&heap->size_class_use[class_idx].alloc_current);             \
+    atomic_incr32(&heap->size_class_use[class_idx].free_total);                \
+  } while (0)
+#else
+#define _rpmalloc_stat_inc(counter)                                            \
+  do {                                                                         \
+  } while (0)
+#define _rpmalloc_stat_dec(counter)                                            \
+  do {                                                                         \
+  } while (0)
+#define _rpmalloc_stat_add(counter, value)                                     \
+  do {                                                                         \
+  } while (0)
+#define _rpmalloc_stat_add64(counter, value)                                   \
+  do {                                                                         \
+  } while (0)
+#define _rpmalloc_stat_add_peak(counter, value, peak)                          \
+  do {                                                                         \
+  } while (0)
+#define _rpmalloc_stat_sub(counter, value)                                     \
+  do {                                                                         \
+  } while (0)
+#define _rpmalloc_stat_inc_alloc(heap, class_idx)                              \
+  do {                                                                         \
+  } while (0)
+#define _rpmalloc_stat_inc_free(heap, class_idx)                               \
+  do {                                                                         \
+  } while (0)
+#endif
+
+///
+/// Preconfigured limits and sizes
+///
+
+//! Granularity of a small allocation block (must be power of two)
+#define SMALL_GRANULARITY 16
+//! Small granularity shift count
+#define SMALL_GRANULARITY_SHIFT 4
+//! Number of small block size classes
+#define SMALL_CLASS_COUNT 65
+//! Maximum size of a small block
+#define SMALL_SIZE_LIMIT (SMALL_GRANULARITY * (SMALL_CLASS_COUNT - 1))
+//! Granularity of a medium allocation block
+#define MEDIUM_GRANULARITY 512
+//! Medium granularity shift count
+#define MEDIUM_GRANULARITY_SHIFT 9
+//! Number of medium block size classes
+#define MEDIUM_CLASS_COUNT 61
+//! Total number of small + medium size classes
+#define SIZE_CLASS_COUNT (SMALL_CLASS_COUNT + MEDIUM_CLASS_COUNT)
+//! Number of large block size classes
+#define LARGE_CLASS_COUNT 63
+//! Maximum size of a medium block
+#define MEDIUM_SIZE_LIMIT                                                      \
+  (SMALL_SIZE_LIMIT + (MEDIUM_GRANULARITY * MEDIUM_CLASS_COUNT))
+//! Maximum size of a large block
+#define LARGE_SIZE_LIMIT                                                       \
+  ((LARGE_CLASS_COUNT * _memory_span_size) - SPAN_HEADER_SIZE)
+//! Size of a span header (must be a multiple of SMALL_GRANULARITY and a power
+//! of two)
+#define SPAN_HEADER_SIZE 128
+//! Number of spans in thread cache
+#define MAX_THREAD_SPAN_CACHE 400
+//! Number of spans to transfer between thread and global cache
+#define THREAD_SPAN_CACHE_TRANSFER 64
+//! Number of spans in thread cache for large spans (must be greater than
+//! LARGE_CLASS_COUNT / 2)
+#define MAX_THREAD_SPAN_LARGE_CACHE 100
+//! Number of spans to transfer between thread and global cache for large spans
+#define THREAD_SPAN_LARGE_CACHE_TRANSFER 6
+
+_Static_assert((SMALL_GRANULARITY & (SMALL_GRANULARITY - 1)) == 0,
+               "Small granularity must be power of two");
+_Static_assert((SPAN_HEADER_SIZE & (SPAN_HEADER_SIZE - 1)) == 0,
+               "Span header size must be power of two");
+
+#if ENABLE_VALIDATE_ARGS
+//! Maximum allocation size to avoid integer overflow
+#undef MAX_ALLOC_SIZE
+#define MAX_ALLOC_SIZE (((size_t) - 1) - _memory_span_size)
+#endif
+
+#define pointer_offset(ptr, ofs) (void *)((char *)(ptr) + (ptrdiff_t)(ofs))
+#define pointer_diff(first, second)                                            \
+  (ptrdiff_t)((const char *)(first) - (const char *)(second))
+
+#define INVALID_POINTER ((void *)((uintptr_t) - 1))
+
+#define SIZE_CLASS_LARGE SIZE_CLASS_COUNT
+#define SIZE_CLASS_HUGE ((uint32_t) - 1)
+
+////////////
+///
+/// Data types
+///
+//////
+
+//! A memory heap, per thread
+typedef struct heap_t heap_t;
+//! Span of memory pages
+typedef struct span_t span_t;
+//! Span list
+typedef struct span_list_t span_list_t;
+//! Span active data
+typedef struct span_active_t span_active_t;
+//! Size class definition
+typedef struct size_class_t size_class_t;
+//! Global cache
+typedef struct global_cache_t global_cache_t;
+
+//! Flag indicating span is the first (master) span of a split superspan
+#define SPAN_FLAG_MASTER 1U
+//! Flag indicating span is a secondary (sub) span of a split superspan
+#define SPAN_FLAG_SUBSPAN 2U
+//! Flag indicating span has blocks with increased alignment
+#define SPAN_FLAG_ALIGNED_BLOCKS 4U
+//! Flag indicating an unmapped master span
+#define SPAN_FLAG_UNMAPPED_MASTER 8U
+
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+struct span_use_t {
+  //! Current number of spans used (actually used, not in cache)
+  atomic32_t current;
+  //! High water mark of spans used
+  atomic32_t high;
+#if ENABLE_STATISTICS
+  //! Number of spans in deferred list
+  atomic32_t spans_deferred;
+  //! Number of spans transitioned to global cache
+  atomic32_t spans_to_global;
+  //! Number of spans transitioned from global cache
+  atomic32_t spans_from_global;
+  //! Number of spans transitioned to thread cache
+  atomic32_t spans_to_cache;
+  //! Number of spans transitioned from thread cache
+  atomic32_t spans_from_cache;
+  //! Number of spans transitioned to reserved state
+  atomic32_t spans_to_reserved;
+  //! Number of spans transitioned from reserved state
+  atomic32_t spans_from_reserved;
+  //! Number of raw memory map calls
+  atomic32_t spans_map_calls;
+#endif
+};
+typedef struct span_use_t span_use_t;
+#endif
+
+#if ENABLE_STATISTICS
+struct size_class_use_t {
+  //! Current number of allocations
+  atomic32_t alloc_current;
+  //! Peak number of allocations
+  int32_t alloc_peak;
+  //! Total number of allocations
+  atomic32_t alloc_total;
+  //! Total number of frees
+  atomic32_t free_total;
+  //! Number of spans in use
+  atomic32_t spans_current;
+  //! Number of spans transitioned to cache
+  int32_t spans_peak;
+  //! Number of spans transitioned to cache
+  atomic32_t spans_to_cache;
+  //! Number of spans transitioned from cache
+  atomic32_t spans_from_cache;
+  //! Number of spans transitioned from reserved state
+  atomic32_t spans_from_reserved;
+  //! Number of spans mapped
+  atomic32_t spans_map_calls;
+  int32_t unused;
+};
+typedef struct size_class_use_t size_class_use_t;
+#endif
+
+// A span can either represent a single span of memory pages with size declared
+// by span_map_count configuration variable, or a set of spans in a continuous
+// region, a super span. Any reference to the term "span" usually refers to both
+// a single span or a super span. A super span can further be divided into
+// multiple spans (or this, super spans), where the first (super)span is the
+// master and subsequent (super)spans are subspans. The master span keeps track
+// of how many subspans that are still alive and mapped in virtual memory, and
+// once all subspans and master have been unmapped the entire superspan region
+// is released and unmapped (on Windows for example, the entire superspan range
+// has to be released in the same call to release the virtual memory range, but
+// individual subranges can be decommitted individually to reduce physical
+// memory use).
+struct span_t {
+  //! Free list
+  void *free_list;
+  //! Total block count of size class
+  uint32_t block_count;
+  //! Size class
+  uint32_t size_class;
+  //! Index of last block initialized in free list
+  uint32_t free_list_limit;
+  //! Number of used blocks remaining when in partial state
+  uint32_t used_count;
+  //! Deferred free list
+  atomicptr_t free_list_deferred;
+  //! Size of deferred free list, or list of spans when part of a cache list
+  uint32_t list_size;
+  //! Size of a block
+  uint32_t block_size;
+  //! Flags and counters
+  uint32_t flags;
+  //! Number of spans
+  uint32_t span_count;
+  //! Total span counter for master spans
+  uint32_t total_spans;
+  //! Offset from master span for subspans
+  uint32_t offset_from_master;
+  //! Remaining span counter, for master spans
+  atomic32_t remaining_spans;
+  //! Alignment offset
+  uint32_t align_offset;
+  //! Owning heap
+  heap_t *heap;
+  //! Next span
+  span_t *next;
+  //! Previous span
+  span_t *prev;
+};
+_Static_assert(sizeof(span_t) <= SPAN_HEADER_SIZE, "span size mismatch");
+
+struct span_cache_t {
+  size_t count;
+  span_t *span[MAX_THREAD_SPAN_CACHE];
+};
+typedef struct span_cache_t span_cache_t;
+
+struct span_large_cache_t {
+  size_t count;
+  span_t *span[MAX_THREAD_SPAN_LARGE_CACHE];
+};
+typedef struct span_large_cache_t span_large_cache_t;
+
+struct heap_size_class_t {
+  //! Free list of active span
+  void *free_list;
+  //! Double linked list of partially used spans with free blocks.
+  //  Previous span pointer in head points to tail span of list.
+  span_t *partial_span;
+  //! Early level cache of fully free spans
+  span_t *cache;
+};
+typedef struct heap_size_class_t heap_size_class_t;
+
+// Control structure for a heap, either a thread heap or a first class heap if
+// enabled
+struct heap_t {
+  //! Owning thread ID
+  uintptr_t owner_thread;
+  //! Free lists for each size class
+  heap_size_class_t size_class[SIZE_CLASS_COUNT];
+#if ENABLE_THREAD_CACHE
+  //! Arrays of fully freed spans, single span
+  span_cache_t span_cache;
+#endif
+  //! List of deferred free spans (single linked list)
+  atomicptr_t span_free_deferred;
+  //! Number of full spans
+  size_t full_span_count;
+  //! Mapped but unused spans
+  span_t *span_reserve;
+  //! Master span for mapped but unused spans
+  span_t *span_reserve_master;
+  //! Number of mapped but unused spans
+  uint32_t spans_reserved;
+  //! Child count
+  atomic32_t child_count;
+  //! Next heap in id list
+  heap_t *next_heap;
+  //! Next heap in orphan list
+  heap_t *next_orphan;
+  //! Heap ID
+  int32_t id;
+  //! Finalization state flag
+  int finalize;
+  //! Master heap owning the memory pages
+  heap_t *master_heap;
+#if ENABLE_THREAD_CACHE
+  //! Arrays of fully freed spans, large spans with > 1 span count
+  span_large_cache_t span_large_cache[LARGE_CLASS_COUNT - 1];
+#endif
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  //! Double linked list of fully utilized spans with free blocks for each size
+  //! class.
+  //  Previous span pointer in head points to tail span of list.
+  span_t *full_span[SIZE_CLASS_COUNT];
+  //! Double linked list of large and huge spans allocated by this heap
+  span_t *large_huge_span;
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+  //! Current and high water mark of spans used per span count
+  span_use_t span_use[LARGE_CLASS_COUNT];
+#endif
+#if ENABLE_STATISTICS
+  //! Allocation stats per size class
+  size_class_use_t size_class_use[SIZE_CLASS_COUNT + 1];
+  //! Number of bytes transitioned thread -> global
+  atomic64_t thread_to_global;
+  //! Number of bytes transitioned global -> thread
+  atomic64_t global_to_thread;
+#endif
+};
+
+// Size class for defining a block size bucket
+struct size_class_t {
+  //! Size of blocks in this class
+  uint32_t block_size;
+  //! Number of blocks in each chunk
+  uint16_t block_count;
+  //! Class index this class is merged with
+  uint16_t class_idx;
+};
+_Static_assert(sizeof(size_class_t) == 8, "Size class size mismatch");
+
+struct global_cache_t {
+  //! Cache lock
+  atomic32_t lock;
+  //! Cache count
+  uint32_t count;
+#if ENABLE_STATISTICS
+  //! Insert count
+  size_t insert_count;
+  //! Extract count
+  size_t extract_count;
+#endif
+  //! Cached spans
+  span_t *span[GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE];
+  //! Unlimited cache overflow
+  span_t *overflow;
+};
+
+////////////
+///
+/// Global data
+///
+//////
+
+//! Default span size (64KiB)
+#define _memory_default_span_size (64 * 1024)
+#define _memory_default_span_size_shift 16
+#define _memory_default_span_mask (~((uintptr_t)(_memory_span_size - 1)))
+
+//! Initialized flag
+static int _rpmalloc_initialized;
+//! Main thread ID
+static uintptr_t _rpmalloc_main_thread_id;
+//! Configuration
+static rpmalloc_config_t _memory_config;
+//! Memory page size
+static size_t _memory_page_size;
+//! Shift to divide by page size
+static size_t _memory_page_size_shift;
+//! Granularity at which memory pages are mapped by OS
+static size_t _memory_map_granularity;
+#if RPMALLOC_CONFIGURABLE
+//! Size of a span of memory pages
+static size_t _memory_span_size;
+//! Shift to divide by span size
+static size_t _memory_span_size_shift;
+//! Mask to get to start of a memory span
+static uintptr_t _memory_span_mask;
+#else
+//! Hardwired span size
+#define _memory_span_size _memory_default_span_size
+#define _memory_span_size_shift _memory_default_span_size_shift
+#define _memory_span_mask _memory_default_span_mask
+#endif
+//! Number of spans to map in each map call
+static size_t _memory_span_map_count;
+//! Number of spans to keep reserved in each heap
+static size_t _memory_heap_reserve_count;
+//! Global size classes
+static size_class_t _memory_size_class[SIZE_CLASS_COUNT];
+//! Run-time size limit of medium blocks
+static size_t _memory_medium_size_limit;
+//! Heap ID counter
+static atomic32_t _memory_heap_id;
+//! Huge page support
+static int _memory_huge_pages;
+#if ENABLE_GLOBAL_CACHE
+//! Global span cache
+static global_cache_t _memory_span_cache[LARGE_CLASS_COUNT];
+#endif
+//! Global reserved spans
+static span_t *_memory_global_reserve;
+//! Global reserved count
+static size_t _memory_global_reserve_count;
+//! Global reserved master
+static span_t *_memory_global_reserve_master;
+//! All heaps
+static heap_t *_memory_heaps[HEAP_ARRAY_SIZE];
+//! Used to restrict access to mapping memory for huge pages
+static atomic32_t _memory_global_lock;
+//! Orphaned heaps
+static heap_t *_memory_orphan_heaps;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+//! Orphaned heaps (first class heaps)
+static heap_t *_memory_first_class_orphan_heaps;
+#endif
+#if ENABLE_STATISTICS
+//! Allocations counter
+static atomic64_t _allocation_counter;
+//! Deallocations counter
+static atomic64_t _deallocation_counter;
+//! Active heap count
+static atomic32_t _memory_active_heaps;
+//! Number of currently mapped memory pages
+static atomic32_t _mapped_pages;
+//! Peak number of concurrently mapped memory pages
+static int32_t _mapped_pages_peak;
+//! Number of mapped master spans
+static atomic32_t _master_spans;
+//! Number of unmapped dangling master spans
+static atomic32_t _unmapped_master_spans;
+//! Running counter of total number of mapped memory pages since start
+static atomic32_t _mapped_total;
+//! Running counter of total number of unmapped memory pages since start
+static atomic32_t _unmapped_total;
+//! Number of currently mapped memory pages in OS calls
+static atomic32_t _mapped_pages_os;
+//! Number of currently allocated pages in huge allocations
+static atomic32_t _huge_pages_current;
+//! Peak number of currently allocated pages in huge allocations
+static int32_t _huge_pages_peak;
+#endif
+
+////////////
+///
+/// Thread local heap and ID
+///
+//////
+
+//! Current thread heap
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) ||          \
+    defined(__TINYC__)
+static pthread_key_t _memory_thread_heap;
+#else
+#ifdef _MSC_VER
+#define _Thread_local __declspec(thread)
+#define TLS_MODEL
+#else
+#ifndef __HAIKU__
+#define TLS_MODEL __attribute__((tls_model("initial-exec")))
+#else
+#define TLS_MODEL
+#endif
+#if !defined(__clang__) && defined(__GNUC__)
+#define _Thread_local __thread
+#endif
+#endif
+static _Thread_local heap_t *_memory_thread_heap TLS_MODEL;
+#endif
+
+static inline heap_t *get_thread_heap_raw(void) {
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+  return pthread_getspecific(_memory_thread_heap);
+#else
+  return _memory_thread_heap;
+#endif
+}
+
+//! Get the current thread heap
+static inline heap_t *get_thread_heap(void) {
+  heap_t *heap = get_thread_heap_raw();
+#if ENABLE_PRELOAD
+  if (EXPECTED(heap != 0))
+    return heap;
+  rpmalloc_initialize();
+  return get_thread_heap_raw();
+#else
+  return heap;
+#endif
+}
+
+//! Fast thread ID
+static inline uintptr_t get_thread_id(void) {
+#if defined(_WIN32)
+  return (uintptr_t)((void *)NtCurrentTeb());
+#elif (defined(__GNUC__) || defined(__clang__)) && !defined(__CYGWIN__)
+  uintptr_t tid;
+#if defined(__i386__)
+  __asm__("movl %%gs:0, %0" : "=r"(tid) : :);
+#elif defined(__x86_64__)
+#if defined(__MACH__)
+  __asm__("movq %%gs:0, %0" : "=r"(tid) : :);
+#else
+  __asm__("movq %%fs:0, %0" : "=r"(tid) : :);
+#endif
+#elif defined(__arm__)
+  __asm__ volatile("mrc p15, 0, %0, c13, c0, 3" : "=r"(tid));
+#elif defined(__aarch64__)
+#if defined(__MACH__)
+  // tpidr_el0 likely unused, always return 0 on iOS
+  __asm__ volatile("mrs %0, tpidrro_el0" : "=r"(tid));
+#else
+  __asm__ volatile("mrs %0, tpidr_el0" : "=r"(tid));
+#endif
+#else
+#error This platform needs implementation of get_thread_id()
+#endif
+  return tid;
+#else
+#error This platform needs implementation of get_thread_id()
+#endif
+}
+
+//! Set the current thread heap
+static void set_thread_heap(heap_t *heap) {
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) ||          \
+    defined(__TINYC__)
+  pthread_setspecific(_memory_thread_heap, heap);
+#else
+  _memory_thread_heap = heap;
+#endif
+  if (heap)
+    heap->owner_thread = get_thread_id();
+}
+
+//! Set main thread ID
+extern void rpmalloc_set_main_thread(void);
+
+void rpmalloc_set_main_thread(void) {
+  _rpmalloc_main_thread_id = get_thread_id();
+}
+
+static void _rpmalloc_spin(void) {
+#if defined(_MSC_VER)
+#if defined(_M_ARM64)
+  __yield();
+#else
+  _mm_pause();
+#endif
+#elif defined(__x86_64__) || defined(__i386__)
+  __asm__ volatile("pause" ::: "memory");
+#elif defined(__aarch64__) || (defined(__arm__) && __ARM_ARCH >= 7)
+  __asm__ volatile("yield" ::: "memory");
+#elif defined(__powerpc__) || defined(__powerpc64__)
+  // No idea if ever been compiled in such archs but ... as precaution
+  __asm__ volatile("or 27,27,27");
+#elif defined(__sparc__)
+  __asm__ volatile("rd %ccr, %g0 \n\trd %ccr, %g0 \n\trd %ccr, %g0");
+#else
+  struct timespec ts = {0};
+  nanosleep(&ts, 0);
+#endif
+}
+
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+static void NTAPI _rpmalloc_thread_destructor(void *value) {
+#if ENABLE_OVERRIDE
+  // If this is called on main thread it means rpmalloc_finalize
+  // has not been called and shutdown is forced (through _exit) or unclean
+  if (get_thread_id() == _rpmalloc_main_thread_id)
+    return;
+#endif
+  if (value)
+    rpmalloc_thread_finalize(1);
+}
+#endif
+
+////////////
+///
+/// Low level memory map/unmap
+///
+//////
+
+static void _rpmalloc_set_name(void *address, size_t size) {
+#if defined(__linux__) || defined(__ANDROID__)
+  const char *name = _memory_huge_pages ? _memory_config.huge_page_name
+                                        : _memory_config.page_name;
+  if (address == MAP_FAILED || !name)
+    return;
+  // If the kernel does not support CONFIG_ANON_VMA_NAME or if the call fails
+  // (e.g. invalid name) it is a no-op basically.
+  (void)prctl(PR_SET_VMA, PR_SET_VMA_ANON_NAME, (uintptr_t)address, size,
+              (uintptr_t)name);
+#else
+  (void)sizeof(size);
+  (void)sizeof(address);
+#endif
+}
+
+//! Map more virtual memory
+//  size is number of bytes to map
+//  offset receives the offset in bytes from start of mapped region
+//  returns address to start of mapped region to use
+static void *_rpmalloc_mmap(size_t size, size_t *offset) {
+  rpmalloc_assert(!(size % _memory_page_size), "Invalid mmap size");
+  rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
+  void *address = _memory_config.memory_map(size, offset);
+  if (EXPECTED(address != 0)) {
+    _rpmalloc_stat_add_peak(&_mapped_pages, (size >> _memory_page_size_shift),
+                            _mapped_pages_peak);
+    _rpmalloc_stat_add(&_mapped_total, (size >> _memory_page_size_shift));
+  }
+  return address;
+}
+
+//! Unmap virtual memory
+//  address is the memory address to unmap, as returned from _memory_map
+//  size is the number of bytes to unmap, which might be less than full region
+//  for a partial unmap offset is the offset in bytes to the actual mapped
+//  region, as set by _memory_map release is set to 0 for partial unmap, or size
+//  of entire range for a full unmap
+static void _rpmalloc_unmap(void *address, size_t size, size_t offset,
+                            size_t release) {
+  rpmalloc_assert(!release || (release >= size), "Invalid unmap size");
+  rpmalloc_assert(!release || (release >= _memory_page_size),
+                  "Invalid unmap size");
+  if (release) {
+    rpmalloc_assert(!(release % _memory_page_size), "Invalid unmap size");
+    _rpmalloc_stat_sub(&_mapped_pages, (release >> _memory_page_size_shift));
+    _rpmalloc_stat_add(&_unmapped_total, (release >> _memory_page_size_shift));
+  }
+  _memory_config.memory_unmap(address, size, offset, release);
+}
+
+//! Default implementation to map new pages to virtual memory
+static void *_rpmalloc_mmap_os(size_t size, size_t *offset) {
+  // Either size is a heap (a single page) or a (multiple) span - we only need
+  // to align spans, and only if larger than map granularity
+  size_t padding = ((size >= _memory_span_size) &&
+                    (_memory_span_size > _memory_map_granularity))
+                       ? _memory_span_size
+                       : 0;
+  rpmalloc_assert(size >= _memory_page_size, "Invalid mmap size");
+#if PLATFORM_WINDOWS
+  // Ok to MEM_COMMIT - according to MSDN, "actual physical pages are not
+  // allocated unless/until the virtual addresses are actually accessed"
+  void *ptr = VirtualAlloc(0, size + padding,
+                           (_memory_huge_pages ? MEM_LARGE_PAGES : 0) |
+                               MEM_RESERVE | MEM_COMMIT,
+                           PAGE_READWRITE);
+  if (!ptr) {
+    if (_memory_config.map_fail_callback) {
+      if (_memory_config.map_fail_callback(size + padding))
+        return _rpmalloc_mmap_os(size, offset);
+    } else {
+      rpmalloc_assert(ptr, "Failed to map virtual memory block");
+    }
+    return 0;
+  }
+#else
+  int flags = MAP_PRIVATE | MAP_ANONYMOUS | MAP_UNINITIALIZED;
+#if defined(__APPLE__) && !TARGET_OS_IPHONE && !TARGET_OS_SIMULATOR
+  int fd = (int)VM_MAKE_TAG(240U);
+  if (_memory_huge_pages)
+    fd |= VM_FLAGS_SUPERPAGE_SIZE_2MB;
+  void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, fd, 0);
+#elif defined(MAP_HUGETLB)
+  void *ptr = mmap(0, size + padding,
+                   PROT_READ | PROT_WRITE | PROT_MAX(PROT_READ | PROT_WRITE),
+                   (_memory_huge_pages ? MAP_HUGETLB : 0) | flags, -1, 0);
+#if defined(MADV_HUGEPAGE)
+  // In some configurations, huge pages allocations might fail thus
+  // we fallback to normal allocations and promote the region as transparent
+  // huge page
+  if ((ptr == MAP_FAILED || !ptr) && _memory_huge_pages) {
+    ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+    if (ptr && ptr != MAP_FAILED) {
+      int prm = madvise(ptr, size + padding, MADV_HUGEPAGE);
+      (void)prm;
+      rpmalloc_assert((prm == 0), "Failed to promote the page to THP");
+    }
+  }
+#endif
+  _rpmalloc_set_name(ptr, size + padding);
+#elif defined(MAP_ALIGNED)
+  const size_t align =
+      (sizeof(size_t) * 8) - (size_t)(__builtin_clzl(size - 1));
+  void *ptr =
+      mmap(0, size + padding, PROT_READ | PROT_WRITE,
+           (_memory_huge_pages ? MAP_ALIGNED(align) : 0) | flags, -1, 0);
+#elif defined(MAP_ALIGN)
+  caddr_t base = (_memory_huge_pages ? (caddr_t)(4 << 20) : 0);
+  void *ptr = mmap(base, size + padding, PROT_READ | PROT_WRITE,
+                   (_memory_huge_pages ? MAP_ALIGN : 0) | flags, -1, 0);
+#else
+  void *ptr = mmap(0, size + padding, PROT_READ | PROT_WRITE, flags, -1, 0);
+#endif
+  if ((ptr == MAP_FAILED) || !ptr) {
+    if (_memory_config.map_fail_callback) {
+      if (_memory_config.map_fail_callback(size + padding))
+        return _rpmalloc_mmap_os(size, offset);
+    } else if (errno != ENOMEM) {
+      rpmalloc_assert((ptr != MAP_FAILED) && ptr,
+                      "Failed to map virtual memory block");
+    }
+    return 0;
+  }
+#endif
+  _rpmalloc_stat_add(&_mapped_pages_os,
+                     (int32_t)((size + padding) >> _memory_page_size_shift));
+  if (padding) {
+    size_t final_padding = padding - ((uintptr_t)ptr & ~_memory_span_mask);
+    rpmalloc_assert(final_padding <= _memory_span_size,
+                    "Internal failure in padding");
+    rpmalloc_assert(final_padding <= padding, "Internal failure in padding");
+    rpmalloc_assert(!(final_padding % 8), "Internal failure in padding");
+    ptr = pointer_offset(ptr, final_padding);
+    *offset = final_padding >> 3;
+  }
+  rpmalloc_assert((size < _memory_span_size) ||
+                      !((uintptr_t)ptr & ~_memory_span_mask),
+                  "Internal failure in padding");
+  return ptr;
+}
+
+//! Default implementation to unmap pages from virtual memory
+static void _rpmalloc_unmap_os(void *address, size_t size, size_t offset,
+                               size_t release) {
+  rpmalloc_assert(release || (offset == 0), "Invalid unmap size");
+  rpmalloc_assert(!release || (release >= _memory_page_size),
+                  "Invalid unmap size");
+  rpmalloc_assert(size >= _memory_page_size, "Invalid unmap size");
+  if (release && offset) {
+    offset <<= 3;
+    address = pointer_offset(address, -(int32_t)offset);
+    if ((release >= _memory_span_size) &&
+        (_memory_span_size > _memory_map_granularity)) {
+      // Padding is always one span size
+      release += _memory_span_size;
+    }
+  }
+#if !DISABLE_UNMAP
+#if PLATFORM_WINDOWS
+  if (!VirtualFree(address, release ? 0 : size,
+                   release ? MEM_RELEASE : MEM_DECOMMIT)) {
+    rpmalloc_assert(0, "Failed to unmap virtual memory block");
+  }
+#else
+  if (release) {
+    if (munmap(address, release)) {
+      rpmalloc_assert(0, "Failed to unmap virtual memory block");
+    }
+  } else {
+#if defined(MADV_FREE_REUSABLE)
+    int ret;
+    while ((ret = madvise(address, size, MADV_FREE_REUSABLE)) == -1 &&
+           (errno == EAGAIN))
+      errno = 0;
+    if ((ret == -1) && (errno != 0)) {
+#elif defined(MADV_DONTNEED)
+    if (madvise(address, size, MADV_DONTNEED)) {
+#elif defined(MADV_PAGEOUT)
+    if (madvise(address, size, MADV_PAGEOUT)) {
+#elif defined(MADV_FREE)
+    if (madvise(address, size, MADV_FREE)) {
+#else
+    if (posix_madvise(address, size, POSIX_MADV_DONTNEED)) {
+#endif
+      rpmalloc_assert(0, "Failed to madvise virtual memory block as free");
+    }
+  }
+#endif
+#endif
+  if (release)
+    _rpmalloc_stat_sub(&_mapped_pages_os, release >> _memory_page_size_shift);
+}
+
+static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master,
+                                                         span_t *subspan,
+                                                         size_t span_count);
+
+//! Use global reserved spans to fulfill a memory map request (reserve size must
+//! be checked by caller)
+static span_t *_rpmalloc_global_get_reserved_spans(size_t span_count) {
+  span_t *span = _memory_global_reserve;
+  _rpmalloc_span_mark_as_subspan_unless_master(_memory_global_reserve_master,
+                                               span, span_count);
+  _memory_global_reserve_count -= span_count;
+  if (_memory_global_reserve_count)
+    _memory_global_reserve =
+        (span_t *)pointer_offset(span, span_count << _memory_span_size_shift);
+  else
+    _memory_global_reserve = 0;
+  return span;
+}
+
+//! Store the given spans as global reserve (must only be called from within new
+//! heap allocation, not thread safe)
+static void _rpmalloc_global_set_reserved_spans(span_t *master, span_t *reserve,
+                                                size_t reserve_span_count) {
+  _memory_global_reserve_master = master;
+  _memory_global_reserve_count = reserve_span_count;
+  _memory_global_reserve = reserve;
+}
+
+////////////
+///
+/// Span linked list management
+///
+//////
+
+//! Add a span to double linked list at the head
+static void _rpmalloc_span_double_link_list_add(span_t **head, span_t *span) {
+  if (*head)
+    (*head)->prev = span;
+  span->next = *head;
+  *head = span;
+}
+
+//! Pop head span from double linked list
+static void _rpmalloc_span_double_link_list_pop_head(span_t **head,
+                                                     span_t *span) {
+  rpmalloc_assert(*head == span, "Linked list corrupted");
+  span = *head;
+  *head = span->next;
+}
+
+//! Remove a span from double linked list
+static void _rpmalloc_span_double_link_list_remove(span_t **head,
+                                                   span_t *span) {
+  rpmalloc_assert(*head, "Linked list corrupted");
+  if (*head == span) {
+    *head = span->next;
+  } else {
+    span_t *next_span = span->next;
+    span_t *prev_span = span->prev;
+    prev_span->next = next_span;
+    if (EXPECTED(next_span != 0))
+      next_span->prev = prev_span;
+  }
+}
+
+////////////
+///
+/// Span control
+///
+//////
+
+static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span);
+
+static void _rpmalloc_heap_finalize(heap_t *heap);
+
+static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master,
+                                              span_t *reserve,
+                                              size_t reserve_span_count);
+
+//! Declare the span to be a subspan and store distance from master span and
+//! span count
+static void _rpmalloc_span_mark_as_subspan_unless_master(span_t *master,
+                                                         span_t *subspan,
+                                                         size_t span_count) {
+  rpmalloc_assert((subspan != master) || (subspan->flags & SPAN_FLAG_MASTER),
+                  "Span master pointer and/or flag mismatch");
+  if (subspan != master) {
+    subspan->flags = SPAN_FLAG_SUBSPAN;
+    subspan->offset_from_master =
+        (uint32_t)((uintptr_t)pointer_diff(subspan, master) >>
+                   _memory_span_size_shift);
+    subspan->align_offset = 0;
+  }
+  subspan->span_count = (uint32_t)span_count;
+}
+
+//! Use reserved spans to fulfill a memory map request (reserve size must be
+//! checked by caller)
+static span_t *_rpmalloc_span_map_from_reserve(heap_t *heap,
+                                               size_t span_count) {
+  // Update the heap span reserve
+  span_t *span = heap->span_reserve;
+  heap->span_reserve =
+      (span_t *)pointer_offset(span, span_count * _memory_span_size);
+  heap->spans_reserved -= (uint32_t)span_count;
+
+  _rpmalloc_span_mark_as_subspan_unless_master(heap->span_reserve_master, span,
+                                               span_count);
+  if (span_count <= LARGE_CLASS_COUNT)
+    _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_reserved);
+
+  return span;
+}
+
+//! Get the aligned number of spans to map in based on wanted count, configured
+//! mapping granularity and the page size
+static size_t _rpmalloc_span_align_count(size_t span_count) {
+  size_t request_count = (span_count > _memory_span_map_count)
+                             ? span_count
+                             : _memory_span_map_count;
+  if ((_memory_page_size > _memory_span_size) &&
+      ((request_count * _memory_span_size) % _memory_page_size))
+    request_count +=
+        _memory_span_map_count - (request_count % _memory_span_map_count);
+  return request_count;
+}
+
+//! Setup a newly mapped span
+static void _rpmalloc_span_initialize(span_t *span, size_t total_span_count,
+                                      size_t span_count, size_t align_offset) {
+  span->total_spans = (uint32_t)total_span_count;
+  span->span_count = (uint32_t)span_count;
+  span->align_offset = (uint32_t)align_offset;
+  span->flags = SPAN_FLAG_MASTER;
+  atomic_store32(&span->remaining_spans, (int32_t)total_span_count);
+}
+
+static void _rpmalloc_span_unmap(span_t *span);
+
+//! Map an aligned set of spans, taking configured mapping granularity and the
+//! page size into account
+static span_t *_rpmalloc_span_map_aligned_count(heap_t *heap,
+                                                size_t span_count) {
+  // If we already have some, but not enough, reserved spans, release those to
+  // heap cache and map a new full set of spans. Otherwise we would waste memory
+  // if page size > span size (huge pages)
+  size_t aligned_span_count = _rpmalloc_span_align_count(span_count);
+  size_t align_offset = 0;
+  span_t *span = (span_t *)_rpmalloc_mmap(
+      aligned_span_count * _memory_span_size, &align_offset);
+  if (!span)
+    return 0;
+  _rpmalloc_span_initialize(span, aligned_span_count, span_count, align_offset);
+  _rpmalloc_stat_inc(&_master_spans);
+  if (span_count <= LARGE_CLASS_COUNT)
+    _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_map_calls);
+  if (aligned_span_count > span_count) {
+    span_t *reserved_spans =
+        (span_t *)pointer_offset(span, span_count * _memory_span_size);
+    size_t reserved_count = aligned_span_count - span_count;
+    if (heap->spans_reserved) {
+      _rpmalloc_span_mark_as_subspan_unless_master(
+          heap->span_reserve_master, heap->span_reserve, heap->spans_reserved);
+      _rpmalloc_heap_cache_insert(heap, heap->span_reserve);
+    }
+    if (reserved_count > _memory_heap_reserve_count) {
+      // If huge pages or eager spam map count, the global reserve spin lock is
+      // held by caller, _rpmalloc_span_map
+      rpmalloc_assert(atomic_load32(&_memory_global_lock) == 1,
+                      "Global spin lock not held as expected");
+      size_t remain_count = reserved_count - _memory_heap_reserve_count;
+      reserved_count = _memory_heap_reserve_count;
+      span_t *remain_span = (span_t *)pointer_offset(
+          reserved_spans, reserved_count * _memory_span_size);
+      if (_memory_global_reserve) {
+        _rpmalloc_span_mark_as_subspan_unless_master(
+            _memory_global_reserve_master, _memory_global_reserve,
+            _memory_global_reserve_count);
+        _rpmalloc_span_unmap(_memory_global_reserve);
+      }
+      _rpmalloc_global_set_reserved_spans(span, remain_span, remain_count);
+    }
+    _rpmalloc_heap_set_reserved_spans(heap, span, reserved_spans,
+                                      reserved_count);
+  }
+  return span;
+}
+
+//! Map in memory pages for the given number of spans (or use previously
+//! reserved pages)
+static span_t *_rpmalloc_span_map(heap_t *heap, size_t span_count) {
+  if (span_count <= heap->spans_reserved)
+    return _rpmalloc_span_map_from_reserve(heap, span_count);
+  span_t *span = 0;
+  int use_global_reserve =
+      (_memory_page_size > _memory_span_size) ||
+      (_memory_span_map_count > _memory_heap_reserve_count);
+  if (use_global_reserve) {
+    // If huge pages, make sure only one thread maps more memory to avoid bloat
+    while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+      _rpmalloc_spin();
+    if (_memory_global_reserve_count >= span_count) {
+      size_t reserve_count =
+          (!heap->spans_reserved ? _memory_heap_reserve_count : span_count);
+      if (_memory_global_reserve_count < reserve_count)
+        reserve_count = _memory_global_reserve_count;
+      span = _rpmalloc_global_get_reserved_spans(reserve_count);
+      if (span) {
+        if (reserve_count > span_count) {
+          span_t *reserved_span = (span_t *)pointer_offset(
+              span, span_count << _memory_span_size_shift);
+          _rpmalloc_heap_set_reserved_spans(heap, _memory_global_reserve_master,
+                                            reserved_span,
+                                            reserve_count - span_count);
+        }
+        // Already marked as subspan in _rpmalloc_global_get_reserved_spans
+        span->span_count = (uint32_t)span_count;
+      }
+    }
+  }
+  if (!span)
+    span = _rpmalloc_span_map_aligned_count(heap, span_count);
+  if (use_global_reserve)
+    atomic_store32_release(&_memory_global_lock, 0);
+  return span;
+}
+
+//! Unmap memory pages for the given number of spans (or mark as unused if no
+//! partial unmappings)
+static void _rpmalloc_span_unmap(span_t *span) {
+  rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) ||
+                      (span->flags & SPAN_FLAG_SUBSPAN),
+                  "Span flag corrupted");
+  rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) ||
+                      !(span->flags & SPAN_FLAG_SUBSPAN),
+                  "Span flag corrupted");
+
+  int is_master = !!(span->flags & SPAN_FLAG_MASTER);
+  span_t *master =
+      is_master ? span
+                : ((span_t *)pointer_offset(
+                      span, -(intptr_t)((uintptr_t)span->offset_from_master *
+                                        _memory_span_size)));
+  rpmalloc_assert(is_master || (span->flags & SPAN_FLAG_SUBSPAN),
+                  "Span flag corrupted");
+  rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
+
+  size_t span_count = span->span_count;
+  if (!is_master) {
+    // Directly unmap subspans (unless huge pages, in which case we defer and
+    // unmap entire page range with master)
+    rpmalloc_assert(span->align_offset == 0, "Span align offset corrupted");
+    if (_memory_span_size >= _memory_page_size)
+      _rpmalloc_unmap(span, span_count * _memory_span_size, 0, 0);
+  } else {
+    // Special double flag to denote an unmapped master
+    // It must be kept in memory since span header must be used
+    span->flags |=
+        SPAN_FLAG_MASTER | SPAN_FLAG_SUBSPAN | SPAN_FLAG_UNMAPPED_MASTER;
+    _rpmalloc_stat_add(&_unmapped_master_spans, 1);
+  }
+
+  if (atomic_add32(&master->remaining_spans, -(int32_t)span_count) <= 0) {
+    // Everything unmapped, unmap the master span with release flag to unmap the
+    // entire range of the super span
+    rpmalloc_assert(!!(master->flags & SPAN_FLAG_MASTER) &&
+                        !!(master->flags & SPAN_FLAG_SUBSPAN),
+                    "Span flag corrupted");
+    size_t unmap_count = master->span_count;
+    if (_memory_span_size < _memory_page_size)
+      unmap_count = master->total_spans;
+    _rpmalloc_stat_sub(&_master_spans, 1);
+    _rpmalloc_stat_sub(&_unmapped_master_spans, 1);
+    _rpmalloc_unmap(master, unmap_count * _memory_span_size,
+                    master->align_offset,
+                    (size_t)master->total_spans * _memory_span_size);
+  }
+}
+
+//! Move the span (used for small or medium allocations) to the heap thread
+//! cache
+static void _rpmalloc_span_release_to_cache(heap_t *heap, span_t *span) {
+  rpmalloc_assert(heap == span->heap, "Span heap pointer corrupted");
+  rpmalloc_assert(span->size_class < SIZE_CLASS_COUNT,
+                  "Invalid span size class");
+  rpmalloc_assert(span->span_count == 1, "Invalid span count");
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+  atomic_decr32(&heap->span_use[0].current);
+#endif
+  _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+  if (!heap->finalize) {
+    _rpmalloc_stat_inc(&heap->span_use[0].spans_to_cache);
+    _rpmalloc_stat_inc(&heap->size_class_use[span->size_class].spans_to_cache);
+    if (heap->size_class[span->size_class].cache)
+      _rpmalloc_heap_cache_insert(heap,
+                                  heap->size_class[span->size_class].cache);
+    heap->size_class[span->size_class].cache = span;
+  } else {
+    _rpmalloc_span_unmap(span);
+  }
+}
+
+//! Initialize a (partial) free list up to next system memory page, while
+//! reserving the first block as allocated, returning number of blocks in list
+static uint32_t free_list_partial_init(void **list, void **first_block,
+                                       void *page_start, void *block_start,
+                                       uint32_t block_count,
+                                       uint32_t block_size) {
+  rpmalloc_assert(block_count, "Internal failure");
+  *first_block = block_start;
+  if (block_count > 1) {
+    void *free_block = pointer_offset(block_start, block_size);
+    void *block_end =
+        pointer_offset(block_start, (size_t)block_size * block_count);
+    // If block size is less than half a memory page, bound init to next memory
+    // page boundary
+    if (block_size < (_memory_page_size >> 1)) {
+      void *page_end = pointer_offset(page_start, _memory_page_size);
+      if (page_end < block_end)
+        block_end = page_end;
+    }
+    *list = free_block;
+    block_count = 2;
+    void *next_block = pointer_offset(free_block, block_size);
+    while (next_block < block_end) {
+      *((void **)free_block) = next_block;
+      free_block = next_block;
+      ++block_count;
+      next_block = pointer_offset(next_block, block_size);
+    }
+    *((void **)free_block) = 0;
+  } else {
+    *list = 0;
+  }
+  return block_count;
+}
+
+//! Initialize an unused span (from cache or mapped) to be new active span,
+//! putting the initial free list in heap class free list
+static void *_rpmalloc_span_initialize_new(heap_t *heap,
+                                           heap_size_class_t *heap_size_class,
+                                           span_t *span, uint32_t class_idx) {
+  rpmalloc_assert(span->span_count == 1, "Internal failure");
+  size_class_t *size_class = _memory_size_class + class_idx;
+  span->size_class = class_idx;
+  span->heap = heap;
+  span->flags &= ~SPAN_FLAG_ALIGNED_BLOCKS;
+  span->block_size = size_class->block_size;
+  span->block_count = size_class->block_count;
+  span->free_list = 0;
+  span->list_size = 0;
+  atomic_store_ptr_release(&span->free_list_deferred, 0);
+
+  // Setup free list. Only initialize one system page worth of free blocks in
+  // list
+  void *block;
+  span->free_list_limit =
+      free_list_partial_init(&heap_size_class->free_list, &block, span,
+                             pointer_offset(span, SPAN_HEADER_SIZE),
+                             size_class->block_count, size_class->block_size);
+  // Link span as partial if there remains blocks to be initialized as free
+  // list, or full if fully initialized
+  if (span->free_list_limit < span->block_count) {
+    _rpmalloc_span_double_link_list_add(&heap_size_class->partial_span, span);
+    span->used_count = span->free_list_limit;
+  } else {
+#if RPMALLOC_FIRST_CLASS_HEAPS
+    _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+    ++heap->full_span_count;
+    span->used_count = span->block_count;
+  }
+  return block;
+}
+
+static void _rpmalloc_span_extract_free_list_deferred(span_t *span) {
+  // We need acquire semantics on the CAS operation since we are interested in
+  // the list size Refer to _rpmalloc_deallocate_defer_small_or_medium for
+  // further comments on this dependency
+  do {
+    span->free_list =
+        atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+  } while (span->free_list == INVALID_POINTER);
+  span->used_count -= span->list_size;
+  span->list_size = 0;
+  atomic_store_ptr_release(&span->free_list_deferred, 0);
+}
+
+static int _rpmalloc_span_is_fully_utilized(span_t *span) {
+  rpmalloc_assert(span->free_list_limit <= span->block_count,
+                  "Span free list corrupted");
+  return !span->free_list && (span->free_list_limit >= span->block_count);
+}
+
+static int _rpmalloc_span_finalize(heap_t *heap, size_t iclass, span_t *span,
+                                   span_t **list_head) {
+  void *free_list = heap->size_class[iclass].free_list;
+  span_t *class_span = (span_t *)((uintptr_t)free_list & _memory_span_mask);
+  if (span == class_span) {
+    // Adopt the heap class free list back into the span free list
+    void *block = span->free_list;
+    void *last_block = 0;
+    while (block) {
+      last_block = block;
+      block = *((void **)block);
+    }
+    uint32_t free_count = 0;
+    block = free_list;
+    while (block) {
+      ++free_count;
+      block = *((void **)block);
+    }
+    if (last_block) {
+      *((void **)last_block) = free_list;
+    } else {
+      span->free_list = free_list;
+    }
+    heap->size_class[iclass].free_list = 0;
+    span->used_count -= free_count;
+  }
+  // If this assert triggers you have memory leaks
+  rpmalloc_assert(span->list_size == span->used_count, "Memory leak detected");
+  if (span->list_size == span->used_count) {
+    _rpmalloc_stat_dec(&heap->span_use[0].current);
+    _rpmalloc_stat_dec(&heap->size_class_use[iclass].spans_current);
+    // This function only used for spans in double linked lists
+    if (list_head)
+      _rpmalloc_span_double_link_list_remove(list_head, span);
+    _rpmalloc_span_unmap(span);
+    return 1;
+  }
+  return 0;
+}
+
+////////////
+///
+/// Global cache
+///
+//////
+
+#if ENABLE_GLOBAL_CACHE
+
+//! Finalize a global cache
+static void _rpmalloc_global_cache_finalize(global_cache_t *cache) {
+  while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+    _rpmalloc_spin();
+
+  for (size_t ispan = 0; ispan < cache->count; ++ispan)
+    _rpmalloc_span_unmap(cache->span[ispan]);
+  cache->count = 0;
+
+  while (cache->overflow) {
+    span_t *span = cache->overflow;
+    cache->overflow = span->next;
+    _rpmalloc_span_unmap(span);
+  }
+
+  atomic_store32_release(&cache->lock, 0);
+}
+
+static void _rpmalloc_global_cache_insert_spans(span_t **span,
+                                                size_t span_count,
+                                                size_t count) {
+  const size_t cache_limit =
+      (span_count == 1) ? GLOBAL_CACHE_MULTIPLIER * MAX_THREAD_SPAN_CACHE
+                        : GLOBAL_CACHE_MULTIPLIER *
+                              (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+
+  global_cache_t *cache = &_memory_span_cache[span_count - 1];
+
+  size_t insert_count = count;
+  while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+    _rpmalloc_spin();
+
+#if ENABLE_STATISTICS
+  cache->insert_count += count;
+#endif
+  if ((cache->count + insert_count) > cache_limit)
+    insert_count = cache_limit - cache->count;
+
+  memcpy(cache->span + cache->count, span, sizeof(span_t *) * insert_count);
+  cache->count += (uint32_t)insert_count;
+
+#if ENABLE_UNLIMITED_CACHE
+  while (insert_count < count) {
+#else
+  // Enable unlimited cache if huge pages, or we will leak since it is unlikely
+  // that an entire huge page will be unmapped, and we're unable to partially
+  // decommit a huge page
+  while ((_memory_page_size > _memory_span_size) && (insert_count < count)) {
+#endif
+    span_t *current_span = span[insert_count++];
+    current_span->next = cache->overflow;
+    cache->overflow = current_span;
+  }
+  atomic_store32_release(&cache->lock, 0);
+
+  span_t *keep = 0;
+  for (size_t ispan = insert_count; ispan < count; ++ispan) {
+    span_t *current_span = span[ispan];
+    // Keep master spans that has remaining subspans to avoid dangling them
+    if ((current_span->flags & SPAN_FLAG_MASTER) &&
+        (atomic_load32(&current_span->remaining_spans) >
+         (int32_t)current_span->span_count)) {
+      current_span->next = keep;
+      keep = current_span;
+    } else {
+      _rpmalloc_span_unmap(current_span);
+    }
+  }
+
+  if (keep) {
+    while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+      _rpmalloc_spin();
+
+    size_t islot = 0;
+    while (keep) {
+      for (; islot < cache->count; ++islot) {
+        span_t *current_span = cache->span[islot];
+        if (!(current_span->flags & SPAN_FLAG_MASTER) ||
+            ((current_span->flags & SPAN_FLAG_MASTER) &&
+             (atomic_load32(&current_span->remaining_spans) <=
+              (int32_t)current_span->span_count))) {
+          _rpmalloc_span_unmap(current_span);
+          cache->span[islot] = keep;
+          break;
+        }
+      }
+      if (islot == cache->count)
+        break;
+      keep = keep->next;
+    }
+
+    if (keep) {
+      span_t *tail = keep;
+      while (tail->next)
+        tail = tail->next;
+      tail->next = cache->overflow;
+      cache->overflow = keep;
+    }
+
+    atomic_store32_release(&cache->lock, 0);
+  }
+}
+
+static size_t _rpmalloc_global_cache_extract_spans(span_t **span,
+                                                   size_t span_count,
+                                                   size_t count) {
+  global_cache_t *cache = &_memory_span_cache[span_count - 1];
+
+  size_t extract_count = 0;
+  while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+    _rpmalloc_spin();
+
+#if ENABLE_STATISTICS
+  cache->extract_count += count;
+#endif
+  size_t want = count - extract_count;
+  if (want > cache->count)
+    want = cache->count;
+
+  memcpy(span + extract_count, cache->span + (cache->count - want),
+         sizeof(span_t *) * want);
+  cache->count -= (uint32_t)want;
+  extract_count += want;
+
+  while ((extract_count < count) && cache->overflow) {
+    span_t *current_span = cache->overflow;
+    span[extract_count++] = current_span;
+    cache->overflow = current_span->next;
+  }
+
+#if ENABLE_ASSERTS
+  for (size_t ispan = 0; ispan < extract_count; ++ispan) {
+    rpmalloc_assert(span[ispan]->span_count == span_count,
+                    "Global cache span count mismatch");
+  }
+#endif
+
+  atomic_store32_release(&cache->lock, 0);
+
+  return extract_count;
+}
+
+#endif
+
+////////////
+///
+/// Heap control
+///
+//////
+
+static void _rpmalloc_deallocate_huge(span_t *);
+
+//! Store the given spans as reserve in the given heap
+static void _rpmalloc_heap_set_reserved_spans(heap_t *heap, span_t *master,
+                                              span_t *reserve,
+                                              size_t reserve_span_count) {
+  heap->span_reserve_master = master;
+  heap->span_reserve = reserve;
+  heap->spans_reserved = (uint32_t)reserve_span_count;
+}
+
+//! Adopt the deferred span cache list, optionally extracting the first single
+//! span for immediate re-use
+static void _rpmalloc_heap_cache_adopt_deferred(heap_t *heap,
+                                                span_t **single_span) {
+  span_t *span = (span_t *)((void *)atomic_exchange_ptr_acquire(
+      &heap->span_free_deferred, 0));
+  while (span) {
+    span_t *next_span = (span_t *)span->free_list;
+    rpmalloc_assert(span->heap == heap, "Span heap pointer corrupted");
+    if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+      rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
+      --heap->full_span_count;
+      _rpmalloc_stat_dec(&heap->span_use[0].spans_deferred);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+      _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class],
+                                             span);
+#endif
+      _rpmalloc_stat_dec(&heap->span_use[0].current);
+      _rpmalloc_stat_dec(&heap->size_class_use[span->size_class].spans_current);
+      if (single_span && !*single_span)
+        *single_span = span;
+      else
+        _rpmalloc_heap_cache_insert(heap, span);
+    } else {
+      if (span->size_class == SIZE_CLASS_HUGE) {
+        _rpmalloc_deallocate_huge(span);
+      } else {
+        rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE,
+                        "Span size class invalid");
+        rpmalloc_assert(heap->full_span_count, "Heap span counter corrupted");
+        --heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+        _rpmalloc_span_double_link_list_remove(&heap->large_huge_span, span);
+#endif
+        uint32_t idx = span->span_count - 1;
+        _rpmalloc_stat_dec(&heap->span_use[idx].spans_deferred);
+        _rpmalloc_stat_dec(&heap->span_use[idx].current);
+        if (!idx && single_span && !*single_span)
+          *single_span = span;
+        else
+          _rpmalloc_heap_cache_insert(heap, span);
+      }
+    }
+    span = next_span;
+  }
+}
+
+static void _rpmalloc_heap_unmap(heap_t *heap) {
+  if (!heap->master_heap) {
+    if ((heap->finalize > 1) && !atomic_load32(&heap->child_count)) {
+      span_t *span = (span_t *)((uintptr_t)heap & _memory_span_mask);
+      _rpmalloc_span_unmap(span);
+    }
+  } else {
+    if (atomic_decr32(&heap->master_heap->child_count) == 0) {
+      _rpmalloc_heap_unmap(heap->master_heap);
+    }
+  }
+}
+
+static void _rpmalloc_heap_global_finalize(heap_t *heap) {
+  if (heap->finalize++ > 1) {
+    --heap->finalize;
+    return;
+  }
+
+  _rpmalloc_heap_finalize(heap);
+
+#if ENABLE_THREAD_CACHE
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    span_cache_t *span_cache;
+    if (!iclass)
+      span_cache = &heap->span_cache;
+    else
+      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
+    for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+      _rpmalloc_span_unmap(span_cache->span[ispan]);
+    span_cache->count = 0;
+  }
+#endif
+
+  if (heap->full_span_count) {
+    --heap->finalize;
+    return;
+  }
+
+  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+    if (heap->size_class[iclass].free_list ||
+        heap->size_class[iclass].partial_span) {
+      --heap->finalize;
+      return;
+    }
+  }
+  // Heap is now completely free, unmap and remove from heap list
+  size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
+  heap_t *list_heap = _memory_heaps[list_idx];
+  if (list_heap == heap) {
+    _memory_heaps[list_idx] = heap->next_heap;
+  } else {
+    while (list_heap->next_heap != heap)
+      list_heap = list_heap->next_heap;
+    list_heap->next_heap = heap->next_heap;
+  }
+
+  _rpmalloc_heap_unmap(heap);
+}
+
+//! Insert a single span into thread heap cache, releasing to global cache if
+//! overflow
+static void _rpmalloc_heap_cache_insert(heap_t *heap, span_t *span) {
+  if (UNEXPECTED(heap->finalize != 0)) {
+    _rpmalloc_span_unmap(span);
+    _rpmalloc_heap_global_finalize(heap);
+    return;
+  }
+#if ENABLE_THREAD_CACHE
+  size_t span_count = span->span_count;
+  _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_to_cache);
+  if (span_count == 1) {
+    span_cache_t *span_cache = &heap->span_cache;
+    span_cache->span[span_cache->count++] = span;
+    if (span_cache->count == MAX_THREAD_SPAN_CACHE) {
+      const size_t remain_count =
+          MAX_THREAD_SPAN_CACHE - THREAD_SPAN_CACHE_TRANSFER;
+#if ENABLE_GLOBAL_CACHE
+      _rpmalloc_stat_add64(&heap->thread_to_global,
+                           THREAD_SPAN_CACHE_TRANSFER * _memory_span_size);
+      _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global,
+                         THREAD_SPAN_CACHE_TRANSFER);
+      _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count,
+                                          span_count,
+                                          THREAD_SPAN_CACHE_TRANSFER);
+#else
+      for (size_t ispan = 0; ispan < THREAD_SPAN_CACHE_TRANSFER; ++ispan)
+        _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
+#endif
+      span_cache->count = remain_count;
+    }
+  } else {
+    size_t cache_idx = span_count - 2;
+    span_large_cache_t *span_cache = heap->span_large_cache + cache_idx;
+    span_cache->span[span_cache->count++] = span;
+    const size_t cache_limit =
+        (MAX_THREAD_SPAN_LARGE_CACHE - (span_count >> 1));
+    if (span_cache->count == cache_limit) {
+      const size_t transfer_limit = 2 + (cache_limit >> 2);
+      const size_t transfer_count =
+          (THREAD_SPAN_LARGE_CACHE_TRANSFER <= transfer_limit
+               ? THREAD_SPAN_LARGE_CACHE_TRANSFER
+               : transfer_limit);
+      const size_t remain_count = cache_limit - transfer_count;
+#if ENABLE_GLOBAL_CACHE
+      _rpmalloc_stat_add64(&heap->thread_to_global,
+                           transfer_count * span_count * _memory_span_size);
+      _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_to_global,
+                         transfer_count);
+      _rpmalloc_global_cache_insert_spans(span_cache->span + remain_count,
+                                          span_count, transfer_count);
+#else
+      for (size_t ispan = 0; ispan < transfer_count; ++ispan)
+        _rpmalloc_span_unmap(span_cache->span[remain_count + ispan]);
+#endif
+      span_cache->count = remain_count;
+    }
+  }
+#else
+  (void)sizeof(heap);
+  _rpmalloc_span_unmap(span);
+#endif
+}
+
+//! Extract the given number of spans from the different cache levels
+static span_t *_rpmalloc_heap_thread_cache_extract(heap_t *heap,
+                                                   size_t span_count) {
+  span_t *span = 0;
+#if ENABLE_THREAD_CACHE
+  span_cache_t *span_cache;
+  if (span_count == 1)
+    span_cache = &heap->span_cache;
+  else
+    span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2));
+  if (span_cache->count) {
+    _rpmalloc_stat_inc(&heap->span_use[span_count - 1].spans_from_cache);
+    return span_cache->span[--span_cache->count];
+  }
+#endif
+  return span;
+}
+
+static span_t *_rpmalloc_heap_thread_cache_deferred_extract(heap_t *heap,
+                                                            size_t span_count) {
+  span_t *span = 0;
+  if (span_count == 1) {
+    _rpmalloc_heap_cache_adopt_deferred(heap, &span);
+  } else {
+    _rpmalloc_heap_cache_adopt_deferred(heap, 0);
+    span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+  }
+  return span;
+}
+
+static span_t *_rpmalloc_heap_reserved_extract(heap_t *heap,
+                                               size_t span_count) {
+  if (heap->spans_reserved >= span_count)
+    return _rpmalloc_span_map(heap, span_count);
+  return 0;
+}
+
+//! Extract a span from the global cache
+static span_t *_rpmalloc_heap_global_cache_extract(heap_t *heap,
+                                                   size_t span_count) {
+#if ENABLE_GLOBAL_CACHE
+#if ENABLE_THREAD_CACHE
+  span_cache_t *span_cache;
+  size_t wanted_count;
+  if (span_count == 1) {
+    span_cache = &heap->span_cache;
+    wanted_count = THREAD_SPAN_CACHE_TRANSFER;
+  } else {
+    span_cache = (span_cache_t *)(heap->span_large_cache + (span_count - 2));
+    wanted_count = THREAD_SPAN_LARGE_CACHE_TRANSFER;
+  }
+  span_cache->count = _rpmalloc_global_cache_extract_spans(
+      span_cache->span, span_count, wanted_count);
+  if (span_cache->count) {
+    _rpmalloc_stat_add64(&heap->global_to_thread,
+                         span_count * span_cache->count * _memory_span_size);
+    _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global,
+                       span_cache->count);
+    return span_cache->span[--span_cache->count];
+  }
+#else
+  span_t *span = 0;
+  size_t count = _rpmalloc_global_cache_extract_spans(&span, span_count, 1);
+  if (count) {
+    _rpmalloc_stat_add64(&heap->global_to_thread,
+                         span_count * count * _memory_span_size);
+    _rpmalloc_stat_add(&heap->span_use[span_count - 1].spans_from_global,
+                       count);
+    return span;
+  }
+#endif
+#endif
+  (void)sizeof(heap);
+  (void)sizeof(span_count);
+  return 0;
+}
+
+static void _rpmalloc_inc_span_statistics(heap_t *heap, size_t span_count,
+                                          uint32_t class_idx) {
+  (void)sizeof(heap);
+  (void)sizeof(span_count);
+  (void)sizeof(class_idx);
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+  uint32_t idx = (uint32_t)span_count - 1;
+  uint32_t current_count =
+      (uint32_t)atomic_incr32(&heap->span_use[idx].current);
+  if (current_count > (uint32_t)atomic_load32(&heap->span_use[idx].high))
+    atomic_store32(&heap->span_use[idx].high, (int32_t)current_count);
+  _rpmalloc_stat_add_peak(&heap->size_class_use[class_idx].spans_current, 1,
+                          heap->size_class_use[class_idx].spans_peak);
+#endif
+}
+
+//! Get a span from one of the cache levels (thread cache, reserved, global
+//! cache) or fallback to mapping more memory
+static span_t *
+_rpmalloc_heap_extract_new_span(heap_t *heap,
+                                heap_size_class_t *heap_size_class,
+                                size_t span_count, uint32_t class_idx) {
+  span_t *span;
+#if ENABLE_THREAD_CACHE
+  if (heap_size_class && heap_size_class->cache) {
+    span = heap_size_class->cache;
+    heap_size_class->cache =
+        (heap->span_cache.count
+             ? heap->span_cache.span[--heap->span_cache.count]
+             : 0);
+    _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+    return span;
+  }
+#endif
+  (void)sizeof(class_idx);
+  // Allow 50% overhead to increase cache hits
+  size_t base_span_count = span_count;
+  size_t limit_span_count =
+      (span_count > 2) ? (span_count + (span_count >> 1)) : span_count;
+  if (limit_span_count > LARGE_CLASS_COUNT)
+    limit_span_count = LARGE_CLASS_COUNT;
+  do {
+    span = _rpmalloc_heap_thread_cache_extract(heap, span_count);
+    if (EXPECTED(span != 0)) {
+      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+      return span;
+    }
+    span = _rpmalloc_heap_thread_cache_deferred_extract(heap, span_count);
+    if (EXPECTED(span != 0)) {
+      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+      return span;
+    }
+    span = _rpmalloc_heap_global_cache_extract(heap, span_count);
+    if (EXPECTED(span != 0)) {
+      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_cache);
+      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+      return span;
+    }
+    span = _rpmalloc_heap_reserved_extract(heap, span_count);
+    if (EXPECTED(span != 0)) {
+      _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_from_reserved);
+      _rpmalloc_inc_span_statistics(heap, span_count, class_idx);
+      return span;
+    }
+    ++span_count;
+  } while (span_count <= limit_span_count);
+  // Final fallback, map in more virtual memory
+  span = _rpmalloc_span_map(heap, base_span_count);
+  _rpmalloc_inc_span_statistics(heap, base_span_count, class_idx);
+  _rpmalloc_stat_inc(&heap->size_class_use[class_idx].spans_map_calls);
+  return span;
+}
+
+static void _rpmalloc_heap_initialize(heap_t *heap) {
+  _rpmalloc_memset_const(heap, 0, sizeof(heap_t));
+  // Get a new heap ID
+  heap->id = 1 + atomic_incr32(&_memory_heap_id);
+
+  // Link in heap in heap ID map
+  size_t list_idx = (size_t)heap->id % HEAP_ARRAY_SIZE;
+  heap->next_heap = _memory_heaps[list_idx];
+  _memory_heaps[list_idx] = heap;
+}
+
+static void _rpmalloc_heap_orphan(heap_t *heap, int first_class) {
+  heap->owner_thread = (uintptr_t)-1;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  heap_t **heap_list =
+      (first_class ? &_memory_first_class_orphan_heaps : &_memory_orphan_heaps);
+#else
+  (void)sizeof(first_class);
+  heap_t **heap_list = &_memory_orphan_heaps;
+#endif
+  heap->next_orphan = *heap_list;
+  *heap_list = heap;
+}
+
+//! Allocate a new heap from newly mapped memory pages
+static heap_t *_rpmalloc_heap_allocate_new(void) {
+  // Map in pages for a 16 heaps. If page size is greater than required size for
+  // this, map a page and use first part for heaps and remaining part for spans
+  // for allocations. Adds a lot of complexity, but saves a lot of memory on
+  // systems where page size > 64 spans (4MiB)
+  size_t heap_size = sizeof(heap_t);
+  size_t aligned_heap_size = 16 * ((heap_size + 15) / 16);
+  size_t request_heap_count = 16;
+  size_t heap_span_count = ((aligned_heap_size * request_heap_count) +
+                            sizeof(span_t) + _memory_span_size - 1) /
+                           _memory_span_size;
+  size_t block_size = _memory_span_size * heap_span_count;
+  size_t span_count = heap_span_count;
+  span_t *span = 0;
+  // If there are global reserved spans, use these first
+  if (_memory_global_reserve_count >= heap_span_count) {
+    span = _rpmalloc_global_get_reserved_spans(heap_span_count);
+  }
+  if (!span) {
+    if (_memory_page_size > block_size) {
+      span_count = _memory_page_size / _memory_span_size;
+      block_size = _memory_page_size;
+      // If using huge pages, make sure to grab enough heaps to avoid
+      // reallocating a huge page just to serve new heaps
+      size_t possible_heap_count =
+          (block_size - sizeof(span_t)) / aligned_heap_size;
+      if (possible_heap_count >= (request_heap_count * 16))
+        request_heap_count *= 16;
+      else if (possible_heap_count < request_heap_count)
+        request_heap_count = possible_heap_count;
+      heap_span_count = ((aligned_heap_size * request_heap_count) +
+                         sizeof(span_t) + _memory_span_size - 1) /
+                        _memory_span_size;
+    }
+
+    size_t align_offset = 0;
+    span = (span_t *)_rpmalloc_mmap(block_size, &align_offset);
+    if (!span)
+      return 0;
+
+    // Master span will contain the heaps
+    _rpmalloc_stat_inc(&_master_spans);
+    _rpmalloc_span_initialize(span, span_count, heap_span_count, align_offset);
+  }
+
+  size_t remain_size = _memory_span_size - sizeof(span_t);
+  heap_t *heap = (heap_t *)pointer_offset(span, sizeof(span_t));
+  _rpmalloc_heap_initialize(heap);
+
+  // Put extra heaps as orphans
+  size_t num_heaps = remain_size / aligned_heap_size;
+  if (num_heaps < request_heap_count)
+    num_heaps = request_heap_count;
+  atomic_store32(&heap->child_count, (int32_t)num_heaps - 1);
+  heap_t *extra_heap = (heap_t *)pointer_offset(heap, aligned_heap_size);
+  while (num_heaps > 1) {
+    _rpmalloc_heap_initialize(extra_heap);
+    extra_heap->master_heap = heap;
+    _rpmalloc_heap_orphan(extra_heap, 1);
+    extra_heap = (heap_t *)pointer_offset(extra_heap, aligned_heap_size);
+    --num_heaps;
+  }
+
+  if (span_count > heap_span_count) {
+    // Cap reserved spans
+    size_t remain_count = span_count - heap_span_count;
+    size_t reserve_count =
+        (remain_count > _memory_heap_reserve_count ? _memory_heap_reserve_count
+                                                   : remain_count);
+    span_t *remain_span =
+        (span_t *)pointer_offset(span, heap_span_count * _memory_span_size);
+    _rpmalloc_heap_set_reserved_spans(heap, span, remain_span, reserve_count);
+
+    if (remain_count > reserve_count) {
+      // Set to global reserved spans
+      remain_span = (span_t *)pointer_offset(remain_span,
+                                             reserve_count * _memory_span_size);
+      reserve_count = remain_count - reserve_count;
+      _rpmalloc_global_set_reserved_spans(span, remain_span, reserve_count);
+    }
+  }
+
+  return heap;
+}
+
+static heap_t *_rpmalloc_heap_extract_orphan(heap_t **heap_list) {
+  heap_t *heap = *heap_list;
+  *heap_list = (heap ? heap->next_orphan : 0);
+  return heap;
+}
+
+//! Allocate a new heap, potentially reusing a previously orphaned heap
+static heap_t *_rpmalloc_heap_allocate(int first_class) {
+  heap_t *heap = 0;
+  while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+    _rpmalloc_spin();
+  if (first_class == 0)
+    heap = _rpmalloc_heap_extract_orphan(&_memory_orphan_heaps);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  if (!heap)
+    heap = _rpmalloc_heap_extract_orphan(&_memory_first_class_orphan_heaps);
+#endif
+  if (!heap)
+    heap = _rpmalloc_heap_allocate_new();
+  atomic_store32_release(&_memory_global_lock, 0);
+  if (heap)
+    _rpmalloc_heap_cache_adopt_deferred(heap, 0);
+  return heap;
+}
+
+static void _rpmalloc_heap_release(void *heapptr, int first_class,
+                                   int release_cache) {
+  heap_t *heap = (heap_t *)heapptr;
+  if (!heap)
+    return;
+  // Release thread cache spans back to global cache
+  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
+  if (release_cache || heap->finalize) {
+#if ENABLE_THREAD_CACHE
+    for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+      span_cache_t *span_cache;
+      if (!iclass)
+        span_cache = &heap->span_cache;
+      else
+        span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
+      if (!span_cache->count)
+        continue;
+#if ENABLE_GLOBAL_CACHE
+      if (heap->finalize) {
+        for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+          _rpmalloc_span_unmap(span_cache->span[ispan]);
+      } else {
+        _rpmalloc_stat_add64(&heap->thread_to_global, span_cache->count *
+                                                          (iclass + 1) *
+                                                          _memory_span_size);
+        _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global,
+                           span_cache->count);
+        _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1,
+                                            span_cache->count);
+      }
+#else
+      for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+        _rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+      span_cache->count = 0;
+    }
+#endif
+  }
+
+  if (get_thread_heap_raw() == heap)
+    set_thread_heap(0);
+
+#if ENABLE_STATISTICS
+  atomic_decr32(&_memory_active_heaps);
+  rpmalloc_assert(atomic_load32(&_memory_active_heaps) >= 0,
+                  "Still active heaps during finalization");
+#endif
+
+  // If we are forcibly terminating with _exit the state of the
+  // lock atomic is unknown and it's best to just go ahead and exit
+  if (get_thread_id() != _rpmalloc_main_thread_id) {
+    while (!atomic_cas32_acquire(&_memory_global_lock, 1, 0))
+      _rpmalloc_spin();
+  }
+  _rpmalloc_heap_orphan(heap, first_class);
+  atomic_store32_release(&_memory_global_lock, 0);
+}
+
+static void _rpmalloc_heap_release_raw(void *heapptr, int release_cache) {
+  _rpmalloc_heap_release(heapptr, 0, release_cache);
+}
+
+static void _rpmalloc_heap_release_raw_fc(void *heapptr) {
+  _rpmalloc_heap_release_raw(heapptr, 1);
+}
+
+static void _rpmalloc_heap_finalize(heap_t *heap) {
+  if (heap->spans_reserved) {
+    span_t *span = _rpmalloc_span_map(heap, heap->spans_reserved);
+    _rpmalloc_span_unmap(span);
+    heap->spans_reserved = 0;
+  }
+
+  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+    if (heap->size_class[iclass].cache)
+      _rpmalloc_span_unmap(heap->size_class[iclass].cache);
+    heap->size_class[iclass].cache = 0;
+    span_t *span = heap->size_class[iclass].partial_span;
+    while (span) {
+      span_t *next = span->next;
+      _rpmalloc_span_finalize(heap, iclass, span,
+                              &heap->size_class[iclass].partial_span);
+      span = next;
+    }
+    // If class still has a free list it must be a full span
+    if (heap->size_class[iclass].free_list) {
+      span_t *class_span =
+          (span_t *)((uintptr_t)heap->size_class[iclass].free_list &
+                     _memory_span_mask);
+      span_t **list = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+      list = &heap->full_span[iclass];
+#endif
+      --heap->full_span_count;
+      if (!_rpmalloc_span_finalize(heap, iclass, class_span, list)) {
+        if (list)
+          _rpmalloc_span_double_link_list_remove(list, class_span);
+        _rpmalloc_span_double_link_list_add(
+            &heap->size_class[iclass].partial_span, class_span);
+      }
+    }
+  }
+
+#if ENABLE_THREAD_CACHE
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    span_cache_t *span_cache;
+    if (!iclass)
+      span_cache = &heap->span_cache;
+    else
+      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
+    for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+      _rpmalloc_span_unmap(span_cache->span[ispan]);
+    span_cache->count = 0;
+  }
+#endif
+  rpmalloc_assert(!atomic_load_ptr(&heap->span_free_deferred),
+                  "Heaps still active during finalization");
+}
+
+////////////
+///
+/// Allocation entry points
+///
+//////
+
+//! Pop first block from a free list
+static void *free_list_pop(void **list) {
+  void *block = *list;
+  *list = *((void **)block);
+  return block;
+}
+
+//! Allocate a small/medium sized memory block from the given heap
+static void *_rpmalloc_allocate_from_heap_fallback(
+    heap_t *heap, heap_size_class_t *heap_size_class, uint32_t class_idx) {
+  span_t *span = heap_size_class->partial_span;
+  rpmalloc_assume(heap != 0);
+  if (EXPECTED(span != 0)) {
+    rpmalloc_assert(span->block_count ==
+                        _memory_size_class[span->size_class].block_count,
+                    "Span block count corrupted");
+    rpmalloc_assert(!_rpmalloc_span_is_fully_utilized(span),
+                    "Internal failure");
+    void *block;
+    if (span->free_list) {
+      // Span local free list is not empty, swap to size class free list
+      block = free_list_pop(&span->free_list);
+      heap_size_class->free_list = span->free_list;
+      span->free_list = 0;
+    } else {
+      // If the span did not fully initialize free list, link up another page
+      // worth of blocks
+      void *block_start = pointer_offset(
+          span, SPAN_HEADER_SIZE +
+                    ((size_t)span->free_list_limit * span->block_size));
+      span->free_list_limit += free_list_partial_init(
+          &heap_size_class->free_list, &block,
+          (void *)((uintptr_t)block_start & ~(_memory_page_size - 1)),
+          block_start, span->block_count - span->free_list_limit,
+          span->block_size);
+    }
+    rpmalloc_assert(span->free_list_limit <= span->block_count,
+                    "Span block count corrupted");
+    span->used_count = span->free_list_limit;
+
+    // Swap in deferred free list if present
+    if (atomic_load_ptr(&span->free_list_deferred))
+      _rpmalloc_span_extract_free_list_deferred(span);
+
+    // If span is still not fully utilized keep it in partial list and early
+    // return block
+    if (!_rpmalloc_span_is_fully_utilized(span))
+      return block;
+
+    // The span is fully utilized, unlink from partial list and add to fully
+    // utilized list
+    _rpmalloc_span_double_link_list_pop_head(&heap_size_class->partial_span,
+                                             span);
+#if RPMALLOC_FIRST_CLASS_HEAPS
+    _rpmalloc_span_double_link_list_add(&heap->full_span[class_idx], span);
+#endif
+    ++heap->full_span_count;
+    return block;
+  }
+
+  // Find a span in one of the cache levels
+  span = _rpmalloc_heap_extract_new_span(heap, heap_size_class, 1, class_idx);
+  if (EXPECTED(span != 0)) {
+    // Mark span as owned by this heap and set base data, return first block
+    return _rpmalloc_span_initialize_new(heap, heap_size_class, span,
+                                         class_idx);
+  }
+
+  return 0;
+}
+
+//! Allocate a small sized memory block from the given heap
+static void *_rpmalloc_allocate_small(heap_t *heap, size_t size) {
+  rpmalloc_assert(heap, "No thread heap");
+  // Small sizes have unique size classes
+  const uint32_t class_idx =
+      (uint32_t)((size + (SMALL_GRANULARITY - 1)) >> SMALL_GRANULARITY_SHIFT);
+  heap_size_class_t *heap_size_class = heap->size_class + class_idx;
+  _rpmalloc_stat_inc_alloc(heap, class_idx);
+  if (EXPECTED(heap_size_class->free_list != 0))
+    return free_list_pop(&heap_size_class->free_list);
+  return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class,
+                                               class_idx);
+}
+
+//! Allocate a medium sized memory block from the given heap
+static void *_rpmalloc_allocate_medium(heap_t *heap, size_t size) {
+  rpmalloc_assert(heap, "No thread heap");
+  // Calculate the size class index and do a dependent lookup of the final class
+  // index (in case of merged classes)
+  const uint32_t base_idx =
+      (uint32_t)(SMALL_CLASS_COUNT +
+                 ((size - (SMALL_SIZE_LIMIT + 1)) >> MEDIUM_GRANULARITY_SHIFT));
+  const uint32_t class_idx = _memory_size_class[base_idx].class_idx;
+  heap_size_class_t *heap_size_class = heap->size_class + class_idx;
+  _rpmalloc_stat_inc_alloc(heap, class_idx);
+  if (EXPECTED(heap_size_class->free_list != 0))
+    return free_list_pop(&heap_size_class->free_list);
+  return _rpmalloc_allocate_from_heap_fallback(heap, heap_size_class,
+                                               class_idx);
+}
+
+//! Allocate a large sized memory block from the given heap
+static void *_rpmalloc_allocate_large(heap_t *heap, size_t size) {
+  rpmalloc_assert(heap, "No thread heap");
+  // Calculate number of needed max sized spans (including header)
+  // Since this function is never called if size > LARGE_SIZE_LIMIT
+  // the span_count is guaranteed to be <= LARGE_CLASS_COUNT
+  size += SPAN_HEADER_SIZE;
+  size_t span_count = size >> _memory_span_size_shift;
+  if (size & (_memory_span_size - 1))
+    ++span_count;
+
+  // Find a span in one of the cache levels
+  span_t *span =
+      _rpmalloc_heap_extract_new_span(heap, 0, span_count, SIZE_CLASS_LARGE);
+  if (!span)
+    return span;
+
+  // Mark span as owned by this heap and set base data
+  rpmalloc_assert(span->span_count >= span_count, "Internal failure");
+  span->size_class = SIZE_CLASS_LARGE;
+  span->heap = heap;
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+  ++heap->full_span_count;
+
+  return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a huge block by mapping memory pages directly
+static void *_rpmalloc_allocate_huge(heap_t *heap, size_t size) {
+  rpmalloc_assert(heap, "No thread heap");
+  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
+  size += SPAN_HEADER_SIZE;
+  size_t num_pages = size >> _memory_page_size_shift;
+  if (size & (_memory_page_size - 1))
+    ++num_pages;
+  size_t align_offset = 0;
+  span_t *span =
+      (span_t *)_rpmalloc_mmap(num_pages * _memory_page_size, &align_offset);
+  if (!span)
+    return span;
+
+  // Store page count in span_count
+  span->size_class = SIZE_CLASS_HUGE;
+  span->span_count = (uint32_t)num_pages;
+  span->align_offset = (uint32_t)align_offset;
+  span->heap = heap;
+  _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+  ++heap->full_span_count;
+
+  return pointer_offset(span, SPAN_HEADER_SIZE);
+}
+
+//! Allocate a block of the given size
+static void *_rpmalloc_allocate(heap_t *heap, size_t size) {
+  _rpmalloc_stat_add64(&_allocation_counter, 1);
+  if (EXPECTED(size <= SMALL_SIZE_LIMIT))
+    return _rpmalloc_allocate_small(heap, size);
+  else if (size <= _memory_medium_size_limit)
+    return _rpmalloc_allocate_medium(heap, size);
+  else if (size <= LARGE_SIZE_LIMIT)
+    return _rpmalloc_allocate_large(heap, size);
+  return _rpmalloc_allocate_huge(heap, size);
+}
+
+static void *_rpmalloc_aligned_allocate(heap_t *heap, size_t alignment,
+                                        size_t size) {
+  if (alignment <= SMALL_GRANULARITY)
+    return _rpmalloc_allocate(heap, size);
+
+#if ENABLE_VALIDATE_ARGS
+  if ((size + alignment) < size) {
+    errno = EINVAL;
+    return 0;
+  }
+  if (alignment & (alignment - 1)) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+
+  if ((alignment <= SPAN_HEADER_SIZE) &&
+      ((size + SPAN_HEADER_SIZE) < _memory_medium_size_limit)) {
+    // If alignment is less or equal to span header size (which is power of
+    // two), and size aligned to span header size multiples is less than size +
+    // alignment, then use natural alignment of blocks to provide alignment
+    size_t multiple_size = size ? (size + (SPAN_HEADER_SIZE - 1)) &
+                                      ~(uintptr_t)(SPAN_HEADER_SIZE - 1)
+                                : SPAN_HEADER_SIZE;
+    rpmalloc_assert(!(multiple_size % SPAN_HEADER_SIZE),
+                    "Failed alignment calculation");
+    if (multiple_size <= (size + alignment))
+      return _rpmalloc_allocate(heap, multiple_size);
+  }
+
+  void *ptr = 0;
+  size_t align_mask = alignment - 1;
+  if (alignment <= _memory_page_size) {
+    ptr = _rpmalloc_allocate(heap, size + alignment);
+    if ((uintptr_t)ptr & align_mask) {
+      ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+      // Mark as having aligned blocks
+      span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask);
+      span->flags |= SPAN_FLAG_ALIGNED_BLOCKS;
+    }
+    return ptr;
+  }
+
+  // Fallback to mapping new pages for this request. Since pointers passed
+  // to rpfree must be able to reach the start of the span by bitmasking of
+  // the address with the span size, the returned aligned pointer from this
+  // function must be with a span size of the start of the mapped area.
+  // In worst case this requires us to loop and map pages until we get a
+  // suitable memory address. It also means we can never align to span size
+  // or greater, since the span header will push alignment more than one
+  // span size away from span start (thus causing pointer mask to give us
+  // an invalid span start on free)
+  if (alignment & align_mask) {
+    errno = EINVAL;
+    return 0;
+  }
+  if (alignment >= _memory_span_size) {
+    errno = EINVAL;
+    return 0;
+  }
+
+  size_t extra_pages = alignment / _memory_page_size;
+
+  // Since each span has a header, we will at least need one extra memory page
+  size_t num_pages = 1 + (size / _memory_page_size);
+  if (size & (_memory_page_size - 1))
+    ++num_pages;
+
+  if (extra_pages > num_pages)
+    num_pages = 1 + extra_pages;
+
+  size_t original_pages = num_pages;
+  size_t limit_pages = (_memory_span_size / _memory_page_size) * 2;
+  if (limit_pages < (original_pages * 2))
+    limit_pages = original_pages * 2;
+
+  size_t mapped_size, align_offset;
+  span_t *span;
+
+retry:
+  align_offset = 0;
+  mapped_size = num_pages * _memory_page_size;
+
+  span = (span_t *)_rpmalloc_mmap(mapped_size, &align_offset);
+  if (!span) {
+    errno = ENOMEM;
+    return 0;
+  }
+  ptr = pointer_offset(span, SPAN_HEADER_SIZE);
+
+  if ((uintptr_t)ptr & align_mask)
+    ptr = (void *)(((uintptr_t)ptr & ~(uintptr_t)align_mask) + alignment);
+
+  if (((size_t)pointer_diff(ptr, span) >= _memory_span_size) ||
+      (pointer_offset(ptr, size) > pointer_offset(span, mapped_size)) ||
+      (((uintptr_t)ptr & _memory_span_mask) != (uintptr_t)span)) {
+    _rpmalloc_unmap(span, mapped_size, align_offset, mapped_size);
+    ++num_pages;
+    if (num_pages > limit_pages) {
+      errno = EINVAL;
+      return 0;
+    }
+    goto retry;
+  }
+
+  // Store page count in span_count
+  span->size_class = SIZE_CLASS_HUGE;
+  span->span_count = (uint32_t)num_pages;
+  span->align_offset = (uint32_t)align_offset;
+  span->heap = heap;
+  _rpmalloc_stat_add_peak(&_huge_pages_current, num_pages, _huge_pages_peak);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  _rpmalloc_span_double_link_list_add(&heap->large_huge_span, span);
+#endif
+  ++heap->full_span_count;
+
+  _rpmalloc_stat_add64(&_allocation_counter, 1);
+
+  return ptr;
+}
+
+////////////
+///
+/// Deallocation entry points
+///
+//////
+
+//! Deallocate the given small/medium memory block in the current thread local
+//! heap
+static void _rpmalloc_deallocate_direct_small_or_medium(span_t *span,
+                                                        void *block) {
+  heap_t *heap = span->heap;
+  rpmalloc_assert(heap->owner_thread == get_thread_id() ||
+                      !heap->owner_thread || heap->finalize,
+                  "Internal failure");
+  // Add block to free list
+  if (UNEXPECTED(_rpmalloc_span_is_fully_utilized(span))) {
+    span->used_count = span->block_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+    _rpmalloc_span_double_link_list_remove(&heap->full_span[span->size_class],
+                                           span);
+#endif
+    _rpmalloc_span_double_link_list_add(
+        &heap->size_class[span->size_class].partial_span, span);
+    --heap->full_span_count;
+  }
+  *((void **)block) = span->free_list;
+  --span->used_count;
+  span->free_list = block;
+  if (UNEXPECTED(span->used_count == span->list_size)) {
+    // If there are no used blocks it is guaranteed that no other external
+    // thread is accessing the span
+    if (span->used_count) {
+      // Make sure we have synchronized the deferred list and list size by using
+      // acquire semantics and guarantee that no external thread is accessing
+      // span concurrently
+      void *free_list;
+      do {
+        free_list = atomic_exchange_ptr_acquire(&span->free_list_deferred,
+                                                INVALID_POINTER);
+      } while (free_list == INVALID_POINTER);
+      atomic_store_ptr_release(&span->free_list_deferred, free_list);
+    }
+    _rpmalloc_span_double_link_list_remove(
+        &heap->size_class[span->size_class].partial_span, span);
+    _rpmalloc_span_release_to_cache(heap, span);
+  }
+}
+
+static void _rpmalloc_deallocate_defer_free_span(heap_t *heap, span_t *span) {
+  if (span->size_class != SIZE_CLASS_HUGE)
+    _rpmalloc_stat_inc(&heap->span_use[span->span_count - 1].spans_deferred);
+  // This list does not need ABA protection, no mutable side state
+  do {
+    span->free_list = (void *)atomic_load_ptr(&heap->span_free_deferred);
+  } while (!atomic_cas_ptr(&heap->span_free_deferred, span, span->free_list));
+}
+
+//! Put the block in the deferred free list of the owning span
+static void _rpmalloc_deallocate_defer_small_or_medium(span_t *span,
+                                                       void *block) {
+  // The memory ordering here is a bit tricky, to avoid having to ABA protect
+  // the deferred free list to avoid desynchronization of list and list size
+  // we need to have acquire semantics on successful CAS of the pointer to
+  // guarantee the list_size variable validity + release semantics on pointer
+  // store
+  void *free_list;
+  do {
+    free_list =
+        atomic_exchange_ptr_acquire(&span->free_list_deferred, INVALID_POINTER);
+  } while (free_list == INVALID_POINTER);
+  *((void **)block) = free_list;
+  uint32_t free_count = ++span->list_size;
+  int all_deferred_free = (free_count == span->block_count);
+  atomic_store_ptr_release(&span->free_list_deferred, block);
+  if (all_deferred_free) {
+    // Span was completely freed by this block. Due to the INVALID_POINTER spin
+    // lock no other thread can reach this state simultaneously on this span.
+    // Safe to move to owner heap deferred cache
+    _rpmalloc_deallocate_defer_free_span(span->heap, span);
+  }
+}
+
+static void _rpmalloc_deallocate_small_or_medium(span_t *span, void *p) {
+  _rpmalloc_stat_inc_free(span->heap, span->size_class);
+  if (span->flags & SPAN_FLAG_ALIGNED_BLOCKS) {
+    // Realign pointer to block start
+    void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+    uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+    p = pointer_offset(p, -(int32_t)(block_offset % span->block_size));
+  }
+  // Check if block belongs to this heap or if deallocation should be deferred
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  int defer =
+      (span->heap->owner_thread &&
+       (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+  int defer =
+      ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+  if (!defer)
+    _rpmalloc_deallocate_direct_small_or_medium(span, p);
+  else
+    _rpmalloc_deallocate_defer_small_or_medium(span, p);
+}
+
+//! Deallocate the given large memory block to the current heap
+static void _rpmalloc_deallocate_large(span_t *span) {
+  rpmalloc_assert(span->size_class == SIZE_CLASS_LARGE, "Bad span size class");
+  rpmalloc_assert(!(span->flags & SPAN_FLAG_MASTER) ||
+                      !(span->flags & SPAN_FLAG_SUBSPAN),
+                  "Span flag corrupted");
+  rpmalloc_assert((span->flags & SPAN_FLAG_MASTER) ||
+                      (span->flags & SPAN_FLAG_SUBSPAN),
+                  "Span flag corrupted");
+  // We must always defer (unless finalizing) if from another heap since we
+  // cannot touch the list or counters of another heap
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  int defer =
+      (span->heap->owner_thread &&
+       (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+  int defer =
+      ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+  if (defer) {
+    _rpmalloc_deallocate_defer_free_span(span->heap, span);
+    return;
+  }
+  rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
+  --span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
+#if ENABLE_ADAPTIVE_THREAD_CACHE || ENABLE_STATISTICS
+  // Decrease counter
+  size_t idx = span->span_count - 1;
+  atomic_decr32(&span->heap->span_use[idx].current);
+#endif
+  heap_t *heap = span->heap;
+  rpmalloc_assert(heap, "No thread heap");
+#if ENABLE_THREAD_CACHE
+  const int set_as_reserved =
+      ((span->span_count > 1) && (heap->span_cache.count == 0) &&
+       !heap->finalize && !heap->spans_reserved);
+#else
+  const int set_as_reserved =
+      ((span->span_count > 1) && !heap->finalize && !heap->spans_reserved);
+#endif
+  if (set_as_reserved) {
+    heap->span_reserve = span;
+    heap->spans_reserved = span->span_count;
+    if (span->flags & SPAN_FLAG_MASTER) {
+      heap->span_reserve_master = span;
+    } else { // SPAN_FLAG_SUBSPAN
+      span_t *master = (span_t *)pointer_offset(
+          span,
+          -(intptr_t)((size_t)span->offset_from_master * _memory_span_size));
+      heap->span_reserve_master = master;
+      rpmalloc_assert(master->flags & SPAN_FLAG_MASTER, "Span flag corrupted");
+      rpmalloc_assert(atomic_load32(&master->remaining_spans) >=
+                          (int32_t)span->span_count,
+                      "Master span count corrupted");
+    }
+    _rpmalloc_stat_inc(&heap->span_use[idx].spans_to_reserved);
+  } else {
+    // Insert into cache list
+    _rpmalloc_heap_cache_insert(heap, span);
+  }
+}
+
+//! Deallocate the given huge span
+static void _rpmalloc_deallocate_huge(span_t *span) {
+  rpmalloc_assert(span->heap, "No span heap");
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  int defer =
+      (span->heap->owner_thread &&
+       (span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#else
+  int defer =
+      ((span->heap->owner_thread != get_thread_id()) && !span->heap->finalize);
+#endif
+  if (defer) {
+    _rpmalloc_deallocate_defer_free_span(span->heap, span);
+    return;
+  }
+  rpmalloc_assert(span->heap->full_span_count, "Heap span counter corrupted");
+  --span->heap->full_span_count;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  _rpmalloc_span_double_link_list_remove(&span->heap->large_huge_span, span);
+#endif
+
+  // Oversized allocation, page count is stored in span_count
+  size_t num_pages = span->span_count;
+  _rpmalloc_unmap(span, num_pages * _memory_page_size, span->align_offset,
+                  num_pages * _memory_page_size);
+  _rpmalloc_stat_sub(&_huge_pages_current, num_pages);
+}
+
+//! Deallocate the given block
+static void _rpmalloc_deallocate(void *p) {
+  _rpmalloc_stat_add64(&_deallocation_counter, 1);
+  // Grab the span (always at start of span, using span alignment)
+  span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask);
+  if (UNEXPECTED(!span))
+    return;
+  if (EXPECTED(span->size_class < SIZE_CLASS_COUNT))
+    _rpmalloc_deallocate_small_or_medium(span, p);
+  else if (span->size_class == SIZE_CLASS_LARGE)
+    _rpmalloc_deallocate_large(span);
+  else
+    _rpmalloc_deallocate_huge(span);
+}
+
+////////////
+///
+/// Reallocation entry points
+///
+//////
+
+static size_t _rpmalloc_usable_size(void *p);
+
+//! Reallocate the given block to the given size
+static void *_rpmalloc_reallocate(heap_t *heap, void *p, size_t size,
+                                  size_t oldsize, unsigned int flags) {
+  if (p) {
+    // Grab the span using guaranteed span alignment
+    span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask);
+    if (EXPECTED(span->size_class < SIZE_CLASS_COUNT)) {
+      // Small/medium sized block
+      rpmalloc_assert(span->span_count == 1, "Span counter corrupted");
+      void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+      uint32_t block_offset = (uint32_t)pointer_diff(p, blocks_start);
+      uint32_t block_idx = block_offset / span->block_size;
+      void *block =
+          pointer_offset(blocks_start, (size_t)block_idx * span->block_size);
+      if (!oldsize)
+        oldsize =
+            (size_t)((ptrdiff_t)span->block_size - pointer_diff(p, block));
+      if ((size_t)span->block_size >= size) {
+        // Still fits in block, never mind trying to save memory, but preserve
+        // data if alignment changed
+        if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+          memmove(block, p, oldsize);
+        return block;
+      }
+    } else if (span->size_class == SIZE_CLASS_LARGE) {
+      // Large block
+      size_t total_size = size + SPAN_HEADER_SIZE;
+      size_t num_spans = total_size >> _memory_span_size_shift;
+      if (total_size & (_memory_span_mask - 1))
+        ++num_spans;
+      size_t current_spans = span->span_count;
+      void *block = pointer_offset(span, SPAN_HEADER_SIZE);
+      if (!oldsize)
+        oldsize = (current_spans * _memory_span_size) -
+                  (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+      if ((current_spans >= num_spans) && (total_size >= (oldsize / 2))) {
+        // Still fits in block, never mind trying to save memory, but preserve
+        // data if alignment changed
+        if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+          memmove(block, p, oldsize);
+        return block;
+      }
+    } else {
+      // Oversized block
+      size_t total_size = size + SPAN_HEADER_SIZE;
+      size_t num_pages = total_size >> _memory_page_size_shift;
+      if (total_size & (_memory_page_size - 1))
+        ++num_pages;
+      // Page count is stored in span_count
+      size_t current_pages = span->span_count;
+      void *block = pointer_offset(span, SPAN_HEADER_SIZE);
+      if (!oldsize)
+        oldsize = (current_pages * _memory_page_size) -
+                  (size_t)pointer_diff(p, block) - SPAN_HEADER_SIZE;
+      if ((current_pages >= num_pages) && (num_pages >= (current_pages / 2))) {
+        // Still fits in block, never mind trying to save memory, but preserve
+        // data if alignment changed
+        if ((p != block) && !(flags & RPMALLOC_NO_PRESERVE))
+          memmove(block, p, oldsize);
+        return block;
+      }
+    }
+  } else {
+    oldsize = 0;
+  }
+
+  if (!!(flags & RPMALLOC_GROW_OR_FAIL))
+    return 0;
+
+  // Size is greater than block size, need to allocate a new block and
+  // deallocate the old Avoid hysteresis by overallocating if increase is small
+  // (below 37%)
+  size_t lower_bound = oldsize + (oldsize >> 2) + (oldsize >> 3);
+  size_t new_size =
+      (size > lower_bound) ? size : ((size > oldsize) ? lower_bound : size);
+  void *block = _rpmalloc_allocate(heap, new_size);
+  if (p && block) {
+    if (!(flags & RPMALLOC_NO_PRESERVE))
+      memcpy(block, p, oldsize < new_size ? oldsize : new_size);
+    _rpmalloc_deallocate(p);
+  }
+
+  return block;
+}
+
+static void *_rpmalloc_aligned_reallocate(heap_t *heap, void *ptr,
+                                          size_t alignment, size_t size,
+                                          size_t oldsize, unsigned int flags) {
+  if (alignment <= SMALL_GRANULARITY)
+    return _rpmalloc_reallocate(heap, ptr, size, oldsize, flags);
+
+  int no_alloc = !!(flags & RPMALLOC_GROW_OR_FAIL);
+  size_t usablesize = (ptr ? _rpmalloc_usable_size(ptr) : 0);
+  if ((usablesize >= size) && !((uintptr_t)ptr & (alignment - 1))) {
+    if (no_alloc || (size >= (usablesize / 2)))
+      return ptr;
+  }
+  // Aligned alloc marks span as having aligned blocks
+  void *block =
+      (!no_alloc ? _rpmalloc_aligned_allocate(heap, alignment, size) : 0);
+  if (EXPECTED(block != 0)) {
+    if (!(flags & RPMALLOC_NO_PRESERVE) && ptr) {
+      if (!oldsize)
+        oldsize = usablesize;
+      memcpy(block, ptr, oldsize < size ? oldsize : size);
+    }
+    _rpmalloc_deallocate(ptr);
+  }
+  return block;
+}
+
+////////////
+///
+/// Initialization, finalization and utility
+///
+//////
+
+//! Get the usable size of the given block
+static size_t _rpmalloc_usable_size(void *p) {
+  // Grab the span using guaranteed span alignment
+  span_t *span = (span_t *)((uintptr_t)p & _memory_span_mask);
+  if (span->size_class < SIZE_CLASS_COUNT) {
+    // Small/medium block
+    void *blocks_start = pointer_offset(span, SPAN_HEADER_SIZE);
+    return span->block_size -
+           ((size_t)pointer_diff(p, blocks_start) % span->block_size);
+  }
+  if (span->size_class == SIZE_CLASS_LARGE) {
+    // Large block
+    size_t current_spans = span->span_count;
+    return (current_spans * _memory_span_size) - (size_t)pointer_diff(p, span);
+  }
+  // Oversized block, page count is stored in span_count
+  size_t current_pages = span->span_count;
+  return (current_pages * _memory_page_size) - (size_t)pointer_diff(p, span);
+}
+
+//! Adjust and optimize the size class properties for the given class
+static void _rpmalloc_adjust_size_class(size_t iclass) {
+  size_t block_size = _memory_size_class[iclass].block_size;
+  size_t block_count = (_memory_span_size - SPAN_HEADER_SIZE) / block_size;
+
+  _memory_size_class[iclass].block_count = (uint16_t)block_count;
+  _memory_size_class[iclass].class_idx = (uint16_t)iclass;
+
+  // Check if previous size classes can be merged
+  if (iclass >= SMALL_CLASS_COUNT) {
+    size_t prevclass = iclass;
+    while (prevclass > 0) {
+      --prevclass;
+      // A class can be merged if number of pages and number of blocks are equal
+      if (_memory_size_class[prevclass].block_count ==
+          _memory_size_class[iclass].block_count)
+        _rpmalloc_memcpy_const(_memory_size_class + prevclass,
+                               _memory_size_class + iclass,
+                               sizeof(_memory_size_class[iclass]));
+      else
+        break;
+    }
+  }
+}
+
+//! Initialize the allocator and setup global data
+extern inline int rpmalloc_initialize(void) {
+  if (_rpmalloc_initialized) {
+    rpmalloc_thread_initialize();
+    return 0;
+  }
+  return rpmalloc_initialize_config(0);
+}
+
+int rpmalloc_initialize_config(const rpmalloc_config_t *config) {
+  if (_rpmalloc_initialized) {
+    rpmalloc_thread_initialize();
+    return 0;
+  }
+  _rpmalloc_initialized = 1;
+
+  if (config)
+    memcpy(&_memory_config, config, sizeof(rpmalloc_config_t));
+  else
+    _rpmalloc_memset_const(&_memory_config, 0, sizeof(rpmalloc_config_t));
+
+  if (!_memory_config.memory_map || !_memory_config.memory_unmap) {
+    _memory_config.memory_map = _rpmalloc_mmap_os;
+    _memory_config.memory_unmap = _rpmalloc_unmap_os;
+  }
+
+#if PLATFORM_WINDOWS
+  SYSTEM_INFO system_info;
+  memset(&system_info, 0, sizeof(system_info));
+  GetSystemInfo(&system_info);
+  _memory_map_granularity = system_info.dwAllocationGranularity;
+#else
+  _memory_map_granularity = (size_t)sysconf(_SC_PAGESIZE);
+#endif
+
+#if RPMALLOC_CONFIGURABLE
+  _memory_page_size = _memory_config.page_size;
+#else
+  _memory_page_size = 0;
+#endif
+  _memory_huge_pages = 0;
+  if (!_memory_page_size) {
+#if PLATFORM_WINDOWS
+    _memory_page_size = system_info.dwPageSize;
+#else
+    _memory_page_size = _memory_map_granularity;
+    if (_memory_config.enable_huge_pages) {
+#if defined(__linux__)
+      size_t huge_page_size = 0;
+      FILE *meminfo = fopen("/proc/meminfo", "r");
+      if (meminfo) {
+        char line[128];
+        while (!huge_page_size && fgets(line, sizeof(line) - 1, meminfo)) {
+          line[sizeof(line) - 1] = 0;
+          if (strstr(line, "Hugepagesize:"))
+            huge_page_size = (size_t)strtol(line + 13, 0, 10) * 1024;
+        }
+        fclose(meminfo);
+      }
+      if (huge_page_size) {
+        _memory_huge_pages = 1;
+        _memory_page_size = huge_page_size;
+        _memory_map_granularity = huge_page_size;
+      }
+#elif defined(__FreeBSD__)
+      int rc;
+      size_t sz = sizeof(rc);
+
+      if (sysctlbyname("vm.pmap.pg_ps_enabled", &rc, &sz, NULL, 0) == 0 &&
+          rc == 1) {
+        static size_t defsize = 2 * 1024 * 1024;
+        int nsize = 0;
+        size_t sizes[4] = {0};
+        _memory_huge_pages = 1;
+        _memory_page_size = defsize;
+        if ((nsize = getpagesizes(sizes, 4)) >= 2) {
+          nsize--;
+          for (size_t csize = sizes[nsize]; nsize >= 0 && csize;
+               --nsize, csize = sizes[nsize]) {
+            //! Unlikely, but as a precaution..
+            rpmalloc_assert(!(csize & (csize - 1)) && !(csize % 1024),
+                            "Invalid page size");
+            if (defsize < csize) {
+              _memory_page_size = csize;
+              break;
+            }
+          }
+        }
+        _memory_map_granularity = _memory_page_size;
+      }
+#elif defined(__APPLE__) || defined(__NetBSD__)
+      _memory_huge_pages = 1;
+      _memory_page_size = 2 * 1024 * 1024;
+      _memory_map_granularity = _memory_page_size;
+#endif
+    }
+#endif
+  } else {
+    if (_memory_config.enable_huge_pages)
+      _memory_huge_pages = 1;
+  }
+
+#if PLATFORM_WINDOWS
+  if (_memory_config.enable_huge_pages) {
+    HANDLE token = 0;
+    size_t large_page_minimum = GetLargePageMinimum();
+    if (large_page_minimum)
+      OpenProcessToken(GetCurrentProcess(),
+                       TOKEN_ADJUST_PRIVILEGES | TOKEN_QUERY, &token);
+    if (token) {
+      LUID luid;
+      if (LookupPrivilegeValue(0, SE_LOCK_MEMORY_NAME, &luid)) {
+        TOKEN_PRIVILEGES token_privileges;
+        memset(&token_privileges, 0, sizeof(token_privileges));
+        token_privileges.PrivilegeCount = 1;
+        token_privileges.Privileges[0].Luid = luid;
+        token_privileges.Privileges[0].Attributes = SE_PRIVILEGE_ENABLED;
+        if (AdjustTokenPrivileges(token, FALSE, &token_privileges, 0, 0, 0)) {
+          if (GetLastError() == ERROR_SUCCESS)
+            _memory_huge_pages = 1;
+        }
+      }
+      CloseHandle(token);
+    }
+    if (_memory_huge_pages) {
+      if (large_page_minimum > _memory_page_size)
+        _memory_page_size = large_page_minimum;
+      if (large_page_minimum > _memory_map_granularity)
+        _memory_map_granularity = large_page_minimum;
+    }
+  }
+#endif
+
+  size_t min_span_size = 256;
+  size_t max_page_size;
+#if UINTPTR_MAX > 0xFFFFFFFF
+  max_page_size = 4096ULL * 1024ULL * 1024ULL;
+#else
+  max_page_size = 4 * 1024 * 1024;
+#endif
+  if (_memory_page_size < min_span_size)
+    _memory_page_size = min_span_size;
+  if (_memory_page_size > max_page_size)
+    _memory_page_size = max_page_size;
+  _memory_page_size_shift = 0;
+  size_t page_size_bit = _memory_page_size;
+  while (page_size_bit != 1) {
+    ++_memory_page_size_shift;
+    page_size_bit >>= 1;
+  }
+  _memory_page_size = ((size_t)1 << _memory_page_size_shift);
+
+#if RPMALLOC_CONFIGURABLE
+  if (!_memory_config.span_size) {
+    _memory_span_size = _memory_default_span_size;
+    _memory_span_size_shift = _memory_default_span_size_shift;
+    _memory_span_mask = _memory_default_span_mask;
+  } else {
+    size_t span_size = _memory_config.span_size;
+    if (span_size > (256 * 1024))
+      span_size = (256 * 1024);
+    _memory_span_size = 4096;
+    _memory_span_size_shift = 12;
+    while (_memory_span_size < span_size) {
+      _memory_span_size <<= 1;
+      ++_memory_span_size_shift;
+    }
+    _memory_span_mask = ~(uintptr_t)(_memory_span_size - 1);
+  }
+#endif
+
+  _memory_span_map_count =
+      (_memory_config.span_map_count ? _memory_config.span_map_count
+                                     : DEFAULT_SPAN_MAP_COUNT);
+  if ((_memory_span_size * _memory_span_map_count) < _memory_page_size)
+    _memory_span_map_count = (_memory_page_size / _memory_span_size);
+  if ((_memory_page_size >= _memory_span_size) &&
+      ((_memory_span_map_count * _memory_span_size) % _memory_page_size))
+    _memory_span_map_count = (_memory_page_size / _memory_span_size);
+  _memory_heap_reserve_count = (_memory_span_map_count > DEFAULT_SPAN_MAP_COUNT)
+                                   ? DEFAULT_SPAN_MAP_COUNT
+                                   : _memory_span_map_count;
+
+  _memory_config.page_size = _memory_page_size;
+  _memory_config.span_size = _memory_span_size;
+  _memory_config.span_map_count = _memory_span_map_count;
+  _memory_config.enable_huge_pages = _memory_huge_pages;
+
+#if ((defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD) ||          \
+    defined(__TINYC__)
+  if (pthread_key_create(&_memory_thread_heap, _rpmalloc_heap_release_raw_fc))
+    return -1;
+#endif
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+  fls_key = FlsAlloc(&_rpmalloc_thread_destructor);
+#endif
+
+  // Setup all small and medium size classes
+  size_t iclass = 0;
+  _memory_size_class[iclass].block_size = SMALL_GRANULARITY;
+  _rpmalloc_adjust_size_class(iclass);
+  for (iclass = 1; iclass < SMALL_CLASS_COUNT; ++iclass) {
+    size_t size = iclass * SMALL_GRANULARITY;
+    _memory_size_class[iclass].block_size = (uint32_t)size;
+    _rpmalloc_adjust_size_class(iclass);
+  }
+  // At least two blocks per span, then fall back to large allocations
+  _memory_medium_size_limit = (_memory_span_size - SPAN_HEADER_SIZE) >> 1;
+  if (_memory_medium_size_limit > MEDIUM_SIZE_LIMIT)
+    _memory_medium_size_limit = MEDIUM_SIZE_LIMIT;
+  for (iclass = 0; iclass < MEDIUM_CLASS_COUNT; ++iclass) {
+    size_t size = SMALL_SIZE_LIMIT + ((iclass + 1) * MEDIUM_GRANULARITY);
+    if (size > _memory_medium_size_limit) {
+      _memory_medium_size_limit =
+          SMALL_SIZE_LIMIT + (iclass * MEDIUM_GRANULARITY);
+      break;
+    }
+    _memory_size_class[SMALL_CLASS_COUNT + iclass].block_size = (uint32_t)size;
+    _rpmalloc_adjust_size_class(SMALL_CLASS_COUNT + iclass);
+  }
+
+  _memory_orphan_heaps = 0;
+#if RPMALLOC_FIRST_CLASS_HEAPS
+  _memory_first_class_orphan_heaps = 0;
+#endif
+#if ENABLE_STATISTICS
+  atomic_store32(&_memory_active_heaps, 0);
+  atomic_store32(&_mapped_pages, 0);
+  _mapped_pages_peak = 0;
+  atomic_store32(&_master_spans, 0);
+  atomic_store32(&_mapped_total, 0);
+  atomic_store32(&_unmapped_total, 0);
+  atomic_store32(&_mapped_pages_os, 0);
+  atomic_store32(&_huge_pages_current, 0);
+  _huge_pages_peak = 0;
+#endif
+  memset(_memory_heaps, 0, sizeof(_memory_heaps));
+  atomic_store32_release(&_memory_global_lock, 0);
+
+  rpmalloc_linker_reference();
+
+  // Initialize this thread
+  rpmalloc_thread_initialize();
+  return 0;
+}
+
+//! Finalize the allocator
+void rpmalloc_finalize(void) {
+  rpmalloc_thread_finalize(1);
+  // rpmalloc_dump_statistics(stdout);
+
+  if (_memory_global_reserve) {
+    atomic_add32(&_memory_global_reserve_master->remaining_spans,
+                 -(int32_t)_memory_global_reserve_count);
+    _memory_global_reserve_master = 0;
+    _memory_global_reserve_count = 0;
+    _memory_global_reserve = 0;
+  }
+  atomic_store32_release(&_memory_global_lock, 0);
+
+  // Free all thread caches and fully free spans
+  for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+    heap_t *heap = _memory_heaps[list_idx];
+    while (heap) {
+      heap_t *next_heap = heap->next_heap;
+      heap->finalize = 1;
+      _rpmalloc_heap_global_finalize(heap);
+      heap = next_heap;
+    }
+  }
+
+#if ENABLE_GLOBAL_CACHE
+  // Free global caches
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass)
+    _rpmalloc_global_cache_finalize(&_memory_span_cache[iclass]);
+#endif
+
+#if (defined(__APPLE__) || defined(__HAIKU__)) && ENABLE_PRELOAD
+  pthread_key_delete(_memory_thread_heap);
+#endif
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+  FlsFree(fls_key);
+  fls_key = 0;
+#endif
+#if ENABLE_STATISTICS
+  // If you hit these asserts you probably have memory leaks (perhaps global
+  // scope data doing dynamic allocations) or double frees in your code
+  rpmalloc_assert(atomic_load32(&_mapped_pages) == 0, "Memory leak detected");
+  rpmalloc_assert(atomic_load32(&_mapped_pages_os) == 0,
+                  "Memory leak detected");
+#endif
+
+  _rpmalloc_initialized = 0;
+}
+
+//! Initialize thread, assign heap
+extern inline void rpmalloc_thread_initialize(void) {
+  if (!get_thread_heap_raw()) {
+    heap_t *heap = _rpmalloc_heap_allocate(0);
+    if (heap) {
+      _rpmalloc_stat_inc(&_memory_active_heaps);
+      set_thread_heap(heap);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+      FlsSetValue(fls_key, heap);
+#endif
+    }
+  }
+}
+
+//! Finalize thread, orphan heap
+void rpmalloc_thread_finalize(int release_caches) {
+  heap_t *heap = get_thread_heap_raw();
+  if (heap)
+    _rpmalloc_heap_release_raw(heap, release_caches);
+  set_thread_heap(0);
+#if defined(_WIN32) && (!defined(BUILD_DYNAMIC_LINK) || !BUILD_DYNAMIC_LINK)
+  FlsSetValue(fls_key, 0);
+#endif
+}
+
+int rpmalloc_is_thread_initialized(void) {
+  return (get_thread_heap_raw() != 0) ? 1 : 0;
+}
+
+const rpmalloc_config_t *rpmalloc_config(void) { return &_memory_config; }
+
+// Extern interface
+
+extern inline RPMALLOC_ALLOCATOR void *rpmalloc(size_t size) {
+#if ENABLE_VALIDATE_ARGS
+  if (size >= MAX_ALLOC_SIZE) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+  heap_t *heap = get_thread_heap();
+  return _rpmalloc_allocate(heap, size);
+}
+
+extern inline void rpfree(void *ptr) { _rpmalloc_deallocate(ptr); }
+
+extern inline RPMALLOC_ALLOCATOR void *rpcalloc(size_t num, size_t size) {
+  size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+  int err = SizeTMult(num, size, &total);
+  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#else
+  int err = __builtin_umull_overflow(num, size, &total);
+  if (err || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+#else
+  total = num * size;
+#endif
+  heap_t *heap = get_thread_heap();
+  void *block = _rpmalloc_allocate(heap, total);
+  if (block)
+    memset(block, 0, total);
+  return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void *rprealloc(void *ptr, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+  if (size >= MAX_ALLOC_SIZE) {
+    errno = EINVAL;
+    return ptr;
+  }
+#endif
+  heap_t *heap = get_thread_heap();
+  return _rpmalloc_reallocate(heap, ptr, size, 0, 0);
+}
+
+extern RPMALLOC_ALLOCATOR void *rpaligned_realloc(void *ptr, size_t alignment,
+                                                  size_t size, size_t oldsize,
+                                                  unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+  if ((size + alignment < size) || (alignment > _memory_page_size)) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+  heap_t *heap = get_thread_heap();
+  return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, oldsize,
+                                      flags);
+}
+
+extern RPMALLOC_ALLOCATOR void *rpaligned_alloc(size_t alignment, size_t size) {
+  heap_t *heap = get_thread_heap();
+  return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void *
+rpaligned_calloc(size_t alignment, size_t num, size_t size) {
+  size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+  int err = SizeTMult(num, size, &total);
+  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#else
+  int err = __builtin_umull_overflow(num, size, &total);
+  if (err || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+#else
+  total = num * size;
+#endif
+  void *block = rpaligned_alloc(alignment, total);
+  if (block)
+    memset(block, 0, total);
+  return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void *rpmemalign(size_t alignment,
+                                                  size_t size) {
+  return rpaligned_alloc(alignment, size);
+}
+
+extern inline int rpposix_memalign(void **memptr, size_t alignment,
+                                   size_t size) {
+  if (memptr)
+    *memptr = rpaligned_alloc(alignment, size);
+  else
+    return EINVAL;
+  return *memptr ? 0 : ENOMEM;
+}
+
+extern inline size_t rpmalloc_usable_size(void *ptr) {
+  return (ptr ? _rpmalloc_usable_size(ptr) : 0);
+}
+
+extern inline void rpmalloc_thread_collect(void) {}
+
+void rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats) {
+  memset(stats, 0, sizeof(rpmalloc_thread_statistics_t));
+  heap_t *heap = get_thread_heap_raw();
+  if (!heap)
+    return;
+
+  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+    size_class_t *size_class = _memory_size_class + iclass;
+    span_t *span = heap->size_class[iclass].partial_span;
+    while (span) {
+      size_t free_count = span->list_size;
+      size_t block_count = size_class->block_count;
+      if (span->free_list_limit < block_count)
+        block_count = span->free_list_limit;
+      free_count += (block_count - span->used_count);
+      stats->sizecache += free_count * size_class->block_size;
+      span = span->next;
+    }
+  }
+
+#if ENABLE_THREAD_CACHE
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    span_cache_t *span_cache;
+    if (!iclass)
+      span_cache = &heap->span_cache;
+    else
+      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
+    stats->spancache += span_cache->count * (iclass + 1) * _memory_span_size;
+  }
+#endif
+
+  span_t *deferred = (span_t *)atomic_load_ptr(&heap->span_free_deferred);
+  while (deferred) {
+    if (deferred->size_class != SIZE_CLASS_HUGE)
+      stats->spancache += (size_t)deferred->span_count * _memory_span_size;
+    deferred = (span_t *)deferred->free_list;
+  }
+
+#if ENABLE_STATISTICS
+  stats->thread_to_global = (size_t)atomic_load64(&heap->thread_to_global);
+  stats->global_to_thread = (size_t)atomic_load64(&heap->global_to_thread);
+
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    stats->span_use[iclass].current =
+        (size_t)atomic_load32(&heap->span_use[iclass].current);
+    stats->span_use[iclass].peak =
+        (size_t)atomic_load32(&heap->span_use[iclass].high);
+    stats->span_use[iclass].to_global =
+        (size_t)atomic_load32(&heap->span_use[iclass].spans_to_global);
+    stats->span_use[iclass].from_global =
+        (size_t)atomic_load32(&heap->span_use[iclass].spans_from_global);
+    stats->span_use[iclass].to_cache =
+        (size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache);
+    stats->span_use[iclass].from_cache =
+        (size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache);
+    stats->span_use[iclass].to_reserved =
+        (size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved);
+    stats->span_use[iclass].from_reserved =
+        (size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved);
+    stats->span_use[iclass].map_calls =
+        (size_t)atomic_load32(&heap->span_use[iclass].spans_map_calls);
+  }
+  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+    stats->size_use[iclass].alloc_current =
+        (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_current);
+    stats->size_use[iclass].alloc_peak =
+        (size_t)heap->size_class_use[iclass].alloc_peak;
+    stats->size_use[iclass].alloc_total =
+        (size_t)atomic_load32(&heap->size_class_use[iclass].alloc_total);
+    stats->size_use[iclass].free_total =
+        (size_t)atomic_load32(&heap->size_class_use[iclass].free_total);
+    stats->size_use[iclass].spans_to_cache =
+        (size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache);
+    stats->size_use[iclass].spans_from_cache =
+        (size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache);
+    stats->size_use[iclass].spans_from_reserved = (size_t)atomic_load32(
+        &heap->size_class_use[iclass].spans_from_reserved);
+    stats->size_use[iclass].map_calls =
+        (size_t)atomic_load32(&heap->size_class_use[iclass].spans_map_calls);
+  }
+#endif
+}
+
+void rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats) {
+  memset(stats, 0, sizeof(rpmalloc_global_statistics_t));
+#if ENABLE_STATISTICS
+  stats->mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
+  stats->mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
+  stats->mapped_total =
+      (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
+  stats->unmapped_total =
+      (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
+  stats->huge_alloc =
+      (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
+  stats->huge_alloc_peak = (size_t)_huge_pages_peak * _memory_page_size;
+#endif
+#if ENABLE_GLOBAL_CACHE
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    global_cache_t *cache = &_memory_span_cache[iclass];
+    while (!atomic_cas32_acquire(&cache->lock, 1, 0))
+      _rpmalloc_spin();
+    uint32_t count = cache->count;
+#if ENABLE_UNLIMITED_CACHE
+    span_t *current_span = cache->overflow;
+    while (current_span) {
+      ++count;
+      current_span = current_span->next;
+    }
+#endif
+    atomic_store32_release(&cache->lock, 0);
+    stats->cached += count * (iclass + 1) * _memory_span_size;
+  }
+#endif
+}
+
+#if ENABLE_STATISTICS
+
+static void _memory_heap_dump_statistics(heap_t *heap, void *file) {
+  fprintf(file, "Heap %d stats:\n", heap->id);
+  fprintf(file, "Class   CurAlloc  PeakAlloc   TotAlloc    TotFree  BlkSize "
+                "BlkCount SpansCur SpansPeak  PeakAllocMiB  ToCacheMiB "
+                "FromCacheMiB FromReserveMiB MmapCalls\n");
+  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+    if (!atomic_load32(&heap->size_class_use[iclass].alloc_total))
+      continue;
+    fprintf(
+        file,
+        "%3u:  %10u %10u %10u %10u %8u %8u %8d %9d %13zu %11zu %12zu %14zu "
+        "%9u\n",
+        (uint32_t)iclass,
+        atomic_load32(&heap->size_class_use[iclass].alloc_current),
+        heap->size_class_use[iclass].alloc_peak,
+        atomic_load32(&heap->size_class_use[iclass].alloc_total),
+        atomic_load32(&heap->size_class_use[iclass].free_total),
+        _memory_size_class[iclass].block_size,
+        _memory_size_class[iclass].block_count,
+        atomic_load32(&heap->size_class_use[iclass].spans_current),
+        heap->size_class_use[iclass].spans_peak,
+        ((size_t)heap->size_class_use[iclass].alloc_peak *
+         (size_t)_memory_size_class[iclass].block_size) /
+            (size_t)(1024 * 1024),
+        ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_to_cache) *
+         _memory_span_size) /
+            (size_t)(1024 * 1024),
+        ((size_t)atomic_load32(&heap->size_class_use[iclass].spans_from_cache) *
+         _memory_span_size) /
+            (size_t)(1024 * 1024),
+        ((size_t)atomic_load32(
+             &heap->size_class_use[iclass].spans_from_reserved) *
+         _memory_span_size) /
+            (size_t)(1024 * 1024),
+        atomic_load32(&heap->size_class_use[iclass].spans_map_calls));
+  }
+  fprintf(file, "Spans  Current     Peak Deferred  PeakMiB  Cached  ToCacheMiB "
+                "FromCacheMiB ToReserveMiB FromReserveMiB ToGlobalMiB "
+                "FromGlobalMiB  MmapCalls\n");
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    if (!atomic_load32(&heap->span_use[iclass].high) &&
+        !atomic_load32(&heap->span_use[iclass].spans_map_calls))
+      continue;
+    fprintf(
+        file,
+        "%4u: %8d %8u %8u %8zu %7u %11zu %12zu %12zu %14zu %11zu %13zu %10u\n",
+        (uint32_t)(iclass + 1), atomic_load32(&heap->span_use[iclass].current),
+        atomic_load32(&heap->span_use[iclass].high),
+        atomic_load32(&heap->span_use[iclass].spans_deferred),
+        ((size_t)atomic_load32(&heap->span_use[iclass].high) *
+         (size_t)_memory_span_size * (iclass + 1)) /
+            (size_t)(1024 * 1024),
+#if ENABLE_THREAD_CACHE
+        (unsigned int)(!iclass ? heap->span_cache.count
+                               : heap->span_large_cache[iclass - 1].count),
+        ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_cache) *
+         (iclass + 1) * _memory_span_size) /
+            (size_t)(1024 * 1024),
+        ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_cache) *
+         (iclass + 1) * _memory_span_size) /
+            (size_t)(1024 * 1024),
+#else
+        0, (size_t)0, (size_t)0,
+#endif
+        ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_reserved) *
+         (iclass + 1) * _memory_span_size) /
+            (size_t)(1024 * 1024),
+        ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_reserved) *
+         (iclass + 1) * _memory_span_size) /
+            (size_t)(1024 * 1024),
+        ((size_t)atomic_load32(&heap->span_use[iclass].spans_to_global) *
+         (size_t)_memory_span_size * (iclass + 1)) /
+            (size_t)(1024 * 1024),
+        ((size_t)atomic_load32(&heap->span_use[iclass].spans_from_global) *
+         (size_t)_memory_span_size * (iclass + 1)) /
+            (size_t)(1024 * 1024),
+        atomic_load32(&heap->span_use[iclass].spans_map_calls));
+  }
+  fprintf(file, "Full spans: %zu\n", heap->full_span_count);
+  fprintf(file, "ThreadToGlobalMiB GlobalToThreadMiB\n");
+  fprintf(
+      file, "%17zu %17zu\n",
+      (size_t)atomic_load64(&heap->thread_to_global) / (size_t)(1024 * 1024),
+      (size_t)atomic_load64(&heap->global_to_thread) / (size_t)(1024 * 1024));
+}
+
+#endif
+
+void rpmalloc_dump_statistics(void *file) {
+#if ENABLE_STATISTICS
+  for (size_t list_idx = 0; list_idx < HEAP_ARRAY_SIZE; ++list_idx) {
+    heap_t *heap = _memory_heaps[list_idx];
+    while (heap) {
+      int need_dump = 0;
+      for (size_t iclass = 0; !need_dump && (iclass < SIZE_CLASS_COUNT);
+           ++iclass) {
+        if (!atomic_load32(&heap->size_class_use[iclass].alloc_total)) {
+          rpmalloc_assert(
+              !atomic_load32(&heap->size_class_use[iclass].free_total),
+              "Heap statistics counter mismatch");
+          rpmalloc_assert(
+              !atomic_load32(&heap->size_class_use[iclass].spans_map_calls),
+              "Heap statistics counter mismatch");
+          continue;
+        }
+        need_dump = 1;
+      }
+      for (size_t iclass = 0; !need_dump && (iclass < LARGE_CLASS_COUNT);
+           ++iclass) {
+        if (!atomic_load32(&heap->span_use[iclass].high) &&
+            !atomic_load32(&heap->span_use[iclass].spans_map_calls))
+          continue;
+        need_dump = 1;
+      }
+      if (need_dump)
+        _memory_heap_dump_statistics(heap, file);
+      heap = heap->next_heap;
+    }
+  }
+  fprintf(file, "Global stats:\n");
+  size_t huge_current =
+      (size_t)atomic_load32(&_huge_pages_current) * _memory_page_size;
+  size_t huge_peak = (size_t)_huge_pages_peak * _memory_page_size;
+  fprintf(file, "HugeCurrentMiB HugePeakMiB\n");
+  fprintf(file, "%14zu %11zu\n", huge_current / (size_t)(1024 * 1024),
+          huge_peak / (size_t)(1024 * 1024));
+
+#if ENABLE_GLOBAL_CACHE
+  fprintf(file, "GlobalCacheMiB\n");
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    global_cache_t *cache = _memory_span_cache + iclass;
+    size_t global_cache = (size_t)cache->count * iclass * _memory_span_size;
+
+    size_t global_overflow_cache = 0;
+    span_t *span = cache->overflow;
+    while (span) {
+      global_overflow_cache += iclass * _memory_span_size;
+      span = span->next;
+    }
+    if (global_cache || global_overflow_cache || cache->insert_count ||
+        cache->extract_count)
+      fprintf(file,
+              "%4zu: %8zuMiB (%8zuMiB overflow) %14zu insert %14zu extract\n",
+              iclass + 1, global_cache / (size_t)(1024 * 1024),
+              global_overflow_cache / (size_t)(1024 * 1024),
+              cache->insert_count, cache->extract_count);
+  }
+#endif
+
+  size_t mapped = (size_t)atomic_load32(&_mapped_pages) * _memory_page_size;
+  size_t mapped_os =
+      (size_t)atomic_load32(&_mapped_pages_os) * _memory_page_size;
+  size_t mapped_peak = (size_t)_mapped_pages_peak * _memory_page_size;
+  size_t mapped_total =
+      (size_t)atomic_load32(&_mapped_total) * _memory_page_size;
+  size_t unmapped_total =
+      (size_t)atomic_load32(&_unmapped_total) * _memory_page_size;
+  fprintf(
+      file,
+      "MappedMiB MappedOSMiB MappedPeakMiB MappedTotalMiB UnmappedTotalMiB\n");
+  fprintf(file, "%9zu %11zu %13zu %14zu %16zu\n",
+          mapped / (size_t)(1024 * 1024), mapped_os / (size_t)(1024 * 1024),
+          mapped_peak / (size_t)(1024 * 1024),
+          mapped_total / (size_t)(1024 * 1024),
+          unmapped_total / (size_t)(1024 * 1024));
+
+  fprintf(file, "\n");
+#if 0
+	int64_t allocated = atomic_load64(&_allocation_counter);
+	int64_t deallocated = atomic_load64(&_deallocation_counter);
+	fprintf(file, "Allocation count: %lli\n", allocated);
+	fprintf(file, "Deallocation count: %lli\n", deallocated);
+	fprintf(file, "Current allocations: %lli\n", (allocated - deallocated));
+	fprintf(file, "Master spans: %d\n", atomic_load32(&_master_spans));
+	fprintf(file, "Dangling master spans: %d\n", atomic_load32(&_unmapped_master_spans));
+#endif
+#endif
+  (void)sizeof(file);
+}
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+extern inline rpmalloc_heap_t *rpmalloc_heap_acquire(void) {
+  // Must be a pristine heap from newly mapped memory pages, or else memory
+  // blocks could already be allocated from the heap which would (wrongly) be
+  // released when heap is cleared with rpmalloc_heap_free_all(). Also heaps
+  // guaranteed to be pristine from the dedicated orphan list can be used.
+  heap_t *heap = _rpmalloc_heap_allocate(1);
+  rpmalloc_assume(heap != NULL);
+  heap->owner_thread = 0;
+  _rpmalloc_stat_inc(&_memory_active_heaps);
+  return heap;
+}
+
+extern inline void rpmalloc_heap_release(rpmalloc_heap_t *heap) {
+  if (heap)
+    _rpmalloc_heap_release(heap, 1, 1);
+}
+
+extern inline RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) {
+#if ENABLE_VALIDATE_ARGS
+  if (size >= MAX_ALLOC_SIZE) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+  return _rpmalloc_allocate(heap, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment,
+                            size_t size) {
+#if ENABLE_VALIDATE_ARGS
+  if (size >= MAX_ALLOC_SIZE) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+  return _rpmalloc_aligned_allocate(heap, alignment, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num, size_t size) {
+  return rpmalloc_heap_aligned_calloc(heap, 0, num, size);
+}
+
+extern inline RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment,
+                             size_t num, size_t size) {
+  size_t total;
+#if ENABLE_VALIDATE_ARGS
+#if PLATFORM_WINDOWS
+  int err = SizeTMult(num, size, &total);
+  if ((err != S_OK) || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#else
+  int err = __builtin_umull_overflow(num, size, &total);
+  if (err || (total >= MAX_ALLOC_SIZE)) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+#else
+  total = num * size;
+#endif
+  void *block = _rpmalloc_aligned_allocate(heap, alignment, total);
+  if (block)
+    memset(block, 0, total);
+  return block;
+}
+
+extern inline RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size,
+                      unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+  if (size >= MAX_ALLOC_SIZE) {
+    errno = EINVAL;
+    return ptr;
+  }
+#endif
+  return _rpmalloc_reallocate(heap, ptr, size, 0, flags);
+}
+
+extern inline RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_aligned_realloc(rpmalloc_heap_t *heap, void *ptr,
+                              size_t alignment, size_t size,
+                              unsigned int flags) {
+#if ENABLE_VALIDATE_ARGS
+  if ((size + alignment < size) || (alignment > _memory_page_size)) {
+    errno = EINVAL;
+    return 0;
+  }
+#endif
+  return _rpmalloc_aligned_reallocate(heap, ptr, alignment, size, 0, flags);
+}
+
+extern inline void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr) {
+  (void)sizeof(heap);
+  _rpmalloc_deallocate(ptr);
+}
+
+extern inline void rpmalloc_heap_free_all(rpmalloc_heap_t *heap) {
+  span_t *span;
+  span_t *next_span;
+
+  _rpmalloc_heap_cache_adopt_deferred(heap, 0);
+
+  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+    span = heap->size_class[iclass].partial_span;
+    while (span) {
+      next_span = span->next;
+      _rpmalloc_heap_cache_insert(heap, span);
+      span = next_span;
+    }
+    heap->size_class[iclass].partial_span = 0;
+    span = heap->full_span[iclass];
+    while (span) {
+      next_span = span->next;
+      _rpmalloc_heap_cache_insert(heap, span);
+      span = next_span;
+    }
+
+    span = heap->size_class[iclass].cache;
+    if (span)
+      _rpmalloc_heap_cache_insert(heap, span);
+    heap->size_class[iclass].cache = 0;
+  }
+  memset(heap->size_class, 0, sizeof(heap->size_class));
+  memset(heap->full_span, 0, sizeof(heap->full_span));
+
+  span = heap->large_huge_span;
+  while (span) {
+    next_span = span->next;
+    if (UNEXPECTED(span->size_class == SIZE_CLASS_HUGE))
+      _rpmalloc_deallocate_huge(span);
+    else
+      _rpmalloc_heap_cache_insert(heap, span);
+    span = next_span;
+  }
+  heap->large_huge_span = 0;
+  heap->full_span_count = 0;
+
+#if ENABLE_THREAD_CACHE
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    span_cache_t *span_cache;
+    if (!iclass)
+      span_cache = &heap->span_cache;
+    else
+      span_cache = (span_cache_t *)(heap->span_large_cache + (iclass - 1));
+    if (!span_cache->count)
+      continue;
+#if ENABLE_GLOBAL_CACHE
+    _rpmalloc_stat_add64(&heap->thread_to_global,
+                         span_cache->count * (iclass + 1) * _memory_span_size);
+    _rpmalloc_stat_add(&heap->span_use[iclass].spans_to_global,
+                       span_cache->count);
+    _rpmalloc_global_cache_insert_spans(span_cache->span, iclass + 1,
+                                        span_cache->count);
+#else
+    for (size_t ispan = 0; ispan < span_cache->count; ++ispan)
+      _rpmalloc_span_unmap(span_cache->span[ispan]);
+#endif
+    span_cache->count = 0;
+  }
+#endif
+
+#if ENABLE_STATISTICS
+  for (size_t iclass = 0; iclass < SIZE_CLASS_COUNT; ++iclass) {
+    atomic_store32(&heap->size_class_use[iclass].alloc_current, 0);
+    atomic_store32(&heap->size_class_use[iclass].spans_current, 0);
+  }
+  for (size_t iclass = 0; iclass < LARGE_CLASS_COUNT; ++iclass) {
+    atomic_store32(&heap->span_use[iclass].current, 0);
+  }
+#endif
+}
+
+extern inline void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap) {
+  heap_t *prev_heap = get_thread_heap_raw();
+  if (prev_heap != heap) {
+    set_thread_heap(heap);
+    if (prev_heap)
+      rpmalloc_heap_release(prev_heap);
+  }
+}
+
+extern inline rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr) {
+  // Grab the span, and then the heap from the span
+  span_t *span = (span_t *)((uintptr_t)ptr & _memory_span_mask);
+  if (span) {
+    return span->heap;
+  }
+  return 0;
+}
+
+#endif
+
+#if ENABLE_PRELOAD || ENABLE_OVERRIDE
+
+#include "malloc.c"
+
+#endif
+
+void rpmalloc_linker_reference(void) { (void)sizeof(_rpmalloc_initialized); }
diff --git a/llvm/lib/Support/rpmalloc/rpmalloc.h b/llvm/lib/Support/rpmalloc/rpmalloc.h
index 3911c53b779b..5b7fe1ff4286 100644
--- a/llvm/lib/Support/rpmalloc/rpmalloc.h
+++ b/llvm/lib/Support/rpmalloc/rpmalloc.h
@@ -1,428 +1,428 @@
-//===---------------------- rpmalloc.h ------------------*- C -*-=============//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This library provides a cross-platform lock free thread caching malloc
-// implementation in C11.
-//
-//===----------------------------------------------------------------------===//
-
-#pragma once
-
-#include <stddef.h>
-
-#ifdef __cplusplus
-extern "C" {
-#endif
-
-#if defined(__clang__) || defined(__GNUC__)
-#define RPMALLOC_EXPORT __attribute__((visibility("default")))
-#define RPMALLOC_ALLOCATOR
-#if (defined(__clang_major__) && (__clang_major__ < 4)) ||                     \
-    (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD)
-#define RPMALLOC_ATTRIB_MALLOC
-#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
-#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
-#else
-#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
-#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
-#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)                               \
-  __attribute__((alloc_size(count, size)))
-#endif
-#define RPMALLOC_CDECL
-#elif defined(_MSC_VER)
-#define RPMALLOC_EXPORT
-#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict)
-#define RPMALLOC_ATTRIB_MALLOC
-#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
-#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
-#define RPMALLOC_CDECL __cdecl
-#else
-#define RPMALLOC_EXPORT
-#define RPMALLOC_ALLOCATOR
-#define RPMALLOC_ATTRIB_MALLOC
-#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
-#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
-#define RPMALLOC_CDECL
-#endif
-
-//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce
-//  a very small overhead due to some size calculations not being compile time
-//  constants
-#ifndef RPMALLOC_CONFIGURABLE
-#define RPMALLOC_CONFIGURABLE 0
-#endif
-
-//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_*
-//! functions).
-//  Will introduce a very small overhead to track fully allocated spans in heaps
-#ifndef RPMALLOC_FIRST_CLASS_HEAPS
-#define RPMALLOC_FIRST_CLASS_HEAPS 0
-#endif
-
-//! Flag to rpaligned_realloc to not preserve content in reallocation
-#define RPMALLOC_NO_PRESERVE 1
-//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be
-//! done in-place,
-//  in which case the original pointer is still valid (just like a call to
-//  realloc which failes to allocate a new block).
-#define RPMALLOC_GROW_OR_FAIL 2
-
-typedef struct rpmalloc_global_statistics_t {
-  //! Current amount of virtual memory mapped, all of which might not have been
-  //! committed (only if ENABLE_STATISTICS=1)
-  size_t mapped;
-  //! Peak amount of virtual memory mapped, all of which might not have been
-  //! committed (only if ENABLE_STATISTICS=1)
-  size_t mapped_peak;
-  //! Current amount of memory in global caches for small and medium sizes
-  //! (<32KiB)
-  size_t cached;
-  //! Current amount of memory allocated in huge allocations, i.e larger than
-  //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
-  size_t huge_alloc;
-  //! Peak amount of memory allocated in huge allocations, i.e larger than
-  //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
-  size_t huge_alloc_peak;
-  //! Total amount of memory mapped since initialization (only if
-  //! ENABLE_STATISTICS=1)
-  size_t mapped_total;
-  //! Total amount of memory unmapped since initialization  (only if
-  //! ENABLE_STATISTICS=1)
-  size_t unmapped_total;
-} rpmalloc_global_statistics_t;
-
-typedef struct rpmalloc_thread_statistics_t {
-  //! Current number of bytes available in thread size class caches for small
-  //! and medium sizes (<32KiB)
-  size_t sizecache;
-  //! Current number of bytes available in thread span caches for small and
-  //! medium sizes (<32KiB)
-  size_t spancache;
-  //! Total number of bytes transitioned from thread cache to global cache (only
-  //! if ENABLE_STATISTICS=1)
-  size_t thread_to_global;
-  //! Total number of bytes transitioned from global cache to thread cache (only
-  //! if ENABLE_STATISTICS=1)
-  size_t global_to_thread;
-  //! Per span count statistics (only if ENABLE_STATISTICS=1)
-  struct {
-    //! Currently used number of spans
-    size_t current;
-    //! High water mark of spans used
-    size_t peak;
-    //! Number of spans transitioned to global cache
-    size_t to_global;
-    //! Number of spans transitioned from global cache
-    size_t from_global;
-    //! Number of spans transitioned to thread cache
-    size_t to_cache;
-    //! Number of spans transitioned from thread cache
-    size_t from_cache;
-    //! Number of spans transitioned to reserved state
-    size_t to_reserved;
-    //! Number of spans transitioned from reserved state
-    size_t from_reserved;
-    //! Number of raw memory map calls (not hitting the reserve spans but
-    //! resulting in actual OS mmap calls)
-    size_t map_calls;
-  } span_use[64];
-  //! Per size class statistics (only if ENABLE_STATISTICS=1)
-  struct {
-    //! Current number of allocations
-    size_t alloc_current;
-    //! Peak number of allocations
-    size_t alloc_peak;
-    //! Total number of allocations
-    size_t alloc_total;
-    //! Total number of frees
-    size_t free_total;
-    //! Number of spans transitioned to cache
-    size_t spans_to_cache;
-    //! Number of spans transitioned from cache
-    size_t spans_from_cache;
-    //! Number of spans transitioned from reserved state
-    size_t spans_from_reserved;
-    //! Number of raw memory map calls (not hitting the reserve spans but
-    //! resulting in actual OS mmap calls)
-    size_t map_calls;
-  } size_use[128];
-} rpmalloc_thread_statistics_t;
-
-typedef struct rpmalloc_config_t {
-  //! Map memory pages for the given number of bytes. The returned address MUST
-  //! be
-  //  aligned to the rpmalloc span size, which will always be a power of two.
-  //  Optionally the function can store an alignment offset in the offset
-  //  variable in case it performs alignment and the returned pointer is offset
-  //  from the actual start of the memory region due to this alignment. The
-  //  alignment offset will be passed to the memory unmap function. The
-  //  alignment offset MUST NOT be larger than 65535 (storable in an uint16_t),
-  //  if it is you must use natural alignment to shift it into 16 bits. If you
-  //  set a memory_map function, you must also set a memory_unmap function or
-  //  else the default implementation will be used for both. This function must
-  //  be thread safe, it can be called by multiple threads simultaneously.
-  void *(*memory_map)(size_t size, size_t *offset);
-  //! Unmap the memory pages starting at address and spanning the given number
-  //! of bytes.
-  //  If release is set to non-zero, the unmap is for an entire span range as
-  //  returned by a previous call to memory_map and that the entire range should
-  //  be released. The release argument holds the size of the entire span range.
-  //  If release is set to 0, the unmap is a partial decommit of a subset of the
-  //  mapped memory range. If you set a memory_unmap function, you must also set
-  //  a memory_map function or else the default implementation will be used for
-  //  both. This function must be thread safe, it can be called by multiple
-  //  threads simultaneously.
-  void (*memory_unmap)(void *address, size_t size, size_t offset,
-                       size_t release);
-  //! Called when an assert fails, if asserts are enabled. Will use the standard
-  //! assert()
-  //  if this is not set.
-  void (*error_callback)(const char *message);
-  //! Called when a call to map memory pages fails (out of memory). If this
-  //! callback is
-  //  not set or returns zero the library will return a null pointer in the
-  //  allocation call. If this callback returns non-zero the map call will be
-  //  retried. The argument passed is the number of bytes that was requested in
-  //  the map call. Only used if the default system memory map function is used
-  //  (memory_map callback is not set).
-  int (*map_fail_callback)(size_t size);
-  //! Size of memory pages. The page size MUST be a power of two. All memory
-  //! mapping
-  //  requests to memory_map will be made with size set to a multiple of the
-  //  page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system
-  //  page size is used.
-  size_t page_size;
-  //! Size of a span of memory blocks. MUST be a power of two, and in
-  //! [4096,262144]
-  //  range (unless 0 - set to 0 to use the default span size). Used if
-  //  RPMALLOC_CONFIGURABLE is defined to 1.
-  size_t span_size;
-  //! Number of spans to map at each request to map new virtual memory blocks.
-  //! This can
-  //  be used to minimize the system call overhead at the cost of virtual memory
-  //  address space. The extra mapped pages will not be written until actually
-  //  used, so physical committed memory should not be affected in the default
-  //  implementation. Will be aligned to a multiple of spans that match memory
-  //  page size in case of huge pages.
-  size_t span_map_count;
-  //! Enable use of large/huge pages. If this flag is set to non-zero and page
-  //! size is
-  //  zero, the allocator will try to enable huge pages and auto detect the
-  //  configuration. If this is set to non-zero and page_size is also non-zero,
-  //  the allocator will assume huge pages have been configured and enabled
-  //  prior to initializing the allocator. For Windows, see
-  //  https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
-  //  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
-  int enable_huge_pages;
-  //! Respectively allocated pages and huge allocated pages names for systems
-  //  supporting it to be able to distinguish among anonymous regions.
-  const char *page_name;
-  const char *huge_page_name;
-} rpmalloc_config_t;
-
-//! Initialize allocator with default configuration
-RPMALLOC_EXPORT int rpmalloc_initialize(void);
-
-//! Initialize allocator with given configuration
-RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config);
-
-//! Get allocator configuration
-RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void);
-
-//! Finalize allocator
-RPMALLOC_EXPORT void rpmalloc_finalize(void);
-
-//! Initialize allocator for calling thread
-RPMALLOC_EXPORT void rpmalloc_thread_initialize(void);
-
-//! Finalize allocator for calling thread
-RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches);
-
-//! Perform deferred deallocations pending for the calling thread heap
-RPMALLOC_EXPORT void rpmalloc_thread_collect(void);
-
-//! Query if allocator is initialized for calling thread
-RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void);
-
-//! Get per-thread statistics
-RPMALLOC_EXPORT void
-rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats);
-
-//! Get global statistics
-RPMALLOC_EXPORT void
-rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats);
-
-//! Dump all statistics in human readable format to file (should be a FILE*)
-RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file);
-
-//! Allocate a memory block of at least the given size
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1);
-
-//! Free the given memory block
-RPMALLOC_EXPORT void rpfree(void *ptr);
-
-//! Allocate a memory block of at least the given size and zero initialize it
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2);
-
-//! Reallocate the given block to at least the given size
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
-
-//! Reallocate the given block to at least the given size and alignment,
-//  with optional control flags (see RPMALLOC_NO_PRESERVE).
-//  Alignment must be a power of two and a multiple of sizeof(void*),
-//  and should ideally be less than memory page size. A caveat of rpmalloc
-//  internals is that this must also be strictly less than the span size
-//  (default 64KiB)
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize,
-                  unsigned int flags) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
-
-//! Allocate a memory block of at least the given size and alignment.
-//  Alignment must be a power of two and a multiple of sizeof(void*),
-//  and should ideally be less than memory page size. A caveat of rpmalloc
-//  internals is that this must also be strictly less than the span size
-//  (default 64KiB)
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
-
-//! Allocate a memory block of at least the given size and alignment, and zero
-//! initialize it.
-//  Alignment must be a power of two and a multiple of sizeof(void*),
-//  and should ideally be less than memory page size. A caveat of rpmalloc
-//  internals is that this must also be strictly less than the span size
-//  (default 64KiB)
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpaligned_calloc(size_t alignment, size_t num,
-                 size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
-
-//! Allocate a memory block of at least the given size and alignment.
-//  Alignment must be a power of two and a multiple of sizeof(void*),
-//  and should ideally be less than memory page size. A caveat of rpmalloc
-//  internals is that this must also be strictly less than the span size
-//  (default 64KiB)
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
-
-//! Allocate a memory block of at least the given size and alignment.
-//  Alignment must be a power of two and a multiple of sizeof(void*),
-//  and should ideally be less than memory page size. A caveat of rpmalloc
-//  internals is that this must also be strictly less than the span size
-//  (default 64KiB)
-RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment,
-                                     size_t size);
-
-//! Query the usable size of the given memory block (from given pointer to the
-//! end of block)
-RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr);
-
-//! Dummy empty function for forcing linker symbol inclusion
-RPMALLOC_EXPORT void rpmalloc_linker_reference(void);
-
-#if RPMALLOC_FIRST_CLASS_HEAPS
-
-//! Heap type
-typedef struct heap_t rpmalloc_heap_t;
-
-//! Acquire a new heap. Will reuse existing released heaps or allocate memory
-//! for a new heap
-//  if none available. Heap API is implemented with the strict assumption that
-//  only one single thread will call heap functions for a given heap at any
-//  given time, no functions are thread safe.
-RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void);
-
-//! Release a heap (does NOT free the memory allocated by the heap, use
-//! rpmalloc_heap_free_all before destroying the heap).
-//  Releasing a heap will enable it to be reused by other threads. Safe to pass
-//  a null pointer.
-RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap);
-
-//! Allocate a memory block of at least the given size using the given heap.
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
-
-//! Allocate a memory block of at least the given size using the given heap. The
-//! returned
-//  block will have the requested alignment. Alignment must be a power of two
-//  and a multiple of sizeof(void*), and should ideally be less than memory page
-//  size. A caveat of rpmalloc internals is that this must also be strictly less
-//  than the span size (default 64KiB).
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment,
-                            size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
-
-//! Allocate a memory block of at least the given size using the given heap and
-//! zero initialize it.
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num,
-                     size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
-
-//! Allocate a memory block of at least the given size using the given heap and
-//! zero initialize it. The returned
-//  block will have the requested alignment. Alignment must either be zero, or a
-//  power of two and a multiple of sizeof(void*), and should ideally be less
-//  than memory page size. A caveat of rpmalloc internals is that this must also
-//  be strictly less than the span size (default 64KiB).
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment,
-                             size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
-
-//! Reallocate the given block to at least the given size. The memory block MUST
-//! be allocated
-//  by the same heap given to this function.
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
-rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size,
-                      unsigned int flags) RPMALLOC_ATTRIB_MALLOC
-    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
-
-//! Reallocate the given block to at least the given size. The memory block MUST
-//! be allocated
-//  by the same heap given to this function. The returned block will have the
-//  requested alignment. Alignment must be either zero, or a power of two and a
-//  multiple of sizeof(void*), and should ideally be less than memory page size.
-//  A caveat of rpmalloc internals is that this must also be strictly less than
-//  the span size (default 64KiB).
-RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc(
-    rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size,
-    unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4);
-
-//! Free the given memory block from the given heap. The memory block MUST be
-//! allocated
-//  by the same heap given to this function.
-RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr);
-
-//! Free all memory allocated by the heap
-RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap);
-
-//! Set the given heap as the current heap for the calling thread. A heap MUST
-//! only be current heap
-//  for a single thread, a heap can never be shared between multiple threads.
-//  The previous current heap for the calling thread is released to be reused by
-//  other threads.
-RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap);
-
-//! Returns which heap the given pointer is allocated on
-RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr);
-
-#endif
-
-#ifdef __cplusplus
-}
-#endif
+//===---------------------- rpmalloc.h ------------------*- C -*-=============//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This library provides a cross-platform lock free thread caching malloc
+// implementation in C11.
+//
+//===----------------------------------------------------------------------===//
+
+#pragma once
+
+#include <stddef.h>
+
+#ifdef __cplusplus
+extern "C" {
+#endif
+
+#if defined(__clang__) || defined(__GNUC__)
+#define RPMALLOC_EXPORT __attribute__((visibility("default")))
+#define RPMALLOC_ALLOCATOR
+#if (defined(__clang_major__) && (__clang_major__ < 4)) ||                     \
+    (defined(__GNUC__) && defined(ENABLE_PRELOAD) && ENABLE_PRELOAD)
+#define RPMALLOC_ATTRIB_MALLOC
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+#else
+#define RPMALLOC_ATTRIB_MALLOC __attribute__((__malloc__))
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size) __attribute__((alloc_size(size)))
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)                               \
+  __attribute__((alloc_size(count, size)))
+#endif
+#define RPMALLOC_CDECL
+#elif defined(_MSC_VER)
+#define RPMALLOC_EXPORT
+#define RPMALLOC_ALLOCATOR __declspec(allocator) __declspec(restrict)
+#define RPMALLOC_ATTRIB_MALLOC
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+#define RPMALLOC_CDECL __cdecl
+#else
+#define RPMALLOC_EXPORT
+#define RPMALLOC_ALLOCATOR
+#define RPMALLOC_ATTRIB_MALLOC
+#define RPMALLOC_ATTRIB_ALLOC_SIZE(size)
+#define RPMALLOC_ATTRIB_ALLOC_SIZE2(count, size)
+#define RPMALLOC_CDECL
+#endif
+
+//! Define RPMALLOC_CONFIGURABLE to enable configuring sizes. Will introduce
+//  a very small overhead due to some size calculations not being compile time
+//  constants
+#ifndef RPMALLOC_CONFIGURABLE
+#define RPMALLOC_CONFIGURABLE 0
+#endif
+
+//! Define RPMALLOC_FIRST_CLASS_HEAPS to enable heap based API (rpmalloc_heap_*
+//! functions).
+//  Will introduce a very small overhead to track fully allocated spans in heaps
+#ifndef RPMALLOC_FIRST_CLASS_HEAPS
+#define RPMALLOC_FIRST_CLASS_HEAPS 0
+#endif
+
+//! Flag to rpaligned_realloc to not preserve content in reallocation
+#define RPMALLOC_NO_PRESERVE 1
+//! Flag to rpaligned_realloc to fail and return null pointer if grow cannot be
+//! done in-place,
+//  in which case the original pointer is still valid (just like a call to
+//  realloc which failes to allocate a new block).
+#define RPMALLOC_GROW_OR_FAIL 2
+
+typedef struct rpmalloc_global_statistics_t {
+  //! Current amount of virtual memory mapped, all of which might not have been
+  //! committed (only if ENABLE_STATISTICS=1)
+  size_t mapped;
+  //! Peak amount of virtual memory mapped, all of which might not have been
+  //! committed (only if ENABLE_STATISTICS=1)
+  size_t mapped_peak;
+  //! Current amount of memory in global caches for small and medium sizes
+  //! (<32KiB)
+  size_t cached;
+  //! Current amount of memory allocated in huge allocations, i.e larger than
+  //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
+  size_t huge_alloc;
+  //! Peak amount of memory allocated in huge allocations, i.e larger than
+  //! LARGE_SIZE_LIMIT which is 2MiB by default (only if ENABLE_STATISTICS=1)
+  size_t huge_alloc_peak;
+  //! Total amount of memory mapped since initialization (only if
+  //! ENABLE_STATISTICS=1)
+  size_t mapped_total;
+  //! Total amount of memory unmapped since initialization  (only if
+  //! ENABLE_STATISTICS=1)
+  size_t unmapped_total;
+} rpmalloc_global_statistics_t;
+
+typedef struct rpmalloc_thread_statistics_t {
+  //! Current number of bytes available in thread size class caches for small
+  //! and medium sizes (<32KiB)
+  size_t sizecache;
+  //! Current number of bytes available in thread span caches for small and
+  //! medium sizes (<32KiB)
+  size_t spancache;
+  //! Total number of bytes transitioned from thread cache to global cache (only
+  //! if ENABLE_STATISTICS=1)
+  size_t thread_to_global;
+  //! Total number of bytes transitioned from global cache to thread cache (only
+  //! if ENABLE_STATISTICS=1)
+  size_t global_to_thread;
+  //! Per span count statistics (only if ENABLE_STATISTICS=1)
+  struct {
+    //! Currently used number of spans
+    size_t current;
+    //! High water mark of spans used
+    size_t peak;
+    //! Number of spans transitioned to global cache
+    size_t to_global;
+    //! Number of spans transitioned from global cache
+    size_t from_global;
+    //! Number of spans transitioned to thread cache
+    size_t to_cache;
+    //! Number of spans transitioned from thread cache
+    size_t from_cache;
+    //! Number of spans transitioned to reserved state
+    size_t to_reserved;
+    //! Number of spans transitioned from reserved state
+    size_t from_reserved;
+    //! Number of raw memory map calls (not hitting the reserve spans but
+    //! resulting in actual OS mmap calls)
+    size_t map_calls;
+  } span_use[64];
+  //! Per size class statistics (only if ENABLE_STATISTICS=1)
+  struct {
+    //! Current number of allocations
+    size_t alloc_current;
+    //! Peak number of allocations
+    size_t alloc_peak;
+    //! Total number of allocations
+    size_t alloc_total;
+    //! Total number of frees
+    size_t free_total;
+    //! Number of spans transitioned to cache
+    size_t spans_to_cache;
+    //! Number of spans transitioned from cache
+    size_t spans_from_cache;
+    //! Number of spans transitioned from reserved state
+    size_t spans_from_reserved;
+    //! Number of raw memory map calls (not hitting the reserve spans but
+    //! resulting in actual OS mmap calls)
+    size_t map_calls;
+  } size_use[128];
+} rpmalloc_thread_statistics_t;
+
+typedef struct rpmalloc_config_t {
+  //! Map memory pages for the given number of bytes. The returned address MUST
+  //! be
+  //  aligned to the rpmalloc span size, which will always be a power of two.
+  //  Optionally the function can store an alignment offset in the offset
+  //  variable in case it performs alignment and the returned pointer is offset
+  //  from the actual start of the memory region due to this alignment. The
+  //  alignment offset will be passed to the memory unmap function. The
+  //  alignment offset MUST NOT be larger than 65535 (storable in an uint16_t),
+  //  if it is you must use natural alignment to shift it into 16 bits. If you
+  //  set a memory_map function, you must also set a memory_unmap function or
+  //  else the default implementation will be used for both. This function must
+  //  be thread safe, it can be called by multiple threads simultaneously.
+  void *(*memory_map)(size_t size, size_t *offset);
+  //! Unmap the memory pages starting at address and spanning the given number
+  //! of bytes.
+  //  If release is set to non-zero, the unmap is for an entire span range as
+  //  returned by a previous call to memory_map and that the entire range should
+  //  be released. The release argument holds the size of the entire span range.
+  //  If release is set to 0, the unmap is a partial decommit of a subset of the
+  //  mapped memory range. If you set a memory_unmap function, you must also set
+  //  a memory_map function or else the default implementation will be used for
+  //  both. This function must be thread safe, it can be called by multiple
+  //  threads simultaneously.
+  void (*memory_unmap)(void *address, size_t size, size_t offset,
+                       size_t release);
+  //! Called when an assert fails, if asserts are enabled. Will use the standard
+  //! assert()
+  //  if this is not set.
+  void (*error_callback)(const char *message);
+  //! Called when a call to map memory pages fails (out of memory). If this
+  //! callback is
+  //  not set or returns zero the library will return a null pointer in the
+  //  allocation call. If this callback returns non-zero the map call will be
+  //  retried. The argument passed is the number of bytes that was requested in
+  //  the map call. Only used if the default system memory map function is used
+  //  (memory_map callback is not set).
+  int (*map_fail_callback)(size_t size);
+  //! Size of memory pages. The page size MUST be a power of two. All memory
+  //! mapping
+  //  requests to memory_map will be made with size set to a multiple of the
+  //  page size. Used if RPMALLOC_CONFIGURABLE is defined to 1, otherwise system
+  //  page size is used.
+  size_t page_size;
+  //! Size of a span of memory blocks. MUST be a power of two, and in
+  //! [4096,262144]
+  //  range (unless 0 - set to 0 to use the default span size). Used if
+  //  RPMALLOC_CONFIGURABLE is defined to 1.
+  size_t span_size;
+  //! Number of spans to map at each request to map new virtual memory blocks.
+  //! This can
+  //  be used to minimize the system call overhead at the cost of virtual memory
+  //  address space. The extra mapped pages will not be written until actually
+  //  used, so physical committed memory should not be affected in the default
+  //  implementation. Will be aligned to a multiple of spans that match memory
+  //  page size in case of huge pages.
+  size_t span_map_count;
+  //! Enable use of large/huge pages. If this flag is set to non-zero and page
+  //! size is
+  //  zero, the allocator will try to enable huge pages and auto detect the
+  //  configuration. If this is set to non-zero and page_size is also non-zero,
+  //  the allocator will assume huge pages have been configured and enabled
+  //  prior to initializing the allocator. For Windows, see
+  //  https://docs.microsoft.com/en-us/windows/desktop/memory/large-page-support
+  //  For Linux, see https://www.kernel.org/doc/Documentation/vm/hugetlbpage.txt
+  int enable_huge_pages;
+  //! Respectively allocated pages and huge allocated pages names for systems
+  //  supporting it to be able to distinguish among anonymous regions.
+  const char *page_name;
+  const char *huge_page_name;
+} rpmalloc_config_t;
+
+//! Initialize allocator with default configuration
+RPMALLOC_EXPORT int rpmalloc_initialize(void);
+
+//! Initialize allocator with given configuration
+RPMALLOC_EXPORT int rpmalloc_initialize_config(const rpmalloc_config_t *config);
+
+//! Get allocator configuration
+RPMALLOC_EXPORT const rpmalloc_config_t *rpmalloc_config(void);
+
+//! Finalize allocator
+RPMALLOC_EXPORT void rpmalloc_finalize(void);
+
+//! Initialize allocator for calling thread
+RPMALLOC_EXPORT void rpmalloc_thread_initialize(void);
+
+//! Finalize allocator for calling thread
+RPMALLOC_EXPORT void rpmalloc_thread_finalize(int release_caches);
+
+//! Perform deferred deallocations pending for the calling thread heap
+RPMALLOC_EXPORT void rpmalloc_thread_collect(void);
+
+//! Query if allocator is initialized for calling thread
+RPMALLOC_EXPORT int rpmalloc_is_thread_initialized(void);
+
+//! Get per-thread statistics
+RPMALLOC_EXPORT void
+rpmalloc_thread_statistics(rpmalloc_thread_statistics_t *stats);
+
+//! Get global statistics
+RPMALLOC_EXPORT void
+rpmalloc_global_statistics(rpmalloc_global_statistics_t *stats);
+
+//! Dump all statistics in human readable format to file (should be a FILE*)
+RPMALLOC_EXPORT void rpmalloc_dump_statistics(void *file);
+
+//! Allocate a memory block of at least the given size
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpmalloc(size_t size) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(1);
+
+//! Free the given memory block
+RPMALLOC_EXPORT void rpfree(void *ptr);
+
+//! Allocate a memory block of at least the given size and zero initialize it
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpcalloc(size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE2(1, 2);
+
+//! Reallocate the given block to at least the given size
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rprealloc(void *ptr, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Reallocate the given block to at least the given size and alignment,
+//  with optional control flags (see RPMALLOC_NO_PRESERVE).
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size
+//  (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpaligned_realloc(void *ptr, size_t alignment, size_t size, size_t oldsize,
+                  unsigned int flags) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size
+//  (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpaligned_alloc(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment, and zero
+//! initialize it.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size
+//  (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpaligned_calloc(size_t alignment, size_t num,
+                 size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size
+//  (default 64KiB)
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpmemalign(size_t alignment, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size and alignment.
+//  Alignment must be a power of two and a multiple of sizeof(void*),
+//  and should ideally be less than memory page size. A caveat of rpmalloc
+//  internals is that this must also be strictly less than the span size
+//  (default 64KiB)
+RPMALLOC_EXPORT int rpposix_memalign(void **memptr, size_t alignment,
+                                     size_t size);
+
+//! Query the usable size of the given memory block (from given pointer to the
+//! end of block)
+RPMALLOC_EXPORT size_t rpmalloc_usable_size(void *ptr);
+
+//! Dummy empty function for forcing linker symbol inclusion
+RPMALLOC_EXPORT void rpmalloc_linker_reference(void);
+
+#if RPMALLOC_FIRST_CLASS_HEAPS
+
+//! Heap type
+typedef struct heap_t rpmalloc_heap_t;
+
+//! Acquire a new heap. Will reuse existing released heaps or allocate memory
+//! for a new heap
+//  if none available. Heap API is implemented with the strict assumption that
+//  only one single thread will call heap functions for a given heap at any
+//  given time, no functions are thread safe.
+RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_heap_acquire(void);
+
+//! Release a heap (does NOT free the memory allocated by the heap, use
+//! rpmalloc_heap_free_all before destroying the heap).
+//  Releasing a heap will enable it to be reused by other threads. Safe to pass
+//  a null pointer.
+RPMALLOC_EXPORT void rpmalloc_heap_release(rpmalloc_heap_t *heap);
+
+//! Allocate a memory block of at least the given size using the given heap.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_alloc(rpmalloc_heap_t *heap, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(2);
+
+//! Allocate a memory block of at least the given size using the given heap. The
+//! returned
+//  block will have the requested alignment. Alignment must be a power of two
+//  and a multiple of sizeof(void*), and should ideally be less than memory page
+//  size. A caveat of rpmalloc internals is that this must also be strictly less
+//  than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_aligned_alloc(rpmalloc_heap_t *heap, size_t alignment,
+                            size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Allocate a memory block of at least the given size using the given heap and
+//! zero initialize it.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_calloc(rpmalloc_heap_t *heap, size_t num,
+                     size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Allocate a memory block of at least the given size using the given heap and
+//! zero initialize it. The returned
+//  block will have the requested alignment. Alignment must either be zero, or a
+//  power of two and a multiple of sizeof(void*), and should ideally be less
+//  than memory page size. A caveat of rpmalloc internals is that this must also
+//  be strictly less than the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_aligned_calloc(rpmalloc_heap_t *heap, size_t alignment,
+                             size_t num, size_t size) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE2(2, 3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST
+//! be allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *
+rpmalloc_heap_realloc(rpmalloc_heap_t *heap, void *ptr, size_t size,
+                      unsigned int flags) RPMALLOC_ATTRIB_MALLOC
+    RPMALLOC_ATTRIB_ALLOC_SIZE(3);
+
+//! Reallocate the given block to at least the given size. The memory block MUST
+//! be allocated
+//  by the same heap given to this function. The returned block will have the
+//  requested alignment. Alignment must be either zero, or a power of two and a
+//  multiple of sizeof(void*), and should ideally be less than memory page size.
+//  A caveat of rpmalloc internals is that this must also be strictly less than
+//  the span size (default 64KiB).
+RPMALLOC_EXPORT RPMALLOC_ALLOCATOR void *rpmalloc_heap_aligned_realloc(
+    rpmalloc_heap_t *heap, void *ptr, size_t alignment, size_t size,
+    unsigned int flags) RPMALLOC_ATTRIB_MALLOC RPMALLOC_ATTRIB_ALLOC_SIZE(4);
+
+//! Free the given memory block from the given heap. The memory block MUST be
+//! allocated
+//  by the same heap given to this function.
+RPMALLOC_EXPORT void rpmalloc_heap_free(rpmalloc_heap_t *heap, void *ptr);
+
+//! Free all memory allocated by the heap
+RPMALLOC_EXPORT void rpmalloc_heap_free_all(rpmalloc_heap_t *heap);
+
+//! Set the given heap as the current heap for the calling thread. A heap MUST
+//! only be current heap
+//  for a single thread, a heap can never be shared between multiple threads.
+//  The previous current heap for the calling thread is released to be reused by
+//  other threads.
+RPMALLOC_EXPORT void rpmalloc_heap_thread_set_current(rpmalloc_heap_t *heap);
+
+//! Returns which heap the given pointer is allocated on
+RPMALLOC_EXPORT rpmalloc_heap_t *rpmalloc_get_heap_for_ptr(void *ptr);
+
+#endif
+
+#ifdef __cplusplus
+}
+#endif
diff --git a/llvm/lib/Support/rpmalloc/rpnew.h b/llvm/lib/Support/rpmalloc/rpnew.h
index d8303c6f9565..a18f0799d56d 100644
--- a/llvm/lib/Support/rpmalloc/rpnew.h
+++ b/llvm/lib/Support/rpmalloc/rpnew.h
@@ -1,113 +1,113 @@
-//===-------------------------- rpnew.h -----------------*- C -*-=============//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-//
-// This library provides a cross-platform lock free thread caching malloc
-// implementation in C11.
-//
-//===----------------------------------------------------------------------===//
-
-#ifdef __cplusplus
-
-#include <new>
-#include <rpmalloc.h>
-
-#ifndef __CRTDECL
-#define __CRTDECL
-#endif
-
-extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); }
-
-extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); }
-
-extern void *__CRTDECL operator new(std::size_t size) noexcept(false) {
-  return rpmalloc(size);
-}
-
-extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) {
-  return rpmalloc(size);
-}
-
-extern void *__CRTDECL operator new(std::size_t size,
-                                    const std::nothrow_t &tag) noexcept {
-  (void)sizeof(tag);
-  return rpmalloc(size);
-}
-
-extern void *__CRTDECL operator new[](std::size_t size,
-                                      const std::nothrow_t &tag) noexcept {
-  (void)sizeof(tag);
-  return rpmalloc(size);
-}
-
-#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
-
-extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept {
-  (void)sizeof(size);
-  rpfree(p);
-}
-
-extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept {
-  (void)sizeof(size);
-  rpfree(p);
-}
-
-#endif
-
-#if (__cplusplus > 201402L || defined(__cpp_aligned_new))
-
-extern void __CRTDECL operator delete(void *p,
-                                      std::align_val_t align) noexcept {
-  (void)sizeof(align);
-  rpfree(p);
-}
-
-extern void __CRTDECL operator delete[](void *p,
-                                        std::align_val_t align) noexcept {
-  (void)sizeof(align);
-  rpfree(p);
-}
-
-extern void __CRTDECL operator delete(void *p, std::size_t size,
-                                      std::align_val_t align) noexcept {
-  (void)sizeof(size);
-  (void)sizeof(align);
-  rpfree(p);
-}
-
-extern void __CRTDECL operator delete[](void *p, std::size_t size,
-                                        std::align_val_t align) noexcept {
-  (void)sizeof(size);
-  (void)sizeof(align);
-  rpfree(p);
-}
-
-extern void *__CRTDECL operator new(std::size_t size,
-                                    std::align_val_t align) noexcept(false) {
-  return rpaligned_alloc(static_cast<size_t>(align), size);
-}
-
-extern void *__CRTDECL operator new[](std::size_t size,
-                                      std::align_val_t align) noexcept(false) {
-  return rpaligned_alloc(static_cast<size_t>(align), size);
-}
-
-extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align,
-                                    const std::nothrow_t &tag) noexcept {
-  (void)sizeof(tag);
-  return rpaligned_alloc(static_cast<size_t>(align), size);
-}
-
-extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align,
-                                      const std::nothrow_t &tag) noexcept {
-  (void)sizeof(tag);
-  return rpaligned_alloc(static_cast<size_t>(align), size);
-}
-
-#endif
-
-#endif
+//===-------------------------- rpnew.h -----------------*- C -*-=============//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+//
+// This library provides a cross-platform lock free thread caching malloc
+// implementation in C11.
+//
+//===----------------------------------------------------------------------===//
+
+#ifdef __cplusplus
+
+#include <new>
+#include <rpmalloc.h>
+
+#ifndef __CRTDECL
+#define __CRTDECL
+#endif
+
+extern void __CRTDECL operator delete(void *p) noexcept { rpfree(p); }
+
+extern void __CRTDECL operator delete[](void *p) noexcept { rpfree(p); }
+
+extern void *__CRTDECL operator new(std::size_t size) noexcept(false) {
+  return rpmalloc(size);
+}
+
+extern void *__CRTDECL operator new[](std::size_t size) noexcept(false) {
+  return rpmalloc(size);
+}
+
+extern void *__CRTDECL operator new(std::size_t size,
+                                    const std::nothrow_t &tag) noexcept {
+  (void)sizeof(tag);
+  return rpmalloc(size);
+}
+
+extern void *__CRTDECL operator new[](std::size_t size,
+                                      const std::nothrow_t &tag) noexcept {
+  (void)sizeof(tag);
+  return rpmalloc(size);
+}
+
+#if (__cplusplus >= 201402L || _MSC_VER >= 1916)
+
+extern void __CRTDECL operator delete(void *p, std::size_t size) noexcept {
+  (void)sizeof(size);
+  rpfree(p);
+}
+
+extern void __CRTDECL operator delete[](void *p, std::size_t size) noexcept {
+  (void)sizeof(size);
+  rpfree(p);
+}
+
+#endif
+
+#if (__cplusplus > 201402L || defined(__cpp_aligned_new))
+
+extern void __CRTDECL operator delete(void *p,
+                                      std::align_val_t align) noexcept {
+  (void)sizeof(align);
+  rpfree(p);
+}
+
+extern void __CRTDECL operator delete[](void *p,
+                                        std::align_val_t align) noexcept {
+  (void)sizeof(align);
+  rpfree(p);
+}
+
+extern void __CRTDECL operator delete(void *p, std::size_t size,
+                                      std::align_val_t align) noexcept {
+  (void)sizeof(size);
+  (void)sizeof(align);
+  rpfree(p);
+}
+
+extern void __CRTDECL operator delete[](void *p, std::size_t size,
+                                        std::align_val_t align) noexcept {
+  (void)sizeof(size);
+  (void)sizeof(align);
+  rpfree(p);
+}
+
+extern void *__CRTDECL operator new(std::size_t size,
+                                    std::align_val_t align) noexcept(false) {
+  return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+extern void *__CRTDECL operator new[](std::size_t size,
+                                      std::align_val_t align) noexcept(false) {
+  return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+extern void *__CRTDECL operator new(std::size_t size, std::align_val_t align,
+                                    const std::nothrow_t &tag) noexcept {
+  (void)sizeof(tag);
+  return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+extern void *__CRTDECL operator new[](std::size_t size, std::align_val_t align,
+                                      const std::nothrow_t &tag) noexcept {
+  (void)sizeof(tag);
+  return rpaligned_alloc(static_cast<size_t>(align), size);
+}
+
+#endif
+
+#endif
diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index d315d9bd16f4..d32dda2a67c9 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -1,65 +1,65 @@
-//===- DXILFinalizeLinkage.cpp - Finalize linkage of functions ------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "DXILFinalizeLinkage.h"
-#include "DirectX.h"
-#include "llvm/Analysis/DXILResource.h"
-#include "llvm/IR/Function.h"
-#include "llvm/IR/GlobalValue.h"
-#include "llvm/IR/Metadata.h"
-#include "llvm/IR/Module.h"
-
-#define DEBUG_TYPE "dxil-finalize-linkage"
-
-using namespace llvm;
-
-static bool finalizeLinkage(Module &M) {
-  SmallPtrSet<Function *, 8> EntriesAndExports;
-
-  // Find all entry points and export functions
-  for (Function &EF : M.functions()) {
-    if (!EF.hasFnAttribute("hlsl.shader") && !EF.hasFnAttribute("hlsl.export"))
-      continue;
-    EntriesAndExports.insert(&EF);
-  }
-
-  for (Function &F : M.functions()) {
-    if (F.getLinkage() == GlobalValue::ExternalLinkage &&
-        !EntriesAndExports.contains(&F)) {
-      F.setLinkage(GlobalValue::InternalLinkage);
-    }
-  }
-
-  return false;
-}
-
-PreservedAnalyses DXILFinalizeLinkage::run(Module &M,
-                                           ModuleAnalysisManager &AM) {
-  if (finalizeLinkage(M))
-    return PreservedAnalyses::none();
-  return PreservedAnalyses::all();
-}
-
-bool DXILFinalizeLinkageLegacy::runOnModule(Module &M) {
-  return finalizeLinkage(M);
-}
-
-void DXILFinalizeLinkageLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
-  AU.addPreserved<DXILResourceWrapperPass>();
-}
-
-char DXILFinalizeLinkageLegacy::ID = 0;
-
-INITIALIZE_PASS_BEGIN(DXILFinalizeLinkageLegacy, DEBUG_TYPE,
-                      "DXIL Finalize Linkage", false, false)
-INITIALIZE_PASS_END(DXILFinalizeLinkageLegacy, DEBUG_TYPE,
-                    "DXIL Finalize Linkage", false, false)
-
-ModulePass *llvm::createDXILFinalizeLinkageLegacyPass() {
-  return new DXILFinalizeLinkageLegacy();
-}
+//===- DXILFinalizeLinkage.cpp - Finalize linkage of functions ------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "DXILFinalizeLinkage.h"
+#include "DirectX.h"
+#include "llvm/Analysis/DXILResource.h"
+#include "llvm/IR/Function.h"
+#include "llvm/IR/GlobalValue.h"
+#include "llvm/IR/Metadata.h"
+#include "llvm/IR/Module.h"
+
+#define DEBUG_TYPE "dxil-finalize-linkage"
+
+using namespace llvm;
+
+static bool finalizeLinkage(Module &M) {
+  SmallPtrSet<Function *, 8> EntriesAndExports;
+
+  // Find all entry points and export functions
+  for (Function &EF : M.functions()) {
+    if (!EF.hasFnAttribute("hlsl.shader") && !EF.hasFnAttribute("hlsl.export"))
+      continue;
+    EntriesAndExports.insert(&EF);
+  }
+
+  for (Function &F : M.functions()) {
+    if (F.getLinkage() == GlobalValue::ExternalLinkage &&
+        !EntriesAndExports.contains(&F)) {
+      F.setLinkage(GlobalValue::InternalLinkage);
+    }
+  }
+
+  return false;
+}
+
+PreservedAnalyses DXILFinalizeLinkage::run(Module &M,
+                                           ModuleAnalysisManager &AM) {
+  if (finalizeLinkage(M))
+    return PreservedAnalyses::none();
+  return PreservedAnalyses::all();
+}
+
+bool DXILFinalizeLinkageLegacy::runOnModule(Module &M) {
+  return finalizeLinkage(M);
+}
+
+void DXILFinalizeLinkageLegacy::getAnalysisUsage(AnalysisUsage &AU) const {
+  AU.addPreserved<DXILResourceWrapperPass>();
+}
+
+char DXILFinalizeLinkageLegacy::ID = 0;
+
+INITIALIZE_PASS_BEGIN(DXILFinalizeLinkageLegacy, DEBUG_TYPE,
+                      "DXIL Finalize Linkage", false, false)
+INITIALIZE_PASS_END(DXILFinalizeLinkageLegacy, DEBUG_TYPE,
+                    "DXIL Finalize Linkage", false, false)
+
+ModulePass *llvm::createDXILFinalizeLinkageLegacyPass() {
+  return new DXILFinalizeLinkageLegacy();
+}
diff --git a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
index 8ea31401121b..9844fd394aa4 100644
--- a/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
+++ b/llvm/lib/Target/DirectX/DirectXTargetTransformInfo.cpp
@@ -1,38 +1,38 @@
-//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++
-//-*-===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-///
-//===----------------------------------------------------------------------===//
-
-#include "DirectXTargetTransformInfo.h"
-#include "llvm/IR/Intrinsics.h"
-#include "llvm/IR/IntrinsicsDirectX.h"
-
-using namespace llvm;
-
-bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
-                                                        unsigned ScalarOpdIdx) {
-  switch (ID) {
-  case Intrinsic::dx_wave_readlane:
-    return ScalarOpdIdx == 1;
-  default:
-    return false;
-  }
-}
-
-bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
-    Intrinsic::ID ID) const {
-  switch (ID) {
-  case Intrinsic::dx_frac:
-  case Intrinsic::dx_rsqrt:
-  case Intrinsic::dx_wave_readlane:
-    return true;
-  default:
-    return false;
-  }
-}
+//===- DirectXTargetTransformInfo.cpp - DirectX TTI ---------------*- C++
+//-*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+///
+//===----------------------------------------------------------------------===//
+
+#include "DirectXTargetTransformInfo.h"
+#include "llvm/IR/Intrinsics.h"
+#include "llvm/IR/IntrinsicsDirectX.h"
+
+using namespace llvm;
+
+bool DirectXTTIImpl::isTargetIntrinsicWithScalarOpAtArg(Intrinsic::ID ID,
+                                                        unsigned ScalarOpdIdx) {
+  switch (ID) {
+  case Intrinsic::dx_wave_readlane:
+    return ScalarOpdIdx == 1;
+  default:
+    return false;
+  }
+}
+
+bool DirectXTTIImpl::isTargetIntrinsicTriviallyScalarizable(
+    Intrinsic::ID ID) const {
+  switch (ID) {
+  case Intrinsic::dx_frac:
+  case Intrinsic::dx_rsqrt:
+  case Intrinsic::dx_wave_readlane:
+    return true;
+  default:
+    return false;
+  }
+}
diff --git a/llvm/test/CodeGen/DirectX/atan2.ll b/llvm/test/CodeGen/DirectX/atan2.ll
index 9d86f87f3ed5..b2c650d11626 100644
--- a/llvm/test/CodeGen/DirectX/atan2.ll
+++ b/llvm/test/CodeGen/DirectX/atan2.ll
@@ -1,87 +1,87 @@
-; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
-
-; Make sure correct dxil expansions for atan2 are generated for float and half.
-
-define noundef float @atan2_float(float noundef %y, float noundef %x) {
-entry:
-; CHECK: [[DIV:%.+]] = fdiv float %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]])
-; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]])
-; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000
-; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000
-; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00
-; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 
-; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 
-; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00
-; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]]
-; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]]
-; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]]
-; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]]
-; CHECK: ret float [[SELECT_HPI]]
-  %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x)
-  ret float %elt.atan2
-}
-
-define noundef half @atan2_half(half noundef %y, half noundef %x) {
-entry:
-; CHECK: [[DIV:%.+]] = fdiv half %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]])
-; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]])
-; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248
-; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248
-; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000
-; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 
-; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 
-; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000
-; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]]
-; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]]
-; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
-; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]]
-; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
-; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]]
-; CHECK: ret half [[SELECT_HPI]]
-  %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x)
-  ret half %elt.atan2
-}
-
-define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) {
-entry:
-; Just Expansion, no scalarization or lowering:
-; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x
-; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]])
-; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
-; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
-; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer
-; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer
-; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer
-; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer
-; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]]
-; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]]
-; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]]
-; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> <float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000>, <4 x float> [[SELECT_SUB_PI]]
-; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]]
-; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> <float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000>, <4 x float> [[SELECT_NEGHPI]]
-; EXPCHECK: ret <4 x float> [[SELECT_HPI]]
-
-; Scalarization occurs after expansion, so atan scalarization is tested separately.
-; Expansion, scalarization and lowering:
-; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls.
-; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}})
-; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
-
-  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x)
-  ret <4 x float> %elt.atan2
-}
-
-declare half @llvm.atan2.f16(half, half)
-declare float @llvm.atan2.f32(float, float)
-declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
+; RUN: opt -S -dxil-intrinsic-expansion -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S -dxil-intrinsic-expansion -scalarizer -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+
+; Make sure correct dxil expansions for atan2 are generated for float and half.
+
+define noundef float @atan2_float(float noundef %y, float noundef %x) {
+entry:
+; CHECK: [[DIV:%.+]] = fdiv float %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call float @llvm.atan.f32(float [[DIV]])
+; DOPCHECK: [[ATAN:%.+]] = call float @dx.op.unary.f32(i32 17, float [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd float [[ATAN]], 0x400921FB60000000
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub float [[ATAN]], 0x400921FB60000000
+; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt float %x, 0.000000e+00
+; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq float %x, 0.000000e+00 
+; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge float %y, 0.000000e+00 
+; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt float %y, 0.000000e+00
+; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], float [[ADD_PI]], float [[ATAN]]
+; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], float [[SUB_PI]], float [[SELECT_ADD_PI]]
+; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], float 0xBFF921FB60000000, float [[SELECT_SUB_PI]]
+; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], float 0x3FF921FB60000000, float [[SELECT_NEGHPI]]
+; CHECK: ret float [[SELECT_HPI]]
+  %elt.atan2 = call float @llvm.atan2.f32(float %y, float %x)
+  ret float %elt.atan2
+}
+
+define noundef half @atan2_half(half noundef %y, half noundef %x) {
+entry:
+; CHECK: [[DIV:%.+]] = fdiv half %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call half @llvm.atan.f16(half [[DIV]])
+; DOPCHECK: [[ATAN:%.+]] = call half @dx.op.unary.f16(i32 17, half [[DIV]])
+; CHECK-DAG: [[ADD_PI:%.+]] = fadd half [[ATAN]], 0xH4248
+; CHECK-DAG: [[SUB_PI:%.+]] = fsub half [[ATAN]], 0xH4248
+; CHECK-DAG: [[X_LT_0:%.+]] = fcmp olt half %x, 0xH0000
+; CHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq half %x, 0xH0000 
+; CHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge half %y, 0xH0000 
+; CHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt half %y, 0xH0000
+; CHECK: [[XLT0_AND_YGE0:%.+]] = and i1 [[X_LT_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_ADD_PI:%.+]] = select i1 [[XLT0_AND_YGE0]], half [[ADD_PI]], half [[ATAN]]
+; CHECK: [[XLT0_AND_YLT0:%.+]] = and i1 [[X_LT_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_SUB_PI:%.+]] = select i1 [[XLT0_AND_YLT0]], half [[SUB_PI]], half [[SELECT_ADD_PI]]
+; CHECK: [[XEQ0_AND_YLT0:%.+]] = and i1 [[X_EQ_0]], [[Y_LT_0]]
+; CHECK: [[SELECT_NEGHPI:%.+]] = select i1 [[XEQ0_AND_YLT0]], half 0xHBE48, half [[SELECT_SUB_PI]]
+; CHECK: [[XEQ0_AND_YGE0:%.+]] = and i1 [[X_EQ_0]], [[Y_GE_0]]
+; CHECK: [[SELECT_HPI:%.+]] = select i1 [[XEQ0_AND_YGE0]], half 0xH3E48, half [[SELECT_NEGHPI]]
+; CHECK: ret half [[SELECT_HPI]]
+  %elt.atan2 = call half @llvm.atan2.f16(half %y, half %x)
+  ret half %elt.atan2
+}
+
+define noundef <4 x float> @atan2_float4(<4 x float> noundef %y, <4 x float> noundef %x) {
+entry:
+; Just Expansion, no scalarization or lowering:
+; EXPCHECK: [[DIV:%.+]] = fdiv <4 x float> %y, %x
+; EXPCHECK: [[ATAN:%.+]] = call <4 x float> @llvm.atan.v4f32(<4 x float> [[DIV]])
+; EXPCHECK-DAG: [[ADD_PI:%.+]] = fadd <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
+; EXPCHECK-DAG: [[SUB_PI:%.+]] = fsub <4 x float> [[ATAN]], <float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000, float 0x400921FB60000000>
+; EXPCHECK-DAG: [[X_LT_0:%.+]] = fcmp olt <4 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[X_EQ_0:%.+]] = fcmp oeq <4 x float> %x, zeroinitializer
+; EXPCHECK-DAG: [[Y_GE_0:%.+]] = fcmp oge <4 x float> %y, zeroinitializer
+; EXPCHECK-DAG: [[Y_LT_0:%.+]] = fcmp olt <4 x float> %y, zeroinitializer
+; EXPCHECK: [[XLT0_AND_YGE0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_ADD_PI:%.+]] = select <4 x i1> [[XLT0_AND_YGE0]], <4 x float> [[ADD_PI]], <4 x float> [[ATAN]]
+; EXPCHECK: [[XLT0_AND_YLT0:%.+]] = and <4 x i1> [[X_LT_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_SUB_PI:%.+]] = select <4 x i1> [[XLT0_AND_YLT0]], <4 x float> [[SUB_PI]], <4 x float> [[SELECT_ADD_PI]]
+; EXPCHECK: [[XEQ0_AND_YLT0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_LT_0]]
+; EXPCHECK: [[SELECT_NEGHPI:%.+]] = select <4 x i1> [[XEQ0_AND_YLT0]], <4 x float> <float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000, float 0xBFF921FB60000000>, <4 x float> [[SELECT_SUB_PI]]
+; EXPCHECK: [[XEQ0_AND_YGE0:%.+]] = and <4 x i1> [[X_EQ_0]], [[Y_GE_0]]
+; EXPCHECK: [[SELECT_HPI:%.+]] = select <4 x i1> [[XEQ0_AND_YGE0]], <4 x float> <float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000, float 0x3FF921FB60000000>, <4 x float> [[SELECT_NEGHPI]]
+; EXPCHECK: ret <4 x float> [[SELECT_HPI]]
+
+; Scalarization occurs after expansion, so atan scalarization is tested separately.
+; Expansion, scalarization and lowering:
+; Just make sure this expands to exactly 4 scalar DXIL atan (OpCode=17) calls.
+; DOPCHECK-COUNT-4: call float @dx.op.unary.f32(i32 17, float %{{.*}})
+; DOPCHECK-NOT: call float @dx.op.unary.f32(i32 17,
+
+  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %y, <4 x float> %x)
+  ret <4 x float> %elt.atan2
+}
+
+declare half @llvm.atan2.f16(half, half)
+declare float @llvm.atan2.f32(float, float)
+declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/DirectX/atan2_error.ll b/llvm/test/CodeGen/DirectX/atan2_error.ll
index 372934098b7c..9b66f9f1dd45 100644
--- a/llvm/test/CodeGen/DirectX/atan2_error.ll
+++ b/llvm/test/CodeGen/DirectX/atan2_error.ll
@@ -1,11 +1,11 @@
-; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
-
-; DXIL operation atan does not support double overload type
-; CHECK: in function atan2_double
-; CHECK-SAME: Cannot create ATan operation: Invalid overload type
-
-define noundef double @atan2_double(double noundef %a, double noundef %b) #0 {
-entry:
-  %1 = call double @llvm.atan2.f64(double %a, double %b)
-  ret double %1
-}
+; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation atan does not support double overload type
+; CHECK: in function atan2_double
+; CHECK-SAME: Cannot create ATan operation: Invalid overload type
+
+define noundef double @atan2_double(double noundef %a, double noundef %b) #0 {
+entry:
+  %1 = call double @llvm.atan2.f64(double %a, double %b)
+  ret double %1
+}
diff --git a/llvm/test/CodeGen/DirectX/cross.ll b/llvm/test/CodeGen/DirectX/cross.ll
index 6ec3ec4d3594..6153cf7cddc9 100644
--- a/llvm/test/CodeGen/DirectX/cross.ll
+++ b/llvm/test/CodeGen/DirectX/cross.ll
@@ -1,56 +1,56 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s
-
-; Make sure dxil operation function calls for cross are generated for half/float.
-
-declare <3 x half> @llvm.dx.cross.v3f16(<3 x half>, <3 x half>)
-declare <3 x float> @llvm.dx.cross.v3f32(<3 x float>, <3 x float>)
-
-define noundef <3 x half> @test_cross_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) {
-entry:
-  ; CHECK: %x0 = extractelement <3 x half> %p0, i64 0
-  ; CHECK: %x1 = extractelement <3 x half> %p0, i64 1
-  ; CHECK: %x2 = extractelement <3 x half> %p0, i64 2
-  ; CHECK: %y0 = extractelement <3 x half> %p1, i64 0
-  ; CHECK: %y1 = extractelement <3 x half> %p1, i64 1
-  ; CHECK: %y2 = extractelement <3 x half> %p1, i64 2
-  ; CHECK: %0 = fmul half %x1, %y2
-  ; CHECK: %1 = fmul half %x2, %y1
-  ; CHECK: %hlsl.cross1 = fsub half %0, %1
-  ; CHECK: %2 = fmul half %x2, %y0
-  ; CHECK: %3 = fmul half %x0, %y2
-  ; CHECK: %hlsl.cross2 = fsub half %2, %3
-  ; CHECK: %4 = fmul half %x0, %y1
-  ; CHECK: %5 = fmul half %x1, %y0
-  ; CHECK: %hlsl.cross3 = fsub half %4, %5
-  ; CHECK: %6 = insertelement <3 x half> undef, half %hlsl.cross1, i64 0
-  ; CHECK: %7 = insertelement <3 x half> %6, half %hlsl.cross2, i64 1
-  ; CHECK: %8 = insertelement <3 x half> %7, half %hlsl.cross3, i64 2
-  ; CHECK: ret <3 x half> %8
-  %hlsl.cross = call <3 x half> @llvm.dx.cross.v3f16(<3 x half> %p0, <3 x half> %p1)
-  ret <3 x half> %hlsl.cross
-}
-
-define noundef <3 x float> @test_cross_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) {
-entry:
-  ; CHECK: %x0 = extractelement <3 x float> %p0, i64 0
-  ; CHECK: %x1 = extractelement <3 x float> %p0, i64 1
-  ; CHECK: %x2 = extractelement <3 x float> %p0, i64 2
-  ; CHECK: %y0 = extractelement <3 x float> %p1, i64 0
-  ; CHECK: %y1 = extractelement <3 x float> %p1, i64 1
-  ; CHECK: %y2 = extractelement <3 x float> %p1, i64 2
-  ; CHECK: %0 = fmul float %x1, %y2
-  ; CHECK: %1 = fmul float %x2, %y1
-  ; CHECK: %hlsl.cross1 = fsub float %0, %1
-  ; CHECK: %2 = fmul float %x2, %y0
-  ; CHECK: %3 = fmul float %x0, %y2
-  ; CHECK: %hlsl.cross2 = fsub float %2, %3
-  ; CHECK: %4 = fmul float %x0, %y1
-  ; CHECK: %5 = fmul float %x1, %y0
-  ; CHECK: %hlsl.cross3 = fsub float %4, %5
-  ; CHECK: %6 = insertelement <3 x float> undef, float %hlsl.cross1, i64 0
-  ; CHECK: %7 = insertelement <3 x float> %6, float %hlsl.cross2, i64 1
-  ; CHECK: %8 = insertelement <3 x float> %7, float %hlsl.cross3, i64 2
-  ; CHECK: ret <3 x float> %8
-  %hlsl.cross = call <3 x float> @llvm.dx.cross.v3f32(<3 x float> %p0, <3 x float> %p1)
-  ret <3 x float> %hlsl.cross
-}
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s
+
+; Make sure dxil operation function calls for cross are generated for half/float.
+
+declare <3 x half> @llvm.dx.cross.v3f16(<3 x half>, <3 x half>)
+declare <3 x float> @llvm.dx.cross.v3f32(<3 x float>, <3 x float>)
+
+define noundef <3 x half> @test_cross_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) {
+entry:
+  ; CHECK: %x0 = extractelement <3 x half> %p0, i64 0
+  ; CHECK: %x1 = extractelement <3 x half> %p0, i64 1
+  ; CHECK: %x2 = extractelement <3 x half> %p0, i64 2
+  ; CHECK: %y0 = extractelement <3 x half> %p1, i64 0
+  ; CHECK: %y1 = extractelement <3 x half> %p1, i64 1
+  ; CHECK: %y2 = extractelement <3 x half> %p1, i64 2
+  ; CHECK: %0 = fmul half %x1, %y2
+  ; CHECK: %1 = fmul half %x2, %y1
+  ; CHECK: %hlsl.cross1 = fsub half %0, %1
+  ; CHECK: %2 = fmul half %x2, %y0
+  ; CHECK: %3 = fmul half %x0, %y2
+  ; CHECK: %hlsl.cross2 = fsub half %2, %3
+  ; CHECK: %4 = fmul half %x0, %y1
+  ; CHECK: %5 = fmul half %x1, %y0
+  ; CHECK: %hlsl.cross3 = fsub half %4, %5
+  ; CHECK: %6 = insertelement <3 x half> undef, half %hlsl.cross1, i64 0
+  ; CHECK: %7 = insertelement <3 x half> %6, half %hlsl.cross2, i64 1
+  ; CHECK: %8 = insertelement <3 x half> %7, half %hlsl.cross3, i64 2
+  ; CHECK: ret <3 x half> %8
+  %hlsl.cross = call <3 x half> @llvm.dx.cross.v3f16(<3 x half> %p0, <3 x half> %p1)
+  ret <3 x half> %hlsl.cross
+}
+
+define noundef <3 x float> @test_cross_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) {
+entry:
+  ; CHECK: %x0 = extractelement <3 x float> %p0, i64 0
+  ; CHECK: %x1 = extractelement <3 x float> %p0, i64 1
+  ; CHECK: %x2 = extractelement <3 x float> %p0, i64 2
+  ; CHECK: %y0 = extractelement <3 x float> %p1, i64 0
+  ; CHECK: %y1 = extractelement <3 x float> %p1, i64 1
+  ; CHECK: %y2 = extractelement <3 x float> %p1, i64 2
+  ; CHECK: %0 = fmul float %x1, %y2
+  ; CHECK: %1 = fmul float %x2, %y1
+  ; CHECK: %hlsl.cross1 = fsub float %0, %1
+  ; CHECK: %2 = fmul float %x2, %y0
+  ; CHECK: %3 = fmul float %x0, %y2
+  ; CHECK: %hlsl.cross2 = fsub float %2, %3
+  ; CHECK: %4 = fmul float %x0, %y1
+  ; CHECK: %5 = fmul float %x1, %y0
+  ; CHECK: %hlsl.cross3 = fsub float %4, %5
+  ; CHECK: %6 = insertelement <3 x float> undef, float %hlsl.cross1, i64 0
+  ; CHECK: %7 = insertelement <3 x float> %6, float %hlsl.cross2, i64 1
+  ; CHECK: %8 = insertelement <3 x float> %7, float %hlsl.cross3, i64 2
+  ; CHECK: ret <3 x float> %8
+  %hlsl.cross = call <3 x float> @llvm.dx.cross.v3f32(<3 x float> %p0, <3 x float> %p1)
+  ret <3 x float> %hlsl.cross
+}
diff --git a/llvm/test/CodeGen/DirectX/finalize_linkage.ll b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
index 0ee8a5f44593..b6da9f6cb392 100644
--- a/llvm/test/CodeGen/DirectX/finalize_linkage.ll
+++ b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
@@ -1,64 +1,64 @@
-; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-compute %s | FileCheck %s
-; RUN: llc %s --filetype=asm -o - | FileCheck %s --check-prefixes=CHECK-LLC
-
-target triple = "dxilv1.5-pc-shadermodel6.5-compute"
-
-; DXILFinalizeLinkage changes linkage of all functions that are not
-; entry points or exported function to internal.
-
-; CHECK: define internal void @"?f1@@YAXXZ"()
-define void @"?f1@@YAXXZ"() #0 {
-entry:
-  ret void
-}
-
-; CHECK: define internal void @"?f2@@YAXXZ"()
-define void @"?f2@@YAXXZ"() #0 {
-entry:
-  ret void
-}
-
-; CHECK: define internal void @"?f3@@YAXXZ"()
-define void @"?f3@@YAXXZ"() #0 {
-entry:
-  ret void
-}
-
-; CHECK: define internal void @"?foo@@YAXXZ"()
-define void @"?foo@@YAXXZ"() #0 {
-entry:
-  call void @"?f2@@YAXXZ"() #3
-  ret void
-}
-
-; Exported function - do not change linkage
-; CHECK: define void @"?bar@@YAXXZ"()
-define void @"?bar@@YAXXZ"() #1 {
-entry:
-  call void @"?f3@@YAXXZ"() #3
-  ret void
-}
-
-; CHECK: define internal void @"?main@@YAXXZ"() #0
-define internal void @"?main@@YAXXZ"() #0 {
-entry:
-  call void @"?foo@@YAXXZ"() #3
-  call void @"?bar@@YAXXZ"() #3
-  ret void
-}
-
-; Entry point function - do not change linkage
-; CHECK: define void @main() #2
-define void @main() #2 {
-entry:
-  call void @"?main@@YAXXZ"()
-  ret void
-}
-
-attributes #0 = { convergent noinline nounwind optnone}
-attributes #1 = { convergent noinline nounwind optnone "hlsl.export"}
-attributes #2 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
-attributes #3 = { convergent }
-
-; Make sure "hlsl.export" attribute is stripped by llc
-; CHECK-LLC-NOT: "hlsl.export"
+; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-compute %s | FileCheck %s
+; RUN: llc %s --filetype=asm -o - | FileCheck %s --check-prefixes=CHECK-LLC
+
+target triple = "dxilv1.5-pc-shadermodel6.5-compute"
+
+; DXILFinalizeLinkage changes linkage of all functions that are not
+; entry points or exported function to internal.
+
+; CHECK: define internal void @"?f1@@YAXXZ"()
+define void @"?f1@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; CHECK: define internal void @"?f2@@YAXXZ"()
+define void @"?f2@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; CHECK: define internal void @"?f3@@YAXXZ"()
+define void @"?f3@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; CHECK: define internal void @"?foo@@YAXXZ"()
+define void @"?foo@@YAXXZ"() #0 {
+entry:
+  call void @"?f2@@YAXXZ"() #3
+  ret void
+}
+
+; Exported function - do not change linkage
+; CHECK: define void @"?bar@@YAXXZ"()
+define void @"?bar@@YAXXZ"() #1 {
+entry:
+  call void @"?f3@@YAXXZ"() #3
+  ret void
+}
+
+; CHECK: define internal void @"?main@@YAXXZ"() #0
+define internal void @"?main@@YAXXZ"() #0 {
+entry:
+  call void @"?foo@@YAXXZ"() #3
+  call void @"?bar@@YAXXZ"() #3
+  ret void
+}
+
+; Entry point function - do not change linkage
+; CHECK: define void @main() #2
+define void @main() #2 {
+entry:
+  call void @"?main@@YAXXZ"()
+  ret void
+}
+
+attributes #0 = { convergent noinline nounwind optnone}
+attributes #1 = { convergent noinline nounwind optnone "hlsl.export"}
+attributes #2 = { convergent "hlsl.numthreads"="4,1,1" "hlsl.shader"="compute"}
+attributes #3 = { convergent }
+
+; Make sure "hlsl.export" attribute is stripped by llc
+; CHECK-LLC-NOT: "hlsl.export"
diff --git a/llvm/test/CodeGen/DirectX/normalize.ll b/llvm/test/CodeGen/DirectX/normalize.ll
index 2aba9d5f74d7..de106be12437 100644
--- a/llvm/test/CodeGen/DirectX/normalize.ll
+++ b/llvm/test/CodeGen/DirectX/normalize.ll
@@ -1,112 +1,112 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
-; RUN: opt -S  -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
-
-; Make sure dxil operation function calls for normalize are generated for half/float.
-
-declare half @llvm.dx.normalize.f16(half)
-declare <2 x half> @llvm.dx.normalize.v2f16(<2 x half>)
-declare <3 x half> @llvm.dx.normalize.v3f16(<3 x half>)
-declare <4 x half> @llvm.dx.normalize.v4f16(<4 x half>)
-
-declare float @llvm.dx.normalize.f32(float)
-declare <2 x float> @llvm.dx.normalize.v2f32(<2 x float>)
-declare <3 x float> @llvm.dx.normalize.v3f32(<3 x float>)
-declare <4 x float> @llvm.dx.normalize.v4f32(<4 x float>)
-
-define noundef half @test_normalize_half(half noundef %p0) {
-entry:
-  ; CHECK: fdiv half %p0, %p0
-  %hlsl.normalize = call half @llvm.dx.normalize.f16(half %p0)
-  ret half %hlsl.normalize
-}
-
-define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) {
-entry:
-  ; EXPCHECK: [[doth2:%.*]] = call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}})
-  ; DOPCHECK: [[doth2:%.*]] = call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
-  ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth2]])
-  ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth2]])
-  ; CHECK: [[splatinserth2:%.*]] = insertelement <2 x half> poison, half [[rsqrt]], i64 0
-  ; CHECK: [[splat:%.*]] = shufflevector <2 x half> [[splatinserth2]], <2 x half> poison, <2 x i32> zeroinitializer
-  ; CHECK: fmul <2 x half> %p0, [[splat]]
-
-  %hlsl.normalize = call <2 x half> @llvm.dx.normalize.v2f16(<2 x half> %p0)
-  ret <2 x half> %hlsl.normalize
-}
-
-define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) {
-entry:
-  ; EXPCHECK: [[doth3:%.*]] = call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}})
-  ; DOPCHECK: [[doth3:%.*]] = call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
-  ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth3]])
-  ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth3]])
-  ; CHECK: [[splatinserth3:%.*]] = insertelement <3 x half> poison, half [[rsqrt]], i64 0
-  ; CHECK: [[splat:%.*]] shufflevector <3 x half> [[splatinserth3]], <3 x half> poison, <3 x i32> zeroinitializer
-  ; CHECK: fmul <3 x half> %p0, %.splat
-
-  %hlsl.normalize = call <3 x half> @llvm.dx.normalize.v3f16(<3 x half> %p0)
-  ret <3 x half> %hlsl.normalize
-}
-
-define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) {
-entry:
-  ; EXPCHECK: [[doth4:%.*]] = call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}})
-  ; DOPCHECK: [[doth4:%.*]] = call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
-  ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth4]])
-  ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth4]])
-  ; CHECK: [[splatinserth4:%.*]] = insertelement <4 x half> poison, half [[rsqrt]], i64 0
-  ; CHECK: [[splat:%.*]] shufflevector <4 x half> [[splatinserth4]], <4 x half> poison, <4 x i32> zeroinitializer
-  ; CHECK: fmul <4 x half> %p0, %.splat
-
-  %hlsl.normalize = call <4 x half> @llvm.dx.normalize.v4f16(<4 x half> %p0)
-  ret <4 x half> %hlsl.normalize
-}
-
-define noundef float @test_normalize_float(float noundef %p0) {
-entry:
-  ; CHECK: fdiv float %p0, %p0
-  %hlsl.normalize = call float @llvm.dx.normalize.f32(float %p0)
-  ret float %hlsl.normalize
-}
-
-define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) {
-entry:
-  ; EXPCHECK: [[dotf2:%.*]] = call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}})
-  ; DOPCHECK: [[dotf2:%.*]] = call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
-  ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf2]])
-  ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf2]])
-  ; CHECK: [[splatinsertf2:%.*]] = insertelement <2 x float> poison, float [[rsqrt]], i64 0
-  ; CHECK: [[splat:%.*]] shufflevector <2 x float> [[splatinsertf2]], <2 x float> poison, <2 x i32> zeroinitializer
-  ; CHECK: fmul <2 x float> %p0, %.splat
-
-  %hlsl.normalize = call <2 x float> @llvm.dx.normalize.v2f32(<2 x float> %p0)
-  ret <2 x float> %hlsl.normalize
-}
-
-define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) {
-entry:
-  ; EXPCHECK: [[dotf3:%.*]] = call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}})
-  ; DOPCHECK: [[dotf3:%.*]] = call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
-  ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf3]])
-  ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf3]])
-  ; CHECK: [[splatinsertf3:%.*]] = insertelement <3 x float> poison, float [[rsqrt]], i64 0
-  ; CHECK: [[splat:%.*]] shufflevector <3 x float> [[splatinsertf3]], <3 x float> poison, <3 x i32> zeroinitializer
-  ; CHECK: fmul <3 x float> %p0, %.splat
-
-  %hlsl.normalize = call <3 x float> @llvm.dx.normalize.v3f32(<3 x float> %p0)
-  ret <3 x float> %hlsl.normalize
-}
-
-define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) {
-entry:
-  ; EXPCHECK: [[dotf4:%.*]] = call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
-  ; DOPCHECK: [[dotf4:%.*]] = call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
-  ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf4]])
-  ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf4]])
-  ; CHECK: [[splatinsertf4:%.*]] = insertelement <4 x float> poison, float [[rsqrt]], i64 0
-  ; CHECK: [[splat:%.*]] shufflevector <4 x float> [[splatinsertf4]], <4 x float> poison, <4 x i32> zeroinitializer
-  ; CHECK: fmul <4 x float> %p0, %.splat
-
-  %hlsl.normalize = call <4 x float> @llvm.dx.normalize.v4f32(<4 x float> %p0)
-  ret <4 x float> %hlsl.normalize
-}
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefixes=CHECK,EXPCHECK
+; RUN: opt -S  -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefixes=CHECK,DOPCHECK
+
+; Make sure dxil operation function calls for normalize are generated for half/float.
+
+declare half @llvm.dx.normalize.f16(half)
+declare <2 x half> @llvm.dx.normalize.v2f16(<2 x half>)
+declare <3 x half> @llvm.dx.normalize.v3f16(<3 x half>)
+declare <4 x half> @llvm.dx.normalize.v4f16(<4 x half>)
+
+declare float @llvm.dx.normalize.f32(float)
+declare <2 x float> @llvm.dx.normalize.v2f32(<2 x float>)
+declare <3 x float> @llvm.dx.normalize.v3f32(<3 x float>)
+declare <4 x float> @llvm.dx.normalize.v4f32(<4 x float>)
+
+define noundef half @test_normalize_half(half noundef %p0) {
+entry:
+  ; CHECK: fdiv half %p0, %p0
+  %hlsl.normalize = call half @llvm.dx.normalize.f16(half %p0)
+  ret half %hlsl.normalize
+}
+
+define noundef <2 x half> @test_normalize_half2(<2 x half> noundef %p0) {
+entry:
+  ; EXPCHECK: [[doth2:%.*]] = call half @llvm.dx.dot2.v2f16(<2 x half> %{{.*}}, <2 x half> %{{.*}})
+  ; DOPCHECK: [[doth2:%.*]] = call half @dx.op.dot2.f16(i32 54, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
+  ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth2]])
+  ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth2]])
+  ; CHECK: [[splatinserth2:%.*]] = insertelement <2 x half> poison, half [[rsqrt]], i64 0
+  ; CHECK: [[splat:%.*]] = shufflevector <2 x half> [[splatinserth2]], <2 x half> poison, <2 x i32> zeroinitializer
+  ; CHECK: fmul <2 x half> %p0, [[splat]]
+
+  %hlsl.normalize = call <2 x half> @llvm.dx.normalize.v2f16(<2 x half> %p0)
+  ret <2 x half> %hlsl.normalize
+}
+
+define noundef <3 x half> @test_normalize_half3(<3 x half> noundef %p0) {
+entry:
+  ; EXPCHECK: [[doth3:%.*]] = call half @llvm.dx.dot3.v3f16(<3 x half> %{{.*}}, <3 x half> %{{.*}})
+  ; DOPCHECK: [[doth3:%.*]] = call half @dx.op.dot3.f16(i32 55, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
+  ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth3]])
+  ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth3]])
+  ; CHECK: [[splatinserth3:%.*]] = insertelement <3 x half> poison, half [[rsqrt]], i64 0
+  ; CHECK: [[splat:%.*]] shufflevector <3 x half> [[splatinserth3]], <3 x half> poison, <3 x i32> zeroinitializer
+  ; CHECK: fmul <3 x half> %p0, %.splat
+
+  %hlsl.normalize = call <3 x half> @llvm.dx.normalize.v3f16(<3 x half> %p0)
+  ret <3 x half> %hlsl.normalize
+}
+
+define noundef <4 x half> @test_normalize_half4(<4 x half> noundef %p0) {
+entry:
+  ; EXPCHECK: [[doth4:%.*]] = call half @llvm.dx.dot4.v4f16(<4 x half> %{{.*}}, <4 x half> %{{.*}})
+  ; DOPCHECK: [[doth4:%.*]] = call half @dx.op.dot4.f16(i32 56, half %{{.*}}, half %{{.*}}, half %{{.*}}, half %{{.*}})
+  ; EXPCHECK: [[rsqrt:%.*]] = call half @llvm.dx.rsqrt.f16(half [[doth4]])
+  ; DOPCHECK: [[rsqrt:%.*]] = call half @dx.op.unary.f16(i32 25, half [[doth4]])
+  ; CHECK: [[splatinserth4:%.*]] = insertelement <4 x half> poison, half [[rsqrt]], i64 0
+  ; CHECK: [[splat:%.*]] shufflevector <4 x half> [[splatinserth4]], <4 x half> poison, <4 x i32> zeroinitializer
+  ; CHECK: fmul <4 x half> %p0, %.splat
+
+  %hlsl.normalize = call <4 x half> @llvm.dx.normalize.v4f16(<4 x half> %p0)
+  ret <4 x half> %hlsl.normalize
+}
+
+define noundef float @test_normalize_float(float noundef %p0) {
+entry:
+  ; CHECK: fdiv float %p0, %p0
+  %hlsl.normalize = call float @llvm.dx.normalize.f32(float %p0)
+  ret float %hlsl.normalize
+}
+
+define noundef <2 x float> @test_normalize_float2(<2 x float> noundef %p0) {
+entry:
+  ; EXPCHECK: [[dotf2:%.*]] = call float @llvm.dx.dot2.v2f32(<2 x float> %{{.*}}, <2 x float> %{{.*}})
+  ; DOPCHECK: [[dotf2:%.*]] = call float @dx.op.dot2.f32(i32 54, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+  ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf2]])
+  ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf2]])
+  ; CHECK: [[splatinsertf2:%.*]] = insertelement <2 x float> poison, float [[rsqrt]], i64 0
+  ; CHECK: [[splat:%.*]] shufflevector <2 x float> [[splatinsertf2]], <2 x float> poison, <2 x i32> zeroinitializer
+  ; CHECK: fmul <2 x float> %p0, %.splat
+
+  %hlsl.normalize = call <2 x float> @llvm.dx.normalize.v2f32(<2 x float> %p0)
+  ret <2 x float> %hlsl.normalize
+}
+
+define noundef <3 x float> @test_normalize_float3(<3 x float> noundef %p0) {
+entry:
+  ; EXPCHECK: [[dotf3:%.*]] = call float @llvm.dx.dot3.v3f32(<3 x float> %{{.*}}, <3 x float> %{{.*}})
+  ; DOPCHECK: [[dotf3:%.*]] = call float @dx.op.dot3.f32(i32 55, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+  ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf3]])
+  ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf3]])
+  ; CHECK: [[splatinsertf3:%.*]] = insertelement <3 x float> poison, float [[rsqrt]], i64 0
+  ; CHECK: [[splat:%.*]] shufflevector <3 x float> [[splatinsertf3]], <3 x float> poison, <3 x i32> zeroinitializer
+  ; CHECK: fmul <3 x float> %p0, %.splat
+
+  %hlsl.normalize = call <3 x float> @llvm.dx.normalize.v3f32(<3 x float> %p0)
+  ret <3 x float> %hlsl.normalize
+}
+
+define noundef <4 x float> @test_normalize_float4(<4 x float> noundef %p0) {
+entry:
+  ; EXPCHECK: [[dotf4:%.*]] = call float @llvm.dx.dot4.v4f32(<4 x float> %{{.*}}, <4 x float> %{{.*}})
+  ; DOPCHECK: [[dotf4:%.*]] = call float @dx.op.dot4.f32(i32 56, float %{{.*}}, float %{{.*}}, float %{{.*}}, float %{{.*}})
+  ; EXPCHECK: [[rsqrt:%.*]] = call float @llvm.dx.rsqrt.f32(float [[dotf4]])
+  ; DOPCHECK: [[rsqrt:%.*]] = call float @dx.op.unary.f32(i32 25, float [[dotf4]])
+  ; CHECK: [[splatinsertf4:%.*]] = insertelement <4 x float> poison, float [[rsqrt]], i64 0
+  ; CHECK: [[splat:%.*]] shufflevector <4 x float> [[splatinsertf4]], <4 x float> poison, <4 x i32> zeroinitializer
+  ; CHECK: fmul <4 x float> %p0, %.splat
+
+  %hlsl.normalize = call <4 x float> @llvm.dx.normalize.v4f32(<4 x float> %p0)
+  ret <4 x float> %hlsl.normalize
+}
diff --git a/llvm/test/CodeGen/DirectX/normalize_error.ll b/llvm/test/CodeGen/DirectX/normalize_error.ll
index 35a91c0cdc24..3041d2ecdd92 100644
--- a/llvm/test/CodeGen/DirectX/normalize_error.ll
+++ b/llvm/test/CodeGen/DirectX/normalize_error.ll
@@ -1,10 +1,10 @@
-; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
-
-; DXIL operation normalize does not support double overload type
-; CHECK: Cannot create Dot2 operation: Invalid overload type
-
-define noundef <2 x double> @test_normalize_double2(<2 x double> noundef %p0) {
-entry:
-  %hlsl.normalize = call <2 x double> @llvm.dx.normalize.v2f32(<2 x double> %p0)
-  ret <2 x double> %hlsl.normalize
-}
+; RUN: not opt -S -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library %s 2>&1 | FileCheck %s
+
+; DXIL operation normalize does not support double overload type
+; CHECK: Cannot create Dot2 operation: Invalid overload type
+
+define noundef <2 x double> @test_normalize_double2(<2 x double> noundef %p0) {
+entry:
+  %hlsl.normalize = call <2 x double> @llvm.dx.normalize.v2f32(<2 x double> %p0)
+  ret <2 x double> %hlsl.normalize
+}
diff --git a/llvm/test/CodeGen/DirectX/step.ll b/llvm/test/CodeGen/DirectX/step.ll
index 1c9894026c62..6a9b5bf71da8 100644
--- a/llvm/test/CodeGen/DirectX/step.ll
+++ b/llvm/test/CodeGen/DirectX/step.ll
@@ -1,78 +1,78 @@
-; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefix=CHECK
-; RUN: opt -S  -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefix=CHECK
-
-; Make sure dxil operation function calls for step are generated for half/float.
-
-declare half @llvm.dx.step.f16(half, half)
-declare <2 x half> @llvm.dx.step.v2f16(<2 x half>, <2 x half>)
-declare <3 x half> @llvm.dx.step.v3f16(<3 x half>, <3 x half>)
-declare <4 x half> @llvm.dx.step.v4f16(<4 x half>, <4 x half>)
-
-declare float @llvm.dx.step.f32(float, float)
-declare <2 x float> @llvm.dx.step.v2f32(<2 x float>, <2 x float>)
-declare <3 x float> @llvm.dx.step.v3f32(<3 x float>, <3 x float>)
-declare <4 x float> @llvm.dx.step.v4f32(<4 x float>, <4 x float>)
-
-define noundef half @test_step_half(half noundef %p0, half noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt half %p1, %p0
-  ; CHECK: %1 = select i1 %0, half 0xH0000, half 0xH3C00
-  %hlsl.step = call half @llvm.dx.step.f16(half %p0, half %p1)
-  ret half %hlsl.step
-}
-
-define noundef <2 x half> @test_step_half2(<2 x half> noundef %p0, <2 x half> noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt <2 x half> %p1, %p0
-  ; CHECK: %1 = select <2 x i1> %0, <2 x half> zeroinitializer, <2 x half> <half 0xH3C00, half 0xH3C00>
-  %hlsl.step = call <2 x half> @llvm.dx.step.v2f16(<2 x half> %p0, <2 x half> %p1)
-  ret <2 x half> %hlsl.step
-}
-
-define noundef <3 x half> @test_step_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt <3 x half> %p1, %p0
-  ; CHECK: %1 = select <3 x i1> %0, <3 x half> zeroinitializer, <3 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00>
-  %hlsl.step = call <3 x half> @llvm.dx.step.v3f16(<3 x half> %p0, <3 x half> %p1)
-  ret <3 x half> %hlsl.step
-}
-
-define noundef <4 x half> @test_step_half4(<4 x half> noundef %p0, <4 x half> noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt <4 x half> %p1, %p0
-  ; CHECK: %1 = select <4 x i1> %0, <4 x half> zeroinitializer, <4 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>
-  %hlsl.step = call <4 x half> @llvm.dx.step.v4f16(<4 x half> %p0, <4 x half> %p1)
-  ret <4 x half> %hlsl.step
-}
-
-define noundef float @test_step_float(float noundef %p0, float noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt float %p1, %p0
-  ; CHECK: %1 = select i1 %0, float 0.000000e+00, float 1.000000e+00
-  %hlsl.step = call float @llvm.dx.step.f32(float %p0, float %p1)
-  ret float %hlsl.step
-}
-
-define noundef <2 x float> @test_step_float2(<2 x float> noundef %p0, <2 x float> noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt <2 x float> %p1, %p0
-  ; CHECK: %1 = select <2 x i1> %0, <2 x float> zeroinitializer, <2 x float> <float 1.000000e+00, float 1.000000e+00>
-  %hlsl.step = call <2 x float> @llvm.dx.step.v2f32(<2 x float> %p0, <2 x float> %p1)
-  ret <2 x float> %hlsl.step
-}
-
-define noundef <3 x float> @test_step_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt <3 x float> %p1, %p0
-  ; CHECK: %1 = select <3 x i1> %0, <3 x float> zeroinitializer, <3 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  %hlsl.step = call <3 x float> @llvm.dx.step.v3f32(<3 x float> %p0, <3 x float> %p1)
-  ret <3 x float> %hlsl.step
-}
-
-define noundef <4 x float> @test_step_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) {
-entry:
-  ; CHECK: %0 = fcmp olt <4 x float> %p1, %p0
-  ; CHECK: %1 = select <4 x i1> %0, <4 x float> zeroinitializer, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
-  %hlsl.step = call <4 x float> @llvm.dx.step.v4f32(<4 x float> %p0, <4 x float> %p1)
-  ret <4 x float> %hlsl.step
-}
+; RUN: opt -S  -dxil-intrinsic-expansion  < %s | FileCheck %s --check-prefix=CHECK
+; RUN: opt -S  -dxil-intrinsic-expansion -dxil-op-lower -mtriple=dxil-pc-shadermodel6.3-library < %s | FileCheck %s --check-prefix=CHECK
+
+; Make sure dxil operation function calls for step are generated for half/float.
+
+declare half @llvm.dx.step.f16(half, half)
+declare <2 x half> @llvm.dx.step.v2f16(<2 x half>, <2 x half>)
+declare <3 x half> @llvm.dx.step.v3f16(<3 x half>, <3 x half>)
+declare <4 x half> @llvm.dx.step.v4f16(<4 x half>, <4 x half>)
+
+declare float @llvm.dx.step.f32(float, float)
+declare <2 x float> @llvm.dx.step.v2f32(<2 x float>, <2 x float>)
+declare <3 x float> @llvm.dx.step.v3f32(<3 x float>, <3 x float>)
+declare <4 x float> @llvm.dx.step.v4f32(<4 x float>, <4 x float>)
+
+define noundef half @test_step_half(half noundef %p0, half noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt half %p1, %p0
+  ; CHECK: %1 = select i1 %0, half 0xH0000, half 0xH3C00
+  %hlsl.step = call half @llvm.dx.step.f16(half %p0, half %p1)
+  ret half %hlsl.step
+}
+
+define noundef <2 x half> @test_step_half2(<2 x half> noundef %p0, <2 x half> noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt <2 x half> %p1, %p0
+  ; CHECK: %1 = select <2 x i1> %0, <2 x half> zeroinitializer, <2 x half> <half 0xH3C00, half 0xH3C00>
+  %hlsl.step = call <2 x half> @llvm.dx.step.v2f16(<2 x half> %p0, <2 x half> %p1)
+  ret <2 x half> %hlsl.step
+}
+
+define noundef <3 x half> @test_step_half3(<3 x half> noundef %p0, <3 x half> noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt <3 x half> %p1, %p0
+  ; CHECK: %1 = select <3 x i1> %0, <3 x half> zeroinitializer, <3 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00>
+  %hlsl.step = call <3 x half> @llvm.dx.step.v3f16(<3 x half> %p0, <3 x half> %p1)
+  ret <3 x half> %hlsl.step
+}
+
+define noundef <4 x half> @test_step_half4(<4 x half> noundef %p0, <4 x half> noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt <4 x half> %p1, %p0
+  ; CHECK: %1 = select <4 x i1> %0, <4 x half> zeroinitializer, <4 x half> <half 0xH3C00, half 0xH3C00, half 0xH3C00, half 0xH3C00>
+  %hlsl.step = call <4 x half> @llvm.dx.step.v4f16(<4 x half> %p0, <4 x half> %p1)
+  ret <4 x half> %hlsl.step
+}
+
+define noundef float @test_step_float(float noundef %p0, float noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt float %p1, %p0
+  ; CHECK: %1 = select i1 %0, float 0.000000e+00, float 1.000000e+00
+  %hlsl.step = call float @llvm.dx.step.f32(float %p0, float %p1)
+  ret float %hlsl.step
+}
+
+define noundef <2 x float> @test_step_float2(<2 x float> noundef %p0, <2 x float> noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt <2 x float> %p1, %p0
+  ; CHECK: %1 = select <2 x i1> %0, <2 x float> zeroinitializer, <2 x float> <float 1.000000e+00, float 1.000000e+00>
+  %hlsl.step = call <2 x float> @llvm.dx.step.v2f32(<2 x float> %p0, <2 x float> %p1)
+  ret <2 x float> %hlsl.step
+}
+
+define noundef <3 x float> @test_step_float3(<3 x float> noundef %p0, <3 x float> noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt <3 x float> %p1, %p0
+  ; CHECK: %1 = select <3 x i1> %0, <3 x float> zeroinitializer, <3 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %hlsl.step = call <3 x float> @llvm.dx.step.v3f32(<3 x float> %p0, <3 x float> %p1)
+  ret <3 x float> %hlsl.step
+}
+
+define noundef <4 x float> @test_step_float4(<4 x float> noundef %p0, <4 x float> noundef %p1) {
+entry:
+  ; CHECK: %0 = fcmp olt <4 x float> %p1, %p0
+  ; CHECK: %1 = select <4 x i1> %0, <4 x float> zeroinitializer, <4 x float> <float 1.000000e+00, float 1.000000e+00, float 1.000000e+00, float 1.000000e+00>
+  %hlsl.step = call <4 x float> @llvm.dx.step.v4f32(<4 x float> %p0, <4 x float> %p1)
+  ret <4 x float> %hlsl.step
+}
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll
index bdbfc133efa2..a0306bae4a22 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/atan2.ll
@@ -1,49 +1,49 @@
-; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
-
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
-; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
-; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
-; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
-; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
-
-define noundef float @atan2_float(float noundef %a, float noundef %b) {
-entry:
-; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
-  %elt.atan2 = call float @llvm.atan2.f32(float %a, float %b)
-  ret float %elt.atan2
-}
-
-define noundef half @atan2_half(half noundef %a, half noundef %b) {
-entry:
-; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
-; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
-  %elt.atan2 = call half @llvm.atan2.f16(half %a, half %b)
-  ret half %elt.atan2
-}
-
-define noundef <4 x float> @atan2_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
-entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
-  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %a, <4 x float> %b)
-  ret <4 x float> %elt.atan2
-}
-
-define noundef <4 x half> @atan2_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
-entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
-  %elt.atan2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %a, <4 x half> %b)
-  ret <4 x half> %elt.atan2
-}
-
-declare half @llvm.atan2.f16(half, half)
-declare float @llvm.atan2.f32(float, float)
-declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
+; RUN: llc -verify-machineinstrs -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+
+define noundef float @atan2_float(float noundef %a, float noundef %b) {
+entry:
+; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call float @llvm.atan2.f32(float %a, float %b)
+  ret float %elt.atan2
+}
+
+define noundef half @atan2_half(half noundef %a, half noundef %b) {
+entry:
+; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call half @llvm.atan2.f16(half %a, half %b)
+  ret half %elt.atan2
+}
+
+define noundef <4 x float> @atan2_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call <4 x float> @llvm.atan2.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %elt.atan2
+}
+
+define noundef <4 x half> @atan2_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Atan2 %[[#arg0]] %[[#arg1]]
+  %elt.atan2 = call <4 x half> @llvm.atan2.v4f16(<4 x half> %a, <4 x half> %b)
+  ret <4 x half> %elt.atan2
+}
+
+declare half @llvm.atan2.f16(half, half)
+declare float @llvm.atan2.f32(float, float)
+declare <4 x half> @llvm.atan2.v4f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.atan2.v4f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
index 2e0eb8c429ac..7c06c14bb968 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/cross.ll
@@ -1,33 +1,33 @@
-; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
-
-; Make sure SPIRV operation function calls for cross are lowered correctly.
-
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
-; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
-; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
-; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3
-; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3
-
-define noundef <3 x half> @cross_half4(<3 x half> noundef %a, <3 x half> noundef %b) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#vec3_float_16]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b)
-  ret <3 x half> %hlsl.cross
-}
-
-define noundef <3 x float> @cross_float4(<3 x float> noundef %a, <3 x float> noundef %b) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#vec3_float_32]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
-  %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b)
-  ret <3 x float> %hlsl.cross
-}
-
-declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>)
-declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>)
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for cross are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec3_float_16:]] = OpTypeVector %[[#float_16]] 3
+; CHECK-DAG: %[[#vec3_float_32:]] = OpTypeVector %[[#float_32]] 3
+
+define noundef <3 x half> @cross_half4(<3 x half> noundef %a, <3 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec3_float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_16]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
+  %hlsl.cross = call <3 x half> @llvm.spv.cross.v4f16(<3 x half> %a, <3 x half> %b)
+  ret <3 x half> %hlsl.cross
+}
+
+define noundef <3 x float> @cross_float4(<3 x float> noundef %a, <3 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec3_float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec3_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec3_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec3_float_32]] %[[#op_ext_glsl]] Cross %[[#arg0]] %[[#arg1]]
+  %hlsl.cross = call <3 x float> @llvm.spv.cross.v4f32(<3 x float> %a, <3 x float> %b)
+  ret <3 x float> %hlsl.cross
+}
+
+declare <3 x half> @llvm.spv.cross.v4f16(<3 x half>, <3 x half>)
+declare <3 x float> @llvm.spv.cross.v4f32(<3 x float>, <3 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
index b4a9d8e0664b..df1ef3a7287c 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/length.ll
@@ -1,29 +1,29 @@
-; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
-
-; Make sure SPIRV operation function calls for length are lowered correctly.
-
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
-; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
-; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
-; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
-; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
-
-define noundef half @length_half4(<4 x half> noundef %a) {
-entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a)
-  ret half %hlsl.length
-}
-
-define noundef float @length_float4(<4 x float> noundef %a) {
-entry:
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
-  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]]
-  %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a)
-  ret float %hlsl.length
-}
-
-declare half @llvm.spv.length.v4f16(<4 x half>)
-declare float @llvm.spv.length.v4f32(<4 x float>)
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for length are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef half @length_half4(<4 x half> noundef %a) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_16]] %[[#op_ext_glsl]] Length %[[#arg0]]
+  %hlsl.length = call half @llvm.spv.length.v4f16(<4 x half> %a)
+  ret half %hlsl.length
+}
+
+define noundef float @length_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#]]
+  ; CHECK: %[[#]] = OpExtInst %[[#float_32]] %[[#op_ext_glsl]] Length %[[#arg0]]
+  %hlsl.length = call float @llvm.spv.length.v4f32(<4 x float> %a)
+  ret float %hlsl.length
+}
+
+declare half @llvm.spv.length.v4f16(<4 x half>)
+declare float @llvm.spv.length.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll
index fa73b9c2a4d3..4659b5146e43 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/normalize.ll
@@ -1,31 +1,31 @@
-; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
-
-; Make sure SPIRV operation function calls for normalize are lowered correctly.
-
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
-; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
-; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
-; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
-; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
-
-define noundef <4 x half> @normalize_half4(<4 x half> noundef %a) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Normalize %[[#arg0]]
-  %hlsl.normalize = call <4 x half> @llvm.spv.normalize.v4f16(<4 x half> %a)
-  ret <4 x half> %hlsl.normalize
-}
-
-define noundef <4 x float> @normalize_float4(<4 x float> noundef %a) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Normalize %[[#arg0]]
-  %hlsl.normalize = call <4 x float> @llvm.spv.normalize.v4f32(<4 x float> %a)
-  ret <4 x float> %hlsl.normalize
-}
-
-declare <4 x half> @llvm.spv.normalize.v4f16(<4 x half>)
-declare <4 x float> @llvm.spv.normalize.v4f32(<4 x float>)
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for normalize are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef <4 x half> @normalize_half4(<4 x half> noundef %a) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Normalize %[[#arg0]]
+  %hlsl.normalize = call <4 x half> @llvm.spv.normalize.v4f16(<4 x half> %a)
+  ret <4 x half> %hlsl.normalize
+}
+
+define noundef <4 x float> @normalize_float4(<4 x float> noundef %a) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Normalize %[[#arg0]]
+  %hlsl.normalize = call <4 x float> @llvm.spv.normalize.v4f32(<4 x float> %a)
+  ret <4 x float> %hlsl.normalize
+}
+
+declare <4 x half> @llvm.spv.normalize.v4f16(<4 x half>)
+declare <4 x float> @llvm.spv.normalize.v4f32(<4 x float>)
diff --git a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll
index bb50d8c790f8..7c0ee9398d15 100644
--- a/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll
+++ b/llvm/test/CodeGen/SPIRV/hlsl-intrinsics/step.ll
@@ -1,33 +1,33 @@
-; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
-; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
-
-; Make sure SPIRV operation function calls for step are lowered correctly.
-
-; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
-; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
-; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
-; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
-; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
-
-define noundef <4 x half> @step_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]]
-  %hlsl.step = call <4 x half> @llvm.spv.step.v4f16(<4 x half> %a, <4 x half> %b)
-  ret <4 x half> %hlsl.step
-}
-
-define noundef <4 x float> @step_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
-entry:
-  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]]
-  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
-  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
-  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]]
-  %hlsl.step = call <4 x float> @llvm.spv.step.v4f32(<4 x float> %a, <4 x float> %b)
-  ret <4 x float> %hlsl.step
-}
-
-declare <4 x half> @llvm.spv.step.v4f16(<4 x half>, <4 x half>)
-declare <4 x float> @llvm.spv.step.v4f32(<4 x float>, <4 x float>)
+; RUN: llc -O0 -mtriple=spirv-unknown-unknown %s -o - | FileCheck %s
+; RUN: %if spirv-tools %{ llc -O0 -mtriple=spirv-unknown-unknown %s -o - -filetype=obj | spirv-val %}
+
+; Make sure SPIRV operation function calls for step are lowered correctly.
+
+; CHECK-DAG: %[[#op_ext_glsl:]] = OpExtInstImport "GLSL.std.450"
+; CHECK-DAG: %[[#float_32:]] = OpTypeFloat 32
+; CHECK-DAG: %[[#float_16:]] = OpTypeFloat 16
+; CHECK-DAG: %[[#vec4_float_16:]] = OpTypeVector %[[#float_16]] 4
+; CHECK-DAG: %[[#vec4_float_32:]] = OpTypeVector %[[#float_32]] 4
+
+define noundef <4 x half> @step_half4(<4 x half> noundef %a, <4 x half> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_16]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_16]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_16]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]]
+  %hlsl.step = call <4 x half> @llvm.spv.step.v4f16(<4 x half> %a, <4 x half> %b)
+  ret <4 x half> %hlsl.step
+}
+
+define noundef <4 x float> @step_float4(<4 x float> noundef %a, <4 x float> noundef %b) {
+entry:
+  ; CHECK: %[[#]] = OpFunction %[[#vec4_float_32]] None %[[#]]
+  ; CHECK: %[[#arg0:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#arg1:]] = OpFunctionParameter %[[#vec4_float_32]]
+  ; CHECK: %[[#]] = OpExtInst %[[#vec4_float_32]] %[[#op_ext_glsl]] Step %[[#arg0]] %[[#arg1]]
+  %hlsl.step = call <4 x float> @llvm.spv.step.v4f32(<4 x float> %a, <4 x float> %b)
+  ret <4 x float> %hlsl.step
+}
+
+declare <4 x half> @llvm.spv.step.v4f16(<4 x half>, <4 x half>)
+declare <4 x float> @llvm.spv.step.v4f32(<4 x float>, <4 x float>)
diff --git a/llvm/test/Demangle/ms-placeholder-return-type.test b/llvm/test/Demangle/ms-placeholder-return-type.test
index 18038e636c8d..a656400fe140 100644
--- a/llvm/test/Demangle/ms-placeholder-return-type.test
+++ b/llvm/test/Demangle/ms-placeholder-return-type.test
@@ -1,18 +1,18 @@
-; RUN: llvm-undname < %s | FileCheck %s
-
-; CHECK-NOT: Invalid mangled name
-
-?TestNonTemplateAuto@@YA@XZ
-; CHECK: __cdecl TestNonTemplateAuto(void)
-
-??$AutoT@X@@YA?A_PXZ
-; CHECK: auto __cdecl AutoT<void>(void)
-
-??$AutoT@X@@YA?B_PXZ
-; CHECK: auto const __cdecl AutoT<void>(void)
-
-??$AutoT@X@@YA?A_TXZ
-; CHECK: decltype(auto) __cdecl AutoT<void>(void)
-
-??$AutoT@X@@YA?B_TXZ
-; CHECK: decltype(auto) const __cdecl AutoT<void>(void)
+; RUN: llvm-undname < %s | FileCheck %s
+
+; CHECK-NOT: Invalid mangled name
+
+?TestNonTemplateAuto@@YA@XZ
+; CHECK: __cdecl TestNonTemplateAuto(void)
+
+??$AutoT@X@@YA?A_PXZ
+; CHECK: auto __cdecl AutoT<void>(void)
+
+??$AutoT@X@@YA?B_PXZ
+; CHECK: auto const __cdecl AutoT<void>(void)
+
+??$AutoT@X@@YA?A_TXZ
+; CHECK: decltype(auto) __cdecl AutoT<void>(void)
+
+??$AutoT@X@@YA?B_TXZ
+; CHECK: decltype(auto) const __cdecl AutoT<void>(void)
diff --git a/llvm/test/FileCheck/dos-style-eol.txt b/llvm/test/FileCheck/dos-style-eol.txt
index 4252aad4d3e7..52184f465c3f 100644
--- a/llvm/test/FileCheck/dos-style-eol.txt
+++ b/llvm/test/FileCheck/dos-style-eol.txt
@@ -1,11 +1,11 @@
-// Test for using FileCheck on DOS style end-of-line
-// This test was deliberately committed with DOS style end of line.
-// Don't change line endings!
-// RUN: FileCheck -input-file %s %s
-// RUN: FileCheck  --strict-whitespace -input-file %s %s
-
-LINE 1
-; CHECK: {{^}}LINE 1{{$}}
-
-LINE 2
+// Test for using FileCheck on DOS style end-of-line
+// This test was deliberately committed with DOS style end of line.
+// Don't change line endings!
+// RUN: FileCheck -input-file %s %s
+// RUN: FileCheck  --strict-whitespace -input-file %s %s
+
+LINE 1
+; CHECK: {{^}}LINE 1{{$}}
+
+LINE 2
 ; CHECK: {{^}}LINE 2{{$}}
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri b/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri
index 72d23d041ae8..857c4ff87b6c 100644
--- a/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri
+++ b/llvm/test/tools/llvm-ar/Inputs/mri-crlf.mri
@@ -1,4 +1,4 @@
-; this file intentionally has crlf line endings
-create crlf.a
-addmod foo.txt
-end
+; this file intentionally has crlf line endings
+create crlf.a
+addmod foo.txt
+end
diff --git a/llvm/test/tools/llvm-cvtres/Inputs/languages.rc b/llvm/test/tools/llvm-cvtres/Inputs/languages.rc
index 081b3a77bebc..82031d0e2083 100644
--- a/llvm/test/tools/llvm-cvtres/Inputs/languages.rc
+++ b/llvm/test/tools/llvm-cvtres/Inputs/languages.rc
@@ -1,36 +1,36 @@
-#include "windows.h"
-
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
-randomdat RCDATA
-{
-	"this is a random bit of data that means nothing\0",
-	0x23a9,
-	0x140e,
-	194292,
-}
-
-LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
-randomdat RCDATA
-{
-	"zhe4 shi4 yi1ge4 sui2ji1 de shu4ju4, zhe4 yi4wei4zhe shen2me\0",
-	0x23a9,
-	0x140e,
-	194292,
-}
-
-LANGUAGE LANG_GERMAN, SUBLANG_GERMAN_LUXEMBOURG
-randomdat RCDATA
-{
-	"Dies ist ein zufälliges Bit von Daten, die nichts bedeutet\0",
-	0x23a9,
-	0x140e,
-	194292,
-}
-
-LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
-myaccelerators ACCELERATORS
-{
-	"^C", 999, VIRTKEY, ALT
-	"D", 1100, VIRTKEY, CONTROL, SHIFT
-	"^R", 444, ASCII, NOINVERT
-}
+#include "windows.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+randomdat RCDATA
+{
+	"this is a random bit of data that means nothing\0",
+	0x23a9,
+	0x140e,
+	194292,
+}
+
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+randomdat RCDATA
+{
+	"zhe4 shi4 yi1ge4 sui2ji1 de shu4ju4, zhe4 yi4wei4zhe shen2me\0",
+	0x23a9,
+	0x140e,
+	194292,
+}
+
+LANGUAGE LANG_GERMAN, SUBLANG_GERMAN_LUXEMBOURG
+randomdat RCDATA
+{
+	"Dies ist ein zufälliges Bit von Daten, die nichts bedeutet\0",
+	0x23a9,
+	0x140e,
+	194292,
+}
+
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+myaccelerators ACCELERATORS
+{
+	"^C", 999, VIRTKEY, ALT
+	"D", 1100, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
diff --git a/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc b/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc
index 5ca097baa0f7..494849f57a0a 100644
--- a/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc
+++ b/llvm/test/tools/llvm-cvtres/Inputs/test_resource.rc
@@ -1,50 +1,50 @@
-#include "windows.h"
-
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
-
-myaccelerators ACCELERATORS
-{
-	"^C", 999, VIRTKEY, ALT
-	"D", 1100, VIRTKEY, CONTROL, SHIFT
-	"^R", 444, ASCII, NOINVERT
-}
-
-cursor BITMAP "cursor_small.bmp"
-okay BITMAP "okay_small.bmp"
-
-14432 MENU
-LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
-{
-	MENUITEM "yu", 100
-	MENUITEM "shala", 101
-	MENUITEM "kaoya", 102
-}
-
-testdialog DIALOG 10, 10, 200, 300
-STYLE WS_POPUP | WS_BORDER
-CAPTION "Test"
-{
-	CTEXT "Continue:", 1, 10, 10, 230, 14
-	PUSHBUTTON "&OK", 2, 66, 134, 161, 13
-}
-
-12 ACCELERATORS
-{
-	"X", 164, VIRTKEY, ALT
-	"H", 5678, VIRTKEY, CONTROL, SHIFT
-	"^R", 444, ASCII, NOINVERT
-}
-
-"eat" MENU
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
-{
-	MENUITEM "fish", 100
-	MENUITEM "salad", 101
-	MENUITEM "duck", 102
-}
-
-
-myresource stringarray {
-	"this is a user defined resource\0",
-	"it contains many strings\0",
+#include "windows.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+
+myaccelerators ACCELERATORS
+{
+	"^C", 999, VIRTKEY, ALT
+	"D", 1100, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+cursor BITMAP "cursor_small.bmp"
+okay BITMAP "okay_small.bmp"
+
+14432 MENU
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+{
+	MENUITEM "yu", 100
+	MENUITEM "shala", 101
+	MENUITEM "kaoya", 102
+}
+
+testdialog DIALOG 10, 10, 200, 300
+STYLE WS_POPUP | WS_BORDER
+CAPTION "Test"
+{
+	CTEXT "Continue:", 1, 10, 10, 230, 14
+	PUSHBUTTON "&OK", 2, 66, 134, 161, 13
+}
+
+12 ACCELERATORS
+{
+	"X", 164, VIRTKEY, ALT
+	"H", 5678, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+"eat" MENU
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
+{
+	MENUITEM "fish", 100
+	MENUITEM "salad", 101
+	MENUITEM "duck", 102
+}
+
+
+myresource stringarray {
+	"this is a user defined resource\0",
+	"it contains many strings\0",
 }
\ No newline at end of file
diff --git a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
index bb79dca399c2..c700b587af64 100644
--- a/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
+++ b/llvm/test/tools/llvm-rc/Inputs/dialog-with-menu.rc
@@ -1,16 +1,16 @@
-101 DIALOG 0, 0, 362, 246
-STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l |
-    0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l |
-    0x00080000l | 0x00040000l
-CAPTION "MakeNSISW"
-MENU 104
-FONT 8, "MS Shell Dlg"
-BEGIN
-    CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l |
-                    0x0100l | 0x0800l | 0x00008000 |
-                    0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190
-    CONTROL "",-1,"Static",0x00000010l,7,220,346,1
-    LTEXT "",200,7,230,200,12,0x08000000l
-    DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l
-    PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l
-END
+101 DIALOG 0, 0, 362, 246
+STYLE 0x40l | 0x0004l | 0x0008l | 0x0800l | 0x00020000l |
+    0x00010000l | 0x80000000l | 0x10000000l | 0x02000000l | 0x00C00000l |
+    0x00080000l | 0x00040000l
+CAPTION "MakeNSISW"
+MENU 104
+FONT 8, "MS Shell Dlg"
+BEGIN
+    CONTROL "",202,"RichEdit20A",0x0004l | 0x0040l |
+                    0x0100l | 0x0800l | 0x00008000 |
+                    0x00010000l | 0x00800000l | 0x00200000l,7,22,348,190
+    CONTROL "",-1,"Static",0x00000010l,7,220,346,1
+    LTEXT "",200,7,230,200,12,0x08000000l
+    DEFPUSHBUTTON "Test &Installer",203,230,226,60,15,0x08000000l | 0x00010000l
+    PUSHBUTTON "&Close",2,296,226,49,15,0x00010000l
+END
diff --git a/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc b/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc
index fd616520dbe1..6ad56bc02d73 100644
--- a/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc
+++ b/llvm/test/tools/llvm-readobj/COFF/Inputs/resources/test_resource.rc
@@ -1,44 +1,44 @@
-#include "windows.h"
-
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
-
-myaccelerators ACCELERATORS
-{
-	"^C", 999, VIRTKEY, ALT
-	"D", 1100, VIRTKEY, CONTROL, SHIFT
-	"^R", 444, ASCII, NOINVERT
-}
-
-cursor BITMAP "cursor_small.bmp"
-okay BITMAP "okay_small.bmp"
-
-14432 MENU
-LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
-{
-	MENUITEM "yu", 100
-	MENUITEM "shala", 101
-	MENUITEM "kaoya", 102
-}
-
-testdialog DIALOG 10, 10, 200, 300
-STYLE WS_POPUP | WS_BORDER
-CAPTION "Test"
-{
-	CTEXT "Continue:", 1, 10, 10, 230, 14
-	PUSHBUTTON "&OK", 2, 66, 134, 161, 13
-}
-
-12 ACCELERATORS
-{
-	"X", 164, VIRTKEY, ALT
-	"H", 5678, VIRTKEY, CONTROL, SHIFT
-	"^R", 444, ASCII, NOINVERT
-}
-
-"eat" MENU
-LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
-{
-	MENUITEM "fish", 100
-	MENUITEM "salad", 101
-	MENUITEM "duck", 102
-}
+#include "windows.h"
+
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_US
+
+myaccelerators ACCELERATORS
+{
+	"^C", 999, VIRTKEY, ALT
+	"D", 1100, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+cursor BITMAP "cursor_small.bmp"
+okay BITMAP "okay_small.bmp"
+
+14432 MENU
+LANGUAGE LANG_CHINESE, SUBLANG_CHINESE_SIMPLIFIED
+{
+	MENUITEM "yu", 100
+	MENUITEM "shala", 101
+	MENUITEM "kaoya", 102
+}
+
+testdialog DIALOG 10, 10, 200, 300
+STYLE WS_POPUP | WS_BORDER
+CAPTION "Test"
+{
+	CTEXT "Continue:", 1, 10, 10, 230, 14
+	PUSHBUTTON "&OK", 2, 66, 134, 161, 13
+}
+
+12 ACCELERATORS
+{
+	"X", 164, VIRTKEY, ALT
+	"H", 5678, VIRTKEY, CONTROL, SHIFT
+	"^R", 444, ASCII, NOINVERT
+}
+
+"eat" MENU
+LANGUAGE LANG_ENGLISH, SUBLANG_ENGLISH_AUS
+{
+	MENUITEM "fish", 100
+	MENUITEM "salad", 101
+	MENUITEM "duck", 102
+}
diff --git a/llvm/unittests/Support/ModRefTest.cpp b/llvm/unittests/Support/ModRefTest.cpp
index 35107e50b32d..f77e7e39e14e 100644
--- a/llvm/unittests/Support/ModRefTest.cpp
+++ b/llvm/unittests/Support/ModRefTest.cpp
@@ -1,27 +1,27 @@
-//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===//
-//
-// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
-// See https://llvm.org/LICENSE.txt for license information.
-// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
-//
-//===----------------------------------------------------------------------===//
-
-#include "llvm/Support/ModRef.h"
-#include "llvm/ADT/SmallString.h"
-#include "llvm/Support/raw_ostream.h"
-#include "gtest/gtest.h"
-#include <string>
-
-using namespace llvm;
-
-namespace {
-
-// Verify that printing a MemoryEffects does not end with a ,.
-TEST(ModRefTest, PrintMemoryEffects) {
-  std::string S;
-  raw_string_ostream OS(S);
-  OS << MemoryEffects::none();
-  EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef");
-}
-
-} // namespace
+//===- llvm/unittest/Support/ModRefTest.cpp - ModRef tests ----------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "llvm/Support/ModRef.h"
+#include "llvm/ADT/SmallString.h"
+#include "llvm/Support/raw_ostream.h"
+#include "gtest/gtest.h"
+#include <string>
+
+using namespace llvm;
+
+namespace {
+
+// Verify that printing a MemoryEffects does not end with a ,.
+TEST(ModRefTest, PrintMemoryEffects) {
+  std::string S;
+  raw_string_ostream OS(S);
+  OS << MemoryEffects::none();
+  EXPECT_EQ(S, "ArgMem: NoModRef, InaccessibleMem: NoModRef, Other: NoModRef");
+}
+
+} // namespace
diff --git a/llvm/utils/LLVMVisualizers/llvm.natvis b/llvm/utils/LLVMVisualizers/llvm.natvis
index d83ae8013c51..03ca2d33a80b 100644
--- a/llvm/utils/LLVMVisualizers/llvm.natvis
+++ b/llvm/utils/LLVMVisualizers/llvm.natvis
@@ -1,408 +1,408 @@
-<?xml version="1.0" encoding="utf-8"?>
-<!--
-Visual Studio Native Debugging Visualizers for LLVM
-
-For Visual Studio 2013 only, put this file into
-"%USERPROFILE%\Documents\Visual Studio 2013\Visualizers" or create a symbolic link so it updates automatically.
-
-For later versions of Visual Studio, no setup is required.
--->
-<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
-  <Type Name="llvm::SmallVectorImpl&lt;*&gt;">
-    <DisplayString Condition="Size == 0">empty</DisplayString>
-    <DisplayString Condition="Size &amp;&amp; Size &lt; 4">{(value_type*)BeginX,[Size]}</DisplayString>
-    <DisplayString Condition="Size &gt; 3">{Size} elements</DisplayString>
-    <DisplayString>Uninitialized</DisplayString>
-    <Expand>
-      <Item Name="[size]">Size</Item>
-      <Item Name="[capacity]">Capacity</Item>
-      <ArrayItems>
-        <Size>Size</Size>
-        <ValuePointer>(value_type*)BeginX</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-  <Type Name="llvm::APInt">
-    <!-- For now, only handle up to 64-bit unsigned ints -->
-    <DisplayString Condition="BitWidth &lt;= 64">{U.VAL}</DisplayString>
-    <DisplayString>Cannot visualize APInts longer than 64 bits</DisplayString>
-  </Type>
-  <Type Name="llvm::ArrayRef&lt;*&gt;">
-    <DisplayString Condition="Length &lt; 4">{Data,[Length]}</DisplayString>
-    <DisplayString Condition="Length &gt; 3">{Length} elements</DisplayString>
-    <DisplayString>Uninitialized</DisplayString>
-    <Expand>
-      <Item Name="[size]">Length</Item>
-      <ArrayItems>
-        <Size>Length</Size>
-        <ValuePointer>Data</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-  <Type Name="llvm::SmallString&lt;*&gt;">
-    <DisplayString>{(const char*)BeginX,[Size]s8}</DisplayString>
-    <StringView>(const char*)BeginX,[Size]</StringView>
-    <Expand>
-      <Item Name="[size]">Size</Item>
-      <Item Name="[capacity]">Capacity</Item>
-      <ArrayItems>
-        <Size>Size</Size>
-        <ValuePointer>(char*)BeginX</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-
-  <Type Name="StringView">
-    <DisplayString>{First,[Last - First]s8}</DisplayString>
-  </Type>
-
-  <Type Name="llvm::StringRef">
-    <DisplayString>{Data,[Length]s8}</DisplayString>
-    <StringView>Data,[Length]s8</StringView>
-    <Expand>
-      <Item Name="[size]">Length</Item>
-      <ArrayItems>
-        <Size>Length</Size>
-        <ValuePointer>Data</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::PunnedPointer">
-    <DisplayString>{($T1)*(intptr_t *)Data}</DisplayString>
-  </Type>
-
-  <!-- PointerIntPair. In addition to the regular view, it is possible to view
-       just the pointer or just the int. The same code is duplicated from the
-       regular viewer to improve the performance of the common case. Note, we
-       need to specify PointerIntPair<PointerUnion<*>, *> differently because
-       we need to "look through" the PointerUnion to display it. Otherwise, we
-       get errors about ambiguous conversion from uintptr_t to PointerUnion.-->
-  <Type Name="llvm::PointerIntPair&lt;llvm::PointerUnion&lt;*&gt;, *&gt;">
-    <!-- $T1 is the parameter pack of PointerUnion, $T3 is IntBits,
-         $T4 is IntType, $T5 is PtrTraits, and $T6 is Info. -->
-    <DisplayString IncludeView="ptr">{($T1)(*(intptr_t *)Value.Data &amp; $T6::PointerBitMask)}</DisplayString>
-    <DisplayString IncludeView="int">{($T4)((*(intptr_t *)Value.Data &gt;&gt; $T6::IntShift) &amp; $T6::IntMask)}</DisplayString>
-    <DisplayString>{$T6::IntMask}: {($T1)(*(intptr_t *)Value.Data &amp; $T6::PointerBitMask)} [{($T4)((*(intptr_t *)Value.Data &gt;&gt; $T6::IntShift) &amp; $T6::IntMask)}]</DisplayString>
-    <Expand>
-      <Item Name="[ptr]">($T1)(*(intptr_t *)Value.Data &amp; $T6::PointerBitMask)</Item>
-      <Item Name="[int]">($T4)((*(intptr_t *)Value.Data &gt;&gt; $T6::IntShift) &amp; $T6::IntMask)</Item>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::PointerIntPair&lt;*&gt;">
-    <DisplayString IncludeView="ptr">{($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)}</DisplayString>
-    <DisplayString IncludeView="int">{((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}</DisplayString>
-    <DisplayString>{$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)} [{((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}]</DisplayString>
-    <Expand>
-      <Item Name="[ptr]">($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)</Item>
-      <Item Name="[int]">((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)</Item>
-    </Expand>
-  </Type>
-  <!-- PointerUnion types -->
-  <Type Name="llvm::pointer_union_detail::PointerUnionMembers&lt;*&gt;">
-    <DisplayString Optional="true" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 0">
-      {($T4)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)}
-    </DisplayString>
-    <DisplayString Optional="true" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 1">
-      {($T5)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)}
-    </DisplayString>
-    <DisplayString>Unexpected index in PointerUnion: {(*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask}</DisplayString>
-    <Expand>
-      <Item Name="[Holds]" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 0">"$T4",s8b</Item>
-      <Item Name="[Ptr]" Optional="true"  Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 0">
-        ($T4)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)
-      </Item>
-      <Item Name="[Holds]" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 1">"$T5",s8b</Item>
-      <Item Name="[Ptr]" Optional="true"  Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 1">
-        ($T5)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)
-      </Item>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::iplist&lt;*,*&gt;">
-    <DisplayString Condition="Head == 0">{{ empty }}</DisplayString>
-    <DisplayString Condition="Head != 0">{{ head={Head} }}</DisplayString>
-    <Expand>
-      <LinkedListItems>
-        <HeadPointer>Head</HeadPointer>
-        <NextPointer>Next</NextPointer>
-        <ValueNode>this</ValueNode>
-      </LinkedListItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::IntrusiveRefCntPtr&lt;*&gt;">
-    <DisplayString Condition="Obj == 0">empty</DisplayString>
-    <DisplayString Condition="(Obj != 0) &amp;&amp; (Obj-&gt;RefCount == 1)">RefPtr [1 ref] {*Obj}</DisplayString>
-    <DisplayString Condition="(Obj != 0) &amp;&amp; (Obj-&gt;RefCount != 1)">RefPtr [{Obj-&gt;RefCount} refs] {*Obj}</DisplayString>
-    <Expand>
-      <Item Condition="Obj != 0" Name="[refs]">Obj-&gt;RefCount</Item>
-      <ExpandedItem Condition="Obj != 0">Obj</ExpandedItem>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::SmallPtrSet&lt;*,*&gt;">
-    <DisplayString Condition="CurArray == SmallArray">{{ [Small Mode] size={NumNonEmpty}, capacity={CurArraySize} }}</DisplayString>
-    <DisplayString Condition="CurArray != SmallArray">{{ [Big Mode] size={NumNonEmpty}, capacity={CurArraySize} }}</DisplayString>
-    <Expand>
-      <Item Name="[size]">NumNonEmpty</Item>
-      <Item Name="[capacity]">CurArraySize</Item>
-      <ArrayItems>
-        <Size>NumNonEmpty</Size>
-        <ValuePointer>($T1*)CurArray</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::DenseMap&lt;*,*,*&gt;">
-    <DisplayString Condition="NumEntries == 0">empty</DisplayString>
-    <DisplayString Condition="NumEntries != 0">{{ size={NumEntries}, buckets={NumBuckets} }}</DisplayString>
-    <Expand>
-      <Item Name="[size]">NumEntries</Item>
-      <Item Name="[buckets]">NumBuckets</Item>
-      <ArrayItems>
-        <Size>NumBuckets</Size>
-        <ValuePointer>Buckets</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::StringMap&lt;*,*&gt;">
-    <DisplayString>{{ size={NumItems}, buckets={NumBuckets} }}</DisplayString>
-    <Expand>
-      <Item Name="[size]">NumItems</Item>
-      <Item Name="[buckets]">NumBuckets</Item>
-      <ArrayItems>
-        <Size>NumBuckets</Size>
-        <ValuePointer>(MapEntryTy**)TheTable</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::StringMapEntry&lt;*&gt;">
-    <DisplayString Condition="keyLength == 0">empty</DisplayString>
-    <DisplayString Condition="keyLength != 0">({this+1,s8}, {second})</DisplayString>
-    <Expand>
-      <Item Name="[key]">this+1,s</Item>
-      <Item Name="[value]" Condition="keyLength != 0">second</Item>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::Triple">
-    <DisplayString>{Data}</DisplayString>
-  </Type>
-
-  <Type Name="llvm::Optional&lt;*&gt;">
-    <DisplayString Condition="!Storage.hasVal">None</DisplayString>
-    <DisplayString Condition="Storage.hasVal">{Storage.value}</DisplayString>
-    <Expand>
-      <Item Name="[underlying]" Condition="Storage.hasVal">Storage.value</Item>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::Expected&lt;*&gt;">
-    <DisplayString Condition="HasError">Error</DisplayString>
-    <DisplayString Condition="!HasError">{*((storage_type *)TStorage.buffer)}</DisplayString>
-    <Expand>
-      <Item Name="[value]" Condition="!HasError">*((storage_type *)TStorage.buffer)</Item>
-      <Item Name="[error]" Condition="HasError">*((error_type *)ErrorStorage.buffer)</Item>
-    </Expand>
-  </Type>
-
-
-  <!-- Since we're in MSVC, we can assume that the system is little endian.  Therefore
-       the little and native cases just require a cast.  Handle this easy case first. Use
-       a wildcard for the second template argument (the endianness), but we will use a
-       specific value of 0 later on for the big endian to give it priority for being a
-       better match.  -->
-  <Type Name="llvm::support::detail::packed_endian_specific_integral&lt;*,*,1&gt;">
-    <DisplayString>{{little endian value = {*(($T1*)(unsigned char *)Value.buffer)} }}</DisplayString>
-    <Expand>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==1">(unsigned char *)Value.buffer,1</Item>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==2">(unsigned char *)Value.buffer,2</Item>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==4">(unsigned char *)Value.buffer,4</Item>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==8">(unsigned char *)Value.buffer,8</Item>
-    </Expand>
-  </Type>
-
-  <!-- Now handle the hard case of big endian.  We need to do the swizzling here, but
-       we need to specialize it based on the size of the value type. -->
-  <Type Name="llvm::support::detail::packed_endian_specific_integral&lt;*,0,1&gt;">
-    <DisplayString Condition="sizeof($T1)==1">{{ big endian value = {*(unsigned char *)Value.buffer} }}</DisplayString>
-    <DisplayString Condition="sizeof($T1)==2">{{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) &lt;&lt; 8)
-                                                                    | ($T1)(*((unsigned char *)Value.buffer+1))} }}</DisplayString>
-    <DisplayString Condition="sizeof($T1)==4">{{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) &lt;&lt; 24)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+1)) &lt;&lt; 16)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+2)) &lt;&lt; 8)
-                                                                    |  ($T1)(*((unsigned char *)Value.buffer+3))} }}</DisplayString>
-    <DisplayString Condition="sizeof($T1)==8">{{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) &lt;&lt; 56)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+1)) &lt;&lt; 48)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+2)) &lt;&lt; 40)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+3)) &lt;&lt; 32)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+4)) &lt;&lt; 24)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+5)) &lt;&lt; 16)
-                                                                    | (($T1)(*((unsigned char *)Value.buffer+6)) &lt;&lt; 8)
-                                                                    |  ($T1)(*((unsigned char *)Value.buffer+7))} }}</DisplayString>
-    <Expand>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==1">(unsigned char *)Value.buffer,1</Item>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==2">(unsigned char *)Value.buffer,2</Item>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==4">(unsigned char *)Value.buffer,4</Item>
-      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==8">(unsigned char *)Value.buffer,8</Item>
-    </Expand>
-  </Type>
-  <!-- llvm::Type has two fields, SubclassData and ContainedTys, the meaning of which change depending on the TypeID.
-       This visualiser decodes those fields based on the value of ID.
-  -->
-  <Type Name="llvm::Type">
-    <DisplayString>{ID}</DisplayString>
-    <Expand>
-      <Item Name="ID">ID</Item>
-
-      <Item Name="NumBits" Condition="ID == llvm::Type::TypeID::IntegerTyID">SubclassData</Item>
-
-      <Item Name="ReturnType" Condition="ID == llvm::Type::TypeID::FunctionTyID">*ContainedTys</Item>
-      <Synthetic Name="Arguments" Condition="ID == llvm::Type::TypeID::FunctionTyID">
-        <DisplayString>{NumContainedTys - 1}</DisplayString>
-        <Expand>
-        <ArrayItems>
-          <Size>NumContainedTys - 1</Size>
-          <ValuePointer>ContainedTys + 1</ValuePointer>
-        </ArrayItems>
-        </Expand>
-      </Synthetic>
-      <Item Name="IsVarArg" Condition="ID == llvm::Type::TypeID::FunctionTyID">SubclassData == 1</Item>
-
-      <Item Name="HasBody" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_HasBody) != 0</Item>
-      <Item Name="Packed" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_Packed) != 0</Item>
-      <Item Name="IsLiteral" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_IsLiteral) != 0</Item>
-      <Item Name="IsSized" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_IsSized) != 0</Item>
-      <Synthetic Name="Members" Condition="ID == llvm::Type::TypeID::StructTyID">
-        <DisplayString>{NumContainedTys}</DisplayString>
-        <Expand>
-          <ArrayItems>
-            <Size>NumContainedTys</Size>
-            <ValuePointer>ContainedTys</ValuePointer>
-          </ArrayItems>
-        </Expand>
-      </Synthetic>
-
-      <Item Name="ElementType" Condition="ID == llvm::Type::TypeID::ArrayTyID">*ContainedTys</Item>
-      <Item Name="NumElements" Condition="ID == llvm::Type::TypeID::ArrayTyID">((llvm::ArrayType*)this)->NumElements</Item>
-
-      <Item Name="ElementType" Condition="ID == llvm::Type::TypeID::FixedVectorTyID">*ContainedTys</Item>
-      <Item Name="NumElements" Condition="ID == llvm::Type::TypeID::FixedVectorTyID">((llvm::VectorType*)this)->ElementQuantity</Item>
-
-      <Item Name="ElementType" Condition="ID == llvm::Type::TypeID::ScalableVectorTyID">*ContainedTys</Item>
-      <Item Name="MinNumElements" Condition="ID == llvm::Type::TypeID::ScalableVectorTyID">((llvm::VectorType*)this)->ElementQuantity</Item>
-
-      <Item Name="AddressSpace" Condition="ID == llvm::Type::TypeID::PointerTyID">SubclassData</Item>
-      <Item Name="PointeeType" Condition="ID == llvm::Type::TypeID::PointerTyID">*ContainedTys</Item>
-
-      <Item Name="Context">Context</Item>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::ConstantSDNode">
-    <DisplayString>$(Type) {*Value}</DisplayString>
-  </Type>
-
-  <Type Name="llvm::SDNode">
-    <DisplayString>$(Type) {(llvm::ISD::NodeType)this->NodeType}</DisplayString>
-    <Expand>
-      <ArrayItems>
-        <Size>NumOperands</Size>
-        <ValuePointer>OperandList</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::ConstantInt">
-    <DisplayString>i{Val.BitWidth} {Val.VAL}</DisplayString>
-  </Type>
-
-  <Type Name="llvm::IntegerType">
-    <DisplayString>{IDAndSubclassData >> 8}bit integer type</DisplayString>
-  </Type>
-
-  <Type Name="llvm::Value">
-    <DisplayString Condition="HasName">$(Type) {*VTy} {this->getName()} {SubclassData}</DisplayString>
-    <DisplayString Condition="!HasName">$(Type) {*VTy} anon {SubclassData}</DisplayString>
-    <Expand>
-      <Item Name="[Inst]" Condition="SubclassID > InstructionVal">(Instruction*)this</Item>
-      <Item Name="Operands">(User*)this</Item>
-      <LinkedListItems>
-        <HeadPointer>UseList</HeadPointer>
-        <NextPointer>Next</NextPointer>
-        <ValueNode>Prev.Value &amp; 3 == 3 ? (User*)(this + 1) : (User*)(this + 2)</ValueNode>
-      </LinkedListItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::Use">
-    <Expand>
-      <Item Name="Value">Val</Item>
-      <!--
-      <LinkedListItems>
-        <HeadPointer>this</HeadPointer>
-        <NextPointer>Next</NextPointer>
-        <ValueNode>Prev.Value &amp; 3 == 3 ? (User*)(this + 1) : (User*)(this + 2)</ValueNode>
-      </LinkedListItems>
-      -->
-    </Expand>
-  </Type>
-
-  <!-- uses other values, like Operands -->
-  <Type Name="llvm::User">
-    <DisplayString Condition="HasName">$(Type) {*VTy} {this->getName()} {SubclassData}</DisplayString>
-    <DisplayString Condition="!HasName">$(Type) {*VTy} anon {SubclassData}</DisplayString>
-    <Expand>
-      <Item Name="[Value]">(Value*)this,nd</Item>
-      <Item Name="[Type]">*VTy</Item>
-      <ArrayItems Condition="!HasHungOffUses">
-        <Size>NumUserOperands</Size>
-        <ValuePointer>(llvm::Use*)this - NumUserOperands</ValuePointer>
-      </ArrayItems>
-      <ArrayItems Condition="HasHungOffUses">
-        <Size>NumUserOperands</Size>
-        <ValuePointer>*((llvm::Use**)this - 1)</ValuePointer>
-      </ArrayItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::Instruction">
-    <DisplayString>{getOpcodeName(SubclassID - InstructionVal)}</DisplayString>
-    <Expand>
-      <Item Name="[User]">(User*)this,nd</Item>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::GlobalValue">
-    <DisplayString>{this->getName()} {(LinkageTypes)Linkage} {(VisibilityTypes)Visibility} {(DLLStorageClassTypes)DllStorageClass} {(llvm::GlobalValue::ThreadLocalMode) ThreadLocal}</DisplayString>
-  </Type>
-
-  <!-- TODO doesn't work cause it doesn't know the dynamic type -->
-  <Type Name="llvm::ilist_node">
-    <Expand>
-      <LinkedListItems>
-        <HeadPointer>this</HeadPointer>
-        <NextPointer>Next</NextPointer>
-        <ValueNode>this</ValueNode>
-      </LinkedListItems>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::LLVMContext">
-    <Expand>
-      <ExpandedItem>pImpl</ExpandedItem>
-    </Expand>
-  </Type>
-
-  <Type Name="llvm::Module">
-    <DisplayString>{ModuleID,s8} {TargetTriple}</DisplayString>
-  </Type>
-
-  <Type Name="llvm::Pass">
-    <DisplayString>$(Type) {PassID} {Kind}</DisplayString>
-  </Type>
-</AutoVisualizer>
+<?xml version="1.0" encoding="utf-8"?>
+<!--
+Visual Studio Native Debugging Visualizers for LLVM
+
+For Visual Studio 2013 only, put this file into
+"%USERPROFILE%\Documents\Visual Studio 2013\Visualizers" or create a symbolic link so it updates automatically.
+
+For later versions of Visual Studio, no setup is required.
+-->
+<AutoVisualizer xmlns="http://schemas.microsoft.com/vstudio/debugger/natvis/2010">
+  <Type Name="llvm::SmallVectorImpl&lt;*&gt;">
+    <DisplayString Condition="Size == 0">empty</DisplayString>
+    <DisplayString Condition="Size &amp;&amp; Size &lt; 4">{(value_type*)BeginX,[Size]}</DisplayString>
+    <DisplayString Condition="Size &gt; 3">{Size} elements</DisplayString>
+    <DisplayString>Uninitialized</DisplayString>
+    <Expand>
+      <Item Name="[size]">Size</Item>
+      <Item Name="[capacity]">Capacity</Item>
+      <ArrayItems>
+        <Size>Size</Size>
+        <ValuePointer>(value_type*)BeginX</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="llvm::APInt">
+    <!-- For now, only handle up to 64-bit unsigned ints -->
+    <DisplayString Condition="BitWidth &lt;= 64">{U.VAL}</DisplayString>
+    <DisplayString>Cannot visualize APInts longer than 64 bits</DisplayString>
+  </Type>
+  <Type Name="llvm::ArrayRef&lt;*&gt;">
+    <DisplayString Condition="Length &lt; 4">{Data,[Length]}</DisplayString>
+    <DisplayString Condition="Length &gt; 3">{Length} elements</DisplayString>
+    <DisplayString>Uninitialized</DisplayString>
+    <Expand>
+      <Item Name="[size]">Length</Item>
+      <ArrayItems>
+        <Size>Length</Size>
+        <ValuePointer>Data</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+  <Type Name="llvm::SmallString&lt;*&gt;">
+    <DisplayString>{(const char*)BeginX,[Size]s8}</DisplayString>
+    <StringView>(const char*)BeginX,[Size]</StringView>
+    <Expand>
+      <Item Name="[size]">Size</Item>
+      <Item Name="[capacity]">Capacity</Item>
+      <ArrayItems>
+        <Size>Size</Size>
+        <ValuePointer>(char*)BeginX</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="StringView">
+    <DisplayString>{First,[Last - First]s8}</DisplayString>
+  </Type>
+
+  <Type Name="llvm::StringRef">
+    <DisplayString>{Data,[Length]s8}</DisplayString>
+    <StringView>Data,[Length]s8</StringView>
+    <Expand>
+      <Item Name="[size]">Length</Item>
+      <ArrayItems>
+        <Size>Length</Size>
+        <ValuePointer>Data</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::PunnedPointer">
+    <DisplayString>{($T1)*(intptr_t *)Data}</DisplayString>
+  </Type>
+
+  <!-- PointerIntPair. In addition to the regular view, it is possible to view
+       just the pointer or just the int. The same code is duplicated from the
+       regular viewer to improve the performance of the common case. Note, we
+       need to specify PointerIntPair<PointerUnion<*>, *> differently because
+       we need to "look through" the PointerUnion to display it. Otherwise, we
+       get errors about ambiguous conversion from uintptr_t to PointerUnion.-->
+  <Type Name="llvm::PointerIntPair&lt;llvm::PointerUnion&lt;*&gt;, *&gt;">
+    <!-- $T1 is the parameter pack of PointerUnion, $T3 is IntBits,
+         $T4 is IntType, $T5 is PtrTraits, and $T6 is Info. -->
+    <DisplayString IncludeView="ptr">{($T1)(*(intptr_t *)Value.Data &amp; $T6::PointerBitMask)}</DisplayString>
+    <DisplayString IncludeView="int">{($T4)((*(intptr_t *)Value.Data &gt;&gt; $T6::IntShift) &amp; $T6::IntMask)}</DisplayString>
+    <DisplayString>{$T6::IntMask}: {($T1)(*(intptr_t *)Value.Data &amp; $T6::PointerBitMask)} [{($T4)((*(intptr_t *)Value.Data &gt;&gt; $T6::IntShift) &amp; $T6::IntMask)}]</DisplayString>
+    <Expand>
+      <Item Name="[ptr]">($T1)(*(intptr_t *)Value.Data &amp; $T6::PointerBitMask)</Item>
+      <Item Name="[int]">($T4)((*(intptr_t *)Value.Data &gt;&gt; $T6::IntShift) &amp; $T6::IntMask)</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::PointerIntPair&lt;*&gt;">
+    <DisplayString IncludeView="ptr">{($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)}</DisplayString>
+    <DisplayString IncludeView="int">{((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}</DisplayString>
+    <DisplayString>{$T5::IntMask}: {($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)} [{((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)}]</DisplayString>
+    <Expand>
+      <Item Name="[ptr]">($T1)(*(intptr_t *)Value.Data &amp; $T5::PointerBitMask)</Item>
+      <Item Name="[int]">((*(intptr_t *)Value.Data &gt;&gt; $T5::IntShift) &amp; $T5::IntMask)</Item>
+    </Expand>
+  </Type>
+  <!-- PointerUnion types -->
+  <Type Name="llvm::pointer_union_detail::PointerUnionMembers&lt;*&gt;">
+    <DisplayString Optional="true" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 0">
+      {($T4)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)}
+    </DisplayString>
+    <DisplayString Optional="true" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 1">
+      {($T5)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)}
+    </DisplayString>
+    <DisplayString>Unexpected index in PointerUnion: {(*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask}</DisplayString>
+    <Expand>
+      <Item Name="[Holds]" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 0">"$T4",s8b</Item>
+      <Item Name="[Ptr]" Optional="true"  Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 0">
+        ($T4)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)
+      </Item>
+      <Item Name="[Holds]" Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 1">"$T5",s8b</Item>
+      <Item Name="[Ptr]" Optional="true"  Condition="((*(intptr_t *)Val.Value.Data&gt;&gt;$T2::InfoTy::IntShift) &amp; $T2::InfoTy::IntMask) == 1">
+        ($T5)(*(intptr_t *)Val.Value.Data &amp; $T2::InfoTy::PointerBitMask)
+      </Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::iplist&lt;*,*&gt;">
+    <DisplayString Condition="Head == 0">{{ empty }}</DisplayString>
+    <DisplayString Condition="Head != 0">{{ head={Head} }}</DisplayString>
+    <Expand>
+      <LinkedListItems>
+        <HeadPointer>Head</HeadPointer>
+        <NextPointer>Next</NextPointer>
+        <ValueNode>this</ValueNode>
+      </LinkedListItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::IntrusiveRefCntPtr&lt;*&gt;">
+    <DisplayString Condition="Obj == 0">empty</DisplayString>
+    <DisplayString Condition="(Obj != 0) &amp;&amp; (Obj-&gt;RefCount == 1)">RefPtr [1 ref] {*Obj}</DisplayString>
+    <DisplayString Condition="(Obj != 0) &amp;&amp; (Obj-&gt;RefCount != 1)">RefPtr [{Obj-&gt;RefCount} refs] {*Obj}</DisplayString>
+    <Expand>
+      <Item Condition="Obj != 0" Name="[refs]">Obj-&gt;RefCount</Item>
+      <ExpandedItem Condition="Obj != 0">Obj</ExpandedItem>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::SmallPtrSet&lt;*,*&gt;">
+    <DisplayString Condition="CurArray == SmallArray">{{ [Small Mode] size={NumNonEmpty}, capacity={CurArraySize} }}</DisplayString>
+    <DisplayString Condition="CurArray != SmallArray">{{ [Big Mode] size={NumNonEmpty}, capacity={CurArraySize} }}</DisplayString>
+    <Expand>
+      <Item Name="[size]">NumNonEmpty</Item>
+      <Item Name="[capacity]">CurArraySize</Item>
+      <ArrayItems>
+        <Size>NumNonEmpty</Size>
+        <ValuePointer>($T1*)CurArray</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::DenseMap&lt;*,*,*&gt;">
+    <DisplayString Condition="NumEntries == 0">empty</DisplayString>
+    <DisplayString Condition="NumEntries != 0">{{ size={NumEntries}, buckets={NumBuckets} }}</DisplayString>
+    <Expand>
+      <Item Name="[size]">NumEntries</Item>
+      <Item Name="[buckets]">NumBuckets</Item>
+      <ArrayItems>
+        <Size>NumBuckets</Size>
+        <ValuePointer>Buckets</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::StringMap&lt;*,*&gt;">
+    <DisplayString>{{ size={NumItems}, buckets={NumBuckets} }}</DisplayString>
+    <Expand>
+      <Item Name="[size]">NumItems</Item>
+      <Item Name="[buckets]">NumBuckets</Item>
+      <ArrayItems>
+        <Size>NumBuckets</Size>
+        <ValuePointer>(MapEntryTy**)TheTable</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::StringMapEntry&lt;*&gt;">
+    <DisplayString Condition="keyLength == 0">empty</DisplayString>
+    <DisplayString Condition="keyLength != 0">({this+1,s8}, {second})</DisplayString>
+    <Expand>
+      <Item Name="[key]">this+1,s</Item>
+      <Item Name="[value]" Condition="keyLength != 0">second</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::Triple">
+    <DisplayString>{Data}</DisplayString>
+  </Type>
+
+  <Type Name="llvm::Optional&lt;*&gt;">
+    <DisplayString Condition="!Storage.hasVal">None</DisplayString>
+    <DisplayString Condition="Storage.hasVal">{Storage.value}</DisplayString>
+    <Expand>
+      <Item Name="[underlying]" Condition="Storage.hasVal">Storage.value</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::Expected&lt;*&gt;">
+    <DisplayString Condition="HasError">Error</DisplayString>
+    <DisplayString Condition="!HasError">{*((storage_type *)TStorage.buffer)}</DisplayString>
+    <Expand>
+      <Item Name="[value]" Condition="!HasError">*((storage_type *)TStorage.buffer)</Item>
+      <Item Name="[error]" Condition="HasError">*((error_type *)ErrorStorage.buffer)</Item>
+    </Expand>
+  </Type>
+
+
+  <!-- Since we're in MSVC, we can assume that the system is little endian.  Therefore
+       the little and native cases just require a cast.  Handle this easy case first. Use
+       a wildcard for the second template argument (the endianness), but we will use a
+       specific value of 0 later on for the big endian to give it priority for being a
+       better match.  -->
+  <Type Name="llvm::support::detail::packed_endian_specific_integral&lt;*,*,1&gt;">
+    <DisplayString>{{little endian value = {*(($T1*)(unsigned char *)Value.buffer)} }}</DisplayString>
+    <Expand>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==1">(unsigned char *)Value.buffer,1</Item>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==2">(unsigned char *)Value.buffer,2</Item>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==4">(unsigned char *)Value.buffer,4</Item>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==8">(unsigned char *)Value.buffer,8</Item>
+    </Expand>
+  </Type>
+
+  <!-- Now handle the hard case of big endian.  We need to do the swizzling here, but
+       we need to specialize it based on the size of the value type. -->
+  <Type Name="llvm::support::detail::packed_endian_specific_integral&lt;*,0,1&gt;">
+    <DisplayString Condition="sizeof($T1)==1">{{ big endian value = {*(unsigned char *)Value.buffer} }}</DisplayString>
+    <DisplayString Condition="sizeof($T1)==2">{{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) &lt;&lt; 8)
+                                                                    | ($T1)(*((unsigned char *)Value.buffer+1))} }}</DisplayString>
+    <DisplayString Condition="sizeof($T1)==4">{{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) &lt;&lt; 24)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+1)) &lt;&lt; 16)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+2)) &lt;&lt; 8)
+                                                                    |  ($T1)(*((unsigned char *)Value.buffer+3))} }}</DisplayString>
+    <DisplayString Condition="sizeof($T1)==8">{{ big endian value = {(($T1)(*(unsigned char *)Value.buffer) &lt;&lt; 56)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+1)) &lt;&lt; 48)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+2)) &lt;&lt; 40)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+3)) &lt;&lt; 32)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+4)) &lt;&lt; 24)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+5)) &lt;&lt; 16)
+                                                                    | (($T1)(*((unsigned char *)Value.buffer+6)) &lt;&lt; 8)
+                                                                    |  ($T1)(*((unsigned char *)Value.buffer+7))} }}</DisplayString>
+    <Expand>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==1">(unsigned char *)Value.buffer,1</Item>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==2">(unsigned char *)Value.buffer,2</Item>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==4">(unsigned char *)Value.buffer,4</Item>
+      <Item Name="[Raw Bytes]" Condition="sizeof($T1)==8">(unsigned char *)Value.buffer,8</Item>
+    </Expand>
+  </Type>
+  <!-- llvm::Type has two fields, SubclassData and ContainedTys, the meaning of which change depending on the TypeID.
+       This visualiser decodes those fields based on the value of ID.
+  -->
+  <Type Name="llvm::Type">
+    <DisplayString>{ID}</DisplayString>
+    <Expand>
+      <Item Name="ID">ID</Item>
+
+      <Item Name="NumBits" Condition="ID == llvm::Type::TypeID::IntegerTyID">SubclassData</Item>
+
+      <Item Name="ReturnType" Condition="ID == llvm::Type::TypeID::FunctionTyID">*ContainedTys</Item>
+      <Synthetic Name="Arguments" Condition="ID == llvm::Type::TypeID::FunctionTyID">
+        <DisplayString>{NumContainedTys - 1}</DisplayString>
+        <Expand>
+        <ArrayItems>
+          <Size>NumContainedTys - 1</Size>
+          <ValuePointer>ContainedTys + 1</ValuePointer>
+        </ArrayItems>
+        </Expand>
+      </Synthetic>
+      <Item Name="IsVarArg" Condition="ID == llvm::Type::TypeID::FunctionTyID">SubclassData == 1</Item>
+
+      <Item Name="HasBody" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_HasBody) != 0</Item>
+      <Item Name="Packed" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_Packed) != 0</Item>
+      <Item Name="IsLiteral" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_IsLiteral) != 0</Item>
+      <Item Name="IsSized" Condition="ID == llvm::Type::TypeID::StructTyID">(SubclassData 	&amp; llvm::StructType::SCDB_IsSized) != 0</Item>
+      <Synthetic Name="Members" Condition="ID == llvm::Type::TypeID::StructTyID">
+        <DisplayString>{NumContainedTys}</DisplayString>
+        <Expand>
+          <ArrayItems>
+            <Size>NumContainedTys</Size>
+            <ValuePointer>ContainedTys</ValuePointer>
+          </ArrayItems>
+        </Expand>
+      </Synthetic>
+
+      <Item Name="ElementType" Condition="ID == llvm::Type::TypeID::ArrayTyID">*ContainedTys</Item>
+      <Item Name="NumElements" Condition="ID == llvm::Type::TypeID::ArrayTyID">((llvm::ArrayType*)this)->NumElements</Item>
+
+      <Item Name="ElementType" Condition="ID == llvm::Type::TypeID::FixedVectorTyID">*ContainedTys</Item>
+      <Item Name="NumElements" Condition="ID == llvm::Type::TypeID::FixedVectorTyID">((llvm::VectorType*)this)->ElementQuantity</Item>
+
+      <Item Name="ElementType" Condition="ID == llvm::Type::TypeID::ScalableVectorTyID">*ContainedTys</Item>
+      <Item Name="MinNumElements" Condition="ID == llvm::Type::TypeID::ScalableVectorTyID">((llvm::VectorType*)this)->ElementQuantity</Item>
+
+      <Item Name="AddressSpace" Condition="ID == llvm::Type::TypeID::PointerTyID">SubclassData</Item>
+      <Item Name="PointeeType" Condition="ID == llvm::Type::TypeID::PointerTyID">*ContainedTys</Item>
+
+      <Item Name="Context">Context</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::ConstantSDNode">
+    <DisplayString>$(Type) {*Value}</DisplayString>
+  </Type>
+
+  <Type Name="llvm::SDNode">
+    <DisplayString>$(Type) {(llvm::ISD::NodeType)this->NodeType}</DisplayString>
+    <Expand>
+      <ArrayItems>
+        <Size>NumOperands</Size>
+        <ValuePointer>OperandList</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::ConstantInt">
+    <DisplayString>i{Val.BitWidth} {Val.VAL}</DisplayString>
+  </Type>
+
+  <Type Name="llvm::IntegerType">
+    <DisplayString>{IDAndSubclassData >> 8}bit integer type</DisplayString>
+  </Type>
+
+  <Type Name="llvm::Value">
+    <DisplayString Condition="HasName">$(Type) {*VTy} {this->getName()} {SubclassData}</DisplayString>
+    <DisplayString Condition="!HasName">$(Type) {*VTy} anon {SubclassData}</DisplayString>
+    <Expand>
+      <Item Name="[Inst]" Condition="SubclassID > InstructionVal">(Instruction*)this</Item>
+      <Item Name="Operands">(User*)this</Item>
+      <LinkedListItems>
+        <HeadPointer>UseList</HeadPointer>
+        <NextPointer>Next</NextPointer>
+        <ValueNode>Prev.Value &amp; 3 == 3 ? (User*)(this + 1) : (User*)(this + 2)</ValueNode>
+      </LinkedListItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::Use">
+    <Expand>
+      <Item Name="Value">Val</Item>
+      <!--
+      <LinkedListItems>
+        <HeadPointer>this</HeadPointer>
+        <NextPointer>Next</NextPointer>
+        <ValueNode>Prev.Value &amp; 3 == 3 ? (User*)(this + 1) : (User*)(this + 2)</ValueNode>
+      </LinkedListItems>
+      -->
+    </Expand>
+  </Type>
+
+  <!-- uses other values, like Operands -->
+  <Type Name="llvm::User">
+    <DisplayString Condition="HasName">$(Type) {*VTy} {this->getName()} {SubclassData}</DisplayString>
+    <DisplayString Condition="!HasName">$(Type) {*VTy} anon {SubclassData}</DisplayString>
+    <Expand>
+      <Item Name="[Value]">(Value*)this,nd</Item>
+      <Item Name="[Type]">*VTy</Item>
+      <ArrayItems Condition="!HasHungOffUses">
+        <Size>NumUserOperands</Size>
+        <ValuePointer>(llvm::Use*)this - NumUserOperands</ValuePointer>
+      </ArrayItems>
+      <ArrayItems Condition="HasHungOffUses">
+        <Size>NumUserOperands</Size>
+        <ValuePointer>*((llvm::Use**)this - 1)</ValuePointer>
+      </ArrayItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::Instruction">
+    <DisplayString>{getOpcodeName(SubclassID - InstructionVal)}</DisplayString>
+    <Expand>
+      <Item Name="[User]">(User*)this,nd</Item>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::GlobalValue">
+    <DisplayString>{this->getName()} {(LinkageTypes)Linkage} {(VisibilityTypes)Visibility} {(DLLStorageClassTypes)DllStorageClass} {(llvm::GlobalValue::ThreadLocalMode) ThreadLocal}</DisplayString>
+  </Type>
+
+  <!-- TODO doesn't work cause it doesn't know the dynamic type -->
+  <Type Name="llvm::ilist_node">
+    <Expand>
+      <LinkedListItems>
+        <HeadPointer>this</HeadPointer>
+        <NextPointer>Next</NextPointer>
+        <ValueNode>this</ValueNode>
+      </LinkedListItems>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::LLVMContext">
+    <Expand>
+      <ExpandedItem>pImpl</ExpandedItem>
+    </Expand>
+  </Type>
+
+  <Type Name="llvm::Module">
+    <DisplayString>{ModuleID,s8} {TargetTriple}</DisplayString>
+  </Type>
+
+  <Type Name="llvm::Pass">
+    <DisplayString>$(Type) {PassID} {Kind}</DisplayString>
+  </Type>
+</AutoVisualizer>
diff --git a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos
index 7a0560654c5c..0f25621c787e 100644
--- a/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos
+++ b/llvm/utils/lit/tests/Inputs/shtest-shell/diff-in.dos
@@ -1,3 +1,3 @@
-In this file, the
-sequence "\r\n"
-terminates lines.
+In this file, the
+sequence "\r\n"
+terminates lines.
diff --git a/llvm/utils/release/build_llvm_release.bat b/llvm/utils/release/build_llvm_release.bat
index dd041d7d384e..3718673ae7a2 100755
--- a/llvm/utils/release/build_llvm_release.bat
+++ b/llvm/utils/release/build_llvm_release.bat
@@ -1,515 +1,515 @@
-@echo off
-setlocal enabledelayedexpansion
-
-goto begin
-
-:usage
-echo Script for building the LLVM installer on Windows,
-echo used for the releases at https://github.com/llvm/llvm-project/releases
-echo.
-echo Usage: build_llvm_release.bat --version ^<version^> [--x86,--x64, --arm64] [--skip-checkout] [--local-python]
-echo.
-echo Options:
-echo --version: [required] version to build
-echo --help: display this help
-echo --x86: build and test x86 variant
-echo --x64: build and test x64 variant
-echo --arm64: build and test arm64 variant
-echo --skip-checkout: use local git checkout instead of downloading src.zip
-echo --local-python: use installed Python and does not try to use a specific version (3.10)
-echo.
-echo Note: At least one variant to build is required.
-echo.
-echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64
-exit /b 1
-
-:begin
-
-::==============================================================================
-:: parse args
-set version=
-set help=
-set x86=
-set x64=
-set arm64=
-set skip-checkout=
-set local-python=
-call :parse_args %*
-
-if "%help%" NEQ "" goto usage
-
-if "%version%" == "" (
-    echo --version option is required
-    echo =============================
-    goto usage
-)
-
-if "%arm64%" == "" if "%x64%" == "" if "%x86%" == "" (
-    echo nothing to build!
-    echo choose one or several variants from: --x86 --x64 --arm64
-    exit /b 1
-)
-
-::==============================================================================
-:: check prerequisites
-REM Note:
-REM   7zip versions 21.x and higher will try to extract the symlinks in
-REM   llvm's git archive, which requires running as administrator.
-
-REM Check 7-zip version and/or administrator permissions.
-for /f "delims=" %%i in ('7z.exe ^| findstr /r "2[1-9].[0-9][0-9]"') do set version_7z=%%i
-if not "%version_7z%"=="" (
-  REM Unique temporary filename to use by the 'mklink' command.
-  set "link_name=%temp%\%username%_%random%_%random%.tmp"
-
-  REM As the 'mklink' requires elevated permissions, the symbolic link
-  REM creation will fail if the script is not running as administrator.
-  mklink /d "!link_name!" . 1>nul 2>nul
-  if errorlevel 1 (
-    echo.
-    echo Script requires administrator permissions, or a 7-zip version 20.x or older.
-    echo Current version is "%version_7z%"
-    exit /b 1
-  ) else (
-    REM Remove the temporary symbolic link.
-    rd "!link_name!"
-  )
-)
-
-REM Prerequisites:
-REM
-REM   Visual Studio 2019, CMake, Ninja, GNUWin32, SWIG, Python 3,
-REM   NSIS with the strlen_8192 patch,
-REM   Perl (for the OpenMP run-time).
-REM
-REM
-REM   For LLDB, SWIG version 4.1.1 should be used.
-REM
-
-:: Detect Visual Studio
-set vsinstall=
-set vswhere=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe
-
-if "%VSINSTALLDIR%" NEQ "" (
-  echo using enabled Visual Studio installation
-  set "vsinstall=%VSINSTALLDIR%"
-) else (
-  echo using vswhere to detect Visual Studio installation
-  FOR /F "delims=" %%r IN ('^""%vswhere%" -nologo -latest -products "*" -all -property installationPath^"') DO set vsinstall=%%r
-)
-set "vsdevcmd=%vsinstall%\Common7\Tools\VsDevCmd.bat"
-
-if not exist "%vsdevcmd%" (
-  echo Can't find any installation of Visual Studio
-  exit /b 1
-)
-echo Using VS devcmd: %vsdevcmd%
-
-::==============================================================================
-:: start echoing what we do
-@echo on
-
-set python32_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310-32
-set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310
-set pythonarm64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python311-arm64
-
-set revision=llvmorg-%version%
-set package_version=%version%
-set build_dir=%cd%\llvm_package_%package_version%
-
-echo Revision: %revision%
-echo Package version: %package_version%
-echo Build dir: %build_dir%
-echo.
-
-if exist %build_dir% (
-  echo Build directory already exists: %build_dir%
-  exit /b 1
-)
-mkdir %build_dir%
-cd %build_dir% || exit /b 1
-
-if "%skip-checkout%" == "true" (
-  echo Using local source
-  set llvm_src=%~dp0..\..\..
-) else (
-  echo Checking out %revision%
-  curl -L https://github.com/llvm/llvm-project/archive/%revision%.zip -o src.zip || exit /b 1
-  7z x src.zip || exit /b 1
-  mv llvm-project-* llvm-project || exit /b 1
-  set llvm_src=%build_dir%\llvm-project
-)
-
-curl -O https://gitlab.gnome.org/GNOME/libxml2/-/archive/v2.9.12/libxml2-v2.9.12.tar.gz || exit /b 1
-tar zxf libxml2-v2.9.12.tar.gz
-
-REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226.
-REM Common flags for all builds.
-set common_compiler_flags=-DLIBXML_STATIC
-set common_cmake_flags=^
-  -DCMAKE_BUILD_TYPE=Release ^
-  -DLLVM_ENABLE_ASSERTIONS=OFF ^
-  -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^
-  -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" ^
-  -DLLVM_BUILD_LLVM_C_DYLIB=ON ^
-  -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^
-  -DPython3_FIND_REGISTRY=NEVER ^
-  -DPACKAGE_VERSION=%package_version% ^
-  -DLLDB_RELOCATABLE_PYTHON=1 ^
-  -DLLDB_EMBED_PYTHON_HOME=OFF ^
-  -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^
-  -DLLVM_ENABLE_LIBXML2=FORCE_ON ^
-  -DLLDB_ENABLE_LIBXML2=OFF ^
-  -DCLANG_ENABLE_LIBXML2=OFF ^
-  -DCMAKE_C_FLAGS="%common_compiler_flags%" ^
-  -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
-  -DLLVM_ENABLE_RPMALLOC=ON ^
-  -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp"
-
-set cmake_profile_flags=""
-
-REM Preserve original path
-set OLDPATH=%PATH%
-
-REM Build the 32-bits and/or 64-bits binaries.
-if "%x86%" == "true" call :do_build_32 || exit /b 1
-if "%x64%" == "true" call :do_build_64 || exit /b 1
-if "%arm64%" == "true" call :do_build_arm64 || exit /b 1
-exit /b 0
-
-::==============================================================================
-:: Build 32-bits binaries.
-::==============================================================================
-:do_build_32
-call :set_environment %python32_dir% || exit /b 1
-call "%vsdevcmd%" -arch=x86 || exit /b 1
-@echo on
-mkdir build32_stage0
-cd build32_stage0
-call :do_build_libxml || exit /b 1
-
-REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
-set cmake_flags=^
-  %common_cmake_flags% ^
-  -DLLVM_ENABLE_RPMALLOC=OFF ^
-  -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
-  -DPYTHON_HOME=%PYTHONHOME% ^
-  -DPython3_ROOT_DIR=%PYTHONHOME% ^
-  -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
-
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
-ninja || ninja || ninja || exit /b 1
-REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
-REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
-ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
-REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
-cd..
-
-REM CMake expects the paths that specifies the compiler and linker to be
-REM with forward slash.
-set all_cmake_flags=^
-  %cmake_flags% ^
-  -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
-  -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
-  -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe
-set cmake_flags=%all_cmake_flags:\=/%
-
-mkdir build32
-cd build32
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
-ninja || ninja || ninja || exit /b 1
-REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
-REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
-ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
-REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
-ninja package || exit /b 1
-cd ..
-
-exit /b 0
-::==============================================================================
-
-::==============================================================================
-:: Build 64-bits binaries.
-::==============================================================================
-:do_build_64
-call :set_environment %python64_dir% || exit /b 1
-call "%vsdevcmd%" -arch=amd64 || exit /b 1
-@echo on
-mkdir build64_stage0
-cd build64_stage0
-call :do_build_libxml || exit /b 1
-
-REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build64_stage0/bin"
-set cmake_flags=^
-  %common_cmake_flags% ^
-  -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
-  -DPYTHON_HOME=%PYTHONHOME% ^
-  -DPython3_ROOT_DIR=%PYTHONHOME% ^
-  -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
-
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
-ninja || ninja || ninja || exit /b 1
-ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
-ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
-ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
-ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
-ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
-cd..
-
-REM CMake expects the paths that specifies the compiler and linker to be
-REM with forward slash.
-set all_cmake_flags=^
-  %cmake_flags% ^
-  -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
-  -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
-  -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe
-set cmake_flags=%all_cmake_flags:\=/%
-
-
-mkdir build64
-cd build64
-call :do_generate_profile || exit /b 1
-cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
-ninja || ninja || ninja || exit /b 1
-ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
-ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
-ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
-ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
-ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
-ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
-ninja package || exit /b 1
-
-:: generate tarball with install toolchain only off
-set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
-cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^
-  -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1
-ninja install || exit /b 1
-:: check llvm_config is present & returns something
-%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1
-cd ..
-7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz
-
-exit /b 0
-::==============================================================================
-
-::==============================================================================
-:: Build arm64 binaries.
-::==============================================================================
-:do_build_arm64
-call :set_environment %pythonarm64_dir% || exit /b 1
-call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1
-@echo on
-mkdir build_arm64_stage0
-cd build_arm64_stage0
-call :do_build_libxml || exit /b 1
-
-REM Stage0 binaries directory; used in stage1.
-set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin"
-set cmake_flags=^
-  %common_cmake_flags% ^
-  -DCLANG_DEFAULT_LINKER=lld ^
-  -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
-  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
-  -DPython3_ROOT_DIR=%PYTHONHOME% ^
-  -DCOMPILER_RT_BUILD_PROFILE=OFF ^
-  -DCOMPILER_RT_BUILD_SANITIZERS=OFF
-
-REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins).
-cmake -GNinja %cmake_flags% ^
-  -DCMAKE_C_COMPILER=clang-cl.exe ^
-  -DCMAKE_CXX_COMPILER=clang-cl.exe ^
-  %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-cd..
-
-REM CMake expects the paths that specifies the compiler and linker to be
-REM with forward slash.
-REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated.
-set all_cmake_flags=^
-  %cmake_flags% ^
-  -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
-  -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
-  -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
-  -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^
-  -DCPACK_SYSTEM_NAME=woa64
-set cmake_flags=%all_cmake_flags:\=/%
-
-mkdir build_arm64
-cd build_arm64
-cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
-ninja || exit /b 1
-REM Check but do not fail on errors.
-ninja check-lldb
-::ninja check-llvm || exit /b 1
-::ninja check-clang || exit /b 1
-::ninja check-lld || exit /b 1
-::ninja check-sanitizer || exit /b 1
-::ninja check-clang-tools || exit /b 1
-::ninja check-clangd || exit /b 1
-ninja package || exit /b 1
-cd ..
-
-exit /b 0
-::==============================================================================
-::
-::==============================================================================
-:: Set PATH and some environment variables.
-::==============================================================================
-:set_environment
-REM Restore original path
-set PATH=%OLDPATH%
-
-set python_dir=%1
-
-REM Set Python environment
-if "%local-python%" == "true" (
-  FOR /F "delims=" %%i IN ('where python.exe ^| head -1') DO set python_exe=%%i
-  set PYTHONHOME=!python_exe:~0,-11!
-) else (
-  %python_dir%/python.exe --version || exit /b 1
-  set PYTHONHOME=%python_dir%
-)
-set PATH=%PYTHONHOME%;%PATH%
-
-set "VSCMD_START_DIR=%build_dir%"
-
-exit /b 0
-
-::=============================================================================
-
-::==============================================================================
-:: Build libxml.
-::==============================================================================
-:do_build_libxml
-mkdir libxmlbuild
-cd libxmlbuild
-cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^
-  -DBUILD_SHARED_LIBS=OFF -DLIBXML2_WITH_C14N=OFF -DLIBXML2_WITH_CATALOG=OFF ^
-  -DLIBXML2_WITH_DEBUG=OFF -DLIBXML2_WITH_DOCB=OFF -DLIBXML2_WITH_FTP=OFF ^
-  -DLIBXML2_WITH_HTML=OFF -DLIBXML2_WITH_HTTP=OFF -DLIBXML2_WITH_ICONV=OFF ^
-  -DLIBXML2_WITH_ICU=OFF -DLIBXML2_WITH_ISO8859X=OFF -DLIBXML2_WITH_LEGACY=OFF ^
-  -DLIBXML2_WITH_LZMA=OFF -DLIBXML2_WITH_MEM_DEBUG=OFF -DLIBXML2_WITH_MODULES=OFF ^
-  -DLIBXML2_WITH_OUTPUT=ON -DLIBXML2_WITH_PATTERN=OFF -DLIBXML2_WITH_PROGRAMS=OFF ^
-  -DLIBXML2_WITH_PUSH=OFF -DLIBXML2_WITH_PYTHON=OFF -DLIBXML2_WITH_READER=OFF ^
-  -DLIBXML2_WITH_REGEXPS=OFF -DLIBXML2_WITH_RUN_DEBUG=OFF -DLIBXML2_WITH_SAX1=OFF ^
-  -DLIBXML2_WITH_SCHEMAS=OFF -DLIBXML2_WITH_SCHEMATRON=OFF -DLIBXML2_WITH_TESTS=OFF ^
-  -DLIBXML2_WITH_THREADS=ON -DLIBXML2_WITH_THREAD_ALLOC=OFF -DLIBXML2_WITH_TREE=ON ^
-  -DLIBXML2_WITH_VALID=OFF -DLIBXML2_WITH_WRITER=OFF -DLIBXML2_WITH_XINCLUDE=OFF ^
-  -DLIBXML2_WITH_XPATH=OFF -DLIBXML2_WITH_XPTR=OFF -DLIBXML2_WITH_ZLIB=OFF ^
-  -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^
-  ../../libxml2-v2.9.12 || exit /b 1
-ninja install || exit /b 1
-set libxmldir=%cd%\install
-set "libxmldir=%libxmldir:\=/%"
-cd ..
-exit /b 0
-
-::==============================================================================
-:: Generate a PGO profile.
-::==============================================================================
-:do_generate_profile
-REM Build Clang with instrumentation.
-mkdir instrument
-cd instrument
-cmake -GNinja %cmake_flags% -DLLVM_TARGETS_TO_BUILD=Native ^
-  -DLLVM_BUILD_INSTRUMENTED=IR %llvm_src%\llvm || exit /b 1
-ninja clang || ninja clang || ninja clang || exit /b 1
-set instrumented_clang=%cd:\=/%/bin/clang-cl.exe
-cd ..
-REM Use that to build part of llvm to generate a profile.
-mkdir train
-cd train
-cmake -GNinja %cmake_flags% ^
-  -DCMAKE_C_COMPILER=%instrumented_clang% ^
-  -DCMAKE_CXX_COMPILER=%instrumented_clang% ^
-  -DLLVM_ENABLE_PROJECTS=clang ^
-  -DLLVM_TARGETS_TO_BUILD=Native ^
-  %llvm_src%\llvm || exit /b 1
-REM Drop profiles generated from running cmake; those are not representative.
-del ..\instrument\profiles\*.profraw
-ninja tools/clang/lib/Sema/CMakeFiles/obj.clangSema.dir/Sema.cpp.obj
-cd ..
-set profile=%cd:\=/%/profile.profdata
-%stage0_bin_dir%\llvm-profdata merge -output=%profile% instrument\profiles\*.profraw || exit /b 1
-set common_compiler_flags=%common_compiler_flags% -Wno-backend-plugin
-set cmake_profile_flags=-DLLVM_PROFDATA_FILE=%profile% ^
-  -DCMAKE_C_FLAGS="%common_compiler_flags%" ^
-  -DCMAKE_CXX_FLAGS="%common_compiler_flags%"
-exit /b 0
-
-::=============================================================================
-:: Parse command line arguments.
-:: The format for the arguments is:
-::   Boolean: --option
-::   Value:   --option<separator>value
-::     with <separator> being: space, colon, semicolon or equal sign
-::
-:: Command line usage example:
-::   my-batch-file.bat --build --type=release --version 123
-:: It will create 3 variables:
-::   'build' with the value 'true'
-::   'type' with the value 'release'
-::   'version' with the value '123'
-::
-:: Usage:
-::   set "build="
-::   set "type="
-::   set "version="
-::
-::   REM Parse arguments.
-::   call :parse_args %*
-::
-::   if defined build (
-::     ...
-::   )
-::   if %type%=='release' (
-::     ...
-::   )
-::   if %version%=='123' (
-::     ...
-::   )
-::=============================================================================
-:parse_args
-  set "arg_name="
-  :parse_args_start
-  if "%1" == "" (
-    :: Set a seen boolean argument.
-    if "%arg_name%" neq "" (
-      set "%arg_name%=true"
-    )
-    goto :parse_args_done
-  )
-  set aux=%1
-  if "%aux:~0,2%" == "--" (
-    :: Set a seen boolean argument.
-    if "%arg_name%" neq "" (
-      set "%arg_name%=true"
-    )
-    set "arg_name=%aux:~2,250%"
-  ) else (
-    set "%arg_name%=%1"
-    set "arg_name="
-  )
-  shift
-  goto :parse_args_start
-
-:parse_args_done
-exit /b 0
+@echo off
+setlocal enabledelayedexpansion
+
+goto begin
+
+:usage
+echo Script for building the LLVM installer on Windows,
+echo used for the releases at https://github.com/llvm/llvm-project/releases
+echo.
+echo Usage: build_llvm_release.bat --version ^<version^> [--x86,--x64, --arm64] [--skip-checkout] [--local-python]
+echo.
+echo Options:
+echo --version: [required] version to build
+echo --help: display this help
+echo --x86: build and test x86 variant
+echo --x64: build and test x64 variant
+echo --arm64: build and test arm64 variant
+echo --skip-checkout: use local git checkout instead of downloading src.zip
+echo --local-python: use installed Python and does not try to use a specific version (3.10)
+echo.
+echo Note: At least one variant to build is required.
+echo.
+echo Example: build_llvm_release.bat --version 15.0.0 --x86 --x64
+exit /b 1
+
+:begin
+
+::==============================================================================
+:: parse args
+set version=
+set help=
+set x86=
+set x64=
+set arm64=
+set skip-checkout=
+set local-python=
+call :parse_args %*
+
+if "%help%" NEQ "" goto usage
+
+if "%version%" == "" (
+    echo --version option is required
+    echo =============================
+    goto usage
+)
+
+if "%arm64%" == "" if "%x64%" == "" if "%x86%" == "" (
+    echo nothing to build!
+    echo choose one or several variants from: --x86 --x64 --arm64
+    exit /b 1
+)
+
+::==============================================================================
+:: check prerequisites
+REM Note:
+REM   7zip versions 21.x and higher will try to extract the symlinks in
+REM   llvm's git archive, which requires running as administrator.
+
+REM Check 7-zip version and/or administrator permissions.
+for /f "delims=" %%i in ('7z.exe ^| findstr /r "2[1-9].[0-9][0-9]"') do set version_7z=%%i
+if not "%version_7z%"=="" (
+  REM Unique temporary filename to use by the 'mklink' command.
+  set "link_name=%temp%\%username%_%random%_%random%.tmp"
+
+  REM As the 'mklink' requires elevated permissions, the symbolic link
+  REM creation will fail if the script is not running as administrator.
+  mklink /d "!link_name!" . 1>nul 2>nul
+  if errorlevel 1 (
+    echo.
+    echo Script requires administrator permissions, or a 7-zip version 20.x or older.
+    echo Current version is "%version_7z%"
+    exit /b 1
+  ) else (
+    REM Remove the temporary symbolic link.
+    rd "!link_name!"
+  )
+)
+
+REM Prerequisites:
+REM
+REM   Visual Studio 2019, CMake, Ninja, GNUWin32, SWIG, Python 3,
+REM   NSIS with the strlen_8192 patch,
+REM   Perl (for the OpenMP run-time).
+REM
+REM
+REM   For LLDB, SWIG version 4.1.1 should be used.
+REM
+
+:: Detect Visual Studio
+set vsinstall=
+set vswhere=%ProgramFiles(x86)%\Microsoft Visual Studio\Installer\vswhere.exe
+
+if "%VSINSTALLDIR%" NEQ "" (
+  echo using enabled Visual Studio installation
+  set "vsinstall=%VSINSTALLDIR%"
+) else (
+  echo using vswhere to detect Visual Studio installation
+  FOR /F "delims=" %%r IN ('^""%vswhere%" -nologo -latest -products "*" -all -property installationPath^"') DO set vsinstall=%%r
+)
+set "vsdevcmd=%vsinstall%\Common7\Tools\VsDevCmd.bat"
+
+if not exist "%vsdevcmd%" (
+  echo Can't find any installation of Visual Studio
+  exit /b 1
+)
+echo Using VS devcmd: %vsdevcmd%
+
+::==============================================================================
+:: start echoing what we do
+@echo on
+
+set python32_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310-32
+set python64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python310
+set pythonarm64_dir=C:\Users\%USERNAME%\AppData\Local\Programs\Python\Python311-arm64
+
+set revision=llvmorg-%version%
+set package_version=%version%
+set build_dir=%cd%\llvm_package_%package_version%
+
+echo Revision: %revision%
+echo Package version: %package_version%
+echo Build dir: %build_dir%
+echo.
+
+if exist %build_dir% (
+  echo Build directory already exists: %build_dir%
+  exit /b 1
+)
+mkdir %build_dir%
+cd %build_dir% || exit /b 1
+
+if "%skip-checkout%" == "true" (
+  echo Using local source
+  set llvm_src=%~dp0..\..\..
+) else (
+  echo Checking out %revision%
+  curl -L https://github.com/llvm/llvm-project/archive/%revision%.zip -o src.zip || exit /b 1
+  7z x src.zip || exit /b 1
+  mv llvm-project-* llvm-project || exit /b 1
+  set llvm_src=%build_dir%\llvm-project
+)
+
+curl -O https://gitlab.gnome.org/GNOME/libxml2/-/archive/v2.9.12/libxml2-v2.9.12.tar.gz || exit /b 1
+tar zxf libxml2-v2.9.12.tar.gz
+
+REM Setting CMAKE_CL_SHOWINCLUDES_PREFIX to work around PR27226.
+REM Common flags for all builds.
+set common_compiler_flags=-DLIBXML_STATIC
+set common_cmake_flags=^
+  -DCMAKE_BUILD_TYPE=Release ^
+  -DLLVM_ENABLE_ASSERTIONS=OFF ^
+  -DLLVM_INSTALL_TOOLCHAIN_ONLY=ON ^
+  -DLLVM_TARGETS_TO_BUILD="AArch64;ARM;X86" ^
+  -DLLVM_BUILD_LLVM_C_DYLIB=ON ^
+  -DCMAKE_INSTALL_UCRT_LIBRARIES=ON ^
+  -DPython3_FIND_REGISTRY=NEVER ^
+  -DPACKAGE_VERSION=%package_version% ^
+  -DLLDB_RELOCATABLE_PYTHON=1 ^
+  -DLLDB_EMBED_PYTHON_HOME=OFF ^
+  -DCMAKE_CL_SHOWINCLUDES_PREFIX="Note: including file: " ^
+  -DLLVM_ENABLE_LIBXML2=FORCE_ON ^
+  -DLLDB_ENABLE_LIBXML2=OFF ^
+  -DCLANG_ENABLE_LIBXML2=OFF ^
+  -DCMAKE_C_FLAGS="%common_compiler_flags%" ^
+  -DCMAKE_CXX_FLAGS="%common_compiler_flags%" ^
+  -DLLVM_ENABLE_RPMALLOC=ON ^
+  -DLLVM_ENABLE_PROJECTS="clang;clang-tools-extra;lld;compiler-rt;lldb;openmp"
+
+set cmake_profile_flags=""
+
+REM Preserve original path
+set OLDPATH=%PATH%
+
+REM Build the 32-bits and/or 64-bits binaries.
+if "%x86%" == "true" call :do_build_32 || exit /b 1
+if "%x64%" == "true" call :do_build_64 || exit /b 1
+if "%arm64%" == "true" call :do_build_arm64 || exit /b 1
+exit /b 0
+
+::==============================================================================
+:: Build 32-bits binaries.
+::==============================================================================
+:do_build_32
+call :set_environment %python32_dir% || exit /b 1
+call "%vsdevcmd%" -arch=x86 || exit /b 1
+@echo on
+mkdir build32_stage0
+cd build32_stage0
+call :do_build_libxml || exit /b 1
+
+REM Stage0 binaries directory; used in stage1.
+set "stage0_bin_dir=%build_dir%/build32_stage0/bin"
+set cmake_flags=^
+  %common_cmake_flags% ^
+  -DLLVM_ENABLE_RPMALLOC=OFF ^
+  -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
+  -DPYTHON_HOME=%PYTHONHOME% ^
+  -DPython3_ROOT_DIR=%PYTHONHOME% ^
+  -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
+  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
+
+cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
+ninja || ninja || ninja || exit /b 1
+REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
+REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
+ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
+ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
+cd..
+
+REM CMake expects the paths that specifies the compiler and linker to be
+REM with forward slash.
+set all_cmake_flags=^
+  %cmake_flags% ^
+  -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
+  -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
+  -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
+  -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
+  -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe
+set cmake_flags=%all_cmake_flags:\=/%
+
+mkdir build32
+cd build32
+cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
+ninja || ninja || ninja || exit /b 1
+REM ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
+REM ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
+ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
+ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+REM ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
+ninja package || exit /b 1
+cd ..
+
+exit /b 0
+::==============================================================================
+
+::==============================================================================
+:: Build 64-bits binaries.
+::==============================================================================
+:do_build_64
+call :set_environment %python64_dir% || exit /b 1
+call "%vsdevcmd%" -arch=amd64 || exit /b 1
+@echo on
+mkdir build64_stage0
+cd build64_stage0
+call :do_build_libxml || exit /b 1
+
+REM Stage0 binaries directory; used in stage1.
+set "stage0_bin_dir=%build_dir%/build64_stage0/bin"
+set cmake_flags=^
+  %common_cmake_flags% ^
+  -DLLDB_TEST_COMPILER=%stage0_bin_dir%/clang.exe ^
+  -DPYTHON_HOME=%PYTHONHOME% ^
+  -DPython3_ROOT_DIR=%PYTHONHOME% ^
+  -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
+  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib
+
+cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
+ninja || ninja || ninja || exit /b 1
+ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
+ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
+ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
+ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
+ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
+cd..
+
+REM CMake expects the paths that specifies the compiler and linker to be
+REM with forward slash.
+set all_cmake_flags=^
+  %cmake_flags% ^
+  -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
+  -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
+  -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
+  -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
+  -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe
+set cmake_flags=%all_cmake_flags:\=/%
+
+
+mkdir build64
+cd build64
+call :do_generate_profile || exit /b 1
+cmake -GNinja %cmake_flags% %cmake_profile_flags% %llvm_src%\llvm || exit /b 1
+ninja || ninja || ninja || exit /b 1
+ninja check-llvm || ninja check-llvm || ninja check-llvm || exit /b 1
+ninja check-clang || ninja check-clang || ninja check-clang || exit /b 1
+ninja check-lld || ninja check-lld || ninja check-lld || exit /b 1
+ninja check-sanitizer || ninja check-sanitizer || ninja check-sanitizer || exit /b 1
+ninja check-clang-tools || ninja check-clang-tools || ninja check-clang-tools || exit /b 1
+ninja check-clangd || ninja check-clangd || ninja check-clangd || exit /b 1
+ninja package || exit /b 1
+
+:: generate tarball with install toolchain only off
+set filename=clang+llvm-%version%-x86_64-pc-windows-msvc
+cmake -GNinja %cmake_flags% %cmake_profile_flags% -DLLVM_INSTALL_TOOLCHAIN_ONLY=OFF ^
+  -DCMAKE_INSTALL_PREFIX=%build_dir%/%filename% ..\llvm-project\llvm || exit /b 1
+ninja install || exit /b 1
+:: check llvm_config is present & returns something
+%build_dir%/%filename%/bin/llvm-config.exe --bindir || exit /b 1
+cd ..
+7z a -ttar -so %filename%.tar %filename% | 7z a -txz -si %filename%.tar.xz
+
+exit /b 0
+::==============================================================================
+
+::==============================================================================
+:: Build arm64 binaries.
+::==============================================================================
+:do_build_arm64
+call :set_environment %pythonarm64_dir% || exit /b 1
+call "%vsdevcmd%" -host_arch=x64 -arch=arm64 || exit /b 1
+@echo on
+mkdir build_arm64_stage0
+cd build_arm64_stage0
+call :do_build_libxml || exit /b 1
+
+REM Stage0 binaries directory; used in stage1.
+set "stage0_bin_dir=%build_dir%/build_arm64_stage0/bin"
+set cmake_flags=^
+  %common_cmake_flags% ^
+  -DCLANG_DEFAULT_LINKER=lld ^
+  -DLIBXML2_INCLUDE_DIR=%libxmldir%/include/libxml2 ^
+  -DLIBXML2_LIBRARIES=%libxmldir%/lib/libxml2s.lib ^
+  -DPython3_ROOT_DIR=%PYTHONHOME% ^
+  -DCOMPILER_RT_BUILD_PROFILE=OFF ^
+  -DCOMPILER_RT_BUILD_SANITIZERS=OFF
+
+REM We need to build stage0 compiler-rt with clang-cl (msvc lacks some builtins).
+cmake -GNinja %cmake_flags% ^
+  -DCMAKE_C_COMPILER=clang-cl.exe ^
+  -DCMAKE_CXX_COMPILER=clang-cl.exe ^
+  %llvm_src%\llvm || exit /b 1
+ninja || exit /b 1
+::ninja check-llvm || exit /b 1
+::ninja check-clang || exit /b 1
+::ninja check-lld || exit /b 1
+::ninja check-sanitizer || exit /b 1
+::ninja check-clang-tools || exit /b 1
+::ninja check-clangd || exit /b 1
+cd..
+
+REM CMake expects the paths that specifies the compiler and linker to be
+REM with forward slash.
+REM CPACK_SYSTEM_NAME is set to have a correct name for installer generated.
+set all_cmake_flags=^
+  %cmake_flags% ^
+  -DCMAKE_C_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
+  -DCMAKE_CXX_COMPILER=%stage0_bin_dir%/clang-cl.exe ^
+  -DCMAKE_LINKER=%stage0_bin_dir%/lld-link.exe ^
+  -DCMAKE_AR=%stage0_bin_dir%/llvm-lib.exe ^
+  -DCMAKE_RC=%stage0_bin_dir%/llvm-windres.exe ^
+  -DCPACK_SYSTEM_NAME=woa64
+set cmake_flags=%all_cmake_flags:\=/%
+
+mkdir build_arm64
+cd build_arm64
+cmake -GNinja %cmake_flags% %llvm_src%\llvm || exit /b 1
+ninja || exit /b 1
+REM Check but do not fail on errors.
+ninja check-lldb
+::ninja check-llvm || exit /b 1
+::ninja check-clang || exit /b 1
+::ninja check-lld || exit /b 1
+::ninja check-sanitizer || exit /b 1
+::ninja check-clang-tools || exit /b 1
+::ninja check-clangd || exit /b 1
+ninja package || exit /b 1
+cd ..
+
+exit /b 0
+::==============================================================================
+::
+::==============================================================================
+:: Set PATH and some environment variables.
+::==============================================================================
+:set_environment
+REM Restore original path
+set PATH=%OLDPATH%
+
+set python_dir=%1
+
+REM Set Python environment
+if "%local-python%" == "true" (
+  FOR /F "delims=" %%i IN ('where python.exe ^| head -1') DO set python_exe=%%i
+  set PYTHONHOME=!python_exe:~0,-11!
+) else (
+  %python_dir%/python.exe --version || exit /b 1
+  set PYTHONHOME=%python_dir%
+)
+set PATH=%PYTHONHOME%;%PATH%
+
+set "VSCMD_START_DIR=%build_dir%"
+
+exit /b 0
+
+::=============================================================================
+
+::==============================================================================
+:: Build libxml.
+::==============================================================================
+:do_build_libxml
+mkdir libxmlbuild
+cd libxmlbuild
+cmake -GNinja -DCMAKE_BUILD_TYPE=Release -DCMAKE_INSTALL_PREFIX=install ^
+  -DBUILD_SHARED_LIBS=OFF -DLIBXML2_WITH_C14N=OFF -DLIBXML2_WITH_CATALOG=OFF ^
+  -DLIBXML2_WITH_DEBUG=OFF -DLIBXML2_WITH_DOCB=OFF -DLIBXML2_WITH_FTP=OFF ^
+  -DLIBXML2_WITH_HTML=OFF -DLIBXML2_WITH_HTTP=OFF -DLIBXML2_WITH_ICONV=OFF ^
+  -DLIBXML2_WITH_ICU=OFF -DLIBXML2_WITH_ISO8859X=OFF -DLIBXML2_WITH_LEGACY=OFF ^
+  -DLIBXML2_WITH_LZMA=OFF -DLIBXML2_WITH_MEM_DEBUG=OFF -DLIBXML2_WITH_MODULES=OFF ^
+  -DLIBXML2_WITH_OUTPUT=ON -DLIBXML2_WITH_PATTERN=OFF -DLIBXML2_WITH_PROGRAMS=OFF ^
+  -DLIBXML2_WITH_PUSH=OFF -DLIBXML2_WITH_PYTHON=OFF -DLIBXML2_WITH_READER=OFF ^
+  -DLIBXML2_WITH_REGEXPS=OFF -DLIBXML2_WITH_RUN_DEBUG=OFF -DLIBXML2_WITH_SAX1=OFF ^
+  -DLIBXML2_WITH_SCHEMAS=OFF -DLIBXML2_WITH_SCHEMATRON=OFF -DLIBXML2_WITH_TESTS=OFF ^
+  -DLIBXML2_WITH_THREADS=ON -DLIBXML2_WITH_THREAD_ALLOC=OFF -DLIBXML2_WITH_TREE=ON ^
+  -DLIBXML2_WITH_VALID=OFF -DLIBXML2_WITH_WRITER=OFF -DLIBXML2_WITH_XINCLUDE=OFF ^
+  -DLIBXML2_WITH_XPATH=OFF -DLIBXML2_WITH_XPTR=OFF -DLIBXML2_WITH_ZLIB=OFF ^
+  -DCMAKE_MSVC_RUNTIME_LIBRARY=MultiThreaded ^
+  ../../libxml2-v2.9.12 || exit /b 1
+ninja install || exit /b 1
+set libxmldir=%cd%\install
+set "libxmldir=%libxmldir:\=/%"
+cd ..
+exit /b 0
+
+::==============================================================================
+:: Generate a PGO profile.
+::==============================================================================
+:do_generate_profile
+REM Build Clang with instrumentation.
+mkdir instrument
+cd instrument
+cmake -GNinja %cmake_flags% -DLLVM_TARGETS_TO_BUILD=Native ^
+  -DLLVM_BUILD_INSTRUMENTED=IR %llvm_src%\llvm || exit /b 1
+ninja clang || ninja clang || ninja clang || exit /b 1
+set instrumented_clang=%cd:\=/%/bin/clang-cl.exe
+cd ..
+REM Use that to build part of llvm to generate a profile.
+mkdir train
+cd train
+cmake -GNinja %cmake_flags% ^
+  -DCMAKE_C_COMPILER=%instrumented_clang% ^
+  -DCMAKE_CXX_COMPILER=%instrumented_clang% ^
+  -DLLVM_ENABLE_PROJECTS=clang ^
+  -DLLVM_TARGETS_TO_BUILD=Native ^
+  %llvm_src%\llvm || exit /b 1
+REM Drop profiles generated from running cmake; those are not representative.
+del ..\instrument\profiles\*.profraw
+ninja tools/clang/lib/Sema/CMakeFiles/obj.clangSema.dir/Sema.cpp.obj
+cd ..
+set profile=%cd:\=/%/profile.profdata
+%stage0_bin_dir%\llvm-profdata merge -output=%profile% instrument\profiles\*.profraw || exit /b 1
+set common_compiler_flags=%common_compiler_flags% -Wno-backend-plugin
+set cmake_profile_flags=-DLLVM_PROFDATA_FILE=%profile% ^
+  -DCMAKE_C_FLAGS="%common_compiler_flags%" ^
+  -DCMAKE_CXX_FLAGS="%common_compiler_flags%"
+exit /b 0
+
+::=============================================================================
+:: Parse command line arguments.
+:: The format for the arguments is:
+::   Boolean: --option
+::   Value:   --option<separator>value
+::     with <separator> being: space, colon, semicolon or equal sign
+::
+:: Command line usage example:
+::   my-batch-file.bat --build --type=release --version 123
+:: It will create 3 variables:
+::   'build' with the value 'true'
+::   'type' with the value 'release'
+::   'version' with the value '123'
+::
+:: Usage:
+::   set "build="
+::   set "type="
+::   set "version="
+::
+::   REM Parse arguments.
+::   call :parse_args %*
+::
+::   if defined build (
+::     ...
+::   )
+::   if %type%=='release' (
+::     ...
+::   )
+::   if %version%=='123' (
+::     ...
+::   )
+::=============================================================================
+:parse_args
+  set "arg_name="
+  :parse_args_start
+  if "%1" == "" (
+    :: Set a seen boolean argument.
+    if "%arg_name%" neq "" (
+      set "%arg_name%=true"
+    )
+    goto :parse_args_done
+  )
+  set aux=%1
+  if "%aux:~0,2%" == "--" (
+    :: Set a seen boolean argument.
+    if "%arg_name%" neq "" (
+      set "%arg_name%=true"
+    )
+    set "arg_name=%aux:~2,250%"
+  ) else (
+    set "%arg_name%=%1"
+    set "arg_name="
+  )
+  shift
+  goto :parse_args_start
+
+:parse_args_done
+exit /b 0
diff --git a/openmp/runtime/doc/doxygen/config b/openmp/runtime/doc/doxygen/config
index 04c966766ba6..8d79dc143cc1 100644
--- a/openmp/runtime/doc/doxygen/config
+++ b/openmp/runtime/doc/doxygen/config
@@ -1,1822 +1,1822 @@
-# Doxyfile 1.o8.2
-
-# This file describes the settings to be used by the documentation system
-# doxygen (www.doxygen.org) for a project.
-#
-# All text after a hash (#) is considered a comment and will be ignored.
-# The format is:
-#       TAG = value [value, ...]
-# For lists items can also be appended using:
-#       TAG += value [value, ...]
-# Values that contain spaces should be placed between quotes (" ").
-
-#---------------------------------------------------------------------------
-# Project related configuration options
-#---------------------------------------------------------------------------
-
-# This tag specifies the encoding used for all characters in the config file
-# that follow. The default is UTF-8 which is also the encoding used for all
-# text before the first occurrence of this tag. Doxygen uses libiconv (or the
-# iconv built into libc) for the transcoding. See
-# http://www.gnu.org/software/libiconv for the list of possible encodings.
-
-DOXYFILE_ENCODING      = UTF-8
-
-# The PROJECT_NAME tag is a single word (or sequence of words) that should
-# identify the project. Note that if you do not use Doxywizard you need
-# to put quotes around the project name if it contains spaces.
-
-PROJECT_NAME           = "LLVM OpenMP* Runtime Library"
-
-# The PROJECT_NUMBER tag can be used to enter a project or revision number.
-# This could be handy for archiving the generated documentation or
-# if some version control system is used.
-
-PROJECT_NUMBER         =
-
-# Using the PROJECT_BRIEF tag one can provide an optional one line description
-# for a project that appears at the top of each page and should give viewer
-# a quick idea about the purpose of the project. Keep the description short.
-
-PROJECT_BRIEF          =
-
-# With the PROJECT_LOGO tag one can specify an logo or icon that is
-# included in the documentation. The maximum height of the logo should not
-# exceed 55 pixels and the maximum width should not exceed 200 pixels.
-# Doxygen will copy the logo to the output directory.
-
-PROJECT_LOGO           =
-
-# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
-# base path where the generated documentation will be put.
-# If a relative path is entered, it will be relative to the location
-# where doxygen was started. If left blank the current directory will be used.
-
-OUTPUT_DIRECTORY       = doc/doxygen/generated
-
-# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
-# 4096 sub-directories (in 2 levels) under the output directory of each output
-# format and will distribute the generated files over these directories.
-# Enabling this option can be useful when feeding doxygen a huge amount of
-# source files, where putting all generated files in the same directory would
-# otherwise cause performance problems for the file system.
-
-CREATE_SUBDIRS         = NO
-
-# The OUTPUT_LANGUAGE tag is used to specify the language in which all
-# documentation generated by doxygen is written. Doxygen will use this
-# information to generate all constant output in the proper language.
-# The default language is English, other supported languages are:
-# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
-# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
-# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
-# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
-# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
-# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
-
-OUTPUT_LANGUAGE        = English
-
-# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
-# include brief member descriptions after the members that are listed in
-# the file and class documentation (similar to JavaDoc).
-# Set to NO to disable this.
-
-BRIEF_MEMBER_DESC      = YES
-
-# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
-# the brief description of a member or function before the detailed description.
-# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
-# brief descriptions will be completely suppressed.
-
-REPEAT_BRIEF           = YES
-
-# This tag implements a quasi-intelligent brief description abbreviator
-# that is used to form the text in various listings. Each string
-# in this list, if found as the leading text of the brief description, will be
-# stripped from the text and the result after processing the whole list, is
-# used as the annotated text. Otherwise, the brief description is used as-is.
-# If left blank, the following values are used ("$name" is automatically
-# replaced with the name of the entity): "The $name class" "The $name widget"
-# "The $name file" "is" "provides" "specifies" "contains"
-# "represents" "a" "an" "the"
-
-ABBREVIATE_BRIEF       =
-
-# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
-# Doxygen will generate a detailed section even if there is only a brief
-# description.
-
-ALWAYS_DETAILED_SEC    = NO
-
-# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
-# inherited members of a class in the documentation of that class as if those
-# members were ordinary class members. Constructors, destructors and assignment
-# operators of the base classes will not be shown.
-
-INLINE_INHERITED_MEMB  = NO
-
-# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
-# path before files name in the file list and in the header files. If set
-# to NO the shortest path that makes the file name unique will be used.
-
-FULL_PATH_NAMES        = NO
-
-# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
-# can be used to strip a user-defined part of the path. Stripping is
-# only done if one of the specified strings matches the left-hand part of
-# the path. The tag can be used to show relative paths in the file list.
-# If left blank the directory from which doxygen is run is used as the
-# path to strip. Note that you specify absolute paths here, but also
-# relative paths, which will be relative from the directory where doxygen is
-# started.
-
-STRIP_FROM_PATH        =
-
-# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
-# the path mentioned in the documentation of a class, which tells
-# the reader which header file to include in order to use a class.
-# If left blank only the name of the header file containing the class
-# definition is used. Otherwise one should specify the include paths that
-# are normally passed to the compiler using the -I flag.
-
-STRIP_FROM_INC_PATH    =
-
-# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
-# (but less readable) file names. This can be useful if your file system
-# doesn't support long names like on DOS, Mac, or CD-ROM.
-
-SHORT_NAMES            = NO
-
-# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
-# will interpret the first line (until the first dot) of a JavaDoc-style
-# comment as the brief description. If set to NO, the JavaDoc
-# comments will behave just like regular Qt-style comments
-# (thus requiring an explicit @brief command for a brief description.)
-
-JAVADOC_AUTOBRIEF      = NO
-
-# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
-# interpret the first line (until the first dot) of a Qt-style
-# comment as the brief description. If set to NO, the comments
-# will behave just like regular Qt-style comments (thus requiring
-# an explicit \brief command for a brief description.)
-
-QT_AUTOBRIEF           = NO
-
-# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
-# treat a multi-line C++ special comment block (i.e. a block of //! or ///
-# comments) as a brief description. This used to be the default behaviour.
-# The new default is to treat a multi-line C++ comment block as a detailed
-# description. Set this tag to YES if you prefer the old behaviour instead.
-
-MULTILINE_CPP_IS_BRIEF = NO
-
-# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
-# member inherits the documentation from any documented member that it
-# re-implements.
-
-INHERIT_DOCS           = YES
-
-# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
-# a new page for each member. If set to NO, the documentation of a member will
-# be part of the file/class/namespace that contains it.
-
-SEPARATE_MEMBER_PAGES  = NO
-
-# The TAB_SIZE tag can be used to set the number of spaces in a tab.
-# Doxygen uses this value to replace tabs by spaces in code fragments.
-
-TAB_SIZE               = 8
-
-# This tag can be used to specify a number of aliases that acts
-# as commands in the documentation. An alias has the form "name=value".
-# For example adding "sideeffect=\par Side Effects:\n" will allow you to
-# put the command \sideeffect (or @sideeffect) in the documentation, which
-# will result in a user-defined paragraph with heading "Side Effects:".
-# You can put \n's in the value part of an alias to insert newlines.
-
-ALIASES                = "other=<sup>*</sup>"
-
-# This tag can be used to specify a number of word-keyword mappings (TCL only).
-# A mapping has the form "name=value". For example adding
-# "class=itcl::class" will allow you to use the command class in the
-# itcl::class meaning.
-
-TCL_SUBST              =
-
-# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
-# sources only. Doxygen will then generate output that is more tailored for C.
-# For instance, some of the names that are used will be different. The list
-# of all members will be omitted, etc.
-
-OPTIMIZE_OUTPUT_FOR_C  = NO
-
-# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
-# sources only. Doxygen will then generate output that is more tailored for
-# Java. For instance, namespaces will be presented as packages, qualified
-# scopes will look different, etc.
-
-OPTIMIZE_OUTPUT_JAVA   = NO
-
-# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
-# sources only. Doxygen will then generate output that is more tailored for
-# Fortran.
-
-OPTIMIZE_FOR_FORTRAN   = NO
-
-# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
-# sources. Doxygen will then generate output that is tailored for
-# VHDL.
-
-OPTIMIZE_OUTPUT_VHDL   = NO
-
-# Doxygen selects the parser to use depending on the extension of the files it
-# parses. With this tag you can assign which parser to use for a given
-# extension. Doxygen has a built-in mapping, but you can override or extend it
-# using this tag. The format is ext=language, where ext is a file extension,
-# and language is one of the parsers supported by doxygen: IDL, Java,
-# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C,
-# C++. For instance to make doxygen treat .inc files as Fortran files (default
-# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note
-# that for custom extensions you also need to set FILE_PATTERNS otherwise the
-# files are not read by doxygen.
-
-EXTENSION_MAPPING      =
-
-# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
-# comments according to the Markdown format, which allows for more readable
-# documentation. See http://daringfireball.net/projects/markdown/ for details.
-# The output of markdown processing is further processed by doxygen, so you
-# can mix doxygen, HTML, and XML commands with Markdown formatting.
-# Disable only in case of backward compatibilities issues.
-
-MARKDOWN_SUPPORT       = YES
-
-# When enabled doxygen tries to link words that correspond to documented classes,
-# or namespaces to their corresponding documentation. Such a link can be
-# prevented in individual cases by by putting a % sign in front of the word or
-# globally by setting AUTOLINK_SUPPORT to NO.
-
-AUTOLINK_SUPPORT       = YES
-
-# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
-# to include (a tag file for) the STL sources as input, then you should
-# set this tag to YES in order to let doxygen match functions declarations and
-# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
-# func(std::string) {}). This also makes the inheritance and collaboration
-# diagrams that involve STL classes more complete and accurate.
-
-BUILTIN_STL_SUPPORT    = NO
-
-# If you use Microsoft's C++/CLI language, you should set this option to YES to
-# enable parsing support.
-
-CPP_CLI_SUPPORT        = NO
-
-# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
-# Doxygen will parse them like normal C++ but will assume all classes use public
-# instead of private inheritance when no explicit protection keyword is present.
-
-SIP_SUPPORT            = NO
-
-# For Microsoft's IDL there are propget and propput attributes to
-# indicate getter and setter methods for a property. Setting this
-# option to YES (the default) will make doxygen replace the get and
-# set methods by a property in the documentation. This will only work
-# if the methods are indeed getting or setting a simple type. If this
-# is not the case, or you want to show the methods anyway, you should
-# set this option to NO.
-
-IDL_PROPERTY_SUPPORT   = YES
-
-# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
-# tag is set to YES, then doxygen will reuse the documentation of the first
-# member in the group (if any) for the other members of the group. By default
-# all members of a group must be documented explicitly.
-
-DISTRIBUTE_GROUP_DOC   = NO
-
-# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
-# the same type (for instance a group of public functions) to be put as a
-# subgroup of that type (e.g. under the Public Functions section). Set it to
-# NO to prevent subgrouping. Alternatively, this can be done per class using
-# the \nosubgrouping command.
-
-SUBGROUPING            = YES
-
-# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
-# unions are shown inside the group in which they are included (e.g. using
-# @ingroup) instead of on a separate page (for HTML and Man pages) or
-# section (for LaTeX and RTF).
-
-INLINE_GROUPED_CLASSES = NO
-
-# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
-# unions with only public data fields will be shown inline in the documentation
-# of the scope in which they are defined (i.e. file, namespace, or group
-# documentation), provided this scope is documented. If set to NO (the default),
-# structs, classes, and unions are shown on a separate page (for HTML and Man
-# pages) or section (for LaTeX and RTF).
-
-INLINE_SIMPLE_STRUCTS  = NO
-
-# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
-# is documented as struct, union, or enum with the name of the typedef. So
-# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
-# with name TypeT. When disabled the typedef will appear as a member of a file,
-# namespace, or class. And the struct will be named TypeS. This can typically
-# be useful for C code in case the coding convention dictates that all compound
-# types are typedef'ed and only the typedef is referenced, never the tag name.
-
-TYPEDEF_HIDES_STRUCT   = NO
-
-# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
-# determine which symbols to keep in memory and which to flush to disk.
-# When the cache is full, less often used symbols will be written to disk.
-# For small to medium size projects (<1000 input files) the default value is
-# probably good enough. For larger projects a too small cache size can cause
-# doxygen to be busy swapping symbols to and from disk most of the time
-# causing a significant performance penalty.
-# If the system has enough physical memory increasing the cache will improve the
-# performance by keeping more symbols in memory. Note that the value works on
-# a logarithmic scale so increasing the size by one will roughly double the
-# memory usage. The cache size is given by this formula:
-# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
-# corresponding to a cache size of 2^16 = 65536 symbols.
-
-SYMBOL_CACHE_SIZE      = 0
-
-# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
-# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
-# their name and scope. Since this can be an expensive process and often the
-# same symbol appear multiple times in the code, doxygen keeps a cache of
-# pre-resolved symbols. If the cache is too small doxygen will become slower.
-# If the cache is too large, memory is wasted. The cache size is given by this
-# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
-# corresponding to a cache size of 2^16 = 65536 symbols.
-
-LOOKUP_CACHE_SIZE      = 0
-
-#---------------------------------------------------------------------------
-# Build related configuration options
-#---------------------------------------------------------------------------
-
-# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
-# documentation are documented, even if no documentation was available.
-# Private class members and static file members will be hidden unless
-# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
-
-EXTRACT_ALL            = NO
-
-# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
-# will be included in the documentation.
-
-EXTRACT_PRIVATE        = YES
-
-# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
-# scope will be included in the documentation.
-
-EXTRACT_PACKAGE        = NO
-
-# If the EXTRACT_STATIC tag is set to YES all static members of a file
-# will be included in the documentation.
-
-EXTRACT_STATIC         = YES
-
-# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
-# defined locally in source files will be included in the documentation.
-# If set to NO only classes defined in header files are included.
-
-EXTRACT_LOCAL_CLASSES  = YES
-
-# This flag is only useful for Objective-C code. When set to YES local
-# methods, which are defined in the implementation section but not in
-# the interface are included in the documentation.
-# If set to NO (the default) only methods in the interface are included.
-
-EXTRACT_LOCAL_METHODS  = NO
-
-# If this flag is set to YES, the members of anonymous namespaces will be
-# extracted and appear in the documentation as a namespace called
-# 'anonymous_namespace{file}', where file will be replaced with the base
-# name of the file that contains the anonymous namespace. By default
-# anonymous namespaces are hidden.
-
-EXTRACT_ANON_NSPACES   = NO
-
-# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
-# undocumented members of documented classes, files or namespaces.
-# If set to NO (the default) these members will be included in the
-# various overviews, but no documentation section is generated.
-# This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_MEMBERS     = YES
-
-# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
-# undocumented classes that are normally visible in the class hierarchy.
-# If set to NO (the default) these classes will be included in the various
-# overviews. This option has no effect if EXTRACT_ALL is enabled.
-
-HIDE_UNDOC_CLASSES     = YES
-
-# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
-# friend (class|struct|union) declarations.
-# If set to NO (the default) these declarations will be included in the
-# documentation.
-
-HIDE_FRIEND_COMPOUNDS  = NO
-
-# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
-# documentation blocks found inside the body of a function.
-# If set to NO (the default) these blocks will be appended to the
-# function's detailed documentation block.
-
-HIDE_IN_BODY_DOCS      = NO
-
-# The INTERNAL_DOCS tag determines if documentation
-# that is typed after a \internal command is included. If the tag is set
-# to NO (the default) then the documentation will be excluded.
-# Set it to YES to include the internal documentation.
-
-INTERNAL_DOCS          = NO
-
-# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
-# file names in lower-case letters. If set to YES upper-case letters are also
-# allowed. This is useful if you have classes or files whose names only differ
-# in case and if your file system supports case sensitive file names. Windows
-# and Mac users are advised to set this option to NO.
-
-CASE_SENSE_NAMES       = YES
-
-# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
-# will show members with their full class and namespace scopes in the
-# documentation. If set to YES the scope will be hidden.
-
-HIDE_SCOPE_NAMES       = NO
-
-# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
-# will put a list of the files that are included by a file in the documentation
-# of that file.
-
-SHOW_INCLUDE_FILES     = YES
-
-# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
-# will list include files with double quotes in the documentation
-# rather than with sharp brackets.
-
-FORCE_LOCAL_INCLUDES   = NO
-
-# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
-# is inserted in the documentation for inline members.
-
-INLINE_INFO            = YES
-
-# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
-# will sort the (detailed) documentation of file and class members
-# alphabetically by member name. If set to NO the members will appear in
-# declaration order.
-
-SORT_MEMBER_DOCS       = YES
-
-# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
-# brief documentation of file, namespace and class members alphabetically
-# by member name. If set to NO (the default) the members will appear in
-# declaration order.
-
-SORT_BRIEF_DOCS        = NO
-
-# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
-# will sort the (brief and detailed) documentation of class members so that
-# constructors and destructors are listed first. If set to NO (the default)
-# the constructors will appear in the respective orders defined by
-# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
-# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
-# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
-
-SORT_MEMBERS_CTORS_1ST = NO
-
-# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
-# hierarchy of group names into alphabetical order. If set to NO (the default)
-# the group names will appear in their defined order.
-
-SORT_GROUP_NAMES       = NO
-
-# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
-# sorted by fully-qualified names, including namespaces. If set to
-# NO (the default), the class list will be sorted only by class name,
-# not including the namespace part.
-# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
-# Note: This option applies only to the class list, not to the
-# alphabetical list.
-
-SORT_BY_SCOPE_NAME     = NO
-
-# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
-# do proper type resolution of all parameters of a function it will reject a
-# match between the prototype and the implementation of a member function even
-# if there is only one candidate or it is obvious which candidate to choose
-# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
-# will still accept a match between prototype and implementation in such cases.
-
-STRICT_PROTO_MATCHING  = NO
-
-# The GENERATE_TODOLIST tag can be used to enable (YES) or
-# disable (NO) the todo list. This list is created by putting \todo
-# commands in the documentation.
-
-GENERATE_TODOLIST      = YES
-
-# The GENERATE_TESTLIST tag can be used to enable (YES) or
-# disable (NO) the test list. This list is created by putting \test
-# commands in the documentation.
-
-GENERATE_TESTLIST      = YES
-
-# The GENERATE_BUGLIST tag can be used to enable (YES) or
-# disable (NO) the bug list. This list is created by putting \bug
-# commands in the documentation.
-
-GENERATE_BUGLIST       = YES
-
-# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
-# disable (NO) the deprecated list. This list is created by putting
-# \deprecated commands in the documentation.
-
-GENERATE_DEPRECATEDLIST= YES
-
-# The ENABLED_SECTIONS tag can be used to enable conditional
-# documentation sections, marked by \if sectionname ... \endif.
-
-ENABLED_SECTIONS       =
-
-# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
-# the initial value of a variable or macro consists of for it to appear in
-# the documentation. If the initializer consists of more lines than specified
-# here it will be hidden. Use a value of 0 to hide initializers completely.
-# The appearance of the initializer of individual variables and macros in the
-# documentation can be controlled using \showinitializer or \hideinitializer
-# command in the documentation regardless of this setting.
-
-MAX_INITIALIZER_LINES  = 30
-
-# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
-# at the bottom of the documentation of classes and structs. If set to YES the
-# list will mention the files that were used to generate the documentation.
-
-SHOW_USED_FILES        = YES
-
-# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
-# This will remove the Files entry from the Quick Index and from the
-# Folder Tree View (if specified). The default is YES.
-
-# We probably will want this, but we have no file documentation yet so it's simpler to remove
-# it for now.
-SHOW_FILES             = NO
-
-# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
-# Namespaces page.
-# This will remove the Namespaces entry from the Quick Index
-# and from the Folder Tree View (if specified). The default is YES.
-
-SHOW_NAMESPACES        = YES
-
-# The FILE_VERSION_FILTER tag can be used to specify a program or script that
-# doxygen should invoke to get the current version for each file (typically from
-# the version control system). Doxygen will invoke the program by executing (via
-# popen()) the command <command> <input-file>, where <command> is the value of
-# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
-# provided by doxygen. Whatever the program writes to standard output
-# is used as the file version. See the manual for examples.
-
-FILE_VERSION_FILTER    =
-
-# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
-# by doxygen. The layout file controls the global structure of the generated
-# output files in an output format independent way. To create the layout file
-# that represents doxygen's defaults, run doxygen with the -l option.
-# You can optionally specify a file name after the option, if omitted
-# DoxygenLayout.xml will be used as the name of the layout file.
-
-LAYOUT_FILE            =
-
-# The CITE_BIB_FILES tag can be used to specify one or more bib files
-# containing the references data. This must be a list of .bib files. The
-# .bib extension is automatically appended if omitted. Using this command
-# requires the bibtex tool to be installed. See also
-# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
-# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
-# feature you need bibtex and perl available in the search path.
-
-CITE_BIB_FILES         =
-
-#---------------------------------------------------------------------------
-# configuration options related to warning and progress messages
-#---------------------------------------------------------------------------
-
-# The QUIET tag can be used to turn on/off the messages that are generated
-# by doxygen. Possible values are YES and NO. If left blank NO is used.
-
-QUIET                  = NO
-
-# The WARNINGS tag can be used to turn on/off the warning messages that are
-# generated by doxygen. Possible values are YES and NO. If left blank
-# NO is used.
-
-WARNINGS               = YES
-
-# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
-# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
-# automatically be disabled.
-
-WARN_IF_UNDOCUMENTED   = YES
-
-# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
-# potential errors in the documentation, such as not documenting some
-# parameters in a documented function, or documenting parameters that
-# don't exist or using markup commands wrongly.
-
-WARN_IF_DOC_ERROR      = YES
-
-# The WARN_NO_PARAMDOC option can be enabled to get warnings for
-# functions that are documented, but have no documentation for their parameters
-# or return value. If set to NO (the default) doxygen will only warn about
-# wrong or incomplete parameter documentation, but not about the absence of
-# documentation.
-
-WARN_NO_PARAMDOC       = NO
-
-# The WARN_FORMAT tag determines the format of the warning messages that
-# doxygen can produce. The string should contain the $file, $line, and $text
-# tags, which will be replaced by the file and line number from which the
-# warning originated and the warning text. Optionally the format may contain
-# $version, which will be replaced by the version of the file (if it could
-# be obtained via FILE_VERSION_FILTER)
-
-WARN_FORMAT            =
-
-# The WARN_LOGFILE tag can be used to specify a file to which warning
-# and error messages should be written. If left blank the output is written
-# to stderr.
-
-WARN_LOGFILE           =
-
-#---------------------------------------------------------------------------
-# configuration options related to the input files
-#---------------------------------------------------------------------------
-
-# The INPUT tag can be used to specify the files and/or directories that contain
-# documented source files. You may enter file names like "myfile.cpp" or
-# directories like "/usr/src/myproject". Separate the files or directories
-# with spaces.
-
-INPUT                  = src  doc/doxygen/libomp_interface.h
-# The ittnotify code also has doxygen documentation, but if we include it here
-# it takes over from us!
-# src/thirdparty/ittnotify
-
-# This tag can be used to specify the character encoding of the source files
-# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
-# also the default input encoding. Doxygen uses libiconv (or the iconv built
-# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
-# the list of possible encodings.
-
-INPUT_ENCODING         = UTF-8
-
-# If the value of the INPUT tag contains directories, you can use the
-# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank the following patterns are tested:
-# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
-# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
-# *.f90 *.f *.for *.vhd *.vhdl
-
-FILE_PATTERNS          = *.c *.h *.cpp
-# We may also want to include the asm files with appropriate ifdef to ensure
-# doxygen doesn't see the content, just the documentation...
-
-# The RECURSIVE tag can be used to turn specify whether or not subdirectories
-# should be searched for input files as well. Possible values are YES and NO.
-# If left blank NO is used.
-
-# Only look in the one directory.
-RECURSIVE              = NO
-
-# The EXCLUDE tag can be used to specify files and/or directories that should be
-# excluded from the INPUT source files. This way you can easily exclude a
-# subdirectory from a directory tree whose root is specified with the INPUT tag.
-# Note that relative paths are relative to the directory from which doxygen is
-# run.
-
-EXCLUDE                = src/test-touch.c
-
-# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
-# directories that are symbolic links (a Unix file system feature) are excluded
-# from the input.
-
-EXCLUDE_SYMLINKS       = NO
-
-# If the value of the INPUT tag contains directories, you can use the
-# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
-# certain files from those directories. Note that the wildcards are matched
-# against the file with absolute path, so to exclude all test directories
-# for example use the pattern */test/*
-
-EXCLUDE_PATTERNS       =
-
-# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
-# (namespaces, classes, functions, etc.) that should be excluded from the
-# output. The symbol name can be a fully qualified name, a word, or if the
-# wildcard * is used, a substring. Examples: ANamespace, AClass,
-# AClass::ANamespace, ANamespace::*Test
-
-EXCLUDE_SYMBOLS        =
-
-# The EXAMPLE_PATH tag can be used to specify one or more files or
-# directories that contain example code fragments that are included (see
-# the \include command).
-
-EXAMPLE_PATH           =
-
-# If the value of the EXAMPLE_PATH tag contains directories, you can use the
-# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
-# and *.h) to filter out the source-files in the directories. If left
-# blank all files are included.
-
-EXAMPLE_PATTERNS       =
-
-# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
-# searched for input files to be used with the \include or \dontinclude
-# commands irrespective of the value of the RECURSIVE tag.
-# Possible values are YES and NO. If left blank NO is used.
-
-EXAMPLE_RECURSIVE      = NO
-
-# The IMAGE_PATH tag can be used to specify one or more files or
-# directories that contain image that are included in the documentation (see
-# the \image command).
-
-IMAGE_PATH             =
-
-# The INPUT_FILTER tag can be used to specify a program that doxygen should
-# invoke to filter for each input file. Doxygen will invoke the filter program
-# by executing (via popen()) the command <filter> <input-file>, where <filter>
-# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
-# input file. Doxygen will then use the output that the filter program writes
-# to standard output.
-# If FILTER_PATTERNS is specified, this tag will be
-# ignored.
-
-INPUT_FILTER           =
-
-# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
-# basis.
-# Doxygen will compare the file name with each pattern and apply the
-# filter if there is a match.
-# The filters are a list of the form:
-# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
-# info on how filters are used. If FILTER_PATTERNS is empty or if
-# non of the patterns match the file name, INPUT_FILTER is applied.
-
-FILTER_PATTERNS        =
-
-# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
-# INPUT_FILTER) will be used to filter the input files when producing source
-# files to browse (i.e. when SOURCE_BROWSER is set to YES).
-
-FILTER_SOURCE_FILES    = NO
-
-# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
-# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
-# and it is also possible to disable source filtering for a specific pattern
-# using *.ext= (so without naming a filter). This option only has effect when
-# FILTER_SOURCE_FILES is enabled.
-
-FILTER_SOURCE_PATTERNS =
-
-#---------------------------------------------------------------------------
-# configuration options related to source browsing
-#---------------------------------------------------------------------------
-
-# If the SOURCE_BROWSER tag is set to YES then a list of source files will
-# be generated. Documented entities will be cross-referenced with these sources.
-# Note: To get rid of all source code in the generated output, make sure also
-# VERBATIM_HEADERS is set to NO.
-
-SOURCE_BROWSER         = YES
-
-# Setting the INLINE_SOURCES tag to YES will include the body
-# of functions and classes directly in the documentation.
-
-INLINE_SOURCES         = NO
-
-# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
-# doxygen to hide any special comment blocks from generated source code
-# fragments. Normal C, C++ and Fortran comments will always remain visible.
-
-STRIP_CODE_COMMENTS    = YES
-
-# If the REFERENCED_BY_RELATION tag is set to YES
-# then for each documented function all documented
-# functions referencing it will be listed.
-
-REFERENCED_BY_RELATION = YES
-
-# If the REFERENCES_RELATION tag is set to YES
-# then for each documented function all documented entities
-# called/used by that function will be listed.
-
-REFERENCES_RELATION    = NO
-
-# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
-# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
-# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
-# link to the source code.
-# Otherwise they will link to the documentation.
-
-REFERENCES_LINK_SOURCE = YES
-
-# If the USE_HTAGS tag is set to YES then the references to source code
-# will point to the HTML generated by the htags(1) tool instead of doxygen
-# built-in source browser. The htags tool is part of GNU's global source
-# tagging system (see http://www.gnu.org/software/global/global.html). You
-# will need version 4.8.6 or higher.
-
-USE_HTAGS              = NO
-
-# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
-# will generate a verbatim copy of the header file for each class for
-# which an include is specified. Set to NO to disable this.
-
-VERBATIM_HEADERS       = YES
-
-#---------------------------------------------------------------------------
-# configuration options related to the alphabetical class index
-#---------------------------------------------------------------------------
-
-# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
-# of all compounds will be generated. Enable this if the project
-# contains a lot of classes, structs, unions or interfaces.
-
-ALPHABETICAL_INDEX     = YES
-
-# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
-# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
-# in which this list will be split (can be a number in the range [1..20])
-
-COLS_IN_ALPHA_INDEX    = 5
-
-# In case all classes in a project start with a common prefix, all
-# classes will be put under the same header in the alphabetical index.
-# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
-# should be ignored while generating the index headers.
-
-IGNORE_PREFIX          =
-
-#---------------------------------------------------------------------------
-# configuration options related to the HTML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
-# generate HTML output.
-
-GENERATE_HTML          = YES
-
-# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `html' will be used as the default path.
-
-HTML_OUTPUT            =
-
-# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
-# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
-# doxygen will generate files with .html extension.
-
-HTML_FILE_EXTENSION    = .html
-
-# The HTML_HEADER tag can be used to specify a personal HTML header for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard header. Note that when using a custom header you are responsible
-#  for the proper inclusion of any scripts and style sheets that doxygen
-# needs, which is dependent on the configuration options used.
-# It is advised to generate a default header using "doxygen -w html
-# header.html footer.html stylesheet.css YourConfigFile" and then modify
-# that header. Note that the header is subject to change so you typically
-# have to redo this when upgrading to a newer version of doxygen or when
-# changing the value of configuration settings such as GENERATE_TREEVIEW!
-
-HTML_HEADER            =
-
-# The HTML_FOOTER tag can be used to specify a personal HTML footer for
-# each generated HTML page. If it is left blank doxygen will generate a
-# standard footer.
-
-HTML_FOOTER            =
-
-# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
-# style sheet that is used by each HTML page. It can be used to
-# fine-tune the look of the HTML output. If left blank doxygen will
-# generate a default style sheet. Note that it is recommended to use
-# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this
-# tag will in the future become obsolete.
-
-HTML_STYLESHEET        =
-
-# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional
-# user-defined cascading style sheet that is included after the standard
-# style sheets created by doxygen. Using this option one can overrule
-# certain style aspects. This is preferred over using HTML_STYLESHEET
-# since it does not replace the standard style sheet and is therefor more
-# robust against future updates. Doxygen will copy the style sheet file to
-# the output directory.
-
-HTML_EXTRA_STYLESHEET  =
-
-# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
-# other source files which should be copied to the HTML output directory. Note
-# that these files will be copied to the base HTML output directory. Use the
-# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
-# files. In the HTML_STYLESHEET file, use the file name only. Also note that
-# the files will be copied as-is; there are no commands or markers available.
-
-HTML_EXTRA_FILES       =
-
-# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
-# Doxygen will adjust the colors in the style sheet and background images
-# according to this color. Hue is specified as an angle on a colorwheel,
-# see http://en.wikipedia.org/wiki/Hue for more information.
-# For instance the value 0 represents red, 60 is yellow, 120 is green,
-# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
-# The allowed range is 0 to 359.
-
-HTML_COLORSTYLE_HUE    = 220
-
-# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
-# the colors in the HTML output. For a value of 0 the output will use
-# grayscales only. A value of 255 will produce the most vivid colors.
-
-HTML_COLORSTYLE_SAT    = 100
-
-# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
-# the luminance component of the colors in the HTML output. Values below
-# 100 gradually make the output lighter, whereas values above 100 make
-# the output darker. The value divided by 100 is the actual gamma applied,
-# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
-# and 100 does not change the gamma.
-
-HTML_COLORSTYLE_GAMMA  = 80
-
-# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
-# page will contain the date and time when the page was generated. Setting
-# this to NO can help when comparing the output of multiple runs.
-
-HTML_TIMESTAMP         = NO
-
-# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
-# documentation will contain sections that can be hidden and shown after the
-# page has loaded.
-
-HTML_DYNAMIC_SECTIONS  = NO
-
-# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
-# entries shown in the various tree structured indices initially; the user
-# can expand and collapse entries dynamically later on. Doxygen will expand
-# the tree to such a level that at most the specified number of entries are
-# visible (unless a fully collapsed tree already exceeds this amount).
-# So setting the number of entries 1 will produce a full collapsed tree by
-# default. 0 is a special value representing an infinite number of entries
-# and will result in a full expanded tree by default.
-
-HTML_INDEX_NUM_ENTRIES = 100
-
-# If the GENERATE_DOCSET tag is set to YES, additional index files
-# will be generated that can be used as input for Apple's Xcode 3
-# integrated development environment, introduced with OSX 10.5 (Leopard).
-# To create a documentation set, doxygen will generate a Makefile in the
-# HTML output directory. Running make will produce the docset in that
-# directory and running "make install" will install the docset in
-# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
-# it at startup.
-# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
-# for more information.
-
-GENERATE_DOCSET        = NO
-
-# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
-# feed. A documentation feed provides an umbrella under which multiple
-# documentation sets from a single provider (such as a company or product suite)
-# can be grouped.
-
-DOCSET_FEEDNAME        = "Doxygen generated docs"
-
-# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
-# should uniquely identify the documentation set bundle. This should be a
-# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
-# will append .docset to the name.
-
-DOCSET_BUNDLE_ID       = org.doxygen.Project
-
-# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely
-# identify the documentation publisher. This should be a reverse domain-name
-# style string, e.g. com.mycompany.MyDocSet.documentation.
-
-DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
-
-# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
-
-DOCSET_PUBLISHER_NAME  = Publisher
-
-# If the GENERATE_HTMLHELP tag is set to YES, additional index files
-# will be generated that can be used as input for tools like the
-# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
-# of the generated HTML documentation.
-
-GENERATE_HTMLHELP      = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
-# be used to specify the file name of the resulting .chm file. You
-# can add a path in front of the file if the result should not be
-# written to the html output directory.
-
-CHM_FILE               =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
-# be used to specify the location (absolute path including file name) of
-# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
-# the HTML help compiler on the generated index.hhp.
-
-HHC_LOCATION           =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
-# controls if a separate .chi index file is generated (YES) or that
-# it should be included in the main .chm file (NO).
-
-GENERATE_CHI           = NO
-
-# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
-# is used to encode HtmlHelp index (hhk), content (hhc) and project file
-# content.
-
-CHM_INDEX_ENCODING     =
-
-# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
-# controls whether a binary table of contents is generated (YES) or a
-# normal table of contents (NO) in the .chm file.
-
-BINARY_TOC             = NO
-
-# The TOC_EXPAND flag can be set to YES to add extra items for group members
-# to the contents of the HTML help documentation and to the tree view.
-
-TOC_EXPAND             = NO
-
-# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
-# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
-# that can be used as input for Qt's qhelpgenerator to generate a
-# Qt Compressed Help (.qch) of the generated HTML documentation.
-
-GENERATE_QHP           = NO
-
-# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
-# be used to specify the file name of the resulting .qch file.
-# The path specified is relative to the HTML output folder.
-
-QCH_FILE               =
-
-# The QHP_NAMESPACE tag specifies the namespace to use when generating
-# Qt Help Project output. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#namespace
-
-QHP_NAMESPACE          = org.doxygen.Project
-
-# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
-# Qt Help Project output. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#virtual-folders
-
-QHP_VIRTUAL_FOLDER     = doc
-
-# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
-# add. For more information please see
-# http://doc.trolltech.com/qthelpproject.html#custom-filters
-
-QHP_CUST_FILTER_NAME   =
-
-# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
-# custom filter to add. For more information please see
-# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
-# Qt Help Project / Custom Filters</a>.
-
-QHP_CUST_FILTER_ATTRS  =
-
-# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
-# project's
-# filter section matches.
-# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
-# Qt Help Project / Filter Attributes</a>.
-
-QHP_SECT_FILTER_ATTRS  =
-
-# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
-# be used to specify the location of Qt's qhelpgenerator.
-# If non-empty doxygen will try to run qhelpgenerator on the generated
-# .qhp file.
-
-QHG_LOCATION           =
-
-# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
-#  will be generated, which together with the HTML files, form an Eclipse help
-# plugin. To install this plugin and make it available under the help contents
-# menu in Eclipse, the contents of the directory containing the HTML and XML
-# files needs to be copied into the plugins directory of eclipse. The name of
-# the directory within the plugins directory should be the same as
-# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
-# the help appears.
-
-GENERATE_ECLIPSEHELP   = NO
-
-# A unique identifier for the eclipse help plugin. When installing the plugin
-# the directory name containing the HTML and XML files should also have
-# this name.
-
-ECLIPSE_DOC_ID         = org.doxygen.Project
-
-# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
-# at top of each HTML page. The value NO (the default) enables the index and
-# the value YES disables it. Since the tabs have the same information as the
-# navigation tree you can set this option to NO if you already set
-# GENERATE_TREEVIEW to YES.
-
-DISABLE_INDEX          = NO
-
-# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
-# structure should be generated to display hierarchical information.
-# If the tag value is set to YES, a side panel will be generated
-# containing a tree-like index structure (just like the one that
-# is generated for HTML Help). For this to work a browser that supports
-# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
-# Windows users are probably better off using the HTML help feature.
-# Since the tree basically has the same information as the tab index you
-# could consider to set DISABLE_INDEX to NO when enabling this option.
-
-GENERATE_TREEVIEW      = NO
-
-# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
-# (range [0,1..20]) that doxygen will group on one line in the generated HTML
-# documentation. Note that a value of 0 will completely suppress the enum
-# values from appearing in the overview section.
-
-ENUM_VALUES_PER_LINE   = 4
-
-# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
-# used to set the initial width (in pixels) of the frame in which the tree
-# is shown.
-
-TREEVIEW_WIDTH         = 250
-
-# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
-# links to external symbols imported via tag files in a separate window.
-
-EXT_LINKS_IN_WINDOW    = NO
-
-# Use this tag to change the font size of Latex formulas included
-# as images in the HTML documentation. The default is 10. Note that
-# when you change the font size after a successful doxygen run you need
-# to manually remove any form_*.png images from the HTML output directory
-# to force them to be regenerated.
-
-FORMULA_FONTSIZE       = 10
-
-# Use the FORMULA_TRANPARENT tag to determine whether or not the images
-# generated for formulas are transparent PNGs. Transparent PNGs are
-# not supported properly for IE 6.0, but are supported on all modern browsers.
-# Note that when changing this option you need to delete any form_*.png files
-# in the HTML output before the changes have effect.
-
-FORMULA_TRANSPARENT    = YES
-
-# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
-# (see http://www.mathjax.org) which uses client side Javascript for the
-# rendering instead of using prerendered bitmaps. Use this if you do not
-# have LaTeX installed or if you want to formulas look prettier in the HTML
-# output. When enabled you may also need to install MathJax separately and
-# configure the path to it using the MATHJAX_RELPATH option.
-
-USE_MATHJAX            = NO
-
-# When MathJax is enabled you need to specify the location relative to the
-# HTML output directory using the MATHJAX_RELPATH option. The destination
-# directory should contain the MathJax.js script. For instance, if the mathjax
-# directory is located at the same level as the HTML output directory, then
-# MATHJAX_RELPATH should be ../mathjax. The default value points to
-# the MathJax Content Delivery Network so you can quickly see the result without
-# installing MathJax.
-# However, it is strongly recommended to install a local
-# copy of MathJax from http://www.mathjax.org before deployment.
-
-MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
-
-# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
-# names that should be enabled during MathJax rendering.
-
-MATHJAX_EXTENSIONS     =
-
-# When the SEARCHENGINE tag is enabled doxygen will generate a search box
-# for the HTML output. The underlying search engine uses javascript
-# and DHTML and should work on any modern browser. Note that when using
-# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
-# (GENERATE_DOCSET) there is already a search function so this one should
-# typically be disabled. For large projects the javascript based search engine
-# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
-
-SEARCHENGINE           = YES
-
-# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
-# implemented using a PHP enabled web server instead of at the web client
-# using Javascript. Doxygen will generate the search PHP script and index
-# file to put on the web server. The advantage of the server
-# based approach is that it scales better to large projects and allows
-# full text search. The disadvantages are that it is more difficult to setup
-# and does not have live searching capabilities.
-
-SERVER_BASED_SEARCH    = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the LaTeX output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
-# generate Latex output.
-
-GENERATE_LATEX         = YES
-
-# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `latex' will be used as the default path.
-
-LATEX_OUTPUT           =
-
-# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
-# invoked. If left blank `latex' will be used as the default command name.
-# Note that when enabling USE_PDFLATEX this option is only used for
-# generating bitmaps for formulas in the HTML output, but not in the
-# Makefile that is written to the output directory.
-
-LATEX_CMD_NAME         = latex
-
-# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
-# generate index for LaTeX. If left blank `makeindex' will be used as the
-# default command name.
-
-MAKEINDEX_CMD_NAME     = makeindex
-
-# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
-# LaTeX documents. This may be useful for small projects and may help to
-# save some trees in general.
-
-COMPACT_LATEX          = NO
-
-# The PAPER_TYPE tag can be used to set the paper type that is used
-# by the printer. Possible values are: a4, letter, legal and
-# executive. If left blank a4wide will be used.
-
-PAPER_TYPE             = a4wide
-
-# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
-# packages that should be included in the LaTeX output.
-
-EXTRA_PACKAGES         =
-
-# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
-# the generated latex document. The header should contain everything until
-# the first chapter. If it is left blank doxygen will generate a
-# standard header. Notice: only use this tag if you know what you are doing!
-
-LATEX_HEADER           = doc/doxygen/header.tex
-
-# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
-# the generated latex document. The footer should contain everything after
-# the last chapter. If it is left blank doxygen will generate a
-# standard footer. Notice: only use this tag if you know what you are doing!
-
-LATEX_FOOTER           =
-
-# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
-# is prepared for conversion to pdf (using ps2pdf). The pdf file will
-# contain links (just like the HTML output) instead of page references
-# This makes the output suitable for online browsing using a pdf viewer.
-
-PDF_HYPERLINKS         = YES
-
-# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
-# plain latex in the generated Makefile. Set this option to YES to get a
-# higher quality PDF documentation.
-
-USE_PDFLATEX           = YES
-
-# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
-# command to the generated LaTeX files. This will instruct LaTeX to keep
-# running if errors occur, instead of asking the user for help.
-# This option is also used when generating formulas in HTML.
-
-LATEX_BATCHMODE        = NO
-
-# If LATEX_HIDE_INDICES is set to YES then doxygen will not
-# include the index chapters (such as File Index, Compound Index, etc.)
-# in the output.
-
-LATEX_HIDE_INDICES     = NO
-
-# If LATEX_SOURCE_CODE is set to YES then doxygen will include
-# source code with syntax highlighting in the LaTeX output.
-# Note that which sources are shown also depends on other settings
-# such as SOURCE_BROWSER.
-
-LATEX_SOURCE_CODE      = NO
-
-# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
-# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
-# http://en.wikipedia.org/wiki/BibTeX for more info.
-
-LATEX_BIB_STYLE        = plain
-
-#---------------------------------------------------------------------------
-# configuration options related to the RTF output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
-# The RTF output is optimized for Word 97 and may not look very pretty with
-# other RTF readers or editors.
-
-GENERATE_RTF           = NO
-
-# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `rtf' will be used as the default path.
-
-RTF_OUTPUT             =
-
-# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
-# RTF documents. This may be useful for small projects and may help to
-# save some trees in general.
-
-COMPACT_RTF            = NO
-
-# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
-# will contain hyperlink fields. The RTF file will
-# contain links (just like the HTML output) instead of page references.
-# This makes the output suitable for online browsing using WORD or other
-# programs which support those fields.
-# Note: wordpad (write) and others do not support links.
-
-RTF_HYPERLINKS         = NO
-
-# Load style sheet definitions from file. Syntax is similar to doxygen's
-# config file, i.e. a series of assignments. You only have to provide
-# replacements, missing definitions are set to their default value.
-
-RTF_STYLESHEET_FILE    =
-
-# Set optional variables used in the generation of an rtf document.
-# Syntax is similar to doxygen's config file.
-
-RTF_EXTENSIONS_FILE    =
-
-#---------------------------------------------------------------------------
-# configuration options related to the man page output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
-# generate man pages
-
-GENERATE_MAN           = NO
-
-# The MAN_OUTPUT tag is used to specify where the man pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `man' will be used as the default path.
-
-MAN_OUTPUT             =
-
-# The MAN_EXTENSION tag determines the extension that is added to
-# the generated man pages (default is the subroutine's section .3)
-
-MAN_EXTENSION          =
-
-# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
-# then it will generate one additional man file for each entity
-# documented in the real man page(s). These additional files
-# only source the real man page, but without them the man command
-# would be unable to find the correct page. The default is NO.
-
-MAN_LINKS              = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the XML output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_XML tag is set to YES Doxygen will
-# generate an XML file that captures the structure of
-# the code including all documentation.
-
-GENERATE_XML           = NO
-
-# The XML_OUTPUT tag is used to specify where the XML pages will be put.
-# If a relative path is entered the value of OUTPUT_DIRECTORY will be
-# put in front of it. If left blank `xml' will be used as the default path.
-
-XML_OUTPUT             = xml
-
-# The XML_SCHEMA tag can be used to specify an XML schema,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_SCHEMA             =
-
-# The XML_DTD tag can be used to specify an XML DTD,
-# which can be used by a validating XML parser to check the
-# syntax of the XML files.
-
-XML_DTD                =
-
-# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
-# dump the program listings (including syntax highlighting
-# and cross-referencing information) to the XML output. Note that
-# enabling this will significantly increase the size of the XML output.
-
-XML_PROGRAMLISTING     = YES
-
-#---------------------------------------------------------------------------
-# configuration options for the AutoGen Definitions output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
-# generate an AutoGen Definitions (see autogen.sf.net) file
-# that captures the structure of the code including all
-# documentation. Note that this feature is still experimental
-# and incomplete at the moment.
-
-GENERATE_AUTOGEN_DEF   = NO
-
-#---------------------------------------------------------------------------
-# configuration options related to the Perl module output
-#---------------------------------------------------------------------------
-
-# If the GENERATE_PERLMOD tag is set to YES Doxygen will
-# generate a Perl module file that captures the structure of
-# the code including all documentation. Note that this
-# feature is still experimental and incomplete at the
-# moment.
-
-GENERATE_PERLMOD       = NO
-
-# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
-# the necessary Makefile rules, Perl scripts and LaTeX code to be able
-# to generate PDF and DVI output from the Perl module output.
-
-PERLMOD_LATEX          = NO
-
-# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
-# nicely formatted so it can be parsed by a human reader.
-# This is useful
-# if you want to understand what is going on.
-# On the other hand, if this
-# tag is set to NO the size of the Perl module output will be much smaller
-# and Perl will parse it just the same.
-
-PERLMOD_PRETTY         = YES
-
-# The names of the make variables in the generated doxyrules.make file
-# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
-# This is useful so different doxyrules.make files included by the same
-# Makefile don't overwrite each other's variables.
-
-PERLMOD_MAKEVAR_PREFIX =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the preprocessor
-#---------------------------------------------------------------------------
-
-# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
-# evaluate all C-preprocessor directives found in the sources and include
-# files.
-
-ENABLE_PREPROCESSING   = YES
-
-# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
-# names in the source code. If set to NO (the default) only conditional
-# compilation will be performed. Macro expansion can be done in a controlled
-# way by setting EXPAND_ONLY_PREDEF to YES.
-
-MACRO_EXPANSION        = YES
-
-# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
-# then the macro expansion is limited to the macros specified with the
-# PREDEFINED and EXPAND_AS_DEFINED tags.
-
-EXPAND_ONLY_PREDEF     = YES
-
-# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
-# pointed to by INCLUDE_PATH will be searched when a #include is found.
-
-SEARCH_INCLUDES        = YES
-
-# The INCLUDE_PATH tag can be used to specify one or more directories that
-# contain include files that are not input files but should be processed by
-# the preprocessor.
-
-INCLUDE_PATH           =
-
-# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
-# patterns (like *.h and *.hpp) to filter out the header-files in the
-# directories. If left blank, the patterns specified with FILE_PATTERNS will
-# be used.
-
-INCLUDE_FILE_PATTERNS  =
-
-# The PREDEFINED tag can be used to specify one or more macro names that
-# are defined before the preprocessor is started (similar to the -D option of
-# gcc). The argument of the tag is a list of macros of the form: name
-# or name=definition (no spaces). If the definition and the = are
-# omitted =1 is assumed. To prevent a macro definition from being
-# undefined via #undef or recursively expanded use the := operator
-# instead of the = operator.
-
-PREDEFINED             = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1
-
-# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
-# this tag can be used to specify a list of macro names that should be expanded.
-# The macro definition that is found in the sources will be used.
-# Use the PREDEFINED tag if you want to use a different macro definition that
-# overrules the definition found in the source code.
-
-EXPAND_AS_DEFINED      =
-
-# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
-# doxygen's preprocessor will remove all references to function-like macros
-# that are alone on a line, have an all uppercase name, and do not end with a
-# semicolon, because these will confuse the parser if not removed.
-
-SKIP_FUNCTION_MACROS   = YES
-
-#---------------------------------------------------------------------------
-# Configuration::additions related to external references
-#---------------------------------------------------------------------------
-
-# The TAGFILES option can be used to specify one or more tagfiles. For each
-# tag file the location of the external documentation should be added. The
-# format of a tag file without this location is as follows:
-#
-# TAGFILES = file1 file2 ...
-# Adding location for the tag files is done as follows:
-#
-# TAGFILES = file1=loc1 "file2 = loc2" ...
-# where "loc1" and "loc2" can be relative or absolute paths
-# or URLs. Note that each tag file must have a unique name (where the name does
-# NOT include the path). If a tag file is not located in the directory in which
-# doxygen is run, you must also specify the path to the tagfile here.
-
-TAGFILES               =
-
-# When a file name is specified after GENERATE_TAGFILE, doxygen will create
-# a tag file that is based on the input files it reads.
-
-GENERATE_TAGFILE       =
-
-# If the ALLEXTERNALS tag is set to YES all external classes will be listed
-# in the class index. If set to NO only the inherited external classes
-# will be listed.
-
-ALLEXTERNALS           = NO
-
-# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
-# in the modules index. If set to NO, only the current project's groups will
-# be listed.
-
-EXTERNAL_GROUPS        = YES
-
-# The PERL_PATH should be the absolute path and name of the perl script
-# interpreter (i.e. the result of `which perl').
-
-PERL_PATH              =
-
-#---------------------------------------------------------------------------
-# Configuration options related to the dot tool
-#---------------------------------------------------------------------------
-
-# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
-# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
-# or super classes. Setting the tag to NO turns the diagrams off. Note that
-# this option also works with HAVE_DOT disabled, but it is recommended to
-# install and use dot, since it yields more powerful graphs.
-
-CLASS_DIAGRAMS         = YES
-
-# You can define message sequence charts within doxygen comments using the \msc
-# command. Doxygen will then run the mscgen tool (see
-# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
-# documentation. The MSCGEN_PATH tag allows you to specify the directory where
-# the mscgen tool resides. If left empty the tool is assumed to be found in the
-# default search path.
-
-MSCGEN_PATH            =
-
-# If set to YES, the inheritance and collaboration graphs will hide
-# inheritance and usage relations if the target is undocumented
-# or is not a class.
-
-HIDE_UNDOC_RELATIONS   = YES
-
-# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
-# available from the path. This tool is part of Graphviz, a graph visualization
-# toolkit from AT&T and Lucent Bell Labs. The other options in this section
-# have no effect if this option is set to NO (the default)
-
-HAVE_DOT               = NO
-
-# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
-# allowed to run in parallel. When set to 0 (the default) doxygen will
-# base this on the number of processors available in the system. You can set it
-# explicitly to a value larger than 0 to get control over the balance
-# between CPU load and processing speed.
-
-DOT_NUM_THREADS        = 0
-
-# By default doxygen will use the Helvetica font for all dot files that
-# doxygen generates. When you want a differently looking font you can specify
-# the font name using DOT_FONTNAME. You need to make sure dot is able to find
-# the font, which can be done by putting it in a standard location or by setting
-# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
-# directory containing the font.
-
-DOT_FONTNAME           = Helvetica
-
-# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
-# The default size is 10pt.
-
-DOT_FONTSIZE           = 10
-
-# By default doxygen will tell dot to use the Helvetica font.
-# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
-# set the path where dot can find it.
-
-DOT_FONTPATH           =
-
-# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect inheritance relations. Setting this tag to YES will force the
-# CLASS_DIAGRAMS tag to NO.
-
-CLASS_GRAPH            = YES
-
-# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for each documented class showing the direct and
-# indirect implementation dependencies (inheritance, containment, and
-# class references variables) of the class with other documented classes.
-
-COLLABORATION_GRAPH    = NO
-
-# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
-# will generate a graph for groups, showing the direct groups dependencies
-
-GROUP_GRAPHS           = YES
-
-# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
-# collaboration diagrams in a style similar to the OMG's Unified Modeling
-# Language.
-
-UML_LOOK               = NO
-
-# If the UML_LOOK tag is enabled, the fields and methods are shown inside
-# the class node. If there are many fields or methods and many nodes the
-# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
-# threshold limits the number of items for each type to make the size more
-# manageable. Set this to 0 for no limit. Note that the threshold may be
-# exceeded by 50% before the limit is enforced.
-
-UML_LIMIT_NUM_FIELDS   = 10
-
-# If set to YES, the inheritance and collaboration graphs will show the
-# relations between templates and their instances.
-
-TEMPLATE_RELATIONS     = YES
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
-# tags are set to YES then doxygen will generate a graph for each documented
-# file showing the direct and indirect include dependencies of the file with
-# other documented files.
-
-INCLUDE_GRAPH          = NO
-
-# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
-# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
-# documented header file showing the documented files that directly or
-# indirectly include this file.
-
-INCLUDED_BY_GRAPH      = NO
-
-# If the CALL_GRAPH and HAVE_DOT options are set to YES then
-# doxygen will generate a call dependency graph for every global function
-# or class method. Note that enabling this option will significantly increase
-# the time of a run. So in most cases it will be better to enable call graphs
-# for selected functions only using the \callgraph command.
-
-CALL_GRAPH             = NO
-
-# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
-# doxygen will generate a caller dependency graph for every global function
-# or class method. Note that enabling this option will significantly increase
-# the time of a run. So in most cases it will be better to enable caller
-# graphs for selected functions only using the \callergraph command.
-
-CALLER_GRAPH           = NO
-
-# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
-# will generate a graphical hierarchy of all classes instead of a textual one.
-
-GRAPHICAL_HIERARCHY    = YES
-
-# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
-# then doxygen will show the dependencies a directory has on other directories
-# in a graphical way. The dependency relations are determined by the #include
-# relations between the files in the directories.
-
-DIRECTORY_GRAPH        = YES
-
-# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
-# generated by dot. Possible values are svg, png, jpg, or gif.
-# If left blank png will be used. If you choose svg you need to set
-# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
-# visible in IE 9+ (other browsers do not have this requirement).
-
-DOT_IMAGE_FORMAT       = png
-
-# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
-# enable generation of interactive SVG images that allow zooming and panning.
-# Note that this requires a modern browser other than Internet Explorer.
-# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
-# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
-# visible. Older versions of IE do not have SVG support.
-
-INTERACTIVE_SVG        = NO
-
-# The tag DOT_PATH can be used to specify the path where the dot tool can be
-# found. If left blank, it is assumed the dot tool can be found in the path.
-
-DOT_PATH               =
-
-# The DOTFILE_DIRS tag can be used to specify one or more directories that
-# contain dot files that are included in the documentation (see the
-# \dotfile command).
-
-DOTFILE_DIRS           =
-
-# The MSCFILE_DIRS tag can be used to specify one or more directories that
-# contain msc files that are included in the documentation (see the
-# \mscfile command).
-
-MSCFILE_DIRS           =
-
-# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
-# nodes that will be shown in the graph. If the number of nodes in a graph
-# becomes larger than this value, doxygen will truncate the graph, which is
-# visualized by representing a node as a red box. Note that doxygen if the
-# number of direct children of the root node in a graph is already larger than
-# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
-# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
-
-DOT_GRAPH_MAX_NODES    = 50
-
-# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
-# graphs generated by dot. A depth value of 3 means that only nodes reachable
-# from the root by following a path via at most 3 edges will be shown. Nodes
-# that lay further from the root node will be omitted. Note that setting this
-# option to 1 or 2 may greatly reduce the computation time needed for large
-# code bases. Also note that the size of a graph can be further restricted by
-# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
-
-MAX_DOT_GRAPH_DEPTH    = 0
-
-# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
-# background. This is disabled by default, because dot on Windows does not
-# seem to support this out of the box. Warning: Depending on the platform used,
-# enabling this option may lead to badly anti-aliased labels on the edges of
-# a graph (i.e. they become hard to read).
-
-DOT_TRANSPARENT        = NO
-
-# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
-# files in one run (i.e. multiple -o and -T options on the command line). This
-# makes dot run faster, but since only newer versions of dot (>1.8.10)
-# support this, this feature is disabled by default.
-
-DOT_MULTI_TARGETS      = NO
-
-# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
-# generate a legend page explaining the meaning of the various boxes and
-# arrows in the dot generated graphs.
-
-GENERATE_LEGEND        = YES
-
-# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
-# remove the intermediate dot files that are used to generate
-# the various graphs.
-
-DOT_CLEANUP            = YES
+# Doxyfile 1.o8.2
+
+# This file describes the settings to be used by the documentation system
+# doxygen (www.doxygen.org) for a project.
+#
+# All text after a hash (#) is considered a comment and will be ignored.
+# The format is:
+#       TAG = value [value, ...]
+# For lists items can also be appended using:
+#       TAG += value [value, ...]
+# Values that contain spaces should be placed between quotes (" ").
+
+#---------------------------------------------------------------------------
+# Project related configuration options
+#---------------------------------------------------------------------------
+
+# This tag specifies the encoding used for all characters in the config file
+# that follow. The default is UTF-8 which is also the encoding used for all
+# text before the first occurrence of this tag. Doxygen uses libiconv (or the
+# iconv built into libc) for the transcoding. See
+# http://www.gnu.org/software/libiconv for the list of possible encodings.
+
+DOXYFILE_ENCODING      = UTF-8
+
+# The PROJECT_NAME tag is a single word (or sequence of words) that should
+# identify the project. Note that if you do not use Doxywizard you need
+# to put quotes around the project name if it contains spaces.
+
+PROJECT_NAME           = "LLVM OpenMP* Runtime Library"
+
+# The PROJECT_NUMBER tag can be used to enter a project or revision number.
+# This could be handy for archiving the generated documentation or
+# if some version control system is used.
+
+PROJECT_NUMBER         =
+
+# Using the PROJECT_BRIEF tag one can provide an optional one line description
+# for a project that appears at the top of each page and should give viewer
+# a quick idea about the purpose of the project. Keep the description short.
+
+PROJECT_BRIEF          =
+
+# With the PROJECT_LOGO tag one can specify an logo or icon that is
+# included in the documentation. The maximum height of the logo should not
+# exceed 55 pixels and the maximum width should not exceed 200 pixels.
+# Doxygen will copy the logo to the output directory.
+
+PROJECT_LOGO           =
+
+# The OUTPUT_DIRECTORY tag is used to specify the (relative or absolute)
+# base path where the generated documentation will be put.
+# If a relative path is entered, it will be relative to the location
+# where doxygen was started. If left blank the current directory will be used.
+
+OUTPUT_DIRECTORY       = doc/doxygen/generated
+
+# If the CREATE_SUBDIRS tag is set to YES, then doxygen will create
+# 4096 sub-directories (in 2 levels) under the output directory of each output
+# format and will distribute the generated files over these directories.
+# Enabling this option can be useful when feeding doxygen a huge amount of
+# source files, where putting all generated files in the same directory would
+# otherwise cause performance problems for the file system.
+
+CREATE_SUBDIRS         = NO
+
+# The OUTPUT_LANGUAGE tag is used to specify the language in which all
+# documentation generated by doxygen is written. Doxygen will use this
+# information to generate all constant output in the proper language.
+# The default language is English, other supported languages are:
+# Afrikaans, Arabic, Brazilian, Catalan, Chinese, Chinese-Traditional,
+# Croatian, Czech, Danish, Dutch, Esperanto, Farsi, Finnish, French, German,
+# Greek, Hungarian, Italian, Japanese, Japanese-en (Japanese with English
+# messages), Korean, Korean-en, Lithuanian, Norwegian, Macedonian, Persian,
+# Polish, Portuguese, Romanian, Russian, Serbian, Serbian-Cyrillic, Slovak,
+# Slovene, Spanish, Swedish, Ukrainian, and Vietnamese.
+
+OUTPUT_LANGUAGE        = English
+
+# If the BRIEF_MEMBER_DESC tag is set to YES (the default) Doxygen will
+# include brief member descriptions after the members that are listed in
+# the file and class documentation (similar to JavaDoc).
+# Set to NO to disable this.
+
+BRIEF_MEMBER_DESC      = YES
+
+# If the REPEAT_BRIEF tag is set to YES (the default) Doxygen will prepend
+# the brief description of a member or function before the detailed description.
+# Note: if both HIDE_UNDOC_MEMBERS and BRIEF_MEMBER_DESC are set to NO, the
+# brief descriptions will be completely suppressed.
+
+REPEAT_BRIEF           = YES
+
+# This tag implements a quasi-intelligent brief description abbreviator
+# that is used to form the text in various listings. Each string
+# in this list, if found as the leading text of the brief description, will be
+# stripped from the text and the result after processing the whole list, is
+# used as the annotated text. Otherwise, the brief description is used as-is.
+# If left blank, the following values are used ("$name" is automatically
+# replaced with the name of the entity): "The $name class" "The $name widget"
+# "The $name file" "is" "provides" "specifies" "contains"
+# "represents" "a" "an" "the"
+
+ABBREVIATE_BRIEF       =
+
+# If the ALWAYS_DETAILED_SEC and REPEAT_BRIEF tags are both set to YES then
+# Doxygen will generate a detailed section even if there is only a brief
+# description.
+
+ALWAYS_DETAILED_SEC    = NO
+
+# If the INLINE_INHERITED_MEMB tag is set to YES, doxygen will show all
+# inherited members of a class in the documentation of that class as if those
+# members were ordinary class members. Constructors, destructors and assignment
+# operators of the base classes will not be shown.
+
+INLINE_INHERITED_MEMB  = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then Doxygen will prepend the full
+# path before files name in the file list and in the header files. If set
+# to NO the shortest path that makes the file name unique will be used.
+
+FULL_PATH_NAMES        = NO
+
+# If the FULL_PATH_NAMES tag is set to YES then the STRIP_FROM_PATH tag
+# can be used to strip a user-defined part of the path. Stripping is
+# only done if one of the specified strings matches the left-hand part of
+# the path. The tag can be used to show relative paths in the file list.
+# If left blank the directory from which doxygen is run is used as the
+# path to strip. Note that you specify absolute paths here, but also
+# relative paths, which will be relative from the directory where doxygen is
+# started.
+
+STRIP_FROM_PATH        =
+
+# The STRIP_FROM_INC_PATH tag can be used to strip a user-defined part of
+# the path mentioned in the documentation of a class, which tells
+# the reader which header file to include in order to use a class.
+# If left blank only the name of the header file containing the class
+# definition is used. Otherwise one should specify the include paths that
+# are normally passed to the compiler using the -I flag.
+
+STRIP_FROM_INC_PATH    =
+
+# If the SHORT_NAMES tag is set to YES, doxygen will generate much shorter
+# (but less readable) file names. This can be useful if your file system
+# doesn't support long names like on DOS, Mac, or CD-ROM.
+
+SHORT_NAMES            = NO
+
+# If the JAVADOC_AUTOBRIEF tag is set to YES then Doxygen
+# will interpret the first line (until the first dot) of a JavaDoc-style
+# comment as the brief description. If set to NO, the JavaDoc
+# comments will behave just like regular Qt-style comments
+# (thus requiring an explicit @brief command for a brief description.)
+
+JAVADOC_AUTOBRIEF      = NO
+
+# If the QT_AUTOBRIEF tag is set to YES then Doxygen will
+# interpret the first line (until the first dot) of a Qt-style
+# comment as the brief description. If set to NO, the comments
+# will behave just like regular Qt-style comments (thus requiring
+# an explicit \brief command for a brief description.)
+
+QT_AUTOBRIEF           = NO
+
+# The MULTILINE_CPP_IS_BRIEF tag can be set to YES to make Doxygen
+# treat a multi-line C++ special comment block (i.e. a block of //! or ///
+# comments) as a brief description. This used to be the default behaviour.
+# The new default is to treat a multi-line C++ comment block as a detailed
+# description. Set this tag to YES if you prefer the old behaviour instead.
+
+MULTILINE_CPP_IS_BRIEF = NO
+
+# If the INHERIT_DOCS tag is set to YES (the default) then an undocumented
+# member inherits the documentation from any documented member that it
+# re-implements.
+
+INHERIT_DOCS           = YES
+
+# If the SEPARATE_MEMBER_PAGES tag is set to YES, then doxygen will produce
+# a new page for each member. If set to NO, the documentation of a member will
+# be part of the file/class/namespace that contains it.
+
+SEPARATE_MEMBER_PAGES  = NO
+
+# The TAB_SIZE tag can be used to set the number of spaces in a tab.
+# Doxygen uses this value to replace tabs by spaces in code fragments.
+
+TAB_SIZE               = 8
+
+# This tag can be used to specify a number of aliases that acts
+# as commands in the documentation. An alias has the form "name=value".
+# For example adding "sideeffect=\par Side Effects:\n" will allow you to
+# put the command \sideeffect (or @sideeffect) in the documentation, which
+# will result in a user-defined paragraph with heading "Side Effects:".
+# You can put \n's in the value part of an alias to insert newlines.
+
+ALIASES                = "other=<sup>*</sup>"
+
+# This tag can be used to specify a number of word-keyword mappings (TCL only).
+# A mapping has the form "name=value". For example adding
+# "class=itcl::class" will allow you to use the command class in the
+# itcl::class meaning.
+
+TCL_SUBST              =
+
+# Set the OPTIMIZE_OUTPUT_FOR_C tag to YES if your project consists of C
+# sources only. Doxygen will then generate output that is more tailored for C.
+# For instance, some of the names that are used will be different. The list
+# of all members will be omitted, etc.
+
+OPTIMIZE_OUTPUT_FOR_C  = NO
+
+# Set the OPTIMIZE_OUTPUT_JAVA tag to YES if your project consists of Java
+# sources only. Doxygen will then generate output that is more tailored for
+# Java. For instance, namespaces will be presented as packages, qualified
+# scopes will look different, etc.
+
+OPTIMIZE_OUTPUT_JAVA   = NO
+
+# Set the OPTIMIZE_FOR_FORTRAN tag to YES if your project consists of Fortran
+# sources only. Doxygen will then generate output that is more tailored for
+# Fortran.
+
+OPTIMIZE_FOR_FORTRAN   = NO
+
+# Set the OPTIMIZE_OUTPUT_VHDL tag to YES if your project consists of VHDL
+# sources. Doxygen will then generate output that is tailored for
+# VHDL.
+
+OPTIMIZE_OUTPUT_VHDL   = NO
+
+# Doxygen selects the parser to use depending on the extension of the files it
+# parses. With this tag you can assign which parser to use for a given
+# extension. Doxygen has a built-in mapping, but you can override or extend it
+# using this tag. The format is ext=language, where ext is a file extension,
+# and language is one of the parsers supported by doxygen: IDL, Java,
+# Javascript, CSharp, C, C++, D, PHP, Objective-C, Python, Fortran, VHDL, C,
+# C++. For instance to make doxygen treat .inc files as Fortran files (default
+# is PHP), and .f files as C (default is Fortran), use: inc=Fortran f=C. Note
+# that for custom extensions you also need to set FILE_PATTERNS otherwise the
+# files are not read by doxygen.
+
+EXTENSION_MAPPING      =
+
+# If MARKDOWN_SUPPORT is enabled (the default) then doxygen pre-processes all
+# comments according to the Markdown format, which allows for more readable
+# documentation. See http://daringfireball.net/projects/markdown/ for details.
+# The output of markdown processing is further processed by doxygen, so you
+# can mix doxygen, HTML, and XML commands with Markdown formatting.
+# Disable only in case of backward compatibilities issues.
+
+MARKDOWN_SUPPORT       = YES
+
+# When enabled doxygen tries to link words that correspond to documented classes,
+# or namespaces to their corresponding documentation. Such a link can be
+# prevented in individual cases by by putting a % sign in front of the word or
+# globally by setting AUTOLINK_SUPPORT to NO.
+
+AUTOLINK_SUPPORT       = YES
+
+# If you use STL classes (i.e. std::string, std::vector, etc.) but do not want
+# to include (a tag file for) the STL sources as input, then you should
+# set this tag to YES in order to let doxygen match functions declarations and
+# definitions whose arguments contain STL classes (e.g. func(std::string); v.s.
+# func(std::string) {}). This also makes the inheritance and collaboration
+# diagrams that involve STL classes more complete and accurate.
+
+BUILTIN_STL_SUPPORT    = NO
+
+# If you use Microsoft's C++/CLI language, you should set this option to YES to
+# enable parsing support.
+
+CPP_CLI_SUPPORT        = NO
+
+# Set the SIP_SUPPORT tag to YES if your project consists of sip sources only.
+# Doxygen will parse them like normal C++ but will assume all classes use public
+# instead of private inheritance when no explicit protection keyword is present.
+
+SIP_SUPPORT            = NO
+
+# For Microsoft's IDL there are propget and propput attributes to
+# indicate getter and setter methods for a property. Setting this
+# option to YES (the default) will make doxygen replace the get and
+# set methods by a property in the documentation. This will only work
+# if the methods are indeed getting or setting a simple type. If this
+# is not the case, or you want to show the methods anyway, you should
+# set this option to NO.
+
+IDL_PROPERTY_SUPPORT   = YES
+
+# If member grouping is used in the documentation and the DISTRIBUTE_GROUP_DOC
+# tag is set to YES, then doxygen will reuse the documentation of the first
+# member in the group (if any) for the other members of the group. By default
+# all members of a group must be documented explicitly.
+
+DISTRIBUTE_GROUP_DOC   = NO
+
+# Set the SUBGROUPING tag to YES (the default) to allow class member groups of
+# the same type (for instance a group of public functions) to be put as a
+# subgroup of that type (e.g. under the Public Functions section). Set it to
+# NO to prevent subgrouping. Alternatively, this can be done per class using
+# the \nosubgrouping command.
+
+SUBGROUPING            = YES
+
+# When the INLINE_GROUPED_CLASSES tag is set to YES, classes, structs and
+# unions are shown inside the group in which they are included (e.g. using
+# @ingroup) instead of on a separate page (for HTML and Man pages) or
+# section (for LaTeX and RTF).
+
+INLINE_GROUPED_CLASSES = NO
+
+# When the INLINE_SIMPLE_STRUCTS tag is set to YES, structs, classes, and
+# unions with only public data fields will be shown inline in the documentation
+# of the scope in which they are defined (i.e. file, namespace, or group
+# documentation), provided this scope is documented. If set to NO (the default),
+# structs, classes, and unions are shown on a separate page (for HTML and Man
+# pages) or section (for LaTeX and RTF).
+
+INLINE_SIMPLE_STRUCTS  = NO
+
+# When TYPEDEF_HIDES_STRUCT is enabled, a typedef of a struct, union, or enum
+# is documented as struct, union, or enum with the name of the typedef. So
+# typedef struct TypeS {} TypeT, will appear in the documentation as a struct
+# with name TypeT. When disabled the typedef will appear as a member of a file,
+# namespace, or class. And the struct will be named TypeS. This can typically
+# be useful for C code in case the coding convention dictates that all compound
+# types are typedef'ed and only the typedef is referenced, never the tag name.
+
+TYPEDEF_HIDES_STRUCT   = NO
+
+# The SYMBOL_CACHE_SIZE determines the size of the internal cache use to
+# determine which symbols to keep in memory and which to flush to disk.
+# When the cache is full, less often used symbols will be written to disk.
+# For small to medium size projects (<1000 input files) the default value is
+# probably good enough. For larger projects a too small cache size can cause
+# doxygen to be busy swapping symbols to and from disk most of the time
+# causing a significant performance penalty.
+# If the system has enough physical memory increasing the cache will improve the
+# performance by keeping more symbols in memory. Note that the value works on
+# a logarithmic scale so increasing the size by one will roughly double the
+# memory usage. The cache size is given by this formula:
+# 2^(16+SYMBOL_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+SYMBOL_CACHE_SIZE      = 0
+
+# Similar to the SYMBOL_CACHE_SIZE the size of the symbol lookup cache can be
+# set using LOOKUP_CACHE_SIZE. This cache is used to resolve symbols given
+# their name and scope. Since this can be an expensive process and often the
+# same symbol appear multiple times in the code, doxygen keeps a cache of
+# pre-resolved symbols. If the cache is too small doxygen will become slower.
+# If the cache is too large, memory is wasted. The cache size is given by this
+# formula: 2^(16+LOOKUP_CACHE_SIZE). The valid range is 0..9, the default is 0,
+# corresponding to a cache size of 2^16 = 65536 symbols.
+
+LOOKUP_CACHE_SIZE      = 0
+
+#---------------------------------------------------------------------------
+# Build related configuration options
+#---------------------------------------------------------------------------
+
+# If the EXTRACT_ALL tag is set to YES doxygen will assume all entities in
+# documentation are documented, even if no documentation was available.
+# Private class members and static file members will be hidden unless
+# the EXTRACT_PRIVATE and EXTRACT_STATIC tags are set to YES
+
+EXTRACT_ALL            = NO
+
+# If the EXTRACT_PRIVATE tag is set to YES all private members of a class
+# will be included in the documentation.
+
+EXTRACT_PRIVATE        = YES
+
+# If the EXTRACT_PACKAGE tag is set to YES all members with package or internal
+# scope will be included in the documentation.
+
+EXTRACT_PACKAGE        = NO
+
+# If the EXTRACT_STATIC tag is set to YES all static members of a file
+# will be included in the documentation.
+
+EXTRACT_STATIC         = YES
+
+# If the EXTRACT_LOCAL_CLASSES tag is set to YES classes (and structs)
+# defined locally in source files will be included in the documentation.
+# If set to NO only classes defined in header files are included.
+
+EXTRACT_LOCAL_CLASSES  = YES
+
+# This flag is only useful for Objective-C code. When set to YES local
+# methods, which are defined in the implementation section but not in
+# the interface are included in the documentation.
+# If set to NO (the default) only methods in the interface are included.
+
+EXTRACT_LOCAL_METHODS  = NO
+
+# If this flag is set to YES, the members of anonymous namespaces will be
+# extracted and appear in the documentation as a namespace called
+# 'anonymous_namespace{file}', where file will be replaced with the base
+# name of the file that contains the anonymous namespace. By default
+# anonymous namespaces are hidden.
+
+EXTRACT_ANON_NSPACES   = NO
+
+# If the HIDE_UNDOC_MEMBERS tag is set to YES, Doxygen will hide all
+# undocumented members of documented classes, files or namespaces.
+# If set to NO (the default) these members will be included in the
+# various overviews, but no documentation section is generated.
+# This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_MEMBERS     = YES
+
+# If the HIDE_UNDOC_CLASSES tag is set to YES, Doxygen will hide all
+# undocumented classes that are normally visible in the class hierarchy.
+# If set to NO (the default) these classes will be included in the various
+# overviews. This option has no effect if EXTRACT_ALL is enabled.
+
+HIDE_UNDOC_CLASSES     = YES
+
+# If the HIDE_FRIEND_COMPOUNDS tag is set to YES, Doxygen will hide all
+# friend (class|struct|union) declarations.
+# If set to NO (the default) these declarations will be included in the
+# documentation.
+
+HIDE_FRIEND_COMPOUNDS  = NO
+
+# If the HIDE_IN_BODY_DOCS tag is set to YES, Doxygen will hide any
+# documentation blocks found inside the body of a function.
+# If set to NO (the default) these blocks will be appended to the
+# function's detailed documentation block.
+
+HIDE_IN_BODY_DOCS      = NO
+
+# The INTERNAL_DOCS tag determines if documentation
+# that is typed after a \internal command is included. If the tag is set
+# to NO (the default) then the documentation will be excluded.
+# Set it to YES to include the internal documentation.
+
+INTERNAL_DOCS          = NO
+
+# If the CASE_SENSE_NAMES tag is set to NO then Doxygen will only generate
+# file names in lower-case letters. If set to YES upper-case letters are also
+# allowed. This is useful if you have classes or files whose names only differ
+# in case and if your file system supports case sensitive file names. Windows
+# and Mac users are advised to set this option to NO.
+
+CASE_SENSE_NAMES       = YES
+
+# If the HIDE_SCOPE_NAMES tag is set to NO (the default) then Doxygen
+# will show members with their full class and namespace scopes in the
+# documentation. If set to YES the scope will be hidden.
+
+HIDE_SCOPE_NAMES       = NO
+
+# If the SHOW_INCLUDE_FILES tag is set to YES (the default) then Doxygen
+# will put a list of the files that are included by a file in the documentation
+# of that file.
+
+SHOW_INCLUDE_FILES     = YES
+
+# If the FORCE_LOCAL_INCLUDES tag is set to YES then Doxygen
+# will list include files with double quotes in the documentation
+# rather than with sharp brackets.
+
+FORCE_LOCAL_INCLUDES   = NO
+
+# If the INLINE_INFO tag is set to YES (the default) then a tag [inline]
+# is inserted in the documentation for inline members.
+
+INLINE_INFO            = YES
+
+# If the SORT_MEMBER_DOCS tag is set to YES (the default) then doxygen
+# will sort the (detailed) documentation of file and class members
+# alphabetically by member name. If set to NO the members will appear in
+# declaration order.
+
+SORT_MEMBER_DOCS       = YES
+
+# If the SORT_BRIEF_DOCS tag is set to YES then doxygen will sort the
+# brief documentation of file, namespace and class members alphabetically
+# by member name. If set to NO (the default) the members will appear in
+# declaration order.
+
+SORT_BRIEF_DOCS        = NO
+
+# If the SORT_MEMBERS_CTORS_1ST tag is set to YES then doxygen
+# will sort the (brief and detailed) documentation of class members so that
+# constructors and destructors are listed first. If set to NO (the default)
+# the constructors will appear in the respective orders defined by
+# SORT_MEMBER_DOCS and SORT_BRIEF_DOCS.
+# This tag will be ignored for brief docs if SORT_BRIEF_DOCS is set to NO
+# and ignored for detailed docs if SORT_MEMBER_DOCS is set to NO.
+
+SORT_MEMBERS_CTORS_1ST = NO
+
+# If the SORT_GROUP_NAMES tag is set to YES then doxygen will sort the
+# hierarchy of group names into alphabetical order. If set to NO (the default)
+# the group names will appear in their defined order.
+
+SORT_GROUP_NAMES       = NO
+
+# If the SORT_BY_SCOPE_NAME tag is set to YES, the class list will be
+# sorted by fully-qualified names, including namespaces. If set to
+# NO (the default), the class list will be sorted only by class name,
+# not including the namespace part.
+# Note: This option is not very useful if HIDE_SCOPE_NAMES is set to YES.
+# Note: This option applies only to the class list, not to the
+# alphabetical list.
+
+SORT_BY_SCOPE_NAME     = NO
+
+# If the STRICT_PROTO_MATCHING option is enabled and doxygen fails to
+# do proper type resolution of all parameters of a function it will reject a
+# match between the prototype and the implementation of a member function even
+# if there is only one candidate or it is obvious which candidate to choose
+# by doing a simple string match. By disabling STRICT_PROTO_MATCHING doxygen
+# will still accept a match between prototype and implementation in such cases.
+
+STRICT_PROTO_MATCHING  = NO
+
+# The GENERATE_TODOLIST tag can be used to enable (YES) or
+# disable (NO) the todo list. This list is created by putting \todo
+# commands in the documentation.
+
+GENERATE_TODOLIST      = YES
+
+# The GENERATE_TESTLIST tag can be used to enable (YES) or
+# disable (NO) the test list. This list is created by putting \test
+# commands in the documentation.
+
+GENERATE_TESTLIST      = YES
+
+# The GENERATE_BUGLIST tag can be used to enable (YES) or
+# disable (NO) the bug list. This list is created by putting \bug
+# commands in the documentation.
+
+GENERATE_BUGLIST       = YES
+
+# The GENERATE_DEPRECATEDLIST tag can be used to enable (YES) or
+# disable (NO) the deprecated list. This list is created by putting
+# \deprecated commands in the documentation.
+
+GENERATE_DEPRECATEDLIST= YES
+
+# The ENABLED_SECTIONS tag can be used to enable conditional
+# documentation sections, marked by \if sectionname ... \endif.
+
+ENABLED_SECTIONS       =
+
+# The MAX_INITIALIZER_LINES tag determines the maximum number of lines
+# the initial value of a variable or macro consists of for it to appear in
+# the documentation. If the initializer consists of more lines than specified
+# here it will be hidden. Use a value of 0 to hide initializers completely.
+# The appearance of the initializer of individual variables and macros in the
+# documentation can be controlled using \showinitializer or \hideinitializer
+# command in the documentation regardless of this setting.
+
+MAX_INITIALIZER_LINES  = 30
+
+# Set the SHOW_USED_FILES tag to NO to disable the list of files generated
+# at the bottom of the documentation of classes and structs. If set to YES the
+# list will mention the files that were used to generate the documentation.
+
+SHOW_USED_FILES        = YES
+
+# Set the SHOW_FILES tag to NO to disable the generation of the Files page.
+# This will remove the Files entry from the Quick Index and from the
+# Folder Tree View (if specified). The default is YES.
+
+# We probably will want this, but we have no file documentation yet so it's simpler to remove
+# it for now.
+SHOW_FILES             = NO
+
+# Set the SHOW_NAMESPACES tag to NO to disable the generation of the
+# Namespaces page.
+# This will remove the Namespaces entry from the Quick Index
+# and from the Folder Tree View (if specified). The default is YES.
+
+SHOW_NAMESPACES        = YES
+
+# The FILE_VERSION_FILTER tag can be used to specify a program or script that
+# doxygen should invoke to get the current version for each file (typically from
+# the version control system). Doxygen will invoke the program by executing (via
+# popen()) the command <command> <input-file>, where <command> is the value of
+# the FILE_VERSION_FILTER tag, and <input-file> is the name of an input file
+# provided by doxygen. Whatever the program writes to standard output
+# is used as the file version. See the manual for examples.
+
+FILE_VERSION_FILTER    =
+
+# The LAYOUT_FILE tag can be used to specify a layout file which will be parsed
+# by doxygen. The layout file controls the global structure of the generated
+# output files in an output format independent way. To create the layout file
+# that represents doxygen's defaults, run doxygen with the -l option.
+# You can optionally specify a file name after the option, if omitted
+# DoxygenLayout.xml will be used as the name of the layout file.
+
+LAYOUT_FILE            =
+
+# The CITE_BIB_FILES tag can be used to specify one or more bib files
+# containing the references data. This must be a list of .bib files. The
+# .bib extension is automatically appended if omitted. Using this command
+# requires the bibtex tool to be installed. See also
+# http://en.wikipedia.org/wiki/BibTeX for more info. For LaTeX the style
+# of the bibliography can be controlled using LATEX_BIB_STYLE. To use this
+# feature you need bibtex and perl available in the search path.
+
+CITE_BIB_FILES         =
+
+#---------------------------------------------------------------------------
+# configuration options related to warning and progress messages
+#---------------------------------------------------------------------------
+
+# The QUIET tag can be used to turn on/off the messages that are generated
+# by doxygen. Possible values are YES and NO. If left blank NO is used.
+
+QUIET                  = NO
+
+# The WARNINGS tag can be used to turn on/off the warning messages that are
+# generated by doxygen. Possible values are YES and NO. If left blank
+# NO is used.
+
+WARNINGS               = YES
+
+# If WARN_IF_UNDOCUMENTED is set to YES, then doxygen will generate warnings
+# for undocumented members. If EXTRACT_ALL is set to YES then this flag will
+# automatically be disabled.
+
+WARN_IF_UNDOCUMENTED   = YES
+
+# If WARN_IF_DOC_ERROR is set to YES, doxygen will generate warnings for
+# potential errors in the documentation, such as not documenting some
+# parameters in a documented function, or documenting parameters that
+# don't exist or using markup commands wrongly.
+
+WARN_IF_DOC_ERROR      = YES
+
+# The WARN_NO_PARAMDOC option can be enabled to get warnings for
+# functions that are documented, but have no documentation for their parameters
+# or return value. If set to NO (the default) doxygen will only warn about
+# wrong or incomplete parameter documentation, but not about the absence of
+# documentation.
+
+WARN_NO_PARAMDOC       = NO
+
+# The WARN_FORMAT tag determines the format of the warning messages that
+# doxygen can produce. The string should contain the $file, $line, and $text
+# tags, which will be replaced by the file and line number from which the
+# warning originated and the warning text. Optionally the format may contain
+# $version, which will be replaced by the version of the file (if it could
+# be obtained via FILE_VERSION_FILTER)
+
+WARN_FORMAT            =
+
+# The WARN_LOGFILE tag can be used to specify a file to which warning
+# and error messages should be written. If left blank the output is written
+# to stderr.
+
+WARN_LOGFILE           =
+
+#---------------------------------------------------------------------------
+# configuration options related to the input files
+#---------------------------------------------------------------------------
+
+# The INPUT tag can be used to specify the files and/or directories that contain
+# documented source files. You may enter file names like "myfile.cpp" or
+# directories like "/usr/src/myproject". Separate the files or directories
+# with spaces.
+
+INPUT                  = src  doc/doxygen/libomp_interface.h
+# The ittnotify code also has doxygen documentation, but if we include it here
+# it takes over from us!
+# src/thirdparty/ittnotify
+
+# This tag can be used to specify the character encoding of the source files
+# that doxygen parses. Internally doxygen uses the UTF-8 encoding, which is
+# also the default input encoding. Doxygen uses libiconv (or the iconv built
+# into libc) for the transcoding. See http://www.gnu.org/software/libiconv for
+# the list of possible encodings.
+
+INPUT_ENCODING         = UTF-8
+
+# If the value of the INPUT tag contains directories, you can use the
+# FILE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank the following patterns are tested:
+# *.c *.cc *.cxx *.cpp *.c++ *.d *.java *.ii *.ixx *.ipp *.i++ *.inl *.h *.hh
+# *.hxx *.hpp *.h++ *.idl *.odl *.cs *.php *.php3 *.inc *.m *.mm *.dox *.py
+# *.f90 *.f *.for *.vhd *.vhdl
+
+FILE_PATTERNS          = *.c *.h *.cpp
+# We may also want to include the asm files with appropriate ifdef to ensure
+# doxygen doesn't see the content, just the documentation...
+
+# The RECURSIVE tag can be used to turn specify whether or not subdirectories
+# should be searched for input files as well. Possible values are YES and NO.
+# If left blank NO is used.
+
+# Only look in the one directory.
+RECURSIVE              = NO
+
+# The EXCLUDE tag can be used to specify files and/or directories that should be
+# excluded from the INPUT source files. This way you can easily exclude a
+# subdirectory from a directory tree whose root is specified with the INPUT tag.
+# Note that relative paths are relative to the directory from which doxygen is
+# run.
+
+EXCLUDE                = src/test-touch.c
+
+# The EXCLUDE_SYMLINKS tag can be used to select whether or not files or
+# directories that are symbolic links (a Unix file system feature) are excluded
+# from the input.
+
+EXCLUDE_SYMLINKS       = NO
+
+# If the value of the INPUT tag contains directories, you can use the
+# EXCLUDE_PATTERNS tag to specify one or more wildcard patterns to exclude
+# certain files from those directories. Note that the wildcards are matched
+# against the file with absolute path, so to exclude all test directories
+# for example use the pattern */test/*
+
+EXCLUDE_PATTERNS       =
+
+# The EXCLUDE_SYMBOLS tag can be used to specify one or more symbol names
+# (namespaces, classes, functions, etc.) that should be excluded from the
+# output. The symbol name can be a fully qualified name, a word, or if the
+# wildcard * is used, a substring. Examples: ANamespace, AClass,
+# AClass::ANamespace, ANamespace::*Test
+
+EXCLUDE_SYMBOLS        =
+
+# The EXAMPLE_PATH tag can be used to specify one or more files or
+# directories that contain example code fragments that are included (see
+# the \include command).
+
+EXAMPLE_PATH           =
+
+# If the value of the EXAMPLE_PATH tag contains directories, you can use the
+# EXAMPLE_PATTERNS tag to specify one or more wildcard pattern (like *.cpp
+# and *.h) to filter out the source-files in the directories. If left
+# blank all files are included.
+
+EXAMPLE_PATTERNS       =
+
+# If the EXAMPLE_RECURSIVE tag is set to YES then subdirectories will be
+# searched for input files to be used with the \include or \dontinclude
+# commands irrespective of the value of the RECURSIVE tag.
+# Possible values are YES and NO. If left blank NO is used.
+
+EXAMPLE_RECURSIVE      = NO
+
+# The IMAGE_PATH tag can be used to specify one or more files or
+# directories that contain image that are included in the documentation (see
+# the \image command).
+
+IMAGE_PATH             =
+
+# The INPUT_FILTER tag can be used to specify a program that doxygen should
+# invoke to filter for each input file. Doxygen will invoke the filter program
+# by executing (via popen()) the command <filter> <input-file>, where <filter>
+# is the value of the INPUT_FILTER tag, and <input-file> is the name of an
+# input file. Doxygen will then use the output that the filter program writes
+# to standard output.
+# If FILTER_PATTERNS is specified, this tag will be
+# ignored.
+
+INPUT_FILTER           =
+
+# The FILTER_PATTERNS tag can be used to specify filters on a per file pattern
+# basis.
+# Doxygen will compare the file name with each pattern and apply the
+# filter if there is a match.
+# The filters are a list of the form:
+# pattern=filter (like *.cpp=my_cpp_filter). See INPUT_FILTER for further
+# info on how filters are used. If FILTER_PATTERNS is empty or if
+# non of the patterns match the file name, INPUT_FILTER is applied.
+
+FILTER_PATTERNS        =
+
+# If the FILTER_SOURCE_FILES tag is set to YES, the input filter (if set using
+# INPUT_FILTER) will be used to filter the input files when producing source
+# files to browse (i.e. when SOURCE_BROWSER is set to YES).
+
+FILTER_SOURCE_FILES    = NO
+
+# The FILTER_SOURCE_PATTERNS tag can be used to specify source filters per file
+# pattern. A pattern will override the setting for FILTER_PATTERN (if any)
+# and it is also possible to disable source filtering for a specific pattern
+# using *.ext= (so without naming a filter). This option only has effect when
+# FILTER_SOURCE_FILES is enabled.
+
+FILTER_SOURCE_PATTERNS =
+
+#---------------------------------------------------------------------------
+# configuration options related to source browsing
+#---------------------------------------------------------------------------
+
+# If the SOURCE_BROWSER tag is set to YES then a list of source files will
+# be generated. Documented entities will be cross-referenced with these sources.
+# Note: To get rid of all source code in the generated output, make sure also
+# VERBATIM_HEADERS is set to NO.
+
+SOURCE_BROWSER         = YES
+
+# Setting the INLINE_SOURCES tag to YES will include the body
+# of functions and classes directly in the documentation.
+
+INLINE_SOURCES         = NO
+
+# Setting the STRIP_CODE_COMMENTS tag to YES (the default) will instruct
+# doxygen to hide any special comment blocks from generated source code
+# fragments. Normal C, C++ and Fortran comments will always remain visible.
+
+STRIP_CODE_COMMENTS    = YES
+
+# If the REFERENCED_BY_RELATION tag is set to YES
+# then for each documented function all documented
+# functions referencing it will be listed.
+
+REFERENCED_BY_RELATION = YES
+
+# If the REFERENCES_RELATION tag is set to YES
+# then for each documented function all documented entities
+# called/used by that function will be listed.
+
+REFERENCES_RELATION    = NO
+
+# If the REFERENCES_LINK_SOURCE tag is set to YES (the default)
+# and SOURCE_BROWSER tag is set to YES, then the hyperlinks from
+# functions in REFERENCES_RELATION and REFERENCED_BY_RELATION lists will
+# link to the source code.
+# Otherwise they will link to the documentation.
+
+REFERENCES_LINK_SOURCE = YES
+
+# If the USE_HTAGS tag is set to YES then the references to source code
+# will point to the HTML generated by the htags(1) tool instead of doxygen
+# built-in source browser. The htags tool is part of GNU's global source
+# tagging system (see http://www.gnu.org/software/global/global.html). You
+# will need version 4.8.6 or higher.
+
+USE_HTAGS              = NO
+
+# If the VERBATIM_HEADERS tag is set to YES (the default) then Doxygen
+# will generate a verbatim copy of the header file for each class for
+# which an include is specified. Set to NO to disable this.
+
+VERBATIM_HEADERS       = YES
+
+#---------------------------------------------------------------------------
+# configuration options related to the alphabetical class index
+#---------------------------------------------------------------------------
+
+# If the ALPHABETICAL_INDEX tag is set to YES, an alphabetical index
+# of all compounds will be generated. Enable this if the project
+# contains a lot of classes, structs, unions or interfaces.
+
+ALPHABETICAL_INDEX     = YES
+
+# If the alphabetical index is enabled (see ALPHABETICAL_INDEX) then
+# the COLS_IN_ALPHA_INDEX tag can be used to specify the number of columns
+# in which this list will be split (can be a number in the range [1..20])
+
+COLS_IN_ALPHA_INDEX    = 5
+
+# In case all classes in a project start with a common prefix, all
+# classes will be put under the same header in the alphabetical index.
+# The IGNORE_PREFIX tag can be used to specify one or more prefixes that
+# should be ignored while generating the index headers.
+
+IGNORE_PREFIX          =
+
+#---------------------------------------------------------------------------
+# configuration options related to the HTML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_HTML tag is set to YES (the default) Doxygen will
+# generate HTML output.
+
+GENERATE_HTML          = YES
+
+# The HTML_OUTPUT tag is used to specify where the HTML docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `html' will be used as the default path.
+
+HTML_OUTPUT            =
+
+# The HTML_FILE_EXTENSION tag can be used to specify the file extension for
+# each generated HTML page (for example: .htm,.php,.asp). If it is left blank
+# doxygen will generate files with .html extension.
+
+HTML_FILE_EXTENSION    = .html
+
+# The HTML_HEADER tag can be used to specify a personal HTML header for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard header. Note that when using a custom header you are responsible
+#  for the proper inclusion of any scripts and style sheets that doxygen
+# needs, which is dependent on the configuration options used.
+# It is advised to generate a default header using "doxygen -w html
+# header.html footer.html stylesheet.css YourConfigFile" and then modify
+# that header. Note that the header is subject to change so you typically
+# have to redo this when upgrading to a newer version of doxygen or when
+# changing the value of configuration settings such as GENERATE_TREEVIEW!
+
+HTML_HEADER            =
+
+# The HTML_FOOTER tag can be used to specify a personal HTML footer for
+# each generated HTML page. If it is left blank doxygen will generate a
+# standard footer.
+
+HTML_FOOTER            =
+
+# The HTML_STYLESHEET tag can be used to specify a user-defined cascading
+# style sheet that is used by each HTML page. It can be used to
+# fine-tune the look of the HTML output. If left blank doxygen will
+# generate a default style sheet. Note that it is recommended to use
+# HTML_EXTRA_STYLESHEET instead of this one, as it is more robust and this
+# tag will in the future become obsolete.
+
+HTML_STYLESHEET        =
+
+# The HTML_EXTRA_STYLESHEET tag can be used to specify an additional
+# user-defined cascading style sheet that is included after the standard
+# style sheets created by doxygen. Using this option one can overrule
+# certain style aspects. This is preferred over using HTML_STYLESHEET
+# since it does not replace the standard style sheet and is therefor more
+# robust against future updates. Doxygen will copy the style sheet file to
+# the output directory.
+
+HTML_EXTRA_STYLESHEET  =
+
+# The HTML_EXTRA_FILES tag can be used to specify one or more extra images or
+# other source files which should be copied to the HTML output directory. Note
+# that these files will be copied to the base HTML output directory. Use the
+# $relpath$ marker in the HTML_HEADER and/or HTML_FOOTER files to load these
+# files. In the HTML_STYLESHEET file, use the file name only. Also note that
+# the files will be copied as-is; there are no commands or markers available.
+
+HTML_EXTRA_FILES       =
+
+# The HTML_COLORSTYLE_HUE tag controls the color of the HTML output.
+# Doxygen will adjust the colors in the style sheet and background images
+# according to this color. Hue is specified as an angle on a colorwheel,
+# see http://en.wikipedia.org/wiki/Hue for more information.
+# For instance the value 0 represents red, 60 is yellow, 120 is green,
+# 180 is cyan, 240 is blue, 300 purple, and 360 is red again.
+# The allowed range is 0 to 359.
+
+HTML_COLORSTYLE_HUE    = 220
+
+# The HTML_COLORSTYLE_SAT tag controls the purity (or saturation) of
+# the colors in the HTML output. For a value of 0 the output will use
+# grayscales only. A value of 255 will produce the most vivid colors.
+
+HTML_COLORSTYLE_SAT    = 100
+
+# The HTML_COLORSTYLE_GAMMA tag controls the gamma correction applied to
+# the luminance component of the colors in the HTML output. Values below
+# 100 gradually make the output lighter, whereas values above 100 make
+# the output darker. The value divided by 100 is the actual gamma applied,
+# so 80 represents a gamma of 0.8, The value 220 represents a gamma of 2.2,
+# and 100 does not change the gamma.
+
+HTML_COLORSTYLE_GAMMA  = 80
+
+# If the HTML_TIMESTAMP tag is set to YES then the footer of each generated HTML
+# page will contain the date and time when the page was generated. Setting
+# this to NO can help when comparing the output of multiple runs.
+
+HTML_TIMESTAMP         = NO
+
+# If the HTML_DYNAMIC_SECTIONS tag is set to YES then the generated HTML
+# documentation will contain sections that can be hidden and shown after the
+# page has loaded.
+
+HTML_DYNAMIC_SECTIONS  = NO
+
+# With HTML_INDEX_NUM_ENTRIES one can control the preferred number of
+# entries shown in the various tree structured indices initially; the user
+# can expand and collapse entries dynamically later on. Doxygen will expand
+# the tree to such a level that at most the specified number of entries are
+# visible (unless a fully collapsed tree already exceeds this amount).
+# So setting the number of entries 1 will produce a full collapsed tree by
+# default. 0 is a special value representing an infinite number of entries
+# and will result in a full expanded tree by default.
+
+HTML_INDEX_NUM_ENTRIES = 100
+
+# If the GENERATE_DOCSET tag is set to YES, additional index files
+# will be generated that can be used as input for Apple's Xcode 3
+# integrated development environment, introduced with OSX 10.5 (Leopard).
+# To create a documentation set, doxygen will generate a Makefile in the
+# HTML output directory. Running make will produce the docset in that
+# directory and running "make install" will install the docset in
+# ~/Library/Developer/Shared/Documentation/DocSets so that Xcode will find
+# it at startup.
+# See http://developer.apple.com/tools/creatingdocsetswithdoxygen.html
+# for more information.
+
+GENERATE_DOCSET        = NO
+
+# When GENERATE_DOCSET tag is set to YES, this tag determines the name of the
+# feed. A documentation feed provides an umbrella under which multiple
+# documentation sets from a single provider (such as a company or product suite)
+# can be grouped.
+
+DOCSET_FEEDNAME        = "Doxygen generated docs"
+
+# When GENERATE_DOCSET tag is set to YES, this tag specifies a string that
+# should uniquely identify the documentation set bundle. This should be a
+# reverse domain-name style string, e.g. com.mycompany.MyDocSet. Doxygen
+# will append .docset to the name.
+
+DOCSET_BUNDLE_ID       = org.doxygen.Project
+
+# When GENERATE_PUBLISHER_ID tag specifies a string that should uniquely
+# identify the documentation publisher. This should be a reverse domain-name
+# style string, e.g. com.mycompany.MyDocSet.documentation.
+
+DOCSET_PUBLISHER_ID    = org.doxygen.Publisher
+
+# The GENERATE_PUBLISHER_NAME tag identifies the documentation publisher.
+
+DOCSET_PUBLISHER_NAME  = Publisher
+
+# If the GENERATE_HTMLHELP tag is set to YES, additional index files
+# will be generated that can be used as input for tools like the
+# Microsoft HTML help workshop to generate a compiled HTML help file (.chm)
+# of the generated HTML documentation.
+
+GENERATE_HTMLHELP      = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_FILE tag can
+# be used to specify the file name of the resulting .chm file. You
+# can add a path in front of the file if the result should not be
+# written to the html output directory.
+
+CHM_FILE               =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the HHC_LOCATION tag can
+# be used to specify the location (absolute path including file name) of
+# the HTML help compiler (hhc.exe). If non-empty doxygen will try to run
+# the HTML help compiler on the generated index.hhp.
+
+HHC_LOCATION           =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the GENERATE_CHI flag
+# controls if a separate .chi index file is generated (YES) or that
+# it should be included in the main .chm file (NO).
+
+GENERATE_CHI           = NO
+
+# If the GENERATE_HTMLHELP tag is set to YES, the CHM_INDEX_ENCODING
+# is used to encode HtmlHelp index (hhk), content (hhc) and project file
+# content.
+
+CHM_INDEX_ENCODING     =
+
+# If the GENERATE_HTMLHELP tag is set to YES, the BINARY_TOC flag
+# controls whether a binary table of contents is generated (YES) or a
+# normal table of contents (NO) in the .chm file.
+
+BINARY_TOC             = NO
+
+# The TOC_EXPAND flag can be set to YES to add extra items for group members
+# to the contents of the HTML help documentation and to the tree view.
+
+TOC_EXPAND             = NO
+
+# If the GENERATE_QHP tag is set to YES and both QHP_NAMESPACE and
+# QHP_VIRTUAL_FOLDER are set, an additional index file will be generated
+# that can be used as input for Qt's qhelpgenerator to generate a
+# Qt Compressed Help (.qch) of the generated HTML documentation.
+
+GENERATE_QHP           = NO
+
+# If the QHG_LOCATION tag is specified, the QCH_FILE tag can
+# be used to specify the file name of the resulting .qch file.
+# The path specified is relative to the HTML output folder.
+
+QCH_FILE               =
+
+# The QHP_NAMESPACE tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#namespace
+
+QHP_NAMESPACE          = org.doxygen.Project
+
+# The QHP_VIRTUAL_FOLDER tag specifies the namespace to use when generating
+# Qt Help Project output. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#virtual-folders
+
+QHP_VIRTUAL_FOLDER     = doc
+
+# If QHP_CUST_FILTER_NAME is set, it specifies the name of a custom filter to
+# add. For more information please see
+# http://doc.trolltech.com/qthelpproject.html#custom-filters
+
+QHP_CUST_FILTER_NAME   =
+
+# The QHP_CUST_FILT_ATTRS tag specifies the list of the attributes of the
+# custom filter to add. For more information please see
+# <a href="http://doc.trolltech.com/qthelpproject.html#custom-filters">
+# Qt Help Project / Custom Filters</a>.
+
+QHP_CUST_FILTER_ATTRS  =
+
+# The QHP_SECT_FILTER_ATTRS tag specifies the list of the attributes this
+# project's
+# filter section matches.
+# <a href="http://doc.trolltech.com/qthelpproject.html#filter-attributes">
+# Qt Help Project / Filter Attributes</a>.
+
+QHP_SECT_FILTER_ATTRS  =
+
+# If the GENERATE_QHP tag is set to YES, the QHG_LOCATION tag can
+# be used to specify the location of Qt's qhelpgenerator.
+# If non-empty doxygen will try to run qhelpgenerator on the generated
+# .qhp file.
+
+QHG_LOCATION           =
+
+# If the GENERATE_ECLIPSEHELP tag is set to YES, additional index files
+#  will be generated, which together with the HTML files, form an Eclipse help
+# plugin. To install this plugin and make it available under the help contents
+# menu in Eclipse, the contents of the directory containing the HTML and XML
+# files needs to be copied into the plugins directory of eclipse. The name of
+# the directory within the plugins directory should be the same as
+# the ECLIPSE_DOC_ID value. After copying Eclipse needs to be restarted before
+# the help appears.
+
+GENERATE_ECLIPSEHELP   = NO
+
+# A unique identifier for the eclipse help plugin. When installing the plugin
+# the directory name containing the HTML and XML files should also have
+# this name.
+
+ECLIPSE_DOC_ID         = org.doxygen.Project
+
+# The DISABLE_INDEX tag can be used to turn on/off the condensed index (tabs)
+# at top of each HTML page. The value NO (the default) enables the index and
+# the value YES disables it. Since the tabs have the same information as the
+# navigation tree you can set this option to NO if you already set
+# GENERATE_TREEVIEW to YES.
+
+DISABLE_INDEX          = NO
+
+# The GENERATE_TREEVIEW tag is used to specify whether a tree-like index
+# structure should be generated to display hierarchical information.
+# If the tag value is set to YES, a side panel will be generated
+# containing a tree-like index structure (just like the one that
+# is generated for HTML Help). For this to work a browser that supports
+# JavaScript, DHTML, CSS and frames is required (i.e. any modern browser).
+# Windows users are probably better off using the HTML help feature.
+# Since the tree basically has the same information as the tab index you
+# could consider to set DISABLE_INDEX to NO when enabling this option.
+
+GENERATE_TREEVIEW      = NO
+
+# The ENUM_VALUES_PER_LINE tag can be used to set the number of enum values
+# (range [0,1..20]) that doxygen will group on one line in the generated HTML
+# documentation. Note that a value of 0 will completely suppress the enum
+# values from appearing in the overview section.
+
+ENUM_VALUES_PER_LINE   = 4
+
+# If the treeview is enabled (see GENERATE_TREEVIEW) then this tag can be
+# used to set the initial width (in pixels) of the frame in which the tree
+# is shown.
+
+TREEVIEW_WIDTH         = 250
+
+# When the EXT_LINKS_IN_WINDOW option is set to YES doxygen will open
+# links to external symbols imported via tag files in a separate window.
+
+EXT_LINKS_IN_WINDOW    = NO
+
+# Use this tag to change the font size of Latex formulas included
+# as images in the HTML documentation. The default is 10. Note that
+# when you change the font size after a successful doxygen run you need
+# to manually remove any form_*.png images from the HTML output directory
+# to force them to be regenerated.
+
+FORMULA_FONTSIZE       = 10
+
+# Use the FORMULA_TRANPARENT tag to determine whether or not the images
+# generated for formulas are transparent PNGs. Transparent PNGs are
+# not supported properly for IE 6.0, but are supported on all modern browsers.
+# Note that when changing this option you need to delete any form_*.png files
+# in the HTML output before the changes have effect.
+
+FORMULA_TRANSPARENT    = YES
+
+# Enable the USE_MATHJAX option to render LaTeX formulas using MathJax
+# (see http://www.mathjax.org) which uses client side Javascript for the
+# rendering instead of using prerendered bitmaps. Use this if you do not
+# have LaTeX installed or if you want to formulas look prettier in the HTML
+# output. When enabled you may also need to install MathJax separately and
+# configure the path to it using the MATHJAX_RELPATH option.
+
+USE_MATHJAX            = NO
+
+# When MathJax is enabled you need to specify the location relative to the
+# HTML output directory using the MATHJAX_RELPATH option. The destination
+# directory should contain the MathJax.js script. For instance, if the mathjax
+# directory is located at the same level as the HTML output directory, then
+# MATHJAX_RELPATH should be ../mathjax. The default value points to
+# the MathJax Content Delivery Network so you can quickly see the result without
+# installing MathJax.
+# However, it is strongly recommended to install a local
+# copy of MathJax from http://www.mathjax.org before deployment.
+
+MATHJAX_RELPATH        = http://cdn.mathjax.org/mathjax/latest
+
+# The MATHJAX_EXTENSIONS tag can be used to specify one or MathJax extension
+# names that should be enabled during MathJax rendering.
+
+MATHJAX_EXTENSIONS     =
+
+# When the SEARCHENGINE tag is enabled doxygen will generate a search box
+# for the HTML output. The underlying search engine uses javascript
+# and DHTML and should work on any modern browser. Note that when using
+# HTML help (GENERATE_HTMLHELP), Qt help (GENERATE_QHP), or docsets
+# (GENERATE_DOCSET) there is already a search function so this one should
+# typically be disabled. For large projects the javascript based search engine
+# can be slow, then enabling SERVER_BASED_SEARCH may provide a better solution.
+
+SEARCHENGINE           = YES
+
+# When the SERVER_BASED_SEARCH tag is enabled the search engine will be
+# implemented using a PHP enabled web server instead of at the web client
+# using Javascript. Doxygen will generate the search PHP script and index
+# file to put on the web server. The advantage of the server
+# based approach is that it scales better to large projects and allows
+# full text search. The disadvantages are that it is more difficult to setup
+# and does not have live searching capabilities.
+
+SERVER_BASED_SEARCH    = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the LaTeX output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_LATEX tag is set to YES (the default) Doxygen will
+# generate Latex output.
+
+GENERATE_LATEX         = YES
+
+# The LATEX_OUTPUT tag is used to specify where the LaTeX docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `latex' will be used as the default path.
+
+LATEX_OUTPUT           =
+
+# The LATEX_CMD_NAME tag can be used to specify the LaTeX command name to be
+# invoked. If left blank `latex' will be used as the default command name.
+# Note that when enabling USE_PDFLATEX this option is only used for
+# generating bitmaps for formulas in the HTML output, but not in the
+# Makefile that is written to the output directory.
+
+LATEX_CMD_NAME         = latex
+
+# The MAKEINDEX_CMD_NAME tag can be used to specify the command name to
+# generate index for LaTeX. If left blank `makeindex' will be used as the
+# default command name.
+
+MAKEINDEX_CMD_NAME     = makeindex
+
+# If the COMPACT_LATEX tag is set to YES Doxygen generates more compact
+# LaTeX documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_LATEX          = NO
+
+# The PAPER_TYPE tag can be used to set the paper type that is used
+# by the printer. Possible values are: a4, letter, legal and
+# executive. If left blank a4wide will be used.
+
+PAPER_TYPE             = a4wide
+
+# The EXTRA_PACKAGES tag can be to specify one or more names of LaTeX
+# packages that should be included in the LaTeX output.
+
+EXTRA_PACKAGES         =
+
+# The LATEX_HEADER tag can be used to specify a personal LaTeX header for
+# the generated latex document. The header should contain everything until
+# the first chapter. If it is left blank doxygen will generate a
+# standard header. Notice: only use this tag if you know what you are doing!
+
+LATEX_HEADER           = doc/doxygen/header.tex
+
+# The LATEX_FOOTER tag can be used to specify a personal LaTeX footer for
+# the generated latex document. The footer should contain everything after
+# the last chapter. If it is left blank doxygen will generate a
+# standard footer. Notice: only use this tag if you know what you are doing!
+
+LATEX_FOOTER           =
+
+# If the PDF_HYPERLINKS tag is set to YES, the LaTeX that is generated
+# is prepared for conversion to pdf (using ps2pdf). The pdf file will
+# contain links (just like the HTML output) instead of page references
+# This makes the output suitable for online browsing using a pdf viewer.
+
+PDF_HYPERLINKS         = YES
+
+# If the USE_PDFLATEX tag is set to YES, pdflatex will be used instead of
+# plain latex in the generated Makefile. Set this option to YES to get a
+# higher quality PDF documentation.
+
+USE_PDFLATEX           = YES
+
+# If the LATEX_BATCHMODE tag is set to YES, doxygen will add the \\batchmode.
+# command to the generated LaTeX files. This will instruct LaTeX to keep
+# running if errors occur, instead of asking the user for help.
+# This option is also used when generating formulas in HTML.
+
+LATEX_BATCHMODE        = NO
+
+# If LATEX_HIDE_INDICES is set to YES then doxygen will not
+# include the index chapters (such as File Index, Compound Index, etc.)
+# in the output.
+
+LATEX_HIDE_INDICES     = NO
+
+# If LATEX_SOURCE_CODE is set to YES then doxygen will include
+# source code with syntax highlighting in the LaTeX output.
+# Note that which sources are shown also depends on other settings
+# such as SOURCE_BROWSER.
+
+LATEX_SOURCE_CODE      = NO
+
+# The LATEX_BIB_STYLE tag can be used to specify the style to use for the
+# bibliography, e.g. plainnat, or ieeetr. The default style is "plain". See
+# http://en.wikipedia.org/wiki/BibTeX for more info.
+
+LATEX_BIB_STYLE        = plain
+
+#---------------------------------------------------------------------------
+# configuration options related to the RTF output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_RTF tag is set to YES Doxygen will generate RTF output
+# The RTF output is optimized for Word 97 and may not look very pretty with
+# other RTF readers or editors.
+
+GENERATE_RTF           = NO
+
+# The RTF_OUTPUT tag is used to specify where the RTF docs will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `rtf' will be used as the default path.
+
+RTF_OUTPUT             =
+
+# If the COMPACT_RTF tag is set to YES Doxygen generates more compact
+# RTF documents. This may be useful for small projects and may help to
+# save some trees in general.
+
+COMPACT_RTF            = NO
+
+# If the RTF_HYPERLINKS tag is set to YES, the RTF that is generated
+# will contain hyperlink fields. The RTF file will
+# contain links (just like the HTML output) instead of page references.
+# This makes the output suitable for online browsing using WORD or other
+# programs which support those fields.
+# Note: wordpad (write) and others do not support links.
+
+RTF_HYPERLINKS         = NO
+
+# Load style sheet definitions from file. Syntax is similar to doxygen's
+# config file, i.e. a series of assignments. You only have to provide
+# replacements, missing definitions are set to their default value.
+
+RTF_STYLESHEET_FILE    =
+
+# Set optional variables used in the generation of an rtf document.
+# Syntax is similar to doxygen's config file.
+
+RTF_EXTENSIONS_FILE    =
+
+#---------------------------------------------------------------------------
+# configuration options related to the man page output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_MAN tag is set to YES (the default) Doxygen will
+# generate man pages
+
+GENERATE_MAN           = NO
+
+# The MAN_OUTPUT tag is used to specify where the man pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `man' will be used as the default path.
+
+MAN_OUTPUT             =
+
+# The MAN_EXTENSION tag determines the extension that is added to
+# the generated man pages (default is the subroutine's section .3)
+
+MAN_EXTENSION          =
+
+# If the MAN_LINKS tag is set to YES and Doxygen generates man output,
+# then it will generate one additional man file for each entity
+# documented in the real man page(s). These additional files
+# only source the real man page, but without them the man command
+# would be unable to find the correct page. The default is NO.
+
+MAN_LINKS              = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the XML output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_XML tag is set to YES Doxygen will
+# generate an XML file that captures the structure of
+# the code including all documentation.
+
+GENERATE_XML           = NO
+
+# The XML_OUTPUT tag is used to specify where the XML pages will be put.
+# If a relative path is entered the value of OUTPUT_DIRECTORY will be
+# put in front of it. If left blank `xml' will be used as the default path.
+
+XML_OUTPUT             = xml
+
+# The XML_SCHEMA tag can be used to specify an XML schema,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_SCHEMA             =
+
+# The XML_DTD tag can be used to specify an XML DTD,
+# which can be used by a validating XML parser to check the
+# syntax of the XML files.
+
+XML_DTD                =
+
+# If the XML_PROGRAMLISTING tag is set to YES Doxygen will
+# dump the program listings (including syntax highlighting
+# and cross-referencing information) to the XML output. Note that
+# enabling this will significantly increase the size of the XML output.
+
+XML_PROGRAMLISTING     = YES
+
+#---------------------------------------------------------------------------
+# configuration options for the AutoGen Definitions output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_AUTOGEN_DEF tag is set to YES Doxygen will
+# generate an AutoGen Definitions (see autogen.sf.net) file
+# that captures the structure of the code including all
+# documentation. Note that this feature is still experimental
+# and incomplete at the moment.
+
+GENERATE_AUTOGEN_DEF   = NO
+
+#---------------------------------------------------------------------------
+# configuration options related to the Perl module output
+#---------------------------------------------------------------------------
+
+# If the GENERATE_PERLMOD tag is set to YES Doxygen will
+# generate a Perl module file that captures the structure of
+# the code including all documentation. Note that this
+# feature is still experimental and incomplete at the
+# moment.
+
+GENERATE_PERLMOD       = NO
+
+# If the PERLMOD_LATEX tag is set to YES Doxygen will generate
+# the necessary Makefile rules, Perl scripts and LaTeX code to be able
+# to generate PDF and DVI output from the Perl module output.
+
+PERLMOD_LATEX          = NO
+
+# If the PERLMOD_PRETTY tag is set to YES the Perl module output will be
+# nicely formatted so it can be parsed by a human reader.
+# This is useful
+# if you want to understand what is going on.
+# On the other hand, if this
+# tag is set to NO the size of the Perl module output will be much smaller
+# and Perl will parse it just the same.
+
+PERLMOD_PRETTY         = YES
+
+# The names of the make variables in the generated doxyrules.make file
+# are prefixed with the string contained in PERLMOD_MAKEVAR_PREFIX.
+# This is useful so different doxyrules.make files included by the same
+# Makefile don't overwrite each other's variables.
+
+PERLMOD_MAKEVAR_PREFIX =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the preprocessor
+#---------------------------------------------------------------------------
+
+# If the ENABLE_PREPROCESSING tag is set to YES (the default) Doxygen will
+# evaluate all C-preprocessor directives found in the sources and include
+# files.
+
+ENABLE_PREPROCESSING   = YES
+
+# If the MACRO_EXPANSION tag is set to YES Doxygen will expand all macro
+# names in the source code. If set to NO (the default) only conditional
+# compilation will be performed. Macro expansion can be done in a controlled
+# way by setting EXPAND_ONLY_PREDEF to YES.
+
+MACRO_EXPANSION        = YES
+
+# If the EXPAND_ONLY_PREDEF and MACRO_EXPANSION tags are both set to YES
+# then the macro expansion is limited to the macros specified with the
+# PREDEFINED and EXPAND_AS_DEFINED tags.
+
+EXPAND_ONLY_PREDEF     = YES
+
+# If the SEARCH_INCLUDES tag is set to YES (the default) the includes files
+# pointed to by INCLUDE_PATH will be searched when a #include is found.
+
+SEARCH_INCLUDES        = YES
+
+# The INCLUDE_PATH tag can be used to specify one or more directories that
+# contain include files that are not input files but should be processed by
+# the preprocessor.
+
+INCLUDE_PATH           =
+
+# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
+# patterns (like *.h and *.hpp) to filter out the header-files in the
+# directories. If left blank, the patterns specified with FILE_PATTERNS will
+# be used.
+
+INCLUDE_FILE_PATTERNS  =
+
+# The PREDEFINED tag can be used to specify one or more macro names that
+# are defined before the preprocessor is started (similar to the -D option of
+# gcc). The argument of the tag is a list of macros of the form: name
+# or name=definition (no spaces). If the definition and the = are
+# omitted =1 is assumed. To prevent a macro definition from being
+# undefined via #undef or recursively expanded use the := operator
+# instead of the = operator.
+
+PREDEFINED             = OMP_30_ENABLED=1, OMP_40_ENABLED=1, KMP_STATS_ENABLED=1
+
+# If the MACRO_EXPANSION and EXPAND_ONLY_PREDEF tags are set to YES then
+# this tag can be used to specify a list of macro names that should be expanded.
+# The macro definition that is found in the sources will be used.
+# Use the PREDEFINED tag if you want to use a different macro definition that
+# overrules the definition found in the source code.
+
+EXPAND_AS_DEFINED      =
+
+# If the SKIP_FUNCTION_MACROS tag is set to YES (the default) then
+# doxygen's preprocessor will remove all references to function-like macros
+# that are alone on a line, have an all uppercase name, and do not end with a
+# semicolon, because these will confuse the parser if not removed.
+
+SKIP_FUNCTION_MACROS   = YES
+
+#---------------------------------------------------------------------------
+# Configuration::additions related to external references
+#---------------------------------------------------------------------------
+
+# The TAGFILES option can be used to specify one or more tagfiles. For each
+# tag file the location of the external documentation should be added. The
+# format of a tag file without this location is as follows:
+#
+# TAGFILES = file1 file2 ...
+# Adding location for the tag files is done as follows:
+#
+# TAGFILES = file1=loc1 "file2 = loc2" ...
+# where "loc1" and "loc2" can be relative or absolute paths
+# or URLs. Note that each tag file must have a unique name (where the name does
+# NOT include the path). If a tag file is not located in the directory in which
+# doxygen is run, you must also specify the path to the tagfile here.
+
+TAGFILES               =
+
+# When a file name is specified after GENERATE_TAGFILE, doxygen will create
+# a tag file that is based on the input files it reads.
+
+GENERATE_TAGFILE       =
+
+# If the ALLEXTERNALS tag is set to YES all external classes will be listed
+# in the class index. If set to NO only the inherited external classes
+# will be listed.
+
+ALLEXTERNALS           = NO
+
+# If the EXTERNAL_GROUPS tag is set to YES all external groups will be listed
+# in the modules index. If set to NO, only the current project's groups will
+# be listed.
+
+EXTERNAL_GROUPS        = YES
+
+# The PERL_PATH should be the absolute path and name of the perl script
+# interpreter (i.e. the result of `which perl').
+
+PERL_PATH              =
+
+#---------------------------------------------------------------------------
+# Configuration options related to the dot tool
+#---------------------------------------------------------------------------
+
+# If the CLASS_DIAGRAMS tag is set to YES (the default) Doxygen will
+# generate a inheritance diagram (in HTML, RTF and LaTeX) for classes with base
+# or super classes. Setting the tag to NO turns the diagrams off. Note that
+# this option also works with HAVE_DOT disabled, but it is recommended to
+# install and use dot, since it yields more powerful graphs.
+
+CLASS_DIAGRAMS         = YES
+
+# You can define message sequence charts within doxygen comments using the \msc
+# command. Doxygen will then run the mscgen tool (see
+# http://www.mcternan.me.uk/mscgen/) to produce the chart and insert it in the
+# documentation. The MSCGEN_PATH tag allows you to specify the directory where
+# the mscgen tool resides. If left empty the tool is assumed to be found in the
+# default search path.
+
+MSCGEN_PATH            =
+
+# If set to YES, the inheritance and collaboration graphs will hide
+# inheritance and usage relations if the target is undocumented
+# or is not a class.
+
+HIDE_UNDOC_RELATIONS   = YES
+
+# If you set the HAVE_DOT tag to YES then doxygen will assume the dot tool is
+# available from the path. This tool is part of Graphviz, a graph visualization
+# toolkit from AT&T and Lucent Bell Labs. The other options in this section
+# have no effect if this option is set to NO (the default)
+
+HAVE_DOT               = NO
+
+# The DOT_NUM_THREADS specifies the number of dot invocations doxygen is
+# allowed to run in parallel. When set to 0 (the default) doxygen will
+# base this on the number of processors available in the system. You can set it
+# explicitly to a value larger than 0 to get control over the balance
+# between CPU load and processing speed.
+
+DOT_NUM_THREADS        = 0
+
+# By default doxygen will use the Helvetica font for all dot files that
+# doxygen generates. When you want a differently looking font you can specify
+# the font name using DOT_FONTNAME. You need to make sure dot is able to find
+# the font, which can be done by putting it in a standard location or by setting
+# the DOTFONTPATH environment variable or by setting DOT_FONTPATH to the
+# directory containing the font.
+
+DOT_FONTNAME           = Helvetica
+
+# The DOT_FONTSIZE tag can be used to set the size of the font of dot graphs.
+# The default size is 10pt.
+
+DOT_FONTSIZE           = 10
+
+# By default doxygen will tell dot to use the Helvetica font.
+# If you specify a different font using DOT_FONTNAME you can use DOT_FONTPATH to
+# set the path where dot can find it.
+
+DOT_FONTPATH           =
+
+# If the CLASS_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect inheritance relations. Setting this tag to YES will force the
+# CLASS_DIAGRAMS tag to NO.
+
+CLASS_GRAPH            = YES
+
+# If the COLLABORATION_GRAPH and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for each documented class showing the direct and
+# indirect implementation dependencies (inheritance, containment, and
+# class references variables) of the class with other documented classes.
+
+COLLABORATION_GRAPH    = NO
+
+# If the GROUP_GRAPHS and HAVE_DOT tags are set to YES then doxygen
+# will generate a graph for groups, showing the direct groups dependencies
+
+GROUP_GRAPHS           = YES
+
+# If the UML_LOOK tag is set to YES doxygen will generate inheritance and
+# collaboration diagrams in a style similar to the OMG's Unified Modeling
+# Language.
+
+UML_LOOK               = NO
+
+# If the UML_LOOK tag is enabled, the fields and methods are shown inside
+# the class node. If there are many fields or methods and many nodes the
+# graph may become too big to be useful. The UML_LIMIT_NUM_FIELDS
+# threshold limits the number of items for each type to make the size more
+# manageable. Set this to 0 for no limit. Note that the threshold may be
+# exceeded by 50% before the limit is enforced.
+
+UML_LIMIT_NUM_FIELDS   = 10
+
+# If set to YES, the inheritance and collaboration graphs will show the
+# relations between templates and their instances.
+
+TEMPLATE_RELATIONS     = YES
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDE_GRAPH, and HAVE_DOT
+# tags are set to YES then doxygen will generate a graph for each documented
+# file showing the direct and indirect include dependencies of the file with
+# other documented files.
+
+INCLUDE_GRAPH          = NO
+
+# If the ENABLE_PREPROCESSING, SEARCH_INCLUDES, INCLUDED_BY_GRAPH, and
+# HAVE_DOT tags are set to YES then doxygen will generate a graph for each
+# documented header file showing the documented files that directly or
+# indirectly include this file.
+
+INCLUDED_BY_GRAPH      = NO
+
+# If the CALL_GRAPH and HAVE_DOT options are set to YES then
+# doxygen will generate a call dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable call graphs
+# for selected functions only using the \callgraph command.
+
+CALL_GRAPH             = NO
+
+# If the CALLER_GRAPH and HAVE_DOT tags are set to YES then
+# doxygen will generate a caller dependency graph for every global function
+# or class method. Note that enabling this option will significantly increase
+# the time of a run. So in most cases it will be better to enable caller
+# graphs for selected functions only using the \callergraph command.
+
+CALLER_GRAPH           = NO
+
+# If the GRAPHICAL_HIERARCHY and HAVE_DOT tags are set to YES then doxygen
+# will generate a graphical hierarchy of all classes instead of a textual one.
+
+GRAPHICAL_HIERARCHY    = YES
+
+# If the DIRECTORY_GRAPH and HAVE_DOT tags are set to YES
+# then doxygen will show the dependencies a directory has on other directories
+# in a graphical way. The dependency relations are determined by the #include
+# relations between the files in the directories.
+
+DIRECTORY_GRAPH        = YES
+
+# The DOT_IMAGE_FORMAT tag can be used to set the image format of the images
+# generated by dot. Possible values are svg, png, jpg, or gif.
+# If left blank png will be used. If you choose svg you need to set
+# HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible in IE 9+ (other browsers do not have this requirement).
+
+DOT_IMAGE_FORMAT       = png
+
+# If DOT_IMAGE_FORMAT is set to svg, then this option can be set to YES to
+# enable generation of interactive SVG images that allow zooming and panning.
+# Note that this requires a modern browser other than Internet Explorer.
+# Tested and working are Firefox, Chrome, Safari, and Opera. For IE 9+ you
+# need to set HTML_FILE_EXTENSION to xhtml in order to make the SVG files
+# visible. Older versions of IE do not have SVG support.
+
+INTERACTIVE_SVG        = NO
+
+# The tag DOT_PATH can be used to specify the path where the dot tool can be
+# found. If left blank, it is assumed the dot tool can be found in the path.
+
+DOT_PATH               =
+
+# The DOTFILE_DIRS tag can be used to specify one or more directories that
+# contain dot files that are included in the documentation (see the
+# \dotfile command).
+
+DOTFILE_DIRS           =
+
+# The MSCFILE_DIRS tag can be used to specify one or more directories that
+# contain msc files that are included in the documentation (see the
+# \mscfile command).
+
+MSCFILE_DIRS           =
+
+# The DOT_GRAPH_MAX_NODES tag can be used to set the maximum number of
+# nodes that will be shown in the graph. If the number of nodes in a graph
+# becomes larger than this value, doxygen will truncate the graph, which is
+# visualized by representing a node as a red box. Note that doxygen if the
+# number of direct children of the root node in a graph is already larger than
+# DOT_GRAPH_MAX_NODES then the graph will not be shown at all. Also note
+# that the size of a graph can be further restricted by MAX_DOT_GRAPH_DEPTH.
+
+DOT_GRAPH_MAX_NODES    = 50
+
+# The MAX_DOT_GRAPH_DEPTH tag can be used to set the maximum depth of the
+# graphs generated by dot. A depth value of 3 means that only nodes reachable
+# from the root by following a path via at most 3 edges will be shown. Nodes
+# that lay further from the root node will be omitted. Note that setting this
+# option to 1 or 2 may greatly reduce the computation time needed for large
+# code bases. Also note that the size of a graph can be further restricted by
+# DOT_GRAPH_MAX_NODES. Using a depth of 0 means no depth restriction.
+
+MAX_DOT_GRAPH_DEPTH    = 0
+
+# Set the DOT_TRANSPARENT tag to YES to generate images with a transparent
+# background. This is disabled by default, because dot on Windows does not
+# seem to support this out of the box. Warning: Depending on the platform used,
+# enabling this option may lead to badly anti-aliased labels on the edges of
+# a graph (i.e. they become hard to read).
+
+DOT_TRANSPARENT        = NO
+
+# Set the DOT_MULTI_TARGETS tag to YES allow dot to generate multiple output
+# files in one run (i.e. multiple -o and -T options on the command line). This
+# makes dot run faster, but since only newer versions of dot (>1.8.10)
+# support this, this feature is disabled by default.
+
+DOT_MULTI_TARGETS      = NO
+
+# If the GENERATE_LEGEND tag is set to YES (the default) Doxygen will
+# generate a legend page explaining the meaning of the various boxes and
+# arrows in the dot generated graphs.
+
+GENERATE_LEGEND        = YES
+
+# If the DOT_CLEANUP tag is set to YES (the default) Doxygen will
+# remove the intermediate dot files that are used to generate
+# the various graphs.
+
+DOT_CLEANUP            = YES
diff --git a/pstl/CREDITS.txt b/pstl/CREDITS.txt
index 4945fd5ad308..174722510fde 100644
--- a/pstl/CREDITS.txt
+++ b/pstl/CREDITS.txt
@@ -1,21 +1,21 @@
-This file is a partial list of people who have contributed to the LLVM/pstl
-(Parallel STL) project.  If you have contributed a patch or made some other
-contribution to LLVM/pstl, please submit a patch to this file to add yourself,
-and it will be done!
-
-The list is sorted by surname and formatted to allow easy grepping and
-beautification by scripts.  The fields are: name (N), email (E), web-address 
-(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
-(S).
-
-N: Intel Corporation
-W: http://www.intel.com
-D: Created the initial implementation.
-
-N: Thomas Rodgers
-E: trodgers@redhat.com
-D: Identifier name transformation for inclusion in a Standard C++ library.
-
-N: Christopher Nelson
-E: nadiasvertex@gmail.com
-D: Add support for an OpenMP backend.
+This file is a partial list of people who have contributed to the LLVM/pstl
+(Parallel STL) project.  If you have contributed a patch or made some other
+contribution to LLVM/pstl, please submit a patch to this file to add yourself,
+and it will be done!
+
+The list is sorted by surname and formatted to allow easy grepping and
+beautification by scripts.  The fields are: name (N), email (E), web-address 
+(W), PGP key ID and fingerprint (P), description (D), and snail-mail address
+(S).
+
+N: Intel Corporation
+W: http://www.intel.com
+D: Created the initial implementation.
+
+N: Thomas Rodgers
+E: trodgers@redhat.com
+D: Identifier name transformation for inclusion in a Standard C++ library.
+
+N: Christopher Nelson
+E: nadiasvertex@gmail.com
+D: Add support for an OpenMP backend.
-- 
GitLab


From af1e9c81f4ab06ab46db87e273ec6eef5a24ef27 Mon Sep 17 00:00:00 2001
From: Josep Pinot <jsp.pinot@gmail.com>
Date: Thu, 17 Oct 2024 16:01:28 +0200
Subject: [PATCH 257/329] [OpenMP] Fix missing gtid argument in
 __kmp_print_tdg_dot function (#111986)

This patch modifies the signature of the `__kmp_print_tdg_dot` function
in `kmp_tasking.cpp` to include the global thread ID (gtid) as an
argument. The gtid is now correctly passed to the function.

- Updated the function declaration to accept the gtid parameter.
- Modified all calls to `__kmp_print_tdg_dot` to pass the correct gtid
value.

This change addresses issues encountered when compiling with
`OMPX_TASKGRAPH` enabled. No functional changes are expected beyond
successful compilation.
---
 openmp/runtime/src/kmp_tasking.cpp | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/openmp/runtime/src/kmp_tasking.cpp b/openmp/runtime/src/kmp_tasking.cpp
index 7edaa8e127e5..932799e133b4 100644
--- a/openmp/runtime/src/kmp_tasking.cpp
+++ b/openmp/runtime/src/kmp_tasking.cpp
@@ -5491,7 +5491,8 @@ static kmp_tdg_info_t *__kmp_find_tdg(kmp_int32 tdg_id) {
 
 // __kmp_print_tdg_dot: prints the TDG to a dot file
 // tdg:    ID of the TDG
-void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg) {
+// gtid:   Global Thread ID
+void __kmp_print_tdg_dot(kmp_tdg_info_t *tdg, kmp_int32 gtid) {
   kmp_int32 tdg_id = tdg->tdg_id;
   KA_TRACE(10, ("__kmp_print_tdg_dot(enter): T#%d tdg_id=%d \n", gtid, tdg_id));
 
@@ -5693,7 +5694,7 @@ void __kmp_end_record(kmp_int32 gtid, kmp_tdg_info_t *tdg) {
   KMP_ATOMIC_ST_RLX(&__kmp_tdg_task_id, 0);
 
   if (__kmp_tdg_dot)
-    __kmp_print_tdg_dot(tdg);
+    __kmp_print_tdg_dot(tdg, gtid);
 }
 
 // __kmpc_end_record_task: wrapper around __kmp_end_record to mark
-- 
GitLab


From 3764d0ff15ef281974879002e27857a041bd5b9c Mon Sep 17 00:00:00 2001
From: Nick Desaulniers <nickdesaulniers@users.noreply.github.com>
Date: Thu, 17 Oct 2024 07:08:54 -0700
Subject: [PATCH 258/329] [libc][setjmp] make x86_64 longjmp naked (#112581)

The generated asm for x86_64's longjmp has a function prolog and epilog.
The
epilog in particular is unreachable. Convert longjmp to a naked function
to
avoid these spurious instructions in longjmp.

Link: https://github.com/llvm/llvm-project/pull/112437/files#r1802085511
---
 libc/src/setjmp/longjmp.h          | 13 +++++++++
 libc/src/setjmp/x86_64/longjmp.cpp | 43 ++++++++++++++----------------
 2 files changed, 33 insertions(+), 23 deletions(-)

diff --git a/libc/src/setjmp/longjmp.h b/libc/src/setjmp/longjmp.h
index 7cb12b3392ae..9b7db2971721 100644
--- a/libc/src/setjmp/longjmp.h
+++ b/libc/src/setjmp/longjmp.h
@@ -11,9 +11,22 @@
 
 #include "hdr/types/jmp_buf.h"
 #include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/compiler.h"
 
 namespace LIBC_NAMESPACE_DECL {
 
+// TODO(https://github.com/llvm/llvm-project/issues/112427)
+// Some of the architecture-specific definitions are marked `naked`, which in
+// GCC implies `nothrow`.
+//
+// Right now, our aliases aren't marked `nothrow`, so we wind up in a situation
+// where clang will emit -Wmissing-exception-spec if we add `nothrow` here, but
+// GCC will emit -Wmissing-attributes here without `nothrow`. We need to update
+// LLVM_LIBC_FUNCTION to denote when a function throws or not.
+
+#ifdef LIBC_COMPILER_IS_GCC
+[[gnu::nothrow]]
+#endif
 void longjmp(jmp_buf buf, int val);
 
 } // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/setjmp/x86_64/longjmp.cpp b/libc/src/setjmp/x86_64/longjmp.cpp
index d4b55565cb21..c293c55a6f9f 100644
--- a/libc/src/setjmp/x86_64/longjmp.cpp
+++ b/libc/src/setjmp/x86_64/longjmp.cpp
@@ -7,6 +7,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "src/setjmp/longjmp.h"
+#include "include/llvm-libc-macros/offsetof-macro.h"
 #include "src/__support/common.h"
 #include "src/__support/macros/config.h"
 
@@ -16,30 +17,26 @@
 
 namespace LIBC_NAMESPACE_DECL {
 
-LLVM_LIBC_FUNCTION(void, longjmp, (jmp_buf buf, int val)) {
-  register __UINT64_TYPE__ rbx __asm__("rbx");
-  register __UINT64_TYPE__ rbp __asm__("rbp");
-  register __UINT64_TYPE__ r12 __asm__("r12");
-  register __UINT64_TYPE__ r13 __asm__("r13");
-  register __UINT64_TYPE__ r14 __asm__("r14");
-  register __UINT64_TYPE__ r15 __asm__("r15");
-  register __UINT64_TYPE__ rsp __asm__("rsp");
-  register __UINT64_TYPE__ rax __asm__("rax");
+[[gnu::naked]]
+LLVM_LIBC_FUNCTION(void, longjmp, (jmp_buf, int)) {
+  asm(R"(
+      cmpl $0x1, %%esi
+      adcl $0x0, %%esi
+      movq %%rsi, %%rax
 
-  // ABI requires that the return value should be stored in rax. So, we store
-  // |val| in rax. Note that this has to happen before we restore the registers
-  // from values in |buf|. Otherwise, once rsp and rbp are updated, we cannot
-  // read |val|.
-  val = val == 0 ? 1 : val;
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rax) : "m"(val) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rbx) : "m"(buf->rbx) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rbp) : "m"(buf->rbp) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r12) : "m"(buf->r12) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r13) : "m"(buf->r13) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r14) : "m"(buf->r14) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(r15) : "m"(buf->r15) :);
-  LIBC_INLINE_ASM("mov %1, %0\n\t" : "=r"(rsp) : "m"(buf->rsp) :);
-  LIBC_INLINE_ASM("jmp *%0\n\t" : : "m"(buf->rip));
+      movq %c[rbx](%%rdi), %%rbx
+      movq %c[rbp](%%rdi), %%rbp
+      movq %c[r12](%%rdi), %%r12
+      movq %c[r13](%%rdi), %%r13
+      movq %c[r14](%%rdi), %%r14
+      movq %c[r15](%%rdi), %%r15
+      movq %c[rsp](%%rdi), %%rsp
+      jmpq *%c[rip](%%rdi)
+      )" ::[rbx] "i"(offsetof(__jmp_buf, rbx)),
+      [rbp] "i"(offsetof(__jmp_buf, rbp)), [r12] "i"(offsetof(__jmp_buf, r12)),
+      [r13] "i"(offsetof(__jmp_buf, r13)), [r14] "i"(offsetof(__jmp_buf, r14)),
+      [r15] "i"(offsetof(__jmp_buf, r15)), [rsp] "i"(offsetof(__jmp_buf, rsp)),
+      [rip] "i"(offsetof(__jmp_buf, rip)));
 }
 
 } // namespace LIBC_NAMESPACE_DECL
-- 
GitLab


From a1ac5a57ae13d22d20c6ac71fbbccbd9f87b0a72 Mon Sep 17 00:00:00 2001
From: Leandro Lupori <leandro.lupori@linaro.org>
Date: Thu, 17 Oct 2024 11:32:21 -0300
Subject: [PATCH 259/329] [flang] Allow OpenMP declarations before type
 declarations (#112414)

Skip resolving implicit types for OpenMP declarative directives, to
allow them to appear before type declarations, which is supported
by several compilers. This was discussed in

https://discourse.llvm.org/t/rfc-openmp-should-type-declaration-be-allowed-after-threadprivate/81345.

This fixes the semantic errors of
https://github.com/llvm/llvm-project/issues/106021.
---
 flang/lib/Semantics/resolve-names.cpp         | 30 +++++++++-
 ...ective.f90 => declarative-directive01.f90} |  0
 .../OpenMP/declarative-directive02.f90        | 56 +++++++++++++++++++
 .../Semantics/OpenMP/declare-target06.f90     |  5 --
 4 files changed, 84 insertions(+), 7 deletions(-)
 rename flang/test/Semantics/OpenMP/{declarative-directive.f90 => declarative-directive01.f90} (100%)
 create mode 100644 flang/test/Semantics/OpenMP/declarative-directive02.f90

diff --git a/flang/lib/Semantics/resolve-names.cpp b/flang/lib/Semantics/resolve-names.cpp
index f1ce0b415ebe..2fa5b75e073b 100644
--- a/flang/lib/Semantics/resolve-names.cpp
+++ b/flang/lib/Semantics/resolve-names.cpp
@@ -720,6 +720,7 @@ protected:
 
   bool inSpecificationPart_{false};
   bool deferImplicitTyping_{false};
+  bool skipImplicitTyping_{false};
   bool inEquivalenceStmt_{false};
 
   // Some information is collected from a specification part for deferred
@@ -758,6 +759,10 @@ protected:
     }
   }
 
+  void SkipImplicitTyping(bool skip) {
+    deferImplicitTyping_ = skipImplicitTyping_ = skip;
+  }
+
 private:
   Scope *currScope_{nullptr};
   FuncResultStack funcResultStack_{*this};
@@ -1506,6 +1511,25 @@ public:
   void Post(const parser::OmpEndCriticalDirective &) {
     messageHandler().set_currStmtSource(std::nullopt);
   }
+  bool Pre(const parser::OpenMPThreadprivate &) {
+    SkipImplicitTyping(true);
+    return true;
+  }
+  void Post(const parser::OpenMPThreadprivate &) { SkipImplicitTyping(false); }
+  bool Pre(const parser::OpenMPDeclareTargetConstruct &) {
+    SkipImplicitTyping(true);
+    return true;
+  }
+  void Post(const parser::OpenMPDeclareTargetConstruct &) {
+    SkipImplicitTyping(false);
+  }
+  bool Pre(const parser::OpenMPDeclarativeAllocate &) {
+    SkipImplicitTyping(true);
+    return true;
+  }
+  void Post(const parser::OpenMPDeclarativeAllocate &) {
+    SkipImplicitTyping(false);
+  }
 };
 
 bool OmpVisitor::NeedsScope(const parser::OpenMPBlockConstruct &x) {
@@ -2557,8 +2581,10 @@ void ScopeHandler::ApplyImplicitRules(
     return;
   }
   if (const DeclTypeSpec * type{GetImplicitType(symbol)}) {
-    symbol.set(Symbol::Flag::Implicit);
-    symbol.SetType(*type);
+    if (!skipImplicitTyping_) {
+      symbol.set(Symbol::Flag::Implicit);
+      symbol.SetType(*type);
+    }
     return;
   }
   if (symbol.has<ProcEntityDetails>() && !symbol.attrs().test(Attr::EXTERNAL)) {
diff --git a/flang/test/Semantics/OpenMP/declarative-directive.f90 b/flang/test/Semantics/OpenMP/declarative-directive01.f90
similarity index 100%
rename from flang/test/Semantics/OpenMP/declarative-directive.f90
rename to flang/test/Semantics/OpenMP/declarative-directive01.f90
diff --git a/flang/test/Semantics/OpenMP/declarative-directive02.f90 b/flang/test/Semantics/OpenMP/declarative-directive02.f90
new file mode 100644
index 000000000000..dcde963689eb
--- /dev/null
+++ b/flang/test/Semantics/OpenMP/declarative-directive02.f90
@@ -0,0 +1,56 @@
+! RUN: %flang -fsyntax-only -fopenmp %s 2>&1
+
+! Check that OpenMP declarative directives can be used with objects that have
+! an incomplete type.
+
+subroutine test_decl
+  ! OMPv5.2 5.2 threadprivate
+  ! OMPv5.2 6.5 allocate
+  implicit none
+  save :: x1, y1
+  !$omp threadprivate(x1)
+  !$omp allocate(y1)
+  integer :: x1, y1
+
+  ! OMPv5.2 7.7 declare-simd
+  external :: simd_func
+  !$omp declare simd(simd_func)
+  logical :: simd_func
+
+  ! OMPv5.2 7.8.1 declare-target
+  allocatable :: j
+  !$omp declare target(j)
+  save :: j
+  real(kind=8) :: j(:)
+
+  ! OMPv5.2 5.5.11 declare-reduction - crashes
+  !external :: my_add_red
+  !!$omp declare reduction(my_add_red : integer : my_add_red(omp_out, omp_in)) &
+  !!$omp&  initializer(omp_priv=0)
+  !integer :: my_add_red
+end subroutine
+
+subroutine test_decl2
+  save x1, y1
+  !$omp threadprivate(x1)
+  !$omp allocate(y1)
+  integer :: x1, y1
+
+  ! implicit decl
+  !$omp threadprivate(x2)
+  !$omp allocate(y2)
+  save x2, y2
+end subroutine
+
+module m1
+  ! implicit decl
+  !$omp threadprivate(x, y, z)
+  integer :: y
+  real :: z
+
+contains
+  subroutine sub
+    !$omp parallel copyin(x, y, z)
+    !$omp end parallel
+  end subroutine
+end module
diff --git a/flang/test/Semantics/OpenMP/declare-target06.f90 b/flang/test/Semantics/OpenMP/declare-target06.f90
index 9abcfcecb681..7df0a7312309 100644
--- a/flang/test/Semantics/OpenMP/declare-target06.f90
+++ b/flang/test/Semantics/OpenMP/declare-target06.f90
@@ -6,21 +6,16 @@
 
 module test_0
     implicit none
-!ERROR: The given DECLARE TARGET directive clause has an invalid argument
 !ERROR: No explicit type declared for 'no_implicit_materialization_1'
 !$omp declare target(no_implicit_materialization_1)
 
-!ERROR: The given DECLARE TARGET directive clause has an invalid argument
 !ERROR: No explicit type declared for 'no_implicit_materialization_2'
 !$omp declare target link(no_implicit_materialization_2)
 
-!ERROR: The given DECLARE TARGET directive clause has an invalid argument
 !WARNING: The usage of TO clause on DECLARE TARGET directive has been deprecated. Use ENTER clause instead.
 !ERROR: No explicit type declared for 'no_implicit_materialization_3'
 !$omp declare target to(no_implicit_materialization_3)
 
-!ERROR: The given DECLARE TARGET directive clause has an invalid argument
-!ERROR: No explicit type declared for 'no_implicit_materialization_3'
 !$omp declare target enter(no_implicit_materialization_3)
 
 INTEGER :: data_int = 10
-- 
GitLab


From 954836634abb446f18719b14120c386a929a42d1 Mon Sep 17 00:00:00 2001
From: Doug Wyatt <doug@sonosphere.com>
Date: Thu, 17 Oct 2024 10:38:29 -0400
Subject: [PATCH 260/329] [libc++] Make __libcpp_verbose_abort() noexcept like
 std::terminate() (#109151)

Make __libcpp_verbose_abort() noexcept (it is already noreturn), to
match std::terminate(). Clang's function effect analysis can use this to
ignore such functions as being beyond its scope. (See
https://github.com/llvm/llvm-project/pull/99656).
---
 libcxx/docs/ReleaseNotes/20.rst                               | 3 +++
 libcxx/include/__verbose_abort                                | 2 +-
 libcxx/src/verbose_abort.cpp                                  | 2 +-
 .../assertions/customize_verbose_abort.link-time.pass.cpp     | 4 +---
 libcxx/test/support/check_assertion.h                         | 2 +-
 5 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/libcxx/docs/ReleaseNotes/20.rst b/libcxx/docs/ReleaseNotes/20.rst
index 3a66aecaf57c..abd6764579e5 100644
--- a/libcxx/docs/ReleaseNotes/20.rst
+++ b/libcxx/docs/ReleaseNotes/20.rst
@@ -82,6 +82,9 @@ Deprecations and Removals
   were private but could cause ambiguity in name lookup. Code that expects such ambiguity will possibly not compile in
   LLVM 20.
 
+- The function ``__libcpp_verbose_abort()`` is now ``noexcept``, to match ``std::terminate()``. (The combination of
+  ``noexcept`` and ``[[noreturn]]`` has special significance for function effects analysis.)
+
 Upcoming Deprecations and Removals
 ----------------------------------
 
diff --git a/libcxx/include/__verbose_abort b/libcxx/include/__verbose_abort
index 244278aec652..73295cae4261 100644
--- a/libcxx/include/__verbose_abort
+++ b/libcxx/include/__verbose_abort
@@ -21,7 +21,7 @@ _LIBCPP_BEGIN_NAMESPACE_STD
 // This function should never be called directly from the code -- it should only be called through
 // the _LIBCPP_VERBOSE_ABORT macro.
 [[__noreturn__]] _LIBCPP_AVAILABILITY_VERBOSE_ABORT _LIBCPP_OVERRIDABLE_FUNC_VIS
-_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...);
+_LIBCPP_ATTRIBUTE_FORMAT(__printf__, 1, 2) void __libcpp_verbose_abort(const char* __format, ...) _NOEXCEPT;
 
 // _LIBCPP_VERBOSE_ABORT(format, args...)
 //
diff --git a/libcxx/src/verbose_abort.cpp b/libcxx/src/verbose_abort.cpp
index 719134e2ae55..0019063405a8 100644
--- a/libcxx/src/verbose_abort.cpp
+++ b/libcxx/src/verbose_abort.cpp
@@ -28,7 +28,7 @@ extern "C" void android_set_abort_message(const char* msg);
 
 _LIBCPP_BEGIN_NAMESPACE_STD
 
-_LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) {
+_LIBCPP_WEAK void __libcpp_verbose_abort(char const* format, ...) noexcept {
   // Write message to stderr. We do this before formatting into a
   // buffer so that we still get some information out if that fails.
   {
diff --git a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
index 9298a1e365fc..21e9003c30b7 100644
--- a/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
+++ b/libcxx/test/libcxx/assertions/customize_verbose_abort.link-time.pass.cpp
@@ -15,9 +15,7 @@
 #include <__verbose_abort>
 #include <cstdlib>
 
-void std::__libcpp_verbose_abort(char const*, ...) {
-  std::exit(EXIT_SUCCESS);
-}
+void std::__libcpp_verbose_abort(char const*, ...) _NOEXCEPT { std::exit(EXIT_SUCCESS); }
 
 int main(int, char**) {
   std::__libcpp_verbose_abort("%s", "message");
diff --git a/libcxx/test/support/check_assertion.h b/libcxx/test/support/check_assertion.h
index 47ebfeeeefc0..a279400d651b 100644
--- a/libcxx/test/support/check_assertion.h
+++ b/libcxx/test/support/check_assertion.h
@@ -334,7 +334,7 @@ private:
 };
 
 #ifdef _LIBCPP_VERSION
-void std::__libcpp_verbose_abort(char const* format, ...) {
+void std::__libcpp_verbose_abort(char const* format, ...) noexcept {
   va_list args;
   va_start(args, format);
 
-- 
GitLab


From db3292402565042dff9a2d5a147e023de6d82263 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 17 Oct 2024 07:44:27 -0700
Subject: [PATCH 261/329] [HipStdPar] Avoid repeated hash lookups (NFC)
 (#112653)

---
 llvm/lib/Transforms/HipStdPar/HipStdPar.cpp | 4 +---
 1 file changed, 1 insertion(+), 3 deletions(-)

diff --git a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
index d740500ef1f8..b909bf5b2d7b 100644
--- a/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
+++ b/llvm/lib/Transforms/HipStdPar/HipStdPar.cpp
@@ -86,15 +86,13 @@ static inline bool checkIfSupported(GlobalVariable &G) {
     auto U = std::move(Tmp.back());
     Tmp.pop_back();
 
-    if (Visited.contains(U))
+    if (!Visited.insert(U).second)
       continue;
 
     if (isa<Instruction>(U))
       I = cast<Instruction>(U);
     else
       Tmp.insert(Tmp.end(), U->user_begin(), U->user_end());
-
-    Visited.insert(U);
   } while (!I && !Tmp.empty());
 
   assert(I && "thread_local global should have at least one non-constant use.");
-- 
GitLab


From 91b2ac640e9b4e8369c7d09c0a914b815ae6daa9 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 17 Oct 2024 07:45:02 -0700
Subject: [PATCH 262/329] [Transforms] Avoid repeated hash lookups (NFC)
 (#112654)

---
 llvm/lib/Transforms/Utils/LoopPeel.cpp | 10 ++++------
 1 file changed, 4 insertions(+), 6 deletions(-)

diff --git a/llvm/lib/Transforms/Utils/LoopPeel.cpp b/llvm/lib/Transforms/Utils/LoopPeel.cpp
index 760f1619e030..3cbde39b30b4 100644
--- a/llvm/lib/Transforms/Utils/LoopPeel.cpp
+++ b/llvm/lib/Transforms/Utils/LoopPeel.cpp
@@ -206,13 +206,11 @@ PhiAnalyzer::PhiAnalyzer(const Loop &L, unsigned MaxIterations)
 //   G(%y) = Unknown otherwise (including phi not in header block)
 PhiAnalyzer::PeelCounter PhiAnalyzer::calculate(const Value &V) {
   // If we already know the answer, take it from the map.
-  auto I = IterationsToInvariance.find(&V);
-  if (I != IterationsToInvariance.end())
-    return I->second;
-
-  // Place Unknown to map to avoid infinite recursion. Such
+  // Otherwise, place Unknown to map to avoid infinite recursion. Such
   // cycles can never stop on an invariant.
-  IterationsToInvariance[&V] = Unknown;
+  auto [I, Inserted] = IterationsToInvariance.try_emplace(&V, Unknown);
+  if (!Inserted)
+    return I->second;
 
   if (L.isLoopInvariant(&V))
     // Loop invariant so known at start.
-- 
GitLab


From 9173fd77394aa9617b235e1b146114f76c6d77d6 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 17 Oct 2024 07:45:50 -0700
Subject: [PATCH 263/329] [lldb] Avoid repeated map lookups (NFC) (#112655)

---
 lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
index 3e09c316d74f..538c86801400 100644
--- a/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
+++ b/lldb/source/Plugins/Process/gdb-remote/ProcessGDBRemote.cpp
@@ -5323,9 +5323,8 @@ std::string ProcessGDBRemote::HarmonizeThreadIdsForProfileData(
         uint32_t prev_used_usec = 0;
         std::map<uint64_t, uint32_t>::iterator iterator =
             m_thread_id_to_used_usec_map.find(thread_id);
-        if (iterator != m_thread_id_to_used_usec_map.end()) {
-          prev_used_usec = m_thread_id_to_used_usec_map[thread_id];
-        }
+        if (iterator != m_thread_id_to_used_usec_map.end())
+          prev_used_usec = iterator->second;
 
         uint32_t real_used_usec = curr_used_usec - prev_used_usec;
         // A good first time record is one that runs for at least 0.25 sec
-- 
GitLab


From b47849b4cb01a88371536ed660ff4f8aa01512b2 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 17 Oct 2024 07:46:32 -0700
Subject: [PATCH 264/329] [SCEV] Avoid repeated hash lookups (NFC) (#112656)

---
 llvm/lib/Analysis/ScalarEvolution.cpp | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/llvm/lib/Analysis/ScalarEvolution.cpp b/llvm/lib/Analysis/ScalarEvolution.cpp
index 3d028ab752f2..58e23e9556f1 100644
--- a/llvm/lib/Analysis/ScalarEvolution.cpp
+++ b/llvm/lib/Analysis/ScalarEvolution.cpp
@@ -9662,14 +9662,14 @@ Constant *
 ScalarEvolution::getConstantEvolutionLoopExitValue(PHINode *PN,
                                                    const APInt &BEs,
                                                    const Loop *L) {
-  auto I = ConstantEvolutionLoopExitValue.find(PN);
-  if (I != ConstantEvolutionLoopExitValue.end())
+  auto [I, Inserted] = ConstantEvolutionLoopExitValue.try_emplace(PN);
+  if (!Inserted)
     return I->second;
 
   if (BEs.ugt(MaxBruteForceIterations))
-    return ConstantEvolutionLoopExitValue[PN] = nullptr;  // Not going to evaluate it.
+    return nullptr; // Not going to evaluate it.
 
-  Constant *&RetVal = ConstantEvolutionLoopExitValue[PN];
+  Constant *&RetVal = I->second;
 
   DenseMap<Instruction *, Constant *> CurrentIterVals;
   BasicBlock *Header = L->getHeader();
-- 
GitLab


From 8b6764fdc0c9550e3d8033006a4acfb466f74840 Mon Sep 17 00:00:00 2001
From: Kazu Hirata <kazu@google.com>
Date: Thu, 17 Oct 2024 07:47:06 -0700
Subject: [PATCH 265/329] [DebugInfo] Simplify code with
 std::unordered_map::operator[] (NFC) (#112658)

---
 llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
index e85356b5eab0..1c523c013149 100644
--- a/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
+++ b/llvm/lib/DebugInfo/LogicalView/Readers/LVDWARFReader.cpp
@@ -1139,9 +1139,8 @@ void LVDWARFReader::updateReference(dwarf::Attribute Attr,
 // Get an element given the DIE offset.
 LVElement *LVDWARFReader::getElementForOffset(LVOffset Offset,
                                               LVElement *Element, bool IsType) {
-  auto Iter = ElementTable.try_emplace(Offset).first;
   // Update the element and all the references pointing to this element.
-  LVElementEntry &Entry = Iter->second;
+  LVElementEntry &Entry = ElementTable[Offset];
   if (!Entry.Element) {
     if (IsType)
       Entry.Types.insert(Element);
-- 
GitLab


From caa32e6d6fec4c77d47f85d866e23b4c0e2501a0 Mon Sep 17 00:00:00 2001
From: Youngsuk Kim <youngsuk.kim@hpe.com>
Date: Thu, 17 Oct 2024 10:47:44 -0400
Subject: [PATCH 266/329] [llvm][LSR] Fix where invariant on ScaledReg & Scale
 is violated (#112576)

Comments attached to the `ScaledReg` field of `struct Formula` explains
that, `ScaledReg` must be non-null when `Scale` is non-zero.

This fixes up a code path where this invariant is violated. Also, add an
assert to ensure this invariant holds true.

Without this patch, compiler aborts with the attached test case.

Fixes #76504
---
 .../Transforms/Scalar/LoopStrengthReduce.cpp  |  8 +++--
 .../LoopStrengthReduce/X86/pr76504.ll         | 30 +++++++++++++++++++
 2 files changed, 36 insertions(+), 2 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopStrengthReduce/X86/pr76504.ll

diff --git a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
index 575395eda1c5..e55b8f6652e3 100644
--- a/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopStrengthReduce.cpp
@@ -622,6 +622,9 @@ static bool containsAddRecDependentOnLoop(const SCEV *S, const Loop &L) {
 /// representation.
 /// \see Formula::BaseRegs.
 bool Formula::isCanonical(const Loop &L) const {
+  assert((Scale == 0 || ScaledReg) &&
+         "ScaledReg must be non-null if Scale is non-zero");
+
   if (!ScaledReg)
     return BaseRegs.size() <= 1;
 
@@ -3973,9 +3976,10 @@ void LSRInstance::GenerateReassociationsImpl(LSRUse &LU, unsigned LUIdx,
       F.UnfoldedOffset =
           Immediate::getFixed((uint64_t)F.UnfoldedOffset.getFixedValue() +
                               InnerSumSC->getValue()->getZExtValue());
-      if (IsScaledReg)
+      if (IsScaledReg) {
         F.ScaledReg = nullptr;
-      else
+        F.Scale = 0;
+      } else
         F.BaseRegs.erase(F.BaseRegs.begin() + Idx);
     } else if (IsScaledReg)
       F.ScaledReg = InnerSum;
diff --git a/llvm/test/Transforms/LoopStrengthReduce/X86/pr76504.ll b/llvm/test/Transforms/LoopStrengthReduce/X86/pr76504.ll
new file mode 100644
index 000000000000..94b9f7badb0f
--- /dev/null
+++ b/llvm/test/Transforms/LoopStrengthReduce/X86/pr76504.ll
@@ -0,0 +1,30 @@
+; Reduced from https://bugs.chromium.org/p/oss-fuzz/issues/detail?id=65323 by @RKSimon
+;
+; RUN: opt -S -passes=loop-reduce %s | FileCheck %s
+;
+; Make sure we don't trigger an assertion.
+
+target datalayout = "e-m:e-p270:32:32-p271:32:32-p272:64:64-i64:64-i128:128-f80:128-n8:16:32:64-S128"
+target triple = "x86_64-unknown-linux-gnu"
+
+@G = external global i32
+
+define void @foo() {
+; CHECK-LABEL: foo
+bb8:
+  br label %bb30
+
+bb30:                                             ; preds = %bb30, %bb8
+  %l0 = phi i64 [ -2222, %bb8 ], [ %r23, %bb30 ]
+  %A22 = alloca i16, align 2
+  %r23 = add nuw i64 1, %l0
+  %G7 = getelementptr i16, ptr %A22, i64 %r23
+  %B15 = urem i64 %r23, %r23
+  %G6 = getelementptr i16, ptr %G7, i64 %B15
+  %B1 = urem i64 %r23, %r23
+  %B8 = sub i64 -1, %r23
+  %B18 = sub i64 %B8, %B1
+  %G5 = getelementptr i16, ptr %G6, i64 %B18
+  store ptr %G5, ptr undef, align 8
+  br label %bb30
+}
-- 
GitLab


From 8f25c0bc7d59a65f27faa88d7debc47275a3a3da Mon Sep 17 00:00:00 2001
From: Boaz Brickner <brickner@google.com>
Date: Thu, 17 Oct 2024 16:50:47 +0200
Subject: [PATCH 267/329] [clang] Fix covariant cv-qualification check to
 require the override function return type to have the same or less
 cv-qualification (#112713)

This prevents changing cv-qualification from const to volatile or vice
versa, for example.

https://eel.is/c++draft/class.virtual#8.3

Previously, we checked that the new type is the same or more qualified
to return an error, but the standard requires the new type to be the
same or less qualified and since the cv-qualification is only partially
ordered, we cannot rely on a check on whether it is more qualified to
return an error. Now, we reversed the condition to check whether the old
is at least as qualified, and return an error if it is not.

Also, adjusted the error name and message to clarify the requirement and
added a missing closing parenthesis.

Added tests to cover different use cases for classes with different
qualifications and also refactored them to make them easier to follow:
1. Use override to make sure the function names actually match.
2. Named the function in a more descriptive way to clarify what each use
case is checking.

Fixes: #111742
---
 clang/docs/ReleaseNotes.rst                   |  9 +++-
 .../clang/Basic/DiagnosticSemaKinds.td        |  6 +--
 clang/lib/Sema/SemaDeclCXX.cpp                |  4 +-
 clang/test/SemaCXX/virtual-override.cpp       | 52 ++++++++++++++++---
 4 files changed, 57 insertions(+), 14 deletions(-)

diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 9977e8bd3ca6..1da8c82d52e6 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -99,17 +99,24 @@ C++ Specific Potentially Breaking Changes
     // Was error, now evaluates to false.
     constexpr bool b = f() == g();
 
-- Clang will now correctly not consider pointers to non classes for covariance.
+- Clang will now correctly not consider pointers to non classes for covariance
+  and disallow changing return type to a type that doesn't have the same or less cv-qualifications.
 
   .. code-block:: c++
 
     struct A {
       virtual const int *f() const;
+      virtual const std::string *g() const;
     };
     struct B : A {
       // Return type has less cv-qualification but doesn't point to a class.
       // Error will be generated.
       int *f() const override;
+
+      // Return type doesn't have more cv-qualification also not the same or
+      // less cv-qualification.
+      // Error will be generated.
+      volatile std::string *g() const override;
     };
 
 - The warning ``-Wdeprecated-literal-operator`` is now on by default, as this is
diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index c458a62d9be4..487dd8990d88 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -2182,10 +2182,10 @@ def err_covariant_return_incomplete : Error<
 def err_covariant_return_type_different_qualifications : Error<
   "return type of virtual function %0 is not covariant with the return type of "
   "the function it overrides (%1 has different qualifiers than %2)">;
-def err_covariant_return_type_class_type_more_qualified : Error<
+def err_covariant_return_type_class_type_not_same_or_less_qualified : Error<
   "return type of virtual function %0 is not covariant with the return type of "
-  "the function it overrides (class type %1 is more qualified than class "
-  "type %2">;
+  "the function it overrides (class type %1 does not have the same "
+  "cv-qualification as or less cv-qualification than class type %2)">;
 
 // C++ implicit special member functions
 def note_in_declaration_of_implicit_special_member : Note<
diff --git a/clang/lib/Sema/SemaDeclCXX.cpp b/clang/lib/Sema/SemaDeclCXX.cpp
index 38f808a470aa..43ec25b23d97 100644
--- a/clang/lib/Sema/SemaDeclCXX.cpp
+++ b/clang/lib/Sema/SemaDeclCXX.cpp
@@ -18338,9 +18338,9 @@ bool Sema::CheckOverridingFunctionReturnType(const CXXMethodDecl *New,
 
 
   // The new class type must have the same or less qualifiers as the old type.
-  if (NewClassTy.isMoreQualifiedThan(OldClassTy)) {
+  if (!OldClassTy.isAtLeastAsQualifiedAs(NewClassTy)) {
     Diag(New->getLocation(),
-         diag::err_covariant_return_type_class_type_more_qualified)
+         diag::err_covariant_return_type_class_type_not_same_or_less_qualified)
         << New->getDeclName() << NewTy << OldTy
         << New->getReturnTypeSourceRange();
     Diag(Old->getLocation(), diag::note_overridden_virtual_function)
diff --git a/clang/test/SemaCXX/virtual-override.cpp b/clang/test/SemaCXX/virtual-override.cpp
index d37c275d46ba..ce6dd35e0b56 100644
--- a/clang/test/SemaCXX/virtual-override.cpp
+++ b/clang/test/SemaCXX/virtual-override.cpp
@@ -83,17 +83,53 @@ namespace T6 {
 struct a { };
 
 class A {
-  virtual const a* f(); 
-  virtual a* g(); // expected-note{{overridden virtual function is here}}
-  virtual const int* h(); // expected-note{{overridden virtual function is here}}
-  virtual int* i(); // expected-note{{overridden virtual function is here}}
+  // Classes.
+  virtual const a* const_vs_unqualified_class();
+  virtual a* unqualified_vs_const_class(); // expected-note{{overridden virtual function is here}}
+
+  virtual volatile a* volatile_vs_unqualified_class();
+  virtual a* unqualified_vs_volatile_class(); // expected-note{{overridden virtual function is here}}
+
+  virtual const a* const_vs_volatile_class(); // expected-note{{overridden virtual function is here}}
+  virtual volatile a* volatile_vs_const_class(); // expected-note{{overridden virtual function is here}}
+
+  virtual const volatile a* const_volatile_vs_const_class();
+  virtual const a* const_vs_const_volatile_class(); // expected-note{{overridden virtual function is here}}
+
+  virtual const volatile a* const_volatile_vs_volatile_class();
+  virtual volatile a* volatile_vs_const_volatile_class(); // expected-note{{overridden virtual function is here}}
+
+  virtual const volatile a* const_volatile_vs_unualified_class();
+  virtual a* unqualified_vs_const_volatile_class(); // expected-note{{overridden virtual function is here}}
+
+  // Non Classes.
+  virtual const int* const_vs_unqualified_non_class(); // expected-note{{overridden virtual function is here}}
+  virtual int* unqualified_vs_const_non_class(); // expected-note{{overridden virtual function is here}}
 };
 
 class B : A {
-  virtual a* f(); 
-  virtual const a* g(); // expected-error{{return type of virtual function 'g' is not covariant with the return type of the function it overrides (class type 'const a *' is more qualified than class type 'a *'}}
-  virtual int* h();  // expected-error{{virtual function 'h' has a different return type ('int *') than the function it overrides (which has return type 'const int *')}}
-  virtual const int* i(); // expected-error{{virtual function 'i' has a different return type ('const int *') than the function it overrides (which has return type 'int *')}}
+  // Classes.
+  a* const_vs_unqualified_class() override;
+  const a* unqualified_vs_const_class() override; // expected-error{{return type of virtual function 'unqualified_vs_const_class' is not covariant with the return type of the function it overrides (class type 'const a *' does not have the same cv-qualification as or less cv-qualification than class type 'a *')}}
+
+  a* volatile_vs_unqualified_class() override;
+  volatile a* unqualified_vs_volatile_class() override; // expected-error{{return type of virtual function 'unqualified_vs_volatile_class' is not covariant with the return type of the function it overrides (class type 'volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'a *')}}
+
+  volatile a* const_vs_volatile_class() override; // expected-error{{return type of virtual function 'const_vs_volatile_class' is not covariant with the return type of the function it overrides (class type 'volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'const a *')}}
+  const a* volatile_vs_const_class() override; // expected-error{{return type of virtual function 'volatile_vs_const_class' is not covariant with the return type of the function it overrides (class type 'const a *' does not have the same cv-qualification as or less cv-qualification than class type 'volatile a *')}}
+
+  const a* const_volatile_vs_const_class() override;
+  const volatile a* const_vs_const_volatile_class() override; // expected-error{{return type of virtual function 'const_vs_const_volatile_class' is not covariant with the return type of the function it overrides (class type 'const volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'const a *')}}
+
+  volatile a* const_volatile_vs_volatile_class() override;
+  const volatile a* volatile_vs_const_volatile_class() override; // expected-error{{return type of virtual function 'volatile_vs_const_volatile_class' is not covariant with the return type of the function it overrides (class type 'const volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'volatile a *')}}
+
+  a* const_volatile_vs_unualified_class() override;
+  const volatile a* unqualified_vs_const_volatile_class() override; // expected-error{{return type of virtual function 'unqualified_vs_const_volatile_class' is not covariant with the return type of the function it overrides (class type 'const volatile a *' does not have the same cv-qualification as or less cv-qualification than class type 'a *')}}
+
+  // Non Classes.
+  int* const_vs_unqualified_non_class() override; // expected-error{{virtual function 'const_vs_unqualified_non_class' has a different return type ('int *') than the function it overrides (which has return type 'const int *')}}
+  const int* unqualified_vs_const_non_class() override; // expected-error{{virtual function 'unqualified_vs_const_non_class' has a different return type ('const int *') than the function it overrides (which has return type 'int *')}}
 };
 
 }
-- 
GitLab


From 92663defb1c27d809f644752d65d8ccff93a7054 Mon Sep 17 00:00:00 2001
From: Shilei Tian <i@tianshilei.me>
Date: Thu, 17 Oct 2024 10:55:29 -0400
Subject: [PATCH 268/329] [NFC][AMDGPU] Auto-generate check lines for some test
 cases (#112426)

- `llvm/test/CodeGen/AMDGPU/andorbitset.ll`
- `llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll`
- `llvm/test/CodeGen/AMDGPU/fabs.f64.ll`
- `llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll`
- `llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll`
---
 llvm/test/CodeGen/AMDGPU/andorbitset.ll       | 102 +++-
 llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll    |  79 ++-
 llvm/test/CodeGen/AMDGPU/fabs.f64.ll          | 155 ++++--
 .../llvm.amdgcn.raw.ptr.buffer.store.ll       | 471 +++++++++++++-----
 llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll        | 137 ++++-
 5 files changed, 752 insertions(+), 192 deletions(-)

diff --git a/llvm/test/CodeGen/AMDGPU/andorbitset.ll b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
index a189ba9b1034..0fa58f3c444a 100644
--- a/llvm/test/CodeGen/AMDGPU/andorbitset.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorbitset.ll
@@ -1,48 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: {{^}}s_clear_msb:
-; SI: s_bitset0_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_clear_msb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_clear_msb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s4, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, 2147483647
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_set_msb:
-; SI: s_bitset1_b32 s{{[0-9]+}}, 31
 define amdgpu_kernel void @s_set_msb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_set_msb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset1_b32 s4, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, 2147483648
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_clear_lsb:
-; SI: s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, -2
 define amdgpu_kernel void @s_clear_lsb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_clear_lsb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s4, s4, -2
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, 4294967294
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_set_lsb:
-; SI: s_or_b32 s{{[0-9]+}}, s{{[0-9]+}}, 1
 define amdgpu_kernel void @s_set_lsb(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_set_lsb:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_or_b32 s4, s4, 1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, 1
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_clear_midbit:
-; SI: s_bitset0_b32 s{{[0-9]+}}, 8
 define amdgpu_kernel void @s_clear_midbit(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_clear_midbit:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s4, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, 4294967039
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_set_midbit:
-; SI: s_bitset1_b32 s{{[0-9]+}}, 8
 define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_set_midbit:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset1_b32 s4, 8
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, 256
   store i32 %x, ptr addrspace(1) %out
   ret void
@@ -51,10 +106,27 @@ define amdgpu_kernel void @s_set_midbit(ptr addrspace(1) %out, i32 %in) {
 @gv = external addrspace(1) global i32
 
 ; Make sure there's no verifier error with an undef source.
-; SI-LABEL: {{^}}bitset_verifier_error:
-; SI-NOT:   %bb.1:
-; SI:       s_and_b32 s{{[0-9]+}}, s{{[0-9]+}}, 0x7fffffff
 define void @bitset_verifier_error() local_unnamed_addr #0 {
+; SI-LABEL: bitset_verifier_error:
+; SI:       ; %bb.0: ; %bb
+; SI-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; SI-NEXT:    s_getpc_b64 s[4:5]
+; SI-NEXT:    s_add_u32 s4, s4, gv@gotpcrel32@lo+4
+; SI-NEXT:    s_addc_u32 s5, s5, gv@gotpcrel32@hi+12
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[4:5], 0x0
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_and_b32 s8, s4, 0x7fffffff
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    buffer_store_dword v0, off, s[4:7], 0
+; SI-NEXT:    s_waitcnt expcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, 0x3f7fbe77
+; SI-NEXT:    v_cmp_ge_f32_e64 s[4:5], |s4|, v0
+; SI-NEXT:    s_and_b64 vcc, exec, s[4:5]
+; SI-NEXT:    s_cbranch_vccnz .LBB6_2
+; SI-NEXT:  ; %bb.1: ; %bb5
+; SI-NEXT:  .LBB6_2: ; %bb6
 bb:
   %i = call float @llvm.fabs.f32(float undef) #0
   %i1 = bitcast float %i to i32
diff --git a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
index dc158028bd7b..4b56b5e9d24f 100644
--- a/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
+++ b/llvm/test/CodeGen/AMDGPU/andorxorinvimm.ll
@@ -1,48 +1,103 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ; RUN: llc -mtriple=amdgcn -mcpu=tahiti -mattr=-flat-for-global -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
-; SI-LABEL: {{^}}s_or_to_orn2:
-; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_or_to_orn2(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_or_to_orn2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_orn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_or_to_orn2_imm0:
-; SI: s_orn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_or_to_orn2_imm0(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_or_to_orn2_imm0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_orn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = or i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_and_to_andn2:
-; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_and_to_andn2(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_and_to_andn2:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_andn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_and_to_andn2_imm0:
-; SI: s_andn2_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_and_to_andn2_imm0(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_and_to_andn2_imm0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_andn2_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = and i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_xor_to_xnor:
-; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_xor_to_xnor(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_xor_to_xnor:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_xnor_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = xor i32 %in, -51
   store i32 %x, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_xor_to_xnor_imm0:
-; SI: s_xnor_b32 s{{[0-9]+}}, s{{[0-9]+}}, 50
 define amdgpu_kernel void @s_xor_to_xnor_imm0(ptr addrspace(1) %out, i32 %in) {
+; SI-LABEL: s_xor_to_xnor_imm0:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dword s4, s[2:3], 0xb
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_xnor_b32 s4, s4, 50
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %x = xor i32 -51, %in
   store i32 %x, ptr addrspace(1) %out
   ret void
diff --git a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
index 32d5fa6e72d7..f98124fe2ed7 100644
--- a/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
+++ b/llvm/test/CodeGen/AMDGPU/fabs.f64.ll
@@ -1,4 +1,5 @@
-; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI -check-prefix=FUNC %s
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -march=amdgcn -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
 
 declare i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
 
@@ -7,10 +8,25 @@ declare double @llvm.fabs.f64(double) readnone
 declare <2 x double> @llvm.fabs.v2f64(<2 x double>) readnone
 declare <4 x double> @llvm.fabs.v4f64(<4 x double>) readnone
 
-; FUNC-LABEL: {{^}}v_fabs_f64:
-; SI: v_and_b32
-; SI: s_endpgm
 define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %in) {
+; SI-LABEL: v_fabs_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_mov_b32 s10, 0
+; SI-NEXT:    v_lshlrev_b32_e32 v0, 3, v0
+; SI-NEXT:    v_mov_b32_e32 v1, 0
+; SI-NEXT:    s_mov_b32 s11, s7
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_mov_b64 s[8:9], s[2:3]
+; SI-NEXT:    buffer_load_dwordx2 v[0:1], v[0:1], s[8:11], 0 addr64
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    s_waitcnt vmcnt(0)
+; SI-NEXT:    v_and_b32_e32 v1, 0x7fffffff, v1
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %tid = call i32 @llvm.amdgcn.workitem.id.x() nounwind readnone
   %tidext = sext i32 %tid to i64
   %gep = getelementptr double, ptr addrspace(1) %in, i64 %tidext
@@ -20,75 +36,148 @@ define amdgpu_kernel void @v_fabs_f64(ptr addrspace(1) %out, ptr addrspace(1) %i
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_f64:
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_f64(ptr addrspace(1) %out, double %in) {
+; SI-LABEL: fabs_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s3, 31
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %fabs = call double @llvm.fabs.f64(double %in)
   store double %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_v2f64:
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_v2f64(ptr addrspace(1) %out, <2 x double> %in) {
+; SI-LABEL: fabs_v2f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[4:7], s[2:3], 0xd
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s7, 31
+; SI-NEXT:    s_bitset0_b32 s5, 31
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v2, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mov_b32_e32 v3, s7
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call <2 x double> @llvm.fabs.v2f64(<2 x double> %in)
   store <2 x double> %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_v4f64:
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_v4f64(ptr addrspace(1) %out, <4 x double> %in) {
+; SI-LABEL: fabs_v4f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx8 s[4:11], s[2:3], 0x11
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s7, 31
+; SI-NEXT:    s_bitset0_b32 s11, 31
+; SI-NEXT:    s_bitset0_b32 s9, 31
+; SI-NEXT:    s_bitset0_b32 s5, 31
+; SI-NEXT:    v_mov_b32_e32 v0, s8
+; SI-NEXT:    v_mov_b32_e32 v2, s10
+; SI-NEXT:    v_mov_b32_e32 v4, s4
+; SI-NEXT:    v_mov_b32_e32 v6, s6
+; SI-NEXT:    v_mov_b32_e32 v1, s9
+; SI-NEXT:    v_mov_b32_e32 v3, s11
+; SI-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:16
+; SI-NEXT:    v_mov_b32_e32 v5, s5
+; SI-NEXT:    v_mov_b32_e32 v7, s7
+; SI-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call <4 x double> @llvm.fabs.v4f64(<4 x double> %in)
   store <4 x double> %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) {
+; SI-LABEL: fabs_fold_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mul_f64 v[0:1], |s[6:7]|, v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call double @llvm.fabs.f64(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}fabs_fn_fold_f64:
-; SI: s_load_dwordx2 [[ABS_VALUE:s\[[0-9]+:[0-9]+\]]], {{s\[[0-9]+:[0-9]+\]}}, 0x13
-; SI-NOT: and
-; SI: v_mul_f64 {{v\[[0-9]+:[0-9]+\]}}, |[[ABS_VALUE]]|, {{v\[[0-9]+:[0-9]+\]}}
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_fn_fold_f64(ptr addrspace(1) %out, [8 x i32], double %in0, [8 x i32], double %in1) {
+; SI-LABEL: fabs_fn_fold_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx2 s[4:5], s[2:3], 0x1d
+; SI-NEXT:    s_load_dwordx2 s[6:7], s[2:3], 0x13
+; SI-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s3, 0xf000
+; SI-NEXT:    s_mov_b32 s2, -1
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    v_mov_b32_e32 v0, s4
+; SI-NEXT:    v_mov_b32_e32 v1, s5
+; SI-NEXT:    v_mul_f64 v[0:1], |s[6:7]|, v[0:1]
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[0:3], 0
+; SI-NEXT:    s_endpgm
   %fabs = call double @fabs(double %in0)
   %fmul = fmul double %fabs, %in1
   store double %fmul, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_free_f64:
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_free_f64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: fabs_free_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s3, 31
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %bc= bitcast i64 %in to double
   %fabs = call double @llvm.fabs.f64(double %bc)
   store double %fabs, ptr addrspace(1) %out
   ret void
 }
 
-; FUNC-LABEL: {{^}}fabs_fn_free_f64:
-; SI: s_bitset0_b32
-; SI: s_endpgm
 define amdgpu_kernel void @fabs_fn_free_f64(ptr addrspace(1) %out, i64 %in) {
+; SI-LABEL: fabs_fn_free_f64:
+; SI:       ; %bb.0:
+; SI-NEXT:    s_load_dwordx4 s[0:3], s[2:3], 0x9
+; SI-NEXT:    s_mov_b32 s7, 0xf000
+; SI-NEXT:    s_waitcnt lgkmcnt(0)
+; SI-NEXT:    s_bitset0_b32 s3, 31
+; SI-NEXT:    s_mov_b32 s6, -1
+; SI-NEXT:    s_mov_b32 s4, s0
+; SI-NEXT:    s_mov_b32 s5, s1
+; SI-NEXT:    v_mov_b32_e32 v0, s2
+; SI-NEXT:    v_mov_b32_e32 v1, s3
+; SI-NEXT:    buffer_store_dwordx2 v[0:1], off, s[4:7], 0
+; SI-NEXT:    s_endpgm
   %bc= bitcast i64 %in to double
   %fabs = call double @fabs(double %bc)
   store double %fabs, ptr addrspace(1) %out
diff --git a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
index d9227724c22a..855ca390aabd 100644
--- a/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
+++ b/llvm/test/CodeGen/AMDGPU/llvm.amdgcn.raw.ptr.buffer.store.ll
@@ -1,12 +1,21 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=verde -verify-machineinstrs | FileCheck -check-prefix=VERDE %s
 ;RUN: llc < %s -mtriple=amdgcn -mcpu=tonga -verify-machineinstrs | FileCheck %s
 
-;CHECK-LABEL: {{^}}buffer_store:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
-;CHECK: buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
 define amdgpu_ps void @buffer_store(ptr addrspace(8) inreg, <4 x float>, <4 x float>, <4 x float>) {
+; VERDE-LABEL: buffer_store:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VERDE-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
+; VERDE-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CHECK-NEXT:    buffer_store_dwordx4 v[4:7], off, s[0:3], 0 glc
+; CHECK-NEXT:    buffer_store_dwordx4 v[8:11], off, s[0:3], 0 slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1)
@@ -14,34 +23,54 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_immoffs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
 define amdgpu_ps void @buffer_store_immoffs(ptr addrspace(8) inreg, <4 x float>) {
+; VERDE-LABEL: buffer_store_immoffs:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_immoffs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:42
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 42, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_ofs:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_ofs(ptr addrspace(8) inreg, <4 x float>, i32) {
+; VERDE-LABEL: buffer_store_ofs:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_ofs:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0)
   ret void
 }
 
 ; Ideally, the register allocator would avoid the wait here
-;
-;CHECK-LABEL: {{^}}buffer_store_wait:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
-;VERDE: s_waitcnt expcnt(0)
-;CHECK: buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
-;CHECK: s_waitcnt vmcnt(0)
-;CHECK: buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_wait(ptr addrspace(8) inreg, <4 x float>, i32, i32, i32) {
+; VERDE-LABEL: buffer_store_wait:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_waitcnt expcnt(0)
+; VERDE-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
+; VERDE-NEXT:    s_waitcnt vmcnt(0)
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_wait:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    buffer_load_dwordx4 v[0:3], v5, s[0:3], 0 offen
+; CHECK-NEXT:    s_waitcnt vmcnt(0)
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v6, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f32(<4 x float> %1, ptr addrspace(8) %0, i32 %2, i32 0, i32 0)
   %data = call <4 x float> @llvm.amdgcn.raw.ptr.buffer.load.v4f32(ptr addrspace(8) %0, i32 %3, i32 0, i32 0)
@@ -49,29 +78,48 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_x1(ptr addrspace(8) inreg %rsrc, float %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_x1:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_x2(ptr addrspace(8) inreg %rsrc, <2 x float> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_x2:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_and:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28
 define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offen_merged_and:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offen_merged_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -87,11 +135,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_and(ptr addrspace(8) inreg %
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_or:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:28
 define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offen_merged_or:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offen_merged_or:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 6, v0
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28
+; CHECK-NEXT:    s_endpgm
   %a = shl i32 %inp, 6
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
@@ -109,12 +166,20 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_or(ptr addrspace(8) inreg %r
 }
 
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offen_merged_glc_slc:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:12 glc{{$}}
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:28 glc slc{{$}}
 define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inreg %rsrc, i32 %a, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
+; VERDE-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offen_merged_glc_slc:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx2 v[1:2], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[3:4], v0, s[0:3], 0 offen offset:12 glc
+; CHECK-NEXT:    buffer_store_dwordx2 v[5:6], v0, s[0:3], 0 offen offset:28 glc slc
+; CHECK-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 8
   %a3 = add i32 %a, 12
@@ -130,10 +195,16 @@ define amdgpu_ps void @buffer_store_x1_offen_merged_glc_slc(ptr addrspace(8) inr
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_and:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v0, s[0:3], 0 offen offset:4
 define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg %rsrc, i32 %a, <2 x float> %v1, <2 x float> %v2) {
+; VERDE-LABEL: buffer_store_x2_offen_merged_and:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2_offen_merged_and:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_endpgm
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 %a1, i32 0, i32 0)
@@ -141,10 +212,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_and(ptr addrspace(8) inreg %
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offen_merged_or:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], v{{[0-9]}}, s[0:3], 0 offen offset:4
 define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %rsrc, i32 %inp, <2 x float> %v1, <2 x float> %v2) {
+; VERDE-LABEL: buffer_store_x2_offen_merged_or:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; VERDE-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2_offen_merged_or:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    v_lshlrev_b32_e32 v0, 4, v0
+; CHECK-NEXT:    buffer_store_dwordx4 v[1:4], v0, s[0:3], 0 offen offset:4
+; CHECK-NEXT:    s_endpgm
   %a = shl i32 %inp, 4
   %a1 = add i32 %a, 4
   %a2 = add i32 %a, 12
@@ -153,11 +232,18 @@ define amdgpu_ps void @buffer_store_x2_offen_merged_or(ptr addrspace(8) inreg %r
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: buffer_store_x1_offset_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x1_offset_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0)
@@ -167,21 +253,35 @@ define amdgpu_ps void @buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsr
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_x2_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
 define amdgpu_ps void @buffer_store_x2_offset_merged(ptr addrspace(8) inreg %rsrc, <2 x float> %v1,<2 x float> %v2) {
+; VERDE-LABEL: buffer_store_x2_offset_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_x2_offset_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f32(<2 x float> %v2, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_int:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], off, s[0:3], 0
-;CHECK: buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
-;CHECK: buffer_store_dword v6, off, s[0:3], 0 slc
 define amdgpu_ps void @buffer_store_int(ptr addrspace(8) inreg, <4 x i32>, <2 x i32>, i32) {
+; VERDE-LABEL: buffer_store_int:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; VERDE-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
+; VERDE-NEXT:    buffer_store_dword v6, off, s[0:3], 0 slc
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_int:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 glc
+; CHECK-NEXT:    buffer_store_dword v6, off, s[0:3], 0 slc
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4i32(<4 x i32> %1, ptr addrspace(8) %0, i32 0, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2i32(<2 x i32> %2, ptr addrspace(8) %0, i32 0, i32 0, i32 1)
@@ -189,12 +289,18 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_byte:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_byte v{{[0-9]}}, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_byte(ptr addrspace(8) inreg %rsrc, float %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_byte:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; VERDE-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_byte:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_byte v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i8
@@ -202,12 +308,18 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_short:
-;CHECK-NEXT: %bb.
-;CHECK-NEXT: v_cvt_u32_f32_e32 v{{[0-9]}}, v{{[0-9]}}
-;CHECK-NEXT: buffer_store_short v{{[0-9]}}, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_short(ptr addrspace(8) inreg %rsrc, float %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_short:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; VERDE-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_short:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    v_cvt_u32_f32_e32 v0, v0
+; CHECK-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %v2 = fptoui float %v1 to i32
   %v3 = trunc i32 %v2 to i16
@@ -215,12 +327,16 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_f16:
-;CHECK-NEXT: %bb.
-;CHECK-NOT: v0
-;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_f16(ptr addrspace(8) inreg %rsrc, i32 %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   %cast = bitcast i16 %trunc to half
@@ -228,74 +344,169 @@ main_body:
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2f16(ptr addrspace(8) inreg %rsrc, <2 x half> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v2f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2f16(<2 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4f16(ptr addrspace(8) inreg %rsrc, <4 x half> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v5, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v2, v1
+; VERDE-NEXT:    v_lshlrev_b32_e32 v2, 16, v5
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v2
+; VERDE-NEXT:    buffer_store_dwordx2 v[0:1], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v4f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4f16(<4 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v8f16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v8f16(ptr addrspace(8) inreg %rsrc, <8 x half> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v8f16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v7, v7
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v6, v6
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v9, v5
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v3, v3
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v1, v1
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v4, v4
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v2, v2
+; VERDE-NEXT:    v_cvt_f16_f32_e32 v0, v0
+; VERDE-NEXT:    v_lshlrev_b32_e32 v5, 16, v7
+; VERDE-NEXT:    v_or_b32_e32 v5, v6, v5
+; VERDE-NEXT:    v_lshlrev_b32_e32 v6, 16, v9
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_or_b32_e32 v4, v4, v6
+; VERDE-NEXT:    v_or_b32_e32 v3, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v2, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx4 v[2:5], v8, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v8f16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v8f16(<8 x half> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2bf16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2bf16(ptr addrspace(8) inreg %rsrc, <2 x bfloat> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2bf16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; VERDE-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; VERDE-NEXT:    v_alignbit_b32 v0, v1, v0, 16
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v2bf16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2bf16(<2 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4bf16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4bf16(ptr addrspace(8) inreg %rsrc, <4 x bfloat> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4bf16:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    v_mul_f32_e32 v3, 1.0, v3
+; VERDE-NEXT:    v_mul_f32_e32 v1, 1.0, v1
+; VERDE-NEXT:    v_lshrrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_mul_f32_e32 v2, 1.0, v2
+; VERDE-NEXT:    v_lshrrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_mul_f32_e32 v0, 1.0, v0
+; VERDE-NEXT:    v_alignbit_b32 v2, v3, v2, 16
+; VERDE-NEXT:    v_alignbit_b32 v1, v1, v0, 16
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v4bf16:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4bf16(<4 x bfloat> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_i16:
-;CHECK-NEXT: %bb.
-;CHECK-NOT: v0
-;CHECK-NEXT: buffer_store_short v0, off, s[0:3], 0
-;CHECK-NEXT: s_endpgm
 define amdgpu_ps void @raw_ptr_buffer_store_i16(ptr addrspace(8) inreg %rsrc, i32 %v1) {
+; VERDE-LABEL: raw_ptr_buffer_store_i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_short v0, off, s[0:3], 0
+; CHECK-NEXT:    s_endpgm
 main_body:
   %trunc = trunc i32 %v1 to i16
   call void @llvm.amdgcn.raw.ptr.buffer.store.i16(i16 %trunc, ptr addrspace(8) %rsrc, i32 0, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v2i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dword v0, v1, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v2i16(ptr addrspace(8) inreg %rsrc, <2 x i16> %data, i32 %offset) {
+; VERDE-LABEL: buffer_store_v2i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v0, v0, v1
+; VERDE-NEXT:    buffer_store_dword v0, v2, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v2i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dword v0, v1, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v2i16(<2 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}buffer_store_v4i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v4i16(ptr addrspace(8) inreg %rsrc, <4 x i16> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v4i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v2, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v1, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx2 v[1:2], v4, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v4i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx2 v[0:1], v2, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v4i16(<4 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
@@ -307,21 +518,45 @@ main_body:
 ;   call void @llvm.amdgcn.raw.ptr.buffer.store.v6i16(<6 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
 ;   ret void
 ; }
-
-;CHECK-LABEL: {{^}}buffer_store_v8i16:
-;CHECK-NOT: s_waitcnt
-;CHECK: buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
 define amdgpu_ps void @buffer_store_v8i16(ptr addrspace(8) inreg %rsrc, <8 x i16> %data, i32 %offset) #0 {
+; VERDE-LABEL: buffer_store_v8i16:
+; VERDE:       ; %bb.0: ; %main_body
+; VERDE-NEXT:    v_lshlrev_b32_e32 v7, 16, v7
+; VERDE-NEXT:    v_and_b32_e32 v6, 0xffff, v6
+; VERDE-NEXT:    v_lshlrev_b32_e32 v5, 16, v5
+; VERDE-NEXT:    v_and_b32_e32 v4, 0xffff, v4
+; VERDE-NEXT:    v_lshlrev_b32_e32 v3, 16, v3
+; VERDE-NEXT:    v_and_b32_e32 v2, 0xffff, v2
+; VERDE-NEXT:    v_lshlrev_b32_e32 v1, 16, v1
+; VERDE-NEXT:    v_and_b32_e32 v0, 0xffff, v0
+; VERDE-NEXT:    v_or_b32_e32 v6, v6, v7
+; VERDE-NEXT:    v_or_b32_e32 v5, v4, v5
+; VERDE-NEXT:    v_or_b32_e32 v4, v2, v3
+; VERDE-NEXT:    v_or_b32_e32 v3, v0, v1
+; VERDE-NEXT:    buffer_store_dwordx4 v[3:6], v8, s[0:3], 0 offen
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: buffer_store_v8i16:
+; CHECK:       ; %bb.0: ; %main_body
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], v4, s[0:3], 0 offen
+; CHECK-NEXT:    s_endpgm
 main_body:
   call void @llvm.amdgcn.raw.ptr.buffer.store.v8i16(<8 x i16> %data, ptr addrspace(8) %rsrc, i32 %offset, i32 0, i32 0)
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_merged:
-;CHECK-NOT: s_waitcnt
-;CHECK-DAG: buffer_store_dwordx4 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dwordx2 v[{{[0-9]}}:{{[0-9]}}], off, s[0:3], 0 offset:28
 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; VERDE-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dwordx4 v[0:3], off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dwordx2 v[4:5], off, s[0:3], 0 offset:28
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 0)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 0)
@@ -331,14 +566,26 @@ define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_merged(ptr addrspace(8) in
   ret void
 }
 
-;CHECK-LABEL: {{^}}raw_ptr_buffer_store_x1_offset_swizzled_not_merged:
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:4
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:8
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:12
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:16
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:28
-;CHECK-DAG: buffer_store_dword v{{[0-9]}}, off, s[0:3], 0 offset:32
 define amdgpu_ps void @raw_ptr_buffer_store_x1_offset_swizzled_not_merged(ptr addrspace(8) inreg %rsrc, float %v1, float %v2, float %v3, float %v4, float %v5, float %v6) {
+; VERDE-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged:
+; VERDE:       ; %bb.0:
+; VERDE-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; VERDE-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8
+; VERDE-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:12
+; VERDE-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:16
+; VERDE-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:28
+; VERDE-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:32
+; VERDE-NEXT:    s_endpgm
+;
+; CHECK-LABEL: raw_ptr_buffer_store_x1_offset_swizzled_not_merged:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    buffer_store_dword v0, off, s[0:3], 0 offset:4
+; CHECK-NEXT:    buffer_store_dword v1, off, s[0:3], 0 offset:8
+; CHECK-NEXT:    buffer_store_dword v2, off, s[0:3], 0 offset:12
+; CHECK-NEXT:    buffer_store_dword v3, off, s[0:3], 0 offset:16
+; CHECK-NEXT:    buffer_store_dword v4, off, s[0:3], 0 offset:28
+; CHECK-NEXT:    buffer_store_dword v5, off, s[0:3], 0 offset:32
+; CHECK-NEXT:    s_endpgm
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v1, ptr addrspace(8) %rsrc, i32 4, i32 0, i32 8)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v2, ptr addrspace(8) %rsrc, i32 8, i32 0, i32 8)
   call void @llvm.amdgcn.raw.ptr.buffer.store.f32(float %v3, ptr addrspace(8) %rsrc, i32 12, i32 0, i32 8)
diff --git a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
index c7987d3d0091..02641f5b6ae8 100644
--- a/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
+++ b/llvm/test/CodeGen/AMDGPU/s_mulk_i32.ll
@@ -1,40 +1,118 @@
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=SI %s
-
-; SI-LABEL: {{^}}s_mulk_i32_k0:
-; SI: s_load_dword [[VAL:s[0-9]+]]
-; SI: s_mulk_i32 [[VAL]], 0x41
-; SI: v_mov_b32_e32 [[VRESULT:v[0-9]+]], [[VAL]]
-; SI: buffer_store_dword [[VRESULT]]
-; SI: s_endpgm
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tahiti -verify-machineinstrs < %s | FileCheck -check-prefix=GFX6 %s
+; RUN: llc -mtriple=amdgcn--amdpal -mcpu=tonga -mattr=-flat-for-global,-xnack -verify-machineinstrs < %s | FileCheck -check-prefix=GFX8 %s
+
 define amdgpu_kernel void @s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: s_mulk_i32_k0:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s4, 0x41
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_mulk_i32_k0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s4, 0x41
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, 65
   store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_mulk_i32_k1:
-; SI: s_mulk_i32 {{s[0-9]+}}, 0x7fff{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @s_mulk_i32_k1(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: s_mulk_i32_k1:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s4, 0x7fff
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_mulk_i32_k1:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s4, 0x7fff
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, 32767 ; (1 << 15) - 1
   store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}s_mulk_i32_k2:
-; SI: s_mulk_i32 {{s[0-9]+}}, 0xffef{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @s_mulk_i32_k2(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: s_mulk_i32_k2:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s4, 0xffef
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: s_mulk_i32_k2:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s4, 0xffef
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, -17
   store i32 %mul, ptr addrspace(1) %out
   ret void
 }
 
-; SI-LABEL: {{^}}no_s_mulk_i32_k0:
-; SI: s_mul_i32 {{s[0-9]+}}, {{s[0-9]+}}, 0x8001{{$}}
-; SI: s_endpgm
 define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
+; GFX6-LABEL: no_s_mulk_i32_k0:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s4, s[2:3], 0x2
+; GFX6-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX6-NEXT:    s_mov_b32 s3, 0xf000
+; GFX6-NEXT:    s_mov_b32 s2, -1
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mul_i32 s4, s4, 0x8001
+; GFX6-NEXT:    v_mov_b32_e32 v0, s4
+; GFX6-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: no_s_mulk_i32_k0:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s4, s[2:3], 0x8
+; GFX8-NEXT:    s_load_dwordx2 s[0:1], s[2:3], 0x0
+; GFX8-NEXT:    s_mov_b32 s3, 0xf000
+; GFX8-NEXT:    s_mov_b32 s2, -1
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mul_i32 s4, s4, 0x8001
+; GFX8-NEXT:    v_mov_b32_e32 v0, s4
+; GFX8-NEXT:    buffer_store_dword v0, off, s[0:3], 0
+; GFX8-NEXT:    s_endpgm
   %mul = mul i32 %b, 32769 ; 1 << 15 + 1
   store i32 %mul, ptr addrspace(1) %out
   ret void
@@ -42,9 +120,28 @@ define amdgpu_kernel void @no_s_mulk_i32_k0(ptr addrspace(1) %out, i32 %b) {
 
 @lds = addrspace(3) global [512 x i32] undef, align 4
 
-; SI-LABEL: {{^}}commute_s_mulk_i32:
-; SI: s_mulk_i32 s{{[0-9]+}}, 0x800{{$}}
 define amdgpu_kernel void @commute_s_mulk_i32(ptr addrspace(1) %out, i32 %b) #0 {
+; GFX6-LABEL: commute_s_mulk_i32:
+; GFX6:       ; %bb.0:
+; GFX6-NEXT:    s_load_dword s0, s[2:3], 0x2
+; GFX6-NEXT:    v_mov_b32_e32 v0, 0
+; GFX6-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX6-NEXT:    s_mulk_i32 s0, 0x800
+; GFX6-NEXT:    ;;#ASMSTART
+; GFX6-NEXT:    ; foo v0, s0
+; GFX6-NEXT:    ;;#ASMEND
+; GFX6-NEXT:    s_endpgm
+;
+; GFX8-LABEL: commute_s_mulk_i32:
+; GFX8:       ; %bb.0:
+; GFX8-NEXT:    s_load_dword s0, s[2:3], 0x8
+; GFX8-NEXT:    v_mov_b32_e32 v0, 0
+; GFX8-NEXT:    s_waitcnt lgkmcnt(0)
+; GFX8-NEXT:    s_mulk_i32 s0, 0x800
+; GFX8-NEXT:    ;;#ASMSTART
+; GFX8-NEXT:    ; foo v0, s0
+; GFX8-NEXT:    ;;#ASMEND
+; GFX8-NEXT:    s_endpgm
   %size = call i32 @llvm.amdgcn.groupstaticsize()
   %add = mul i32 %size, %b
   call void asm sideeffect "; foo $0, $1", "v,s"(ptr addrspace(3) @lds, i32 %add)
-- 
GitLab


From 8c7f80f77505b7ff275d67a49f4f2dd07d604403 Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 17 Oct 2024 08:19:58 -0700
Subject: [PATCH 269/329] [lldb] Disable warning about codecvt_utf8 deprecation
 (NFC) (#112446)

Disable -Wdeprecated-declarations for codecvt_utf8 in Editline. This is
in preparation for #112276 which narrows the scope of
-Wno-deprecated-declarations for building LLDB.
---
 lldb/include/lldb/Host/Editline.h    | 19 +++++++++++++++++++
 lldb/source/Host/common/Editline.cpp |  2 ++
 2 files changed, 21 insertions(+)

diff --git a/lldb/include/lldb/Host/Editline.h b/lldb/include/lldb/Host/Editline.h
index 9049b106f02a..a02f90891599 100644
--- a/lldb/include/lldb/Host/Editline.h
+++ b/lldb/include/lldb/Host/Editline.h
@@ -57,6 +57,23 @@
 
 #include "llvm/ADT/FunctionExtras.h"
 
+#if defined(__clang__) && defined(__has_warning)
+#if __has_warning("-Wdeprecated-declarations")
+#define LLDB_DEPRECATED_WARNING_DISABLE                                        \
+  _Pragma("clang diagnostic push")                                             \
+      _Pragma("clang diagnostic ignored \"-Wdeprecated-declarations\"")
+#define LLDB_DEPRECATED_WARNING_RESTORE _Pragma("clang diagnostic pop")
+#endif
+#elif defined(__GNUC__) && __GNUC__ > 6
+#define LLDB_DEPRECATED_WARNING_DISABLE                                        \
+  _Pragma("GCC diagnostic push")                                               \
+      _Pragma("GCC diagnostic ignored \"-Wdeprecated-declarations\"")
+#define LLDB_DEPRECATED_WARNING_RESTORE _Pragma("GCC diagnostic pop")
+#else
+#define LLDB_DEPRECATED_WARNING_DISABLE
+#define LLDB_DEPRECATED_WARNING_RESTORE
+#endif
+
 namespace lldb_private {
 namespace line_editor {
 
@@ -367,7 +384,9 @@ private:
   void SetGetCharacterFunction(EditlineGetCharCallbackType callbackFn);
 
 #if LLDB_EDITLINE_USE_WCHAR
+  LLDB_DEPRECATED_WARNING_DISABLE
   std::wstring_convert<std::codecvt_utf8<wchar_t>> m_utf8conv;
+  LLDB_DEPRECATED_WARNING_RESTORE
 #endif
   ::EditLine *m_editline = nullptr;
   EditlineHistorySP m_history_sp;
diff --git a/lldb/source/Host/common/Editline.cpp b/lldb/source/Host/common/Editline.cpp
index 561ec228cdb2..60117cb5f0e6 100644
--- a/lldb/source/Host/common/Editline.cpp
+++ b/lldb/source/Host/common/Editline.cpp
@@ -1574,7 +1574,9 @@ bool Editline::CompleteCharacter(char ch, EditLineGetCharType &out) {
   out = (unsigned char)ch;
   return true;
 #else
+  LLDB_DEPRECATED_WARNING_DISABLE
   std::codecvt_utf8<wchar_t> cvt;
+  LLDB_DEPRECATED_WARNING_RESTORE
   llvm::SmallString<4> input;
   for (;;) {
     const char *from_next;
-- 
GitLab


From 85c17e40926132575d1b98ca1a36b8394fe511cd Mon Sep 17 00:00:00 2001
From: Jay Foad <jay.foad@amd.com>
Date: Thu, 17 Oct 2024 16:20:43 +0100
Subject: [PATCH 270/329] [LLVM] Make more use of IRBuilder::CreateIntrinsic.
 NFC. (#112706)

Convert many instances of:
  Fn = Intrinsic::getOrInsertDeclaration(...);
  CreateCall(Fn, ...)
to the equivalent CreateIntrinsic call.
---
 llvm/lib/CodeGen/ExpandVectorPredication.cpp  |  37 +++--
 llvm/lib/CodeGen/HardwareLoops.cpp            |  13 +-
 llvm/lib/IR/AutoUpgrade.cpp                   |  68 ++++-----
 llvm/lib/IR/IRBuilder.cpp                     | 133 +++++-------------
 .../Target/AArch64/AArch64ISelLowering.cpp    |  15 +-
 .../Target/AArch64/AArch64StackTagging.cpp    |  12 +-
 .../Target/AMDGPU/AMDGPUAtomicOptimizer.cpp   |   8 +-
 .../Target/AMDGPU/AMDGPUCodeGenPrepare.cpp    |  14 +-
 .../AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp  |   4 +-
 .../AMDGPU/AMDGPUInstCombineIntrinsic.cpp     |  20 +--
 .../AMDGPU/AMDGPULowerModuleLDSPass.cpp       |   6 +-
 .../lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp |  26 ++--
 llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp   |   4 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |  37 ++---
 llvm/lib/Target/ARM/MVETailPredication.cpp    |   3 +-
 llvm/lib/Target/DirectX/DXILOpLowering.cpp    |   5 +-
 llvm/lib/Target/Hexagon/HexagonGenExtract.cpp |   5 +-
 .../Target/Hexagon/HexagonISelLowering.cpp    |  10 +-
 .../Target/Hexagon/HexagonVectorCombine.cpp   |  11 +-
 .../LoongArch/LoongArchISelLowering.cpp       |   6 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |  10 +-
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp   |  56 +++-----
 llvm/lib/Target/SystemZ/SystemZTDC.cpp        |   5 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |  19 +--
 .../Target/X86/X86InstCombineIntrinsic.cpp    |  10 +-
 .../Target/XCore/XCoreLowerThreadLocal.cpp    |   4 +-
 .../AggressiveInstCombine.cpp                 |  22 ++-
 llvm/lib/Transforms/IPO/CrossDSOCFI.cpp       |   9 +-
 .../InstCombine/InstCombineAndOrXor.cpp       |   3 +-
 .../InstCombine/InstCombineCalls.cpp          |   9 +-
 .../InstCombine/InstCombineCompares.cpp       |  12 +-
 .../Instrumentation/AddressSanitizer.cpp      |   7 +-
 .../Instrumentation/BoundsChecking.cpp        |   7 +-
 .../Instrumentation/MemorySanitizer.cpp       |  18 +--
 .../Instrumentation/SanitizerCoverage.cpp     |   9 +-
 .../Transforms/Scalar/LoopDataPrefetch.cpp    |  12 +-
 llvm/lib/Transforms/Scalar/LoopFlatten.cpp    |   8 +-
 .../Transforms/Scalar/LoopIdiomRecognize.cpp  |   8 +-
 .../Scalar/LowerMatrixIntrinsics.cpp          |   5 +-
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |   9 +-
 .../Transforms/Utils/MemoryTaggingSupport.cpp |  13 +-
 .../Utils/ScalarEvolutionExpander.cpp         |   7 +-
 .../lib/Transforms/Utils/SimplifyLibCalls.cpp |   5 +-
 43 files changed, 251 insertions(+), 453 deletions(-)

diff --git a/llvm/lib/CodeGen/ExpandVectorPredication.cpp b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
index 32ba3e91822d..dd18b524e3f9 100644
--- a/llvm/lib/CodeGen/ExpandVectorPredication.cpp
+++ b/llvm/lib/CodeGen/ExpandVectorPredication.cpp
@@ -235,13 +235,12 @@ Value *CachingVPExpander::convertEVLToMask(IRBuilder<> &Builder,
   // TODO add caching
   // Scalable vector %evl conversion.
   if (ElemCount.isScalable()) {
-    auto *M = Builder.GetInsertBlock()->getModule();
     Type *BoolVecTy = VectorType::get(Builder.getInt1Ty(), ElemCount);
-    Function *ActiveMaskFunc = Intrinsic::getOrInsertDeclaration(
-        M, Intrinsic::get_active_lane_mask, {BoolVecTy, EVLParam->getType()});
     // `get_active_lane_mask` performs an implicit less-than comparison.
     Value *ConstZero = Builder.getInt32(0);
-    return Builder.CreateCall(ActiveMaskFunc, {ConstZero, EVLParam});
+    return Builder.CreateIntrinsic(Intrinsic::get_active_lane_mask,
+                                   {BoolVecTy, EVLParam->getType()},
+                                   {ConstZero, EVLParam});
   }
 
   // Fixed vector %evl conversion.
@@ -299,18 +298,18 @@ Value *CachingVPExpander::expandPredicationToIntCall(
   case Intrinsic::umin: {
     Value *Op0 = VPI.getOperand(0);
     Value *Op1 = VPI.getOperand(1);
-    Function *Fn = Intrinsic::getOrInsertDeclaration(
-        VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
-    Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName());
+    Value *NewOp = Builder.CreateIntrinsic(
+        UnpredicatedIntrinsicID, {VPI.getType()}, {Op0, Op1},
+        /*FMFSource=*/nullptr, VPI.getName());
     replaceOperation(*NewOp, VPI);
     return NewOp;
   }
   case Intrinsic::bswap:
   case Intrinsic::bitreverse: {
     Value *Op = VPI.getOperand(0);
-    Function *Fn = Intrinsic::getOrInsertDeclaration(
-        VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
-    Value *NewOp = Builder.CreateCall(Fn, {Op}, VPI.getName());
+    Value *NewOp =
+        Builder.CreateIntrinsic(UnpredicatedIntrinsicID, {VPI.getType()}, {Op},
+                                /*FMFSource=*/nullptr, VPI.getName());
     replaceOperation(*NewOp, VPI);
     return NewOp;
   }
@@ -327,9 +326,9 @@ Value *CachingVPExpander::expandPredicationToFPCall(
   case Intrinsic::fabs:
   case Intrinsic::sqrt: {
     Value *Op0 = VPI.getOperand(0);
-    Function *Fn = Intrinsic::getOrInsertDeclaration(
-        VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
-    Value *NewOp = Builder.CreateCall(Fn, {Op0}, VPI.getName());
+    Value *NewOp =
+        Builder.CreateIntrinsic(UnpredicatedIntrinsicID, {VPI.getType()}, {Op0},
+                                /*FMFSource=*/nullptr, VPI.getName());
     replaceOperation(*NewOp, VPI);
     return NewOp;
   }
@@ -337,9 +336,9 @@ Value *CachingVPExpander::expandPredicationToFPCall(
   case Intrinsic::minnum: {
     Value *Op0 = VPI.getOperand(0);
     Value *Op1 = VPI.getOperand(1);
-    Function *Fn = Intrinsic::getOrInsertDeclaration(
-        VPI.getModule(), UnpredicatedIntrinsicID, {VPI.getType()});
-    Value *NewOp = Builder.CreateCall(Fn, {Op0, Op1}, VPI.getName());
+    Value *NewOp = Builder.CreateIntrinsic(
+        UnpredicatedIntrinsicID, {VPI.getType()}, {Op0, Op1},
+        /*FMFSource=*/nullptr, VPI.getName());
     replaceOperation(*NewOp, VPI);
     return NewOp;
   }
@@ -592,12 +591,10 @@ bool CachingVPExpander::discardEVLParameter(VPIntrinsic &VPI) {
   Type *Int32Ty = Type::getInt32Ty(VPI.getContext());
   if (StaticElemCount.isScalable()) {
     // TODO add caching
-    auto *M = VPI.getModule();
-    Function *VScaleFunc =
-        Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale, Int32Ty);
     IRBuilder<> Builder(VPI.getParent(), VPI.getIterator());
     Value *FactorConst = Builder.getInt32(StaticElemCount.getKnownMinValue());
-    Value *VScale = Builder.CreateCall(VScaleFunc, {}, "vscale");
+    Value *VScale = Builder.CreateIntrinsic(Intrinsic::vscale, Int32Ty, {},
+                                            /*FMFSource=*/nullptr, "vscale");
     MaxEVL = Builder.CreateMul(VScale, FactorConst, "scalable_size",
                                /*NUW*/ true, /*NSW*/ false);
   } else {
diff --git a/llvm/lib/CodeGen/HardwareLoops.cpp b/llvm/lib/CodeGen/HardwareLoops.cpp
index c8a63304a3b6..86fec239c3ed 100644
--- a/llvm/lib/CodeGen/HardwareLoops.cpp
+++ b/llvm/lib/CodeGen/HardwareLoops.cpp
@@ -512,8 +512,7 @@ Value* HardwareLoop::InsertIterationSetup(Value *LoopCountInit) {
                                    : Intrinsic::test_set_loop_iterations)
                          : (UsePhi ? Intrinsic::start_loop_iterations
                                    : Intrinsic::set_loop_iterations);
-  Function *LoopIter = Intrinsic::getOrInsertDeclaration(M, ID, Ty);
-  Value *LoopSetup = Builder.CreateCall(LoopIter, LoopCountInit);
+  Value *LoopSetup = Builder.CreateIntrinsic(ID, Ty, LoopCountInit);
 
   // Use the return value of the intrinsic to control the entry of the loop.
   if (UseLoopGuard) {
@@ -541,10 +540,9 @@ void HardwareLoop::InsertLoopDec() {
           Attribute::StrictFP))
     CondBuilder.setIsFPConstrained(true);
 
-  Function *DecFunc = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::loop_decrement, LoopDecrement->getType());
   Value *Ops[] = { LoopDecrement };
-  Value *NewCond = CondBuilder.CreateCall(DecFunc, Ops);
+  Value *NewCond = CondBuilder.CreateIntrinsic(Intrinsic::loop_decrement,
+                                               LoopDecrement->getType(), Ops);
   Value *OldCond = ExitBranch->getCondition();
   ExitBranch->setCondition(NewCond);
 
@@ -565,10 +563,9 @@ Instruction* HardwareLoop::InsertLoopRegDec(Value *EltsRem) {
           Attribute::StrictFP))
     CondBuilder.setIsFPConstrained(true);
 
-  Function *DecFunc = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::loop_decrement_reg, {EltsRem->getType()});
   Value *Ops[] = { EltsRem, LoopDecrement };
-  Value *Call = CondBuilder.CreateCall(DecFunc, Ops);
+  Value *Call = CondBuilder.CreateIntrinsic(Intrinsic::loop_decrement_reg,
+                                            {EltsRem->getType()}, Ops);
 
   LLVM_DEBUG(dbgs() << "HWLoops: Inserted loop dec: " << *Call << "\n");
   return cast<Instruction>(Call);
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 519ff8d74c5a..3aceb5227bb3 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -1757,8 +1757,7 @@ static Value *upgradeX86BinaryIntrinsics(IRBuilder<> &Builder, CallBase &CI,
   Type *Ty = CI.getType();
   Value *Op0 = CI.getOperand(0);
   Value *Op1 = CI.getOperand(1);
-  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty);
-  Value *Res = Builder.CreateCall(Intrin, {Op0, Op1});
+  Value *Res = Builder.CreateIntrinsic(IID, Ty, {Op0, Op1});
 
   if (CI.arg_size() == 4) { // For masked intrinsics.
     Value *VecSrc = CI.getOperand(2);
@@ -1784,8 +1783,7 @@ static Value *upgradeX86Rotate(IRBuilder<> &Builder, CallBase &CI,
   }
 
   Intrinsic::ID IID = IsRotateRight ? Intrinsic::fshr : Intrinsic::fshl;
-  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty);
-  Value *Res = Builder.CreateCall(Intrin, {Src, Src, Amt});
+  Value *Res = Builder.CreateIntrinsic(IID, Ty, {Src, Src, Amt});
 
   if (CI.arg_size() == 4) { // For masked intrinsics.
     Value *VecSrc = CI.getOperand(2);
@@ -1854,8 +1852,7 @@ static Value *upgradeX86ConcatShift(IRBuilder<> &Builder, CallBase &CI,
   }
 
   Intrinsic::ID IID = IsShiftRight ? Intrinsic::fshr : Intrinsic::fshl;
-  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID, Ty);
-  Value *Res = Builder.CreateCall(Intrin, {Op0, Op1, Amt});
+  Value *Res = Builder.CreateIntrinsic(IID, Ty, {Op0, Op1, Amt});
 
   unsigned NumArgs = CI.arg_size();
   if (NumArgs >= 4) { // For masked intrinsics.
@@ -1915,9 +1912,8 @@ static Value *upgradeMaskedLoad(IRBuilder<> &Builder, Value *Ptr,
 static Value *upgradeAbs(IRBuilder<> &Builder, CallBase &CI) {
   Type *Ty = CI.getType();
   Value *Op0 = CI.getArgOperand(0);
-  Function *F =
-      Intrinsic::getOrInsertDeclaration(CI.getModule(), Intrinsic::abs, Ty);
-  Value *Res = Builder.CreateCall(F, {Op0, Builder.getInt1(false)});
+  Value *Res = Builder.CreateIntrinsic(Intrinsic::abs, Ty,
+                                       {Op0, Builder.getInt1(false)});
   if (CI.arg_size() == 3)
     Res = emitX86Select(Builder, CI.getArgOperand(2), Res, CI.getArgOperand(1));
   return Res;
@@ -2009,9 +2005,8 @@ static Value *upgradeMaskedCompare(IRBuilder<> &Builder, CallBase &CI,
 // Replace a masked intrinsic with an older unmasked intrinsic.
 static Value *upgradeX86MaskedShift(IRBuilder<> &Builder, CallBase &CI,
                                     Intrinsic::ID IID) {
-  Function *Intrin = Intrinsic::getOrInsertDeclaration(CI.getModule(), IID);
-  Value *Rep = Builder.CreateCall(Intrin,
-                                 { CI.getArgOperand(0), CI.getArgOperand(1) });
+  Value *Rep = Builder.CreateIntrinsic(
+      IID, {}, {CI.getArgOperand(0), CI.getArgOperand(1)});
   return emitX86Select(Builder, CI.getArgOperand(3), Rep, CI.getArgOperand(2));
 }
 
@@ -2480,9 +2475,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
   } else if (Name == "sse.sqrt.ss" || Name == "sse2.sqrt.sd") {
     Value *Vec = CI->getArgOperand(0);
     Value *Elt0 = Builder.CreateExtractElement(Vec, (uint64_t)0);
-    Function *Intr = Intrinsic::getOrInsertDeclaration(
-        F->getParent(), Intrinsic::sqrt, Elt0->getType());
-    Elt0 = Builder.CreateCall(Intr, Elt0);
+    Elt0 = Builder.CreateIntrinsic(Intrinsic::sqrt, Elt0->getType(), Elt0);
     Rep = Builder.CreateInsertElement(Vec, Elt0, (uint64_t)0);
   } else if (Name.starts_with("avx.sqrt.p") ||
              Name.starts_with("sse2.sqrt.p") ||
@@ -2770,9 +2763,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
               cast<ConstantInt>(CI->getArgOperand(3))->getZExtValue() != 4)) {
       Intrinsic::ID IID = IsUnsigned ? Intrinsic::x86_avx512_uitofp_round
                                      : Intrinsic::x86_avx512_sitofp_round;
-      Function *F = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID,
-                                                      {DstTy, SrcTy});
-      Rep = Builder.CreateCall(F, {Rep, CI->getArgOperand(3)});
+      Rep = Builder.CreateIntrinsic(IID, {DstTy, SrcTy},
+                                    {Rep, CI->getArgOperand(3)});
     } else {
       Rep = IsUnsigned ? Builder.CreateUIToFP(Rep, DstTy, "cvt")
                        : Builder.CreateSIToFP(Rep, DstTy, "cvt");
@@ -2813,9 +2805,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     Value *MaskVec = getX86MaskVec(Builder, CI->getArgOperand(2),
                                    ResultTy->getNumElements());
 
-    Function *ELd = Intrinsic::getOrInsertDeclaration(
-        F->getParent(), Intrinsic::masked_expandload, ResultTy);
-    Rep = Builder.CreateCall(ELd, {Ptr, MaskVec, CI->getOperand(1)});
+    Rep = Builder.CreateIntrinsic(Intrinsic::masked_expandload, ResultTy,
+                                  {Ptr, MaskVec, CI->getOperand(1)});
   } else if (Name.starts_with("avx512.mask.compress.store.")) {
     auto *ResultTy = cast<VectorType>(CI->getArgOperand(1)->getType());
     Type *PtrTy = ResultTy->getElementType();
@@ -2828,9 +2819,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         getX86MaskVec(Builder, CI->getArgOperand(2),
                       cast<FixedVectorType>(ResultTy)->getNumElements());
 
-    Function *CSt = Intrinsic::getOrInsertDeclaration(
-        F->getParent(), Intrinsic::masked_compressstore, ResultTy);
-    Rep = Builder.CreateCall(CSt, {CI->getArgOperand(1), Ptr, MaskVec});
+    Rep = Builder.CreateIntrinsic(Intrinsic::masked_compressstore, ResultTy,
+                                  {CI->getArgOperand(1), Ptr, MaskVec});
   } else if (Name.starts_with("avx512.mask.compress.") ||
              Name.starts_with("avx512.mask.expand.")) {
     auto *ResultTy = cast<FixedVectorType>(CI->getType());
@@ -2841,10 +2831,8 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     bool IsCompress = Name[12] == 'c';
     Intrinsic::ID IID = IsCompress ? Intrinsic::x86_avx512_mask_compress
                                    : Intrinsic::x86_avx512_mask_expand;
-    Function *Intr =
-        Intrinsic::getOrInsertDeclaration(F->getParent(), IID, ResultTy);
-    Rep = Builder.CreateCall(Intr,
-                             {CI->getOperand(0), CI->getOperand(1), MaskVec});
+    Rep = Builder.CreateIntrinsic(
+        IID, ResultTy, {CI->getOperand(0), CI->getOperand(1), MaskVec});
   } else if (Name.starts_with("xop.vpcom")) {
     bool IsSigned;
     if (Name.ends_with("ub") || Name.ends_with("uw") || Name.ends_with("ud") ||
@@ -2905,11 +2893,10 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
     bool ZeroMask = Name[11] == 'z';
     Rep = upgradeX86ConcatShift(Builder, *CI, true, ZeroMask);
   } else if (Name == "sse42.crc32.64.8") {
-    Function *CRC32 = Intrinsic::getOrInsertDeclaration(
-        F->getParent(), Intrinsic::x86_sse42_crc32_32_8);
     Value *Trunc0 =
         Builder.CreateTrunc(CI->getArgOperand(0), Type::getInt32Ty(C));
-    Rep = Builder.CreateCall(CRC32, {Trunc0, CI->getArgOperand(1)});
+    Rep = Builder.CreateIntrinsic(Intrinsic::x86_sse42_crc32_32_8, {},
+                                  {Trunc0, CI->getArgOperand(1)});
     Rep = Builder.CreateZExt(Rep, CI->getType(), "");
   } else if (Name.starts_with("avx.vbroadcast.s") ||
              Name.starts_with("avx512.vbroadcast.s")) {
@@ -3769,12 +3756,9 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
         IID = Intrinsic::x86_avx512_vfmadd_f64;
       else
         IID = Intrinsic::x86_avx512_vfmadd_f32;
-      Function *FMA = Intrinsic::getOrInsertDeclaration(CI->getModule(), IID);
-      Rep = Builder.CreateCall(FMA, Ops);
+      Rep = Builder.CreateIntrinsic(IID, {}, Ops);
     } else {
-      Function *FMA = Intrinsic::getOrInsertDeclaration(
-          CI->getModule(), Intrinsic::fma, A->getType());
-      Rep = Builder.CreateCall(FMA, {A, B, C});
+      Rep = Builder.CreateIntrinsic(Intrinsic::fma, A->getType(), {A, B, C});
     }
 
     Value *PassThru = IsMaskZ   ? Constant::getNullValue(Rep->getType())
@@ -3827,9 +3811,7 @@ static Value *upgradeX86IntrinsicCall(StringRef Name, CallBase *CI, Function *F,
 
       Rep = Builder.CreateIntrinsic(IID, {}, {A, B, C, CI->getArgOperand(4)});
     } else {
-      Function *FMA = Intrinsic::getOrInsertDeclaration(
-          CI->getModule(), Intrinsic::fma, A->getType());
-      Rep = Builder.CreateCall(FMA, {A, B, C});
+      Rep = Builder.CreateIntrinsic(Intrinsic::fma, A->getType(), {A, B, C});
     }
 
     Value *PassThru = IsMaskZ   ? llvm::Constant::getNullValue(CI->getType())
@@ -4088,8 +4070,8 @@ static Value *upgradeAArch64IntrinsicCall(StringRef Name, CallBase *CI,
   Args[1] = Builder.CreateIntrinsic(Intrinsic::aarch64_sve_convert_from_svbool,
                                     GoodPredTy, Args[1]);
 
-  Function *NewF = Intrinsic::getOrInsertDeclaration(CI->getModule(), NewID);
-  return Builder.CreateCall(NewF, Args, CI->getName());
+  return Builder.CreateIntrinsic(NewID, {}, Args, /*FMFSource=*/nullptr,
+                                 CI->getName());
 }
 
 static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
@@ -4171,8 +4153,8 @@ static Value *upgradeARMIntrinsicCall(StringRef Name, CallBase *CI, Function *F,
       Ops.push_back(Op);
     }
 
-    Function *Fn = Intrinsic::getOrInsertDeclaration(F->getParent(), ID, Tys);
-    return Builder.CreateCall(Fn, Ops, CI->getName());
+    return Builder.CreateIntrinsic(ID, Tys, Ops, /*FMFSource=*/nullptr,
+                                   CI->getName());
   }
   llvm_unreachable("Unknown function for ARM CallBase upgrade.");
 }
diff --git a/llvm/lib/IR/IRBuilder.cpp b/llvm/lib/IR/IRBuilder.cpp
index 3654bf9a9e70..f340f7aafdc7 100644
--- a/llvm/lib/IR/IRBuilder.cpp
+++ b/llvm/lib/IR/IRBuilder.cpp
@@ -90,10 +90,8 @@ Value *IRBuilderBase::CreateVScale(Constant *Scaling, const Twine &Name) {
   assert(isa<ConstantInt>(Scaling) && "Expected constant integer");
   if (cast<ConstantInt>(Scaling)->isZero())
     return Scaling;
-  Module *M = GetInsertBlock()->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Intrinsic::vscale,
-                                                      {Scaling->getType()});
-  CallInst *CI = CreateCall(TheFn, {}, {}, Name);
+  CallInst *CI =
+      CreateIntrinsic(Intrinsic::vscale, {Scaling->getType()}, {}, {}, Name);
   return cast<ConstantInt>(Scaling)->isOne() ? CI : CreateMul(CI, Scaling);
 }
 
@@ -140,12 +138,9 @@ CallInst *IRBuilderBase::CreateMemSet(Value *Ptr, Value *Val, Value *Size,
                                       MDNode *TBAATag, MDNode *ScopeTag,
                                       MDNode *NoAliasTag) {
   Value *Ops[] = {Ptr, Val, Size, getInt1(isVolatile)};
-  Type *Tys[] = { Ptr->getType(), Size->getType() };
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn =
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset, Tys);
+  Type *Tys[] = {Ptr->getType(), Size->getType()};
 
-  CallInst *CI = CreateCall(TheFn, Ops);
+  CallInst *CI = CreateIntrinsic(Intrinsic::memset, Tys, Ops);
 
   if (Align)
     cast<MemSetInst>(CI)->setDestAlignment(*Align);
@@ -170,11 +165,8 @@ CallInst *IRBuilderBase::CreateMemSetInline(Value *Dst, MaybeAlign DstAlign,
                                             MDNode *NoAliasTag) {
   Value *Ops[] = {Dst, Val, Size, getInt1(IsVolatile)};
   Type *Tys[] = {Dst->getType(), Size->getType()};
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn =
-      Intrinsic::getOrInsertDeclaration(M, Intrinsic::memset_inline, Tys);
 
-  CallInst *CI = CreateCall(TheFn, Ops);
+  CallInst *CI = CreateIntrinsic(Intrinsic::memset_inline, Tys, Ops);
 
   if (DstAlign)
     cast<MemSetInlineInst>(CI)->setDestAlignment(*DstAlign);
@@ -198,11 +190,9 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemSet(
 
   Value *Ops[] = {Ptr, Val, Size, getInt32(ElementSize)};
   Type *Tys[] = {Ptr->getType(), Size->getType()};
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::memset_element_unordered_atomic, Tys);
 
-  CallInst *CI = CreateCall(TheFn, Ops);
+  CallInst *CI =
+      CreateIntrinsic(Intrinsic::memset_element_unordered_atomic, Tys, Ops);
 
   cast<AtomicMemSetInst>(CI)->setDestAlignment(Alignment);
 
@@ -227,11 +217,9 @@ CallInst *IRBuilderBase::CreateMemTransferInst(
           IntrID == Intrinsic::memmove) &&
          "Unexpected intrinsic ID");
   Value *Ops[] = {Dst, Src, Size, getInt1(isVolatile)};
-  Type *Tys[] = { Dst->getType(), Src->getType(), Size->getType() };
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(M, IntrID, Tys);
+  Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
 
-  CallInst *CI = CreateCall(TheFn, Ops);
+  CallInst *CI = CreateIntrinsic(IntrID, Tys, Ops);
 
   auto* MCI = cast<MemTransferInst>(CI);
   if (DstAlign)
@@ -266,11 +254,9 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemCpy(
          "Pointer alignment must be at least element size");
   Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)};
   Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::memcpy_element_unordered_atomic, Tys);
 
-  CallInst *CI = CreateCall(TheFn, Ops);
+  CallInst *CI =
+      CreateIntrinsic(Intrinsic::memcpy_element_unordered_atomic, Tys, Ops);
 
   // Set the alignment of the pointer args.
   auto *AMCI = cast<AtomicMemCpyInst>(CI);
@@ -382,11 +368,9 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove(
          "Pointer alignment must be at least element size");
   Value *Ops[] = {Dst, Src, Size, getInt32(ElementSize)};
   Type *Tys[] = {Dst->getType(), Src->getType(), Size->getType()};
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::memmove_element_unordered_atomic, Tys);
 
-  CallInst *CI = CreateCall(TheFn, Ops);
+  CallInst *CI =
+      CreateIntrinsic(Intrinsic::memmove_element_unordered_atomic, Tys, Ops);
 
   // Set the alignment of the pointer args.
   CI->addParamAttr(0, Attribute::getWithAlignment(CI->getContext(), DstAlign));
@@ -410,27 +394,19 @@ CallInst *IRBuilderBase::CreateElementUnorderedAtomicMemMove(
 }
 
 CallInst *IRBuilderBase::getReductionIntrinsic(Intrinsic::ID ID, Value *Src) {
-  Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Src};
   Type *Tys[] = { Src->getType() };
-  auto Decl = Intrinsic::getOrInsertDeclaration(M, ID, Tys);
-  return CreateCall(Decl, Ops);
+  return CreateIntrinsic(ID, Tys, Ops);
 }
 
 CallInst *IRBuilderBase::CreateFAddReduce(Value *Acc, Value *Src) {
-  Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  auto Decl = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::vector_reduce_fadd, {Src->getType()});
-  return CreateCall(Decl, Ops);
+  return CreateIntrinsic(Intrinsic::vector_reduce_fadd, {Src->getType()}, Ops);
 }
 
 CallInst *IRBuilderBase::CreateFMulReduce(Value *Acc, Value *Src) {
-  Module *M = GetInsertBlock()->getParent()->getParent();
   Value *Ops[] = {Acc, Src};
-  auto Decl = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::vector_reduce_fmul, {Src->getType()});
-  return CreateCall(Decl, Ops);
+  return CreateIntrinsic(Intrinsic::vector_reduce_fmul, {Src->getType()}, Ops);
 }
 
 CallInst *IRBuilderBase::CreateAddReduce(Value *Src) {
@@ -490,10 +466,7 @@ CallInst *IRBuilderBase::CreateLifetimeStart(Value *Ptr, ConstantInt *Size) {
     assert(Size->getType() == getInt64Ty() &&
            "lifetime.start requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::lifetime_start, {Ptr->getType()});
-  return CreateCall(TheFn, Ops);
+  return CreateIntrinsic(Intrinsic::lifetime_start, {Ptr->getType()}, Ops);
 }
 
 CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
@@ -505,10 +478,7 @@ CallInst *IRBuilderBase::CreateLifetimeEnd(Value *Ptr, ConstantInt *Size) {
     assert(Size->getType() == getInt64Ty() &&
            "lifetime.end requires the size to be an i64");
   Value *Ops[] = { Size, Ptr };
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::lifetime_end, {Ptr->getType()});
-  return CreateCall(TheFn, Ops);
+  return CreateIntrinsic(Intrinsic::lifetime_end, {Ptr->getType()}, Ops);
 }
 
 CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
@@ -524,10 +494,7 @@ CallInst *IRBuilderBase::CreateInvariantStart(Value *Ptr, ConstantInt *Size) {
   Value *Ops[] = {Size, Ptr};
   // Fill in the single overloaded type: memory object type.
   Type *ObjectPtr[1] = {Ptr->getType()};
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::invariant_start, ObjectPtr);
-  return CreateCall(TheFn, Ops);
+  return CreateIntrinsic(Intrinsic::invariant_start, ObjectPtr, Ops);
 }
 
 static MaybeAlign getAlign(Value *Ptr) {
@@ -563,10 +530,8 @@ IRBuilderBase::CreateAssumption(Value *Cond,
 }
 
 Instruction *IRBuilderBase::CreateNoAliasScopeDeclaration(Value *Scope) {
-  Module *M = BB->getModule();
-  auto *FnIntrinsic = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::experimental_noalias_scope_decl, {});
-  return CreateCall(FnIntrinsic, {Scope});
+  return CreateIntrinsic(Intrinsic::experimental_noalias_scope_decl, {},
+                         {Scope});
 }
 
 /// Create a call to a Masked Load intrinsic.
@@ -616,9 +581,7 @@ CallInst *IRBuilderBase::CreateMaskedIntrinsic(Intrinsic::ID Id,
                                                ArrayRef<Value *> Ops,
                                                ArrayRef<Type *> OverloadedTypes,
                                                const Twine &Name) {
-  Module *M = BB->getParent()->getParent();
-  Function *TheFn = Intrinsic::getOrInsertDeclaration(M, Id, OverloadedTypes);
-  return CreateCall(TheFn, Ops, {}, Name);
+  return CreateIntrinsic(Id, OverloadedTypes, Ops, {}, Name);
 }
 
 /// Create a call to a Masked Gather intrinsic.
@@ -875,42 +838,34 @@ InvokeInst *IRBuilderBase::CreateGCStatepointInvoke(
 CallInst *IRBuilderBase::CreateGCResult(Instruction *Statepoint,
                                         Type *ResultType, const Twine &Name) {
   Intrinsic::ID ID = Intrinsic::experimental_gc_result;
-  Module *M = BB->getParent()->getParent();
   Type *Types[] = {ResultType};
-  Function *FnGCResult = Intrinsic::getOrInsertDeclaration(M, ID, Types);
 
   Value *Args[] = {Statepoint};
-  return CreateCall(FnGCResult, Args, {}, Name);
+  return CreateIntrinsic(ID, Types, Args, {}, Name);
 }
 
 CallInst *IRBuilderBase::CreateGCRelocate(Instruction *Statepoint,
                                           int BaseOffset, int DerivedOffset,
                                           Type *ResultType, const Twine &Name) {
-  Module *M = BB->getParent()->getParent();
   Type *Types[] = {ResultType};
-  Function *FnGCRelocate = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::experimental_gc_relocate, Types);
 
   Value *Args[] = {Statepoint, getInt32(BaseOffset), getInt32(DerivedOffset)};
-  return CreateCall(FnGCRelocate, Args, {}, Name);
+  return CreateIntrinsic(Intrinsic::experimental_gc_relocate, Types, Args, {},
+                         Name);
 }
 
 CallInst *IRBuilderBase::CreateGCGetPointerBase(Value *DerivedPtr,
                                                 const Twine &Name) {
-  Module *M = BB->getParent()->getParent();
   Type *PtrTy = DerivedPtr->getType();
-  Function *FnGCFindBase = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::experimental_gc_get_pointer_base, {PtrTy, PtrTy});
-  return CreateCall(FnGCFindBase, {DerivedPtr}, {}, Name);
+  return CreateIntrinsic(Intrinsic::experimental_gc_get_pointer_base,
+                         {PtrTy, PtrTy}, {DerivedPtr}, {}, Name);
 }
 
 CallInst *IRBuilderBase::CreateGCGetPointerOffset(Value *DerivedPtr,
                                                   const Twine &Name) {
-  Module *M = BB->getParent()->getParent();
   Type *PtrTy = DerivedPtr->getType();
-  Function *FnGCGetOffset = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::experimental_gc_get_pointer_offset, {PtrTy});
-  return CreateCall(FnGCGetOffset, {DerivedPtr}, {}, Name);
+  return CreateIntrinsic(Intrinsic::experimental_gc_get_pointer_offset, {PtrTy},
+                         {DerivedPtr}, {}, Name);
 }
 
 CallInst *IRBuilderBase::CreateUnaryIntrinsic(Intrinsic::ID ID, Value *V,
@@ -1228,13 +1183,10 @@ Value *IRBuilderBase::CreatePreserveArrayAccessIndex(
 
   Type *ResultType = GetElementPtrInst::getGEPReturnType(Base, IdxList);
 
-  Module *M = BB->getParent()->getParent();
-  Function *FnPreserveArrayAccessIndex = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::preserve_array_access_index, {ResultType, BaseType});
-
   Value *DimV = getInt32(Dimension);
   CallInst *Fn =
-      CreateCall(FnPreserveArrayAccessIndex, {Base, DimV, LastIndexV});
+      CreateIntrinsic(Intrinsic::preserve_array_access_index,
+                      {ResultType, BaseType}, {Base, DimV, LastIndexV});
   Fn->addParamAttr(
       0, Attribute::get(Fn->getContext(), Attribute::ElementType, ElTy));
   if (DbgInfo)
@@ -1249,13 +1201,9 @@ Value *IRBuilderBase::CreatePreserveUnionAccessIndex(
          "Invalid Base ptr type for preserve.union.access.index.");
   auto *BaseType = Base->getType();
 
-  Module *M = BB->getParent()->getParent();
-  Function *FnPreserveUnionAccessIndex = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::preserve_union_access_index, {BaseType, BaseType});
-
   Value *DIIndex = getInt32(FieldIndex);
-  CallInst *Fn =
-      CreateCall(FnPreserveUnionAccessIndex, {Base, DIIndex});
+  CallInst *Fn = CreateIntrinsic(Intrinsic::preserve_union_access_index,
+                                 {BaseType, BaseType}, {Base, DIIndex});
   if (DbgInfo)
     Fn->setMetadata(LLVMContext::MD_preserve_access_index, DbgInfo);
 
@@ -1274,13 +1222,10 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex(
   Type *ResultType =
       GetElementPtrInst::getGEPReturnType(Base, {Zero, GEPIndex});
 
-  Module *M = BB->getParent()->getParent();
-  Function *FnPreserveStructAccessIndex = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::preserve_struct_access_index, {ResultType, BaseType});
-
   Value *DIIndex = getInt32(FieldIndex);
-  CallInst *Fn = CreateCall(FnPreserveStructAccessIndex,
-                            {Base, GEPIndex, DIIndex});
+  CallInst *Fn =
+      CreateIntrinsic(Intrinsic::preserve_struct_access_index,
+                      {ResultType, BaseType}, {Base, GEPIndex, DIIndex});
   Fn->addParamAttr(
       0, Attribute::get(Fn->getContext(), Attribute::ElementType, ElTy));
   if (DbgInfo)
@@ -1291,10 +1236,8 @@ Value *IRBuilderBase::CreatePreserveStructAccessIndex(
 
 Value *IRBuilderBase::createIsFPClass(Value *FPNum, unsigned Test) {
   ConstantInt *TestV = getInt32(Test);
-  Module *M = BB->getParent()->getParent();
-  Function *FnIsFPClass = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::is_fpclass, {FPNum->getType()});
-  return CreateCall(FnIsFPClass, {FPNum, TestV});
+  return CreateIntrinsic(Intrinsic::is_fpclass, {FPNum->getType()},
+                         {FPNum, TestV});
 }
 
 CallInst *IRBuilderBase::CreateAlignmentAssumptionHelper(const DataLayout &DL,
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index 5e5afdb7fa0a..b5657584016e 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -16457,10 +16457,9 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
         Builder.CreateShuffleVector(TI->getOperand(0), ShuffleLanes), VecTy));
 
     if (Parts.size() == 4) {
-      auto *F = Intrinsic::getOrInsertDeclaration(
-          TI->getModule(), Intrinsic::aarch64_neon_tbl4, VecTy);
       Parts.push_back(ConstantVector::get(MaskConst));
-      Results.push_back(Builder.CreateCall(F, Parts));
+      Results.push_back(
+          Builder.CreateIntrinsic(Intrinsic::aarch64_neon_tbl4, VecTy, Parts));
       Parts.clear();
     }
 
@@ -16487,9 +16486,8 @@ static void createTblForTrunc(TruncInst *TI, bool IsLittleEndian) {
       break;
     }
 
-    auto *F = Intrinsic::getOrInsertDeclaration(TI->getModule(), TblID, VecTy);
     Parts.push_back(ConstantVector::get(MaskConst));
-    Results.push_back(Builder.CreateCall(F, Parts));
+    Results.push_back(Builder.CreateIntrinsic(TblID, VecTy, Parts));
   }
 
   // Extract the destination vector from TBL result(s) after combining them
@@ -27252,9 +27250,9 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
   if (ValueTy->getPrimitiveSizeInBits() == 128) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::aarch64_ldaxp : Intrinsic::aarch64_ldxp;
-    Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int);
 
-    Value *LoHi = Builder.CreateCall(Ldxr, Addr, "lohi");
+    Value *LoHi =
+        Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi");
 
     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
@@ -27271,11 +27269,10 @@ Value *AArch64TargetLowering::emitLoadLinked(IRBuilderBase &Builder,
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int =
       IsAcquire ? Intrinsic::aarch64_ldaxr : Intrinsic::aarch64_ldxr;
-  Function *Ldxr = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
 
   const DataLayout &DL = M->getDataLayout();
   IntegerType *IntEltTy = Builder.getIntNTy(DL.getTypeSizeInBits(ValueTy));
-  CallInst *CI = Builder.CreateCall(Ldxr, Addr);
+  CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
   CI->addParamAttr(0, Attribute::get(Builder.getContext(),
                                      Attribute::ElementType, IntEltTy));
   Value *Trunc = Builder.CreateTrunc(CI, IntEltTy);
diff --git a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
index fe96fedcfb82..a6535a532fff 100644
--- a/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
+++ b/llvm/lib/Target/AArch64/AArch64StackTagging.cpp
@@ -481,10 +481,9 @@ Instruction *AArch64StackTagging::insertBaseTaggedPointer(
   assert(PrologueBB);
 
   IRBuilder<> IRB(&PrologueBB->front());
-  Function *IRG_SP = Intrinsic::getOrInsertDeclaration(
-      F->getParent(), Intrinsic::aarch64_irg_sp);
   Instruction *Base =
-      IRB.CreateCall(IRG_SP, {Constant::getNullValue(IRB.getInt64Ty())});
+      IRB.CreateIntrinsic(Intrinsic::aarch64_irg_sp, {},
+                          {Constant::getNullValue(IRB.getInt64Ty())});
   Base->setName("basetag");
   auto TargetTriple = Triple(M.getTargetTriple());
   // This ABI will make it into Android API level 35.
@@ -580,11 +579,10 @@ bool AArch64StackTagging::runOnFunction(Function &Fn) {
     NextTag = (NextTag + 1) % 16;
     // Replace alloca with tagp(alloca).
     IRBuilder<> IRB(Info.AI->getNextNode());
-    Function *TagP = Intrinsic::getOrInsertDeclaration(
-        F->getParent(), Intrinsic::aarch64_tagp, {Info.AI->getType()});
     Instruction *TagPCall =
-        IRB.CreateCall(TagP, {Constant::getNullValue(Info.AI->getType()), Base,
-                              ConstantInt::get(IRB.getInt64Ty(), Tag)});
+        IRB.CreateIntrinsic(Intrinsic::aarch64_tagp, {Info.AI->getType()},
+                            {Constant::getNullValue(Info.AI->getType()), Base,
+                             ConstantInt::get(IRB.getInt64Ty(), Tag)});
     if (Info.AI->hasName())
       TagPCall->setName(Info.AI->getName() + ".tag");
     // Does not replace metadata, so we don't have to handle DbgVariableRecords.
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
index ea88ed424dc5..ee5e75955cd4 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUAtomicOptimizer.cpp
@@ -407,16 +407,14 @@ Value *AMDGPUAtomicOptimizerImpl::buildReduction(IRBuilder<> &B,
                                                  Value *const Identity) const {
   Type *AtomicTy = V->getType();
   Module *M = B.GetInsertBlock()->getModule();
-  Function *UpdateDPP = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::amdgcn_update_dpp, AtomicTy);
 
   // Reduce within each row of 16 lanes.
   for (unsigned Idx = 0; Idx < 4; Idx++) {
     V = buildNonAtomicBinOp(
         B, Op, V,
-        B.CreateCall(UpdateDPP,
-                     {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
-                      B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
+        B.CreateIntrinsic(Intrinsic::amdgcn_update_dpp, AtomicTy,
+                          {Identity, V, B.getInt32(DPP::ROW_XMASK0 | 1 << Idx),
+                           B.getInt32(0xf), B.getInt32(0xf), B.getFalse()}));
   }
 
   // Reduce within each pair of rows (i.e. 32 lanes).
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
index 7d3164c79089..c49aab823b44 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUCodeGenPrepare.cpp
@@ -576,10 +576,9 @@ bool AMDGPUCodeGenPrepareImpl::promoteUniformBitreverseToI32(
   Builder.SetCurrentDebugLocation(I.getDebugLoc());
 
   Type *I32Ty = getI32Ty(Builder, I.getType());
-  Function *I32 =
-      Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::bitreverse, {I32Ty});
   Value *ExtOp = Builder.CreateZExt(I.getOperand(0), I32Ty);
-  Value *ExtRes = Builder.CreateCall(I32, { ExtOp });
+  Value *ExtRes =
+      Builder.CreateIntrinsic(Intrinsic::bitreverse, {I32Ty}, {ExtOp});
   Value *LShrOp =
       Builder.CreateLShr(ExtRes, 32 - getBaseElementBitWidth(I.getType()));
   Value *TruncRes =
@@ -1260,9 +1259,8 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem24Impl(
   Value *FB = IsSigned ? Builder.CreateSIToFP(IB,F32Ty)
                        : Builder.CreateUIToFP(IB,F32Ty);
 
-  Function *RcpDecl = Intrinsic::getOrInsertDeclaration(
-      Mod, Intrinsic::amdgcn_rcp, Builder.getFloatTy());
-  Value *RCP = Builder.CreateCall(RcpDecl, { FB });
+  Value *RCP = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp,
+                                       Builder.getFloatTy(), {FB});
   Value *FQM = Builder.CreateFMul(FA, RCP);
 
   // fq = trunc(fqm);
@@ -1455,9 +1453,7 @@ Value *AMDGPUCodeGenPrepareImpl::expandDivRem32(IRBuilder<> &Builder,
 
   // Initial estimate of inv(y).
   Value *FloatY = Builder.CreateUIToFP(Y, F32Ty);
-  Function *Rcp =
-      Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_rcp, F32Ty);
-  Value *RcpY = Builder.CreateCall(Rcp, {FloatY});
+  Value *RcpY = Builder.CreateIntrinsic(Intrinsic::amdgcn_rcp, F32Ty, {FloatY});
   Constant *Scale = ConstantFP::get(F32Ty, llvm::bit_cast<float>(0x4F7FFFFE));
   Value *ScaledY = Builder.CreateFMul(RcpY, Scale);
   Value *Z = Builder.CreateFPToUI(ScaledY, I32Ty);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
index e48fed025857..179d8aa46f80 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUImageIntrinsicOptimizer.cpp
@@ -237,12 +237,10 @@ bool optimizeSection(ArrayRef<SmallVector<IntrinsicInst *, 4>> MergeableInsts) {
       else
         NewIntrinID = Intrinsic::amdgcn_image_msaa_load_2darraymsaa;
 
-      Function *NewIntrin = Intrinsic::getOrInsertDeclaration(
-          IIList.front()->getModule(), NewIntrinID, OverloadTys);
       Args[ImageDimIntr->DMaskIndex] =
           ConstantInt::get(DMask->getType(), NewMaskVal);
       Args[FragIdIndex] = ConstantInt::get(FragId->getType(), NewFragIdVal);
-      CallInst *NewCall = B.CreateCall(NewIntrin, Args);
+      CallInst *NewCall = B.CreateIntrinsic(NewIntrinID, OverloadTys, Args);
       LLVM_DEBUG(dbgs() << "Optimize: " << *NewCall << "\n");
 
       NewCalls.push_back(NewCall);
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
index 6a5a48778197..8beb9defee66 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUInstCombineIntrinsic.cpp
@@ -130,10 +130,7 @@ static std::optional<Instruction *> modifyIntrinsicCall(
   // Modify arguments and types
   Func(Args, ArgTys);
 
-  Function *I =
-      Intrinsic::getOrInsertDeclaration(OldIntr.getModule(), NewIntr, ArgTys);
-
-  CallInst *NewCall = IC.Builder.CreateCall(I, Args);
+  CallInst *NewCall = IC.Builder.CreateIntrinsic(NewIntr, ArgTys, Args);
   NewCall->takeName(&OldIntr);
   NewCall->copyMetadata(OldIntr);
   if (isa<FPMathOperator>(NewCall))
@@ -891,12 +888,11 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
         // register (which contains the bitmask of live threads). So a
         // comparison that always returns true is the same as a read of the
         // EXEC register.
-        Function *NewF = Intrinsic::getOrInsertDeclaration(
-            II.getModule(), Intrinsic::read_register, II.getType());
         Metadata *MDArgs[] = {MDString::get(II.getContext(), "exec")};
         MDNode *MD = MDNode::get(II.getContext(), MDArgs);
         Value *Args[] = {MetadataAsValue::get(II.getContext(), MD)};
-        CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+        CallInst *NewCall = IC.Builder.CreateIntrinsic(Intrinsic::read_register,
+                                                       II.getType(), Args);
         NewCall->addFnAttr(Attribute::Convergent);
         NewCall->takeName(&II);
         return IC.replaceInstUsesWith(II, NewCall);
@@ -990,11 +986,10 @@ GCNTTIImpl::instCombineIntrinsic(InstCombiner &IC, IntrinsicInst &II) const {
       } else if (!Ty->isFloatTy() && !Ty->isDoubleTy() && !Ty->isHalfTy())
         break;
 
-      Function *NewF = Intrinsic::getOrInsertDeclaration(
-          II.getModule(), NewIID, {II.getType(), SrcLHS->getType()});
       Value *Args[] = {SrcLHS, SrcRHS,
                        ConstantInt::get(CC->getType(), SrcPred)};
-      CallInst *NewCall = IC.Builder.CreateCall(NewF, Args);
+      CallInst *NewCall = IC.Builder.CreateIntrinsic(
+          NewIID, {II.getType(), SrcLHS->getType()}, Args);
       NewCall->takeName(&II);
       return IC.replaceInstUsesWith(II, NewCall);
     }
@@ -1402,9 +1397,8 @@ static Value *simplifyAMDGCNMemoryIntrinsicDemanded(InstCombiner &IC,
       Args[0] = IC.Builder.CreateShuffleVector(II.getOperand(0), EltMask);
   }
 
-  Function *NewIntrin = Intrinsic::getOrInsertDeclaration(
-      II.getModule(), II.getIntrinsicID(), OverloadTys);
-  CallInst *NewCall = IC.Builder.CreateCall(NewIntrin, Args);
+  CallInst *NewCall =
+      IC.Builder.CreateIntrinsic(II.getIntrinsicID(), OverloadTys, Args);
   NewCall->takeName(&II);
   NewCall->copyMetadata(II);
 
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
index ff5eb8149010..5791daed0065 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULowerModuleLDSPass.cpp
@@ -529,13 +529,11 @@ public:
     // block to spare deduplicating it later.
     auto [It, Inserted] = tableKernelIndexCache.try_emplace(F);
     if (Inserted) {
-      Function *Decl = Intrinsic::getOrInsertDeclaration(
-          &M, Intrinsic::amdgcn_lds_kernel_id, {});
-
       auto InsertAt = F->getEntryBlock().getFirstNonPHIOrDbgOrAlloca();
       IRBuilder<> Builder(&*InsertAt);
 
-      It->second = Builder.CreateCall(Decl, {});
+      It->second =
+          Builder.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {});
     }
 
     return It->second;
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
index 63da3443479b..f8744d6a483c 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUPromoteAlloca.cpp
@@ -973,13 +973,10 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
   const AMDGPUSubtarget &ST = AMDGPUSubtarget::get(TM, F);
 
   if (!IsAMDHSA) {
-    Function *LocalSizeYFn = Intrinsic::getOrInsertDeclaration(
-        Mod, Intrinsic::r600_read_local_size_y);
-    Function *LocalSizeZFn = Intrinsic::getOrInsertDeclaration(
-        Mod, Intrinsic::r600_read_local_size_z);
-
-    CallInst *LocalSizeY = Builder.CreateCall(LocalSizeYFn, {});
-    CallInst *LocalSizeZ = Builder.CreateCall(LocalSizeZFn, {});
+    CallInst *LocalSizeY =
+        Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_y, {}, {});
+    CallInst *LocalSizeZ =
+        Builder.CreateIntrinsic(Intrinsic::r600_read_local_size_z, {}, {});
 
     ST.makeLIDRangeMetadata(LocalSizeY);
     ST.makeLIDRangeMetadata(LocalSizeZ);
@@ -1021,10 +1018,8 @@ AMDGPUPromoteAllocaImpl::getLocalSizeYZ(IRBuilder<> &Builder) {
   //     hsa_signal_t completion_signal; // uint64_t wrapper
   //   } hsa_kernel_dispatch_packet_t
   //
-  Function *DispatchPtrFn =
-      Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::amdgcn_dispatch_ptr);
-
-  CallInst *DispatchPtr = Builder.CreateCall(DispatchPtrFn, {});
+  CallInst *DispatchPtr =
+      Builder.CreateIntrinsic(Intrinsic::amdgcn_dispatch_ptr, {}, {});
   DispatchPtr->addRetAttr(Attribute::NoAlias);
   DispatchPtr->addRetAttr(Attribute::NonNull);
   F.removeFnAttr("amdgpu-no-dispatch-ptr");
@@ -1564,13 +1559,10 @@ bool AMDGPUPromoteAllocaImpl::tryPromoteAllocaToLDS(AllocaInst &I,
       continue;
     case Intrinsic::objectsize: {
       Value *Src = Intr->getOperand(0);
-      Function *ObjectSize = Intrinsic::getOrInsertDeclaration(
-          Mod, Intrinsic::objectsize,
-          {Intr->getType(),
-           PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)});
 
-      CallInst *NewCall = Builder.CreateCall(
-          ObjectSize,
+      CallInst *NewCall = Builder.CreateIntrinsic(
+          Intrinsic::objectsize,
+          {Intr->getType(), PointerType::get(Context, AMDGPUAS::LOCAL_ADDRESS)},
           {Src, Intr->getOperand(1), Intr->getOperand(2), Intr->getOperand(3)});
       Intr->replaceAllUsesWith(NewCall);
       Intr->eraseFromParent();
diff --git a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
index 51af16c48f70..dfa91904a734 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPUSwLowerLDS.cpp
@@ -1055,9 +1055,7 @@ void AMDGPUSwLowerLDS::lowerNonKernelLDSAccesses(
   SetVector<Instruction *> LDSInstructions;
   getLDSMemoryInstructions(Func, LDSInstructions);
 
-  Function *Decl = Intrinsic::getOrInsertDeclaration(
-      &M, Intrinsic::amdgcn_lds_kernel_id, {});
-  auto *KernelId = IRB.CreateCall(Decl, {});
+  auto *KernelId = IRB.CreateIntrinsic(Intrinsic::amdgcn_lds_kernel_id, {}, {});
   GlobalVariable *LDSBaseTable = NKLDSParams.LDSBaseTable;
   GlobalVariable *LDSOffsetTable = NKLDSParams.LDSOffsetTable;
   auto &OrdereLDSGlobals = NKLDSParams.OrdereLDSGlobals;
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index a49dda871dc3..64c0500191e4 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21141,30 +21141,26 @@ bool ARMTargetLowering::isExtractSubvectorCheap(EVT ResVT, EVT SrcVT,
 
 Instruction *ARMTargetLowering::makeDMB(IRBuilderBase &Builder,
                                         ARM_MB::MemBOpt Domain) const {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-
   // First, if the target has no DMB, see what fallback we can use.
   if (!Subtarget->hasDataBarrier()) {
     // Some ARMv6 cpus can support data barriers with an mcr instruction.
     // Thumb1 and pre-v6 ARM mode use a libcall instead and should never get
     // here.
     if (Subtarget->hasV6Ops() && !Subtarget->isThumb()) {
-      Function *MCR = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_mcr);
       Value* args[6] = {Builder.getInt32(15), Builder.getInt32(0),
                         Builder.getInt32(0), Builder.getInt32(7),
                         Builder.getInt32(10), Builder.getInt32(5)};
-      return Builder.CreateCall(MCR, args);
+      return Builder.CreateIntrinsic(Intrinsic::arm_mcr, {}, args);
     } else {
       // Instead of using barriers, atomic accesses on these subtargets use
       // libcalls.
       llvm_unreachable("makeDMB on a target so old that it has no barriers");
     }
   } else {
-    Function *DMB = Intrinsic::getOrInsertDeclaration(M, Intrinsic::arm_dmb);
     // Only a full system barrier exists in the M-class architectures.
     Domain = Subtarget->isMClass() ? ARM_MB::SY : Domain;
     Constant *CDomain = Builder.getInt32(Domain);
-    return Builder.CreateCall(DMB, CDomain);
+    return Builder.CreateIntrinsic(Intrinsic::arm_dmb, {}, CDomain);
   }
 }
 
@@ -21417,9 +21413,9 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
   if (ValueTy->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsAcquire ? Intrinsic::arm_ldaexd : Intrinsic::arm_ldrexd;
-    Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int);
 
-    Value *LoHi = Builder.CreateCall(Ldrex, Addr, "lohi");
+    Value *LoHi =
+        Builder.CreateIntrinsic(Int, {}, Addr, /*FMFSource=*/nullptr, "lohi");
 
     Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
     Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
@@ -21433,8 +21429,7 @@ Value *ARMTargetLowering::emitLoadLinked(IRBuilderBase &Builder, Type *ValueTy,
 
   Type *Tys[] = { Addr->getType() };
   Intrinsic::ID Int = IsAcquire ? Intrinsic::arm_ldaex : Intrinsic::arm_ldrex;
-  Function *Ldrex = Intrinsic::getOrInsertDeclaration(M, Int, Tys);
-  CallInst *CI = Builder.CreateCall(Ldrex, Addr);
+  CallInst *CI = Builder.CreateIntrinsic(Int, Tys, Addr);
 
   CI->addParamAttr(
       0, Attribute::get(M->getContext(), Attribute::ElementType, ValueTy));
@@ -21460,14 +21455,13 @@ Value *ARMTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   if (Val->getType()->getPrimitiveSizeInBits() == 64) {
     Intrinsic::ID Int =
         IsRelease ? Intrinsic::arm_stlexd : Intrinsic::arm_strexd;
-    Function *Strex = Intrinsic::getOrInsertDeclaration(M, Int);
     Type *Int32Ty = Type::getInt32Ty(M->getContext());
 
     Value *Lo = Builder.CreateTrunc(Val, Int32Ty, "lo");
     Value *Hi = Builder.CreateTrunc(Builder.CreateLShr(Val, 32), Int32Ty, "hi");
     if (!Subtarget->isLittle())
       std::swap(Lo, Hi);
-    return Builder.CreateCall(Strex, {Lo, Hi, Addr});
+    return Builder.CreateIntrinsic(Int, {}, {Lo, Hi, Addr});
   }
 
   Intrinsic::ID Int = IsRelease ? Intrinsic::arm_stlex : Intrinsic::arm_strex;
@@ -21600,14 +21594,13 @@ bool ARMTargetLowering::lowerInterleavedLoad(
       static const Intrinsic::ID LoadInts[3] = {Intrinsic::arm_neon_vld2,
                                                 Intrinsic::arm_neon_vld3,
                                                 Intrinsic::arm_neon_vld4};
-      Function *VldnFunc = Intrinsic::getOrInsertDeclaration(
-          LI->getModule(), LoadInts[Factor - 2], Tys);
 
       SmallVector<Value *, 2> Ops;
       Ops.push_back(BaseAddr);
       Ops.push_back(Builder.getInt32(LI->getAlign().value()));
 
-      return Builder.CreateCall(VldnFunc, Ops, "vldN");
+      return Builder.CreateIntrinsic(LoadInts[Factor - 2], Tys, Ops,
+                                     /*FMFSource=*/nullptr, "vldN");
     } else {
       assert((Factor == 2 || Factor == 4) &&
              "expected interleave factor of 2 or 4 for MVE");
@@ -21615,12 +21608,11 @@ bool ARMTargetLowering::lowerInterleavedLoad(
           Factor == 2 ? Intrinsic::arm_mve_vld2q : Intrinsic::arm_mve_vld4q;
       Type *PtrTy = Builder.getPtrTy(LI->getPointerAddressSpace());
       Type *Tys[] = {VecTy, PtrTy};
-      Function *VldnFunc =
-          Intrinsic::getOrInsertDeclaration(LI->getModule(), LoadInts, Tys);
 
       SmallVector<Value *, 2> Ops;
       Ops.push_back(BaseAddr);
-      return Builder.CreateCall(VldnFunc, Ops, "vldN");
+      return Builder.CreateIntrinsic(LoadInts, Tys, Ops, /*FMFSource=*/nullptr,
+                                     "vldN");
     }
   };
 
@@ -21761,14 +21753,11 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
       Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
       Type *Tys[] = {PtrTy, SubVecTy};
 
-      Function *VstNFunc = Intrinsic::getOrInsertDeclaration(
-          SI->getModule(), StoreInts[Factor - 2], Tys);
-
       SmallVector<Value *, 6> Ops;
       Ops.push_back(BaseAddr);
       append_range(Ops, Shuffles);
       Ops.push_back(Builder.getInt32(SI->getAlign().value()));
-      Builder.CreateCall(VstNFunc, Ops);
+      Builder.CreateIntrinsic(StoreInts[Factor - 2], Tys, Ops);
     } else {
       assert((Factor == 2 || Factor == 4) &&
              "expected interleave factor of 2 or 4 for MVE");
@@ -21776,15 +21765,13 @@ bool ARMTargetLowering::lowerInterleavedStore(StoreInst *SI,
           Factor == 2 ? Intrinsic::arm_mve_vst2q : Intrinsic::arm_mve_vst4q;
       Type *PtrTy = Builder.getPtrTy(SI->getPointerAddressSpace());
       Type *Tys[] = {PtrTy, SubVecTy};
-      Function *VstNFunc =
-          Intrinsic::getOrInsertDeclaration(SI->getModule(), StoreInts, Tys);
 
       SmallVector<Value *, 6> Ops;
       Ops.push_back(BaseAddr);
       append_range(Ops, Shuffles);
       for (unsigned F = 0; F < Factor; F++) {
         Ops.push_back(Builder.getInt32(F));
-        Builder.CreateCall(VstNFunc, Ops);
+        Builder.CreateIntrinsic(StoreInts, Tys, Ops);
         Ops.pop_back();
       }
     }
diff --git a/llvm/lib/Target/ARM/MVETailPredication.cpp b/llvm/lib/Target/ARM/MVETailPredication.cpp
index 60211db8a61a..695eafff1270 100644
--- a/llvm/lib/Target/ARM/MVETailPredication.cpp
+++ b/llvm/lib/Target/ARM/MVETailPredication.cpp
@@ -401,8 +401,7 @@ void MVETailPredication::InsertVCTPIntrinsic(IntrinsicInst *ActiveLaneMask,
   case 8:  VCTPID = Intrinsic::arm_mve_vctp16; break;
   case 16: VCTPID = Intrinsic::arm_mve_vctp8; break;
   }
-  Function *VCTP = Intrinsic::getOrInsertDeclaration(M, VCTPID);
-  Value *VCTPCall = Builder.CreateCall(VCTP, Processed);
+  Value *VCTPCall = Builder.CreateIntrinsic(VCTPID, {}, Processed);
   ActiveLaneMask->replaceAllUsesWith(VCTPCall);
 
   // Add the incoming value to the new phi.
diff --git a/llvm/lib/Target/DirectX/DXILOpLowering.cpp b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
index 99df48508720..c62ba8c21d67 100644
--- a/llvm/lib/Target/DirectX/DXILOpLowering.cpp
+++ b/llvm/lib/Target/DirectX/DXILOpLowering.cpp
@@ -134,9 +134,8 @@ public:
   /// piecemeal way - we can add the casts in to avoid updating all of the uses
   /// or defs, and by the end all of the casts will be redundant.
   Value *createTmpHandleCast(Value *V, Type *Ty) {
-    Function *CastFn = Intrinsic::getOrInsertDeclaration(
-        &M, Intrinsic::dx_cast_handle, {Ty, V->getType()});
-    CallInst *Cast = OpBuilder.getIRB().CreateCall(CastFn, {V});
+    CallInst *Cast = OpBuilder.getIRB().CreateIntrinsic(
+        Intrinsic::dx_cast_handle, {Ty, V->getType()}, {V});
     CleanupCasts.push_back(Cast);
     return Cast;
   }
diff --git a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
index b16ab3931b28..b44519a1286d 100644
--- a/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonGenExtract.cpp
@@ -211,9 +211,8 @@ bool HexagonGenExtract::convert(Instruction *In) {
   IRBuilder<> IRB(In);
   Intrinsic::ID IntId = (BW == 32) ? Intrinsic::hexagon_S2_extractu
                                    : Intrinsic::hexagon_S2_extractup;
-  Module *Mod = BB->getParent()->getParent();
-  Function *ExtF = Intrinsic::getOrInsertDeclaration(Mod, IntId);
-  Value *NewIn = IRB.CreateCall(ExtF, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
+  Value *NewIn =
+      IRB.CreateIntrinsic(IntId, {}, {BF, IRB.getInt32(W), IRB.getInt32(SR)});
   if (SL != 0)
     NewIn = IRB.CreateShl(NewIn, SL, CSL->getName());
   In->replaceAllUsesWith(NewIn);
diff --git a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
index 03c12f5ce447..ab9bc5593677 100644
--- a/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonISelLowering.cpp
@@ -3859,15 +3859,13 @@ void HexagonTargetLowering::AdjustInstrPostInstrSelection(MachineInstr &MI,
 Value *HexagonTargetLowering::emitLoadLinked(IRBuilderBase &Builder,
                                              Type *ValueTy, Value *Addr,
                                              AtomicOrdering Ord) const {
-  BasicBlock *BB = Builder.GetInsertBlock();
-  Module *M = BB->getParent()->getParent();
   unsigned SZ = ValueTy->getPrimitiveSizeInBits();
   assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic loads supported");
   Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_L2_loadw_locked
                                    : Intrinsic::hexagon_L4_loadd_locked;
-  Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID);
 
-  Value *Call = Builder.CreateCall(Fn, Addr, "larx");
+  Value *Call =
+      Builder.CreateIntrinsic(IntID, {}, Addr, /*FMFSource=*/nullptr, "larx");
 
   return Builder.CreateBitCast(Call, ValueTy);
 }
@@ -3886,11 +3884,11 @@ Value *HexagonTargetLowering::emitStoreConditional(IRBuilderBase &Builder,
   assert((SZ == 32 || SZ == 64) && "Only 32/64-bit atomic stores supported");
   Intrinsic::ID IntID = (SZ == 32) ? Intrinsic::hexagon_S2_storew_locked
                                    : Intrinsic::hexagon_S4_stored_locked;
-  Function *Fn = Intrinsic::getOrInsertDeclaration(M, IntID);
 
   Val = Builder.CreateBitCast(Val, CastTy);
 
-  Value *Call = Builder.CreateCall(Fn, {Addr, Val}, "stcx");
+  Value *Call = Builder.CreateIntrinsic(IntID, {}, {Addr, Val},
+                                        /*FMFSource=*/nullptr, "stcx");
   Value *Cmp = Builder.CreateICmpEQ(Call, Builder.getInt32(0), "");
   Value *Ext = Builder.CreateZExt(Cmp, Type::getInt32Ty(M->getContext()));
   return Ext;
diff --git a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
index d2cfd3851e71..ce933108b83b 100644
--- a/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonVectorCombine.cpp
@@ -2390,9 +2390,9 @@ auto HexagonVectorCombine::vralignb(IRBuilderBase &Builder, Value *Lo,
     Type *Int64Ty = Type::getInt64Ty(F.getContext());
     Value *Lo64 = Builder.CreateBitCast(Lo, Int64Ty, "cst");
     Value *Hi64 = Builder.CreateBitCast(Hi, Int64Ty, "cst");
-    Function *FI = Intrinsic::getOrInsertDeclaration(
-        F.getParent(), Intrinsic::hexagon_S2_valignrb);
-    Value *Call = Builder.CreateCall(FI, {Hi64, Lo64, Amt}, "cup");
+    Value *Call = Builder.CreateIntrinsic(Intrinsic::hexagon_S2_valignrb, {},
+                                          {Hi64, Lo64, Amt},
+                                          /*FMFSource=*/nullptr, "cup");
     return Builder.CreateBitCast(Call, Lo->getType(), "cst");
   }
   llvm_unreachable("Unexpected vector length");
@@ -2587,9 +2587,8 @@ auto HexagonVectorCombine::createHvxIntrinsic(IRBuilderBase &Builder,
     unsigned HwLen = HST.getVectorLength();
     Intrinsic::ID TC = HwLen == 64 ? Intrinsic::hexagon_V6_pred_typecast
                                    : Intrinsic::hexagon_V6_pred_typecast_128B;
-    Function *FI = Intrinsic::getOrInsertDeclaration(F.getParent(), TC,
-                                                     {DestTy, Val->getType()});
-    return Builder.CreateCall(FI, {Val}, "cup");
+    return Builder.CreateIntrinsic(TC, {DestTy, Val->getType()}, {Val},
+                                   /*FMFSource=*/nullptr, "cup");
   };
 
   Function *IntrFn =
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index 8edca34624e9..fc5f0fc1bf0d 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -5807,10 +5807,8 @@ Value *LoongArchTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
   NewVal = Builder.CreateSExt(NewVal, Builder.getInt64Ty());
   Mask = Builder.CreateSExt(Mask, Builder.getInt64Ty());
   Type *Tys[] = {AlignedAddr->getType()};
-  Function *MaskedCmpXchg =
-      Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
-  Value *Result = Builder.CreateCall(
-      MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering});
+  Value *Result = Builder.CreateIntrinsic(
+      CmpXchgIntrID, Tys, {AlignedAddr, CmpVal, NewVal, Mask, FailureOrdering});
   Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
   return Result;
 }
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index cec1e507f08f..7b07f6b6d151 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -12180,9 +12180,7 @@ void PPCTargetLowering::ReplaceNodeResults(SDNode *N,
 //===----------------------------------------------------------------------===//
 
 static Instruction *callIntrinsic(IRBuilderBase &Builder, Intrinsic::ID Id) {
-  Module *M = Builder.GetInsertBlock()->getParent()->getParent();
-  Function *Func = Intrinsic::getOrInsertDeclaration(M, Id);
-  return Builder.CreateCall(Func, {});
+  return Builder.CreateIntrinsic(Id, {}, {});
 }
 
 // The mappings for emitLeading/TrailingFence is taken from
@@ -19002,13 +19000,13 @@ Value *PPCTargetLowering::emitMaskedAtomicRMWIntrinsic(
   Module *M = Builder.GetInsertBlock()->getParent()->getParent();
   Type *ValTy = Incr->getType();
   assert(ValTy->getPrimitiveSizeInBits() == 128);
-  Function *RMW = Intrinsic::getOrInsertDeclaration(
-      M, getIntrinsicForAtomicRMWBinOp128(AI->getOperation()));
   Type *Int64Ty = Type::getInt64Ty(M->getContext());
   Value *IncrLo = Builder.CreateTrunc(Incr, Int64Ty, "incr_lo");
   Value *IncrHi =
       Builder.CreateTrunc(Builder.CreateLShr(Incr, 64), Int64Ty, "incr_hi");
-  Value *LoHi = Builder.CreateCall(RMW, {AlignedAddr, IncrLo, IncrHi});
+  Value *LoHi = Builder.CreateIntrinsic(
+      getIntrinsicForAtomicRMWBinOp128(AI->getOperation()), {},
+      {AlignedAddr, IncrLo, IncrHi});
   Value *Lo = Builder.CreateExtractValue(LoHi, 0, "lo");
   Value *Hi = Builder.CreateExtractValue(LoHi, 1, "hi");
   Lo = Builder.CreateZExt(Lo, ValTy, "lo64");
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index fa14a203913e..952072c26739 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -20719,10 +20719,8 @@ Value *RISCVTargetLowering::emitMaskedAtomicCmpXchgIntrinsic(
     CmpXchgIntrID = Intrinsic::riscv_masked_cmpxchg_i64;
   }
   Type *Tys[] = {AlignedAddr->getType()};
-  Function *MaskedCmpXchg =
-      Intrinsic::getOrInsertDeclaration(CI->getModule(), CmpXchgIntrID, Tys);
-  Value *Result = Builder.CreateCall(
-      MaskedCmpXchg, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
+  Value *Result = Builder.CreateIntrinsic(
+      CmpXchgIntrID, Tys, {AlignedAddr, CmpVal, NewVal, Mask, Ordering});
   if (XLen == 64)
     Result = Builder.CreateTrunc(Result, Builder.getInt32Ty());
   return Result;
@@ -21335,14 +21333,11 @@ bool RISCVTargetLowering::lowerInterleavedLoad(
 
   auto *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
 
-  Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-      LI->getModule(), FixedVlsegIntrIds[Factor - 2],
-      {VTy, LI->getPointerOperandType(), XLenTy});
-
   Value *VL = ConstantInt::get(XLenTy, VTy->getNumElements());
 
-  CallInst *VlsegN =
-      Builder.CreateCall(VlsegNFunc, {LI->getPointerOperand(), VL});
+  CallInst *VlsegN = Builder.CreateIntrinsic(
+      FixedVlsegIntrIds[Factor - 2], {VTy, LI->getPointerOperandType(), XLenTy},
+      {LI->getPointerOperand(), VL});
 
   for (unsigned i = 0; i < Shuffles.size(); i++) {
     Value *SubVec = Builder.CreateExtractValue(VlsegN, Indices[i]);
@@ -21436,11 +21431,11 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
   Type *XLenTy = Type::getIntNTy(LI->getContext(), Subtarget.getXLen());
 
   if (auto *FVTy = dyn_cast<FixedVectorType>(ResVTy)) {
-    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-        LI->getModule(), FixedVlsegIntrIds[Factor - 2],
-        {ResVTy, LI->getPointerOperandType(), XLenTy});
     Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Return = Builder.CreateCall(VlsegNFunc, {LI->getPointerOperand(), VL});
+    Return =
+        Builder.CreateIntrinsic(FixedVlsegIntrIds[Factor - 2],
+                                {ResVTy, LI->getPointerOperandType(), XLenTy},
+                                {LI->getPointerOperand(), VL});
   } else {
     static const Intrinsic::ID IntrIds[] = {
         Intrinsic::riscv_vlseg2, Intrinsic::riscv_vlseg3,
@@ -21456,21 +21451,19 @@ bool RISCVTargetLowering::lowerDeinterleaveIntrinsicToLoad(
                                 NumElts * SEW / 8),
         Factor);
 
-    Function *VlsegNFunc = Intrinsic::getOrInsertDeclaration(
-        LI->getModule(), IntrIds[Factor - 2], {VecTupTy, XLenTy});
     Value *VL = Constant::getAllOnesValue(XLenTy);
 
-    Value *Vlseg = Builder.CreateCall(
-        VlsegNFunc, {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
-                     ConstantInt::get(XLenTy, Log2_64(SEW))});
+    Value *Vlseg = Builder.CreateIntrinsic(
+        IntrIds[Factor - 2], {VecTupTy, XLenTy},
+        {PoisonValue::get(VecTupTy), LI->getPointerOperand(), VL,
+         ConstantInt::get(XLenTy, Log2_64(SEW))});
 
     SmallVector<Type *, 2> AggrTypes{Factor, ResVTy};
     Return = PoisonValue::get(StructType::get(LI->getContext(), AggrTypes));
-    Function *VecExtractFunc = Intrinsic::getOrInsertDeclaration(
-        LI->getModule(), Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy});
     for (unsigned i = 0; i < Factor; ++i) {
-      Value *VecExtract =
-          Builder.CreateCall(VecExtractFunc, {Vlseg, Builder.getInt32(i)});
+      Value *VecExtract = Builder.CreateIntrinsic(
+          Intrinsic::riscv_tuple_extract, {ResVTy, VecTupTy},
+          {Vlseg, Builder.getInt32(i)});
       Return = Builder.CreateInsertValue(Return, VecExtract, i);
     }
   }
@@ -21502,12 +21495,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
   Type *XLenTy = Type::getIntNTy(SI->getContext(), Subtarget.getXLen());
 
   if (auto *FVTy = dyn_cast<FixedVectorType>(InVTy)) {
-    Function *VssegNFunc = Intrinsic::getOrInsertDeclaration(
-        SI->getModule(), FixedVssegIntrIds[Factor - 2],
-        {InVTy, SI->getPointerOperandType(), XLenTy});
     Value *VL = ConstantInt::get(XLenTy, FVTy->getNumElements());
-    Builder.CreateCall(VssegNFunc, {II->getArgOperand(0), II->getArgOperand(1),
-                                    SI->getPointerOperand(), VL});
+    Builder.CreateIntrinsic(FixedVssegIntrIds[Factor - 2],
+                            {InVTy, SI->getPointerOperandType(), XLenTy},
+                            {II->getArgOperand(0), II->getArgOperand(1),
+                             SI->getPointerOperand(), VL});
   } else {
     static const Intrinsic::ID IntrIds[] = {
         Intrinsic::riscv_vsseg2, Intrinsic::riscv_vsseg3,
@@ -21528,13 +21520,11 @@ bool RISCVTargetLowering::lowerInterleaveIntrinsicToStore(
 
     Value *VL = Constant::getAllOnesValue(XLenTy);
 
-    Function *VecInsertFunc = Intrinsic::getOrInsertDeclaration(
-        SI->getModule(), Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy});
     Value *StoredVal = PoisonValue::get(VecTupTy);
     for (unsigned i = 0; i < Factor; ++i)
-      StoredVal =
-          Builder.CreateCall(VecInsertFunc, {StoredVal, II->getArgOperand(i),
-                                             Builder.getInt32(i)});
+      StoredVal = Builder.CreateIntrinsic(
+          Intrinsic::riscv_tuple_insert, {VecTupTy, InVTy},
+          {StoredVal, II->getArgOperand(i), Builder.getInt32(i)});
 
     Builder.CreateCall(VssegNFunc, {StoredVal, SI->getPointerOperand(), VL,
                                     ConstantInt::get(XLenTy, Log2_64(SEW))});
diff --git a/llvm/lib/Target/SystemZ/SystemZTDC.cpp b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
index 345327e880ec..c351c31b0a79 100644
--- a/llvm/lib/Target/SystemZ/SystemZTDC.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZTDC.cpp
@@ -366,11 +366,10 @@ bool SystemZTDCPass::runOnFunction(Function &F) {
       if (!Worthy)
         continue;
       // Call the intrinsic, compare result with 0.
-      Function *TDCFunc = Intrinsic::getOrInsertDeclaration(
-          &M, Intrinsic::s390_tdc, V->getType());
       IRBuilder<> IRB(I);
       Value *MaskVal = ConstantInt::get(Type::getInt64Ty(Ctx), Mask);
-      Instruction *TDC = IRB.CreateCall(TDCFunc, {V, MaskVal});
+      Instruction *TDC =
+          IRB.CreateIntrinsic(Intrinsic::s390_tdc, V->getType(), {V, MaskVal});
       Value *ICmp = IRB.CreateICmp(CmpInst::ICMP_NE, TDC, Zero32);
       I->replaceAllUsesWith(ICmp);
     }
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index aa6e75cbf410..db633d10edc4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -31190,7 +31190,6 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
   LLVMContext &Ctx = AI->getContext();
   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
                                           PointerType::getUnqual(Ctx));
-  Function *BitTest = nullptr;
   Value *Result = nullptr;
   auto BitTested = FindSingleBitChange(AI->getValOperand());
   assert(BitTested.first != nullptr);
@@ -31198,15 +31197,10 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
   if (BitTested.second == ConstantBit || BitTested.second == NotConstantBit) {
     auto *C = cast<ConstantInt>(I->getOperand(I->getOperand(0) == AI ? 1 : 0));
 
-    BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_C,
-                                                AI->getType());
-
     unsigned Imm = llvm::countr_zero(C->getZExtValue());
-    Result = Builder.CreateCall(BitTest, {Addr, Builder.getInt8(Imm)});
+    Result = Builder.CreateIntrinsic(IID_C, AI->getType(),
+                                     {Addr, Builder.getInt8(Imm)});
   } else {
-    BitTest = Intrinsic::getOrInsertDeclaration(AI->getModule(), IID_I,
-                                                AI->getType());
-
     assert(BitTested.second == ShiftBit || BitTested.second == NotShiftBit);
 
     Value *SI = BitTested.first;
@@ -31223,7 +31217,7 @@ void X86TargetLowering::emitBitTestAtomicRMWIntrinsic(AtomicRMWInst *AI) const {
     // << (X % sizeof_bits(X)) we can drop the shift mask and AGEN in
     // favor of just a raw BT{S|R|C}.
 
-    Result = Builder.CreateCall(BitTest, {Addr, BitPos});
+    Result = Builder.CreateIntrinsic(IID_I, AI->getType(), {Addr, BitPos});
     Result = Builder.CreateZExtOrTrunc(Result, AI->getType());
 
     // If the result is only used for zero/non-zero status then we don't need to
@@ -31364,12 +31358,11 @@ void X86TargetLowering::emitCmpArithAtomicRMWIntrinsic(
     IID = Intrinsic::x86_atomic_xor_cc;
     break;
   }
-  Function *CmpArith =
-      Intrinsic::getOrInsertDeclaration(AI->getModule(), IID, AI->getType());
   Value *Addr = Builder.CreatePointerCast(AI->getPointerOperand(),
                                           PointerType::getUnqual(Ctx));
-  Value *Call = Builder.CreateCall(
-      CmpArith, {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
+  Value *Call = Builder.CreateIntrinsic(
+      IID, AI->getType(),
+      {Addr, AI->getValOperand(), Builder.getInt32((unsigned)CC)});
   Value *Result = Builder.CreateTrunc(Call, Type::getInt1Ty(Ctx));
   ICI->replaceAllUsesWith(Result);
   ICI->eraseFromParent();
diff --git a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
index c4374984da4b..7c9738bf0821 100644
--- a/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
+++ b/llvm/lib/Target/X86/X86InstCombineIntrinsic.cpp
@@ -1875,10 +1875,7 @@ static Value *simplifyX86extrq(IntrinsicInst &II, Value *Op0,
     // If we were an EXTRQ call, we'll save registers if we convert to EXTRQI.
     if (II.getIntrinsicID() == Intrinsic::x86_sse4a_extrq) {
       Value *Args[] = {Op0, CILength, CIIndex};
-      Module *M = II.getModule();
-      Function *F =
-          Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_extrqi);
-      return Builder.CreateCall(F, Args);
+      return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_extrqi, {}, Args);
     }
   }
 
@@ -1975,10 +1972,7 @@ static Value *simplifyX86insertq(IntrinsicInst &II, Value *Op0, Value *Op1,
     Constant *CIIndex = ConstantInt::get(IntTy8, Index, false);
 
     Value *Args[] = {Op0, Op1, CILength, CIIndex};
-    Module *M = II.getModule();
-    Function *F =
-        Intrinsic::getOrInsertDeclaration(M, Intrinsic::x86_sse4a_insertqi);
-    return Builder.CreateCall(F, Args);
+    return Builder.CreateIntrinsic(Intrinsic::x86_sse4a_insertqi, {}, Args);
   }
 
   return nullptr;
diff --git a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
index 3604774ddf35..62461d68ca15 100644
--- a/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
+++ b/llvm/lib/Target/XCore/XCoreLowerThreadLocal.cpp
@@ -157,9 +157,7 @@ bool XCoreLowerThreadLocal::lowerGlobal(GlobalVariable *GV) {
   for (User *U : Users) {
     Instruction *Inst = cast<Instruction>(U);
     IRBuilder<> Builder(Inst);
-    Function *GetID = Intrinsic::getOrInsertDeclaration(GV->getParent(),
-                                                        Intrinsic::xcore_getid);
-    Value *ThreadID = Builder.CreateCall(GetID, {});
+    Value *ThreadID = Builder.CreateIntrinsic(Intrinsic::xcore_getid, {}, {});
     Value *Addr = Builder.CreateInBoundsGEP(NewGV->getValueType(), NewGV,
                                             {Builder.getInt64(0), ThreadID});
     U->replaceUsesOfWith(GV, Addr);
diff --git a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
index 898d55fab2b0..b5b561797f75 100644
--- a/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
+++ b/llvm/lib/Transforms/AggressiveInstCombine/AggressiveInstCombine.cpp
@@ -172,9 +172,8 @@ static bool foldGuardedFunnelShift(Instruction &I, const DominatorTree &DT) {
   //   %cond = phi i32 [ %fsh, %FunnelBB ], [ %ShVal0, %GuardBB ]
   // -->
   // llvm.fshl.i32(i32 %ShVal0, i32 %ShVal1, i32 %ShAmt)
-  Function *F =
-      Intrinsic::getOrInsertDeclaration(Phi.getModule(), IID, Phi.getType());
-  Phi.replaceAllUsesWith(Builder.CreateCall(F, {ShVal0, ShVal1, ShAmt}));
+  Phi.replaceAllUsesWith(
+      Builder.CreateIntrinsic(IID, Phi.getType(), {ShVal0, ShVal1, ShAmt}));
   return true;
 }
 
@@ -332,9 +331,8 @@ static bool tryToRecognizePopCount(Instruction &I) {
                                 m_SpecificInt(Mask55)))) {
           LLVM_DEBUG(dbgs() << "Recognized popcount intrinsic\n");
           IRBuilder<> Builder(&I);
-          Function *Func = Intrinsic::getOrInsertDeclaration(
-              I.getModule(), Intrinsic::ctpop, I.getType());
-          I.replaceAllUsesWith(Builder.CreateCall(Func, {Root}));
+          I.replaceAllUsesWith(
+              Builder.CreateIntrinsic(Intrinsic::ctpop, I.getType(), {Root}));
           ++NumPopCountRecognized;
           return true;
         }
@@ -399,9 +397,8 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
     return false;
 
   IRBuilder<> Builder(&I);
-  Function *Fn = Intrinsic::getOrInsertDeclaration(
-      I.getModule(), Intrinsic::fptosi_sat, {SatTy, FpTy});
-  Value *Sat = Builder.CreateCall(Fn, In);
+  Value *Sat =
+      Builder.CreateIntrinsic(Intrinsic::fptosi_sat, {SatTy, FpTy}, In);
   I.replaceAllUsesWith(Builder.CreateSExt(Sat, IntTy));
   return true;
 }
@@ -412,9 +409,6 @@ static bool tryToFPToSat(Instruction &I, TargetTransformInfo &TTI) {
 static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
                      TargetLibraryInfo &TLI, AssumptionCache &AC,
                      DominatorTree &DT) {
-
-  Module *M = Call->getModule();
-
   // If (1) this is a sqrt libcall, (2) we can assume that NAN is not created
   // (because NNAN or the operand arg must not be less than -0.0) and (2) we
   // would not end up lowering to a libcall anyway (which could change the value
@@ -432,8 +426,8 @@ static bool foldSqrt(CallInst *Call, LibFunc Func, TargetTransformInfo &TTI,
     IRBuilderBase::FastMathFlagGuard Guard(Builder);
     Builder.setFastMathFlags(Call->getFastMathFlags());
 
-    Function *Sqrt = Intrinsic::getOrInsertDeclaration(M, Intrinsic::sqrt, Ty);
-    Value *NewSqrt = Builder.CreateCall(Sqrt, Arg, "sqrt");
+    Value *NewSqrt = Builder.CreateIntrinsic(Intrinsic::sqrt, Ty, Arg,
+                                             /*FMFSource=*/nullptr, "sqrt");
     Call->replaceAllUsesWith(NewSqrt);
 
     // Explicitly erase the old call because a call with side effects is not
diff --git a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
index 9e5d9ea31af6..e8c18435bfc6 100644
--- a/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
+++ b/llvm/lib/Transforms/IPO/CrossDSOCFI.cpp
@@ -125,12 +125,11 @@ void CrossDSOCFI::buildCFICheck(Module &M) {
     ConstantInt *CaseTypeId = ConstantInt::get(Type::getInt64Ty(Ctx), TypeId);
     BasicBlock *TestBB = BasicBlock::Create(Ctx, "test", F);
     IRBuilder<> IRBTest(TestBB);
-    Function *BitsetTestFn =
-        Intrinsic::getOrInsertDeclaration(&M, Intrinsic::type_test);
 
-    Value *Test = IRBTest.CreateCall(
-        BitsetTestFn, {&Addr, MetadataAsValue::get(
-                                  Ctx, ConstantAsMetadata::get(CaseTypeId))});
+    Value *Test = IRBTest.CreateIntrinsic(
+        Intrinsic::type_test, {},
+        {&Addr,
+         MetadataAsValue::get(Ctx, ConstantAsMetadata::get(CaseTypeId))});
     BranchInst *BI = IRBTest.CreateCondBr(Test, ExitBB, TrapBB);
     BI->setMetadata(LLVMContext::MD_prof, VeryLikelyWeights);
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
index 8112255a0b6c..d72013ba223d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineAndOrXor.cpp
@@ -3105,8 +3105,7 @@ static Instruction *matchOrConcat(Instruction &Or,
     Value *NewUpper = Builder.CreateZExt(Hi, Ty);
     NewUpper = Builder.CreateShl(NewUpper, HalfWidth);
     Value *BinOp = Builder.CreateOr(NewLower, NewUpper);
-    Function *F = Intrinsic::getOrInsertDeclaration(Or.getModule(), id, Ty);
-    return Builder.CreateCall(F, BinOp);
+    return Builder.CreateIntrinsic(id, Ty, BinOp);
   };
 
   // BSWAP: Push the concat down, swapping the lower/upper sources.
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 3cc50ee6e233..37841e91821c 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -647,9 +647,8 @@ static Instruction *foldCtpop(IntrinsicInst &II, InstCombinerImpl &IC) {
   // ctpop(x | -x) -> bitwidth - cttz(x, false)
   if (Op0->hasOneUse() &&
       match(Op0, m_c_Or(m_Value(X), m_Neg(m_Deferred(X))))) {
-    Function *F =
-        Intrinsic::getOrInsertDeclaration(II.getModule(), Intrinsic::cttz, Ty);
-    auto *Cttz = IC.Builder.CreateCall(F, {X, IC.Builder.getFalse()});
+    auto *Cttz = IC.Builder.CreateIntrinsic(Intrinsic::cttz, Ty,
+                                            {X, IC.Builder.getFalse()});
     auto *Bw = ConstantInt::get(Ty, APInt(BitWidth, BitWidth));
     return IC.replaceInstUsesWith(II, IC.Builder.CreateSub(Bw, Cttz));
   }
@@ -1182,11 +1181,9 @@ Instruction *InstCombinerImpl::matchSAddSubSat(IntrinsicInst &MinMax1) {
     return nullptr;
 
   // Finally create and return the sat intrinsic, truncated to the new type
-  Function *F = Intrinsic::getOrInsertDeclaration(MinMax1.getModule(),
-                                                  IntrinsicID, NewTy);
   Value *AT = Builder.CreateTrunc(AddSub->getOperand(0), NewTy);
   Value *BT = Builder.CreateTrunc(AddSub->getOperand(1), NewTy);
-  Value *Sat = Builder.CreateCall(F, {AT, BT});
+  Value *Sat = Builder.CreateIntrinsic(IntrinsicID, NewTy, {AT, BT});
   return CastInst::Create(Instruction::SExt, Sat, Ty);
 }
 
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
index fb6d7a72f2f6..72ebd9fbb6d9 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCompares.cpp
@@ -4790,12 +4790,10 @@ Value *InstCombinerImpl::foldMultiplicationOverflowCheck(ICmpInst &I) {
   if (MulHadOtherUses)
     Builder.SetInsertPoint(Mul);
 
-  Function *F = Intrinsic::getOrInsertDeclaration(
-      I.getModule(),
+  CallInst *Call = Builder.CreateIntrinsic(
       Div->getOpcode() == Instruction::UDiv ? Intrinsic::umul_with_overflow
                                             : Intrinsic::smul_with_overflow,
-      X->getType());
-  CallInst *Call = Builder.CreateCall(F, {X, Y}, "mul");
+      X->getType(), {X, Y}, /*FMFSource=*/nullptr, "mul");
 
   // If the multiplication was used elsewhere, to ensure that we don't leave
   // "duplicate" instructions, replace uses of that original multiplication
@@ -6334,9 +6332,9 @@ static Instruction *processUMulZExtIdiom(ICmpInst &I, Value *MulVal,
     MulA = Builder.CreateZExt(A, MulType);
   if (WidthB < MulWidth)
     MulB = Builder.CreateZExt(B, MulType);
-  Function *F = Intrinsic::getOrInsertDeclaration(
-      I.getModule(), Intrinsic::umul_with_overflow, MulType);
-  CallInst *Call = Builder.CreateCall(F, {MulA, MulB}, "umul");
+  CallInst *Call =
+      Builder.CreateIntrinsic(Intrinsic::umul_with_overflow, MulType,
+                              {MulA, MulB}, /*FMFSource=*/nullptr, "umul");
   IC.addToWorklist(MulInstr);
 
   // If there are uses of mul result other than the comparison, we know that
diff --git a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
index 55e9903876b1..cb8458831849 100644
--- a/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/AddressSanitizer.cpp
@@ -1109,11 +1109,8 @@ struct FunctionStackPoisoner : public InstVisitor<FunctionStackPoisoner> {
     // alloca. We have a special @llvm.get.dynamic.area.offset intrinsic for
     // this purpose.
     if (!isa<ReturnInst>(InstBefore)) {
-      Function *DynamicAreaOffsetFunc = Intrinsic::getOrInsertDeclaration(
-          InstBefore->getModule(), Intrinsic::get_dynamic_area_offset,
-          {IntptrTy});
-
-      Value *DynamicAreaOffset = IRB.CreateCall(DynamicAreaOffsetFunc, {});
+      Value *DynamicAreaOffset = IRB.CreateIntrinsic(
+          Intrinsic::get_dynamic_area_offset, {IntptrTy}, {});
 
       DynamicAreaPtr = IRB.CreateAdd(IRB.CreatePtrToInt(SavedStack, IntptrTy),
                                      DynamicAreaOffset);
diff --git a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
index 63d580d2b9d5..8b857d421f29 100644
--- a/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
+++ b/llvm/lib/Transforms/Instrumentation/BoundsChecking.cpp
@@ -194,14 +194,13 @@ static bool addBoundsChecking(Function &F, TargetLibraryInfo &TLI,
     IRB.SetInsertPoint(TrapBB);
 
     Intrinsic::ID IntrID = DebugTrapBB ? Intrinsic::ubsantrap : Intrinsic::trap;
-    auto *F = Intrinsic::getOrInsertDeclaration(Fn->getParent(), IntrID);
 
     CallInst *TrapCall;
     if (DebugTrapBB) {
-      TrapCall =
-          IRB.CreateCall(F, ConstantInt::get(IRB.getInt8Ty(), Fn->size()));
+      TrapCall = IRB.CreateIntrinsic(
+          IntrID, {}, ConstantInt::get(IRB.getInt8Ty(), Fn->size()));
     } else {
-      TrapCall = IRB.CreateCall(F, {});
+      TrapCall = IRB.CreateIntrinsic(IntrID, {}, {});
     }
 
     TrapCall->setDoesNotReturn();
diff --git a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
index 19ec97c17f31..9e174e2415e7 100644
--- a/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/MemorySanitizer.cpp
@@ -2853,9 +2853,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     Value *S2Conv =
         IRB.CreateSExt(IRB.CreateICmpNE(S2, getCleanShadow(S2)), S2->getType());
     Value *V2 = I.getOperand(2);
-    Function *Intrin = Intrinsic::getOrInsertDeclaration(
-        I.getModule(), I.getIntrinsicID(), S2Conv->getType());
-    Value *Shift = IRB.CreateCall(Intrin, {S0, S1, V2});
+    Value *Shift = IRB.CreateIntrinsic(I.getIntrinsicID(), S2Conv->getType(),
+                                       {S0, S1, V2});
     setShadow(&I, IRB.CreateOr(Shift, S2Conv));
     setOriginForNaryOp(I);
   }
@@ -3057,9 +3056,8 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
     IRBuilder<> IRB(&I);
     Value *Op = I.getArgOperand(0);
     Type *OpType = Op->getType();
-    Function *BswapFunc = Intrinsic::getOrInsertDeclaration(
-        F.getParent(), Intrinsic::bswap, ArrayRef(&OpType, 1));
-    setShadow(&I, IRB.CreateCall(BswapFunc, getShadow(Op)));
+    setShadow(&I, IRB.CreateIntrinsic(Intrinsic::bswap, ArrayRef(&OpType, 1),
+                                      getShadow(Op)));
     setOrigin(&I, getOrigin(Op));
   }
 
@@ -3287,11 +3285,9 @@ struct MemorySanitizerVisitor : public InstVisitor<MemorySanitizerVisitor> {
       S2_ext = IRB.CreateBitCast(S2_ext, getMMXVectorTy(64));
     }
 
-    Function *ShadowFn = Intrinsic::getOrInsertDeclaration(
-        F.getParent(), getSignedPackIntrinsic(I.getIntrinsicID()));
-
-    Value *S =
-        IRB.CreateCall(ShadowFn, {S1_ext, S2_ext}, "_msprop_vector_pack");
+    Value *S = IRB.CreateIntrinsic(getSignedPackIntrinsic(I.getIntrinsicID()),
+                                   {}, {S1_ext, S2_ext}, /*FMFSource=*/nullptr,
+                                   "_msprop_vector_pack");
     if (MMXEltSizeInBits)
       S = IRB.CreateBitCast(S, getShadowTy(&I));
     setShadow(&I, S);
diff --git a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
index 8130a719691b..f7461127ec51 100644
--- a/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
+++ b/llvm/lib/Transforms/Instrumentation/SanitizerCoverage.cpp
@@ -1056,11 +1056,10 @@ void ModuleSanitizerCoverage::InjectCoverageAtBlock(Function &F, BasicBlock &BB,
   if (Options.StackDepth && IsEntryBB && !IsLeafFunc) {
     // Check stack depth.  If it's the deepest so far, record it.
     Module *M = F.getParent();
-    Function *GetFrameAddr = Intrinsic::getOrInsertDeclaration(
-        M, Intrinsic::frameaddress,
-        IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
-    auto FrameAddrPtr =
-        IRB.CreateCall(GetFrameAddr, {Constant::getNullValue(Int32Ty)});
+    auto FrameAddrPtr = IRB.CreateIntrinsic(
+        Intrinsic::frameaddress,
+        IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()),
+        {Constant::getNullValue(Int32Ty)});
     auto FrameAddrInt = IRB.CreatePtrToInt(FrameAddrPtr, IntptrTy);
     auto LowestStack = IRB.CreateLoad(IntptrTy, SanCovLowestStack);
     auto IsStackLower = IRB.CreateICmpULT(FrameAddrInt, LowestStack);
diff --git a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
index 4043c0e9a7dd..1050cbaa07b8 100644
--- a/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopDataPrefetch.cpp
@@ -403,15 +403,11 @@ bool LoopDataPrefetch::runOnLoop(Loop *L) {
     Value *PrefPtrValue = SCEVE.expandCodeFor(NextLSCEV, I8Ptr, P.InsertPt);
 
     IRBuilder<> Builder(P.InsertPt);
-    Module *M = BB->getParent()->getParent();
     Type *I32 = Type::getInt32Ty(BB->getContext());
-    Function *PrefetchFunc = Intrinsic::getOrInsertDeclaration(
-        M, Intrinsic::prefetch, PrefPtrValue->getType());
-    Builder.CreateCall(
-        PrefetchFunc,
-        {PrefPtrValue,
-         ConstantInt::get(I32, P.Writes),
-         ConstantInt::get(I32, 3), ConstantInt::get(I32, 1)});
+    Builder.CreateIntrinsic(Intrinsic::prefetch, PrefPtrValue->getType(),
+                            {PrefPtrValue, ConstantInt::get(I32, P.Writes),
+                             ConstantInt::get(I32, 3),
+                             ConstantInt::get(I32, 1)});
     ++NumPrefetches;
     LLVM_DEBUG(dbgs() << "  Access: "
                << *P.MemI->getOperand(isa<LoadInst>(P.MemI) ? 0 : 1)
diff --git a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
index 30369ed7c245..f3e992c03917 100644
--- a/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopFlatten.cpp
@@ -978,10 +978,10 @@ static bool FlattenLoopPair(FlattenInfo &FI, DominatorTree *DT, LoopInfo *LI,
     assert(match(Br->getCondition(), m_Zero()) &&
            "Expected branch condition to be false");
     IRBuilder<> Builder(Br);
-    Function *F = Intrinsic::getOrInsertDeclaration(
-        M, Intrinsic::umul_with_overflow, FI.OuterTripCount->getType());
-    Value *Call = Builder.CreateCall(F, {FI.OuterTripCount, FI.InnerTripCount},
-                                     "flatten.mul");
+    Value *Call = Builder.CreateIntrinsic(
+        Intrinsic::umul_with_overflow, FI.OuterTripCount->getType(),
+        {FI.OuterTripCount, FI.InnerTripCount},
+        /*FMFSource=*/nullptr, "flatten.mul");
     FI.NewTripCount = Builder.CreateExtractValue(Call, 0, "flatten.tripcount");
     Value *Overflow = Builder.CreateExtractValue(Call, 1, "flatten.overflow");
     Br->setCondition(Overflow);
diff --git a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
index 56006d9ae692..2052fc6dadd0 100644
--- a/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
+++ b/llvm/lib/Transforms/Scalar/LoopIdiomRecognize.cpp
@@ -2121,9 +2121,7 @@ static CallInst *createPopcntIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
   Value *Ops[] = {Val};
   Type *Tys[] = {Val->getType()};
 
-  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
-  Function *Func = Intrinsic::getOrInsertDeclaration(M, Intrinsic::ctpop, Tys);
-  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CallInst *CI = IRBuilder.CreateIntrinsic(Intrinsic::ctpop, Tys, Ops);
   CI->setDebugLoc(DL);
 
   return CI;
@@ -2135,9 +2133,7 @@ static CallInst *createFFSIntrinsic(IRBuilder<> &IRBuilder, Value *Val,
   Value *Ops[] = {Val, IRBuilder.getInt1(ZeroCheck)};
   Type *Tys[] = {Val->getType()};
 
-  Module *M = IRBuilder.GetInsertBlock()->getParent()->getParent();
-  Function *Func = Intrinsic::getOrInsertDeclaration(M, IID, Tys);
-  CallInst *CI = IRBuilder.CreateCall(Func, Ops);
+  CallInst *CI = IRBuilder.CreateIntrinsic(IID, Tys, Ops);
   CI->setDebugLoc(DL);
 
   return CI;
diff --git a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
index f2ea9f8faf84..e323b391179e 100644
--- a/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
+++ b/llvm/lib/Transforms/Scalar/LowerMatrixIntrinsics.cpp
@@ -1290,9 +1290,8 @@ public:
       if (AllowContraction) {
         // Use fmuladd for floating point operations and let the backend decide
         // if that's profitable.
-        Function *FMulAdd = Intrinsic::getOrInsertDeclaration(
-            Func.getParent(), Intrinsic::fmuladd, A->getType());
-        return Builder.CreateCall(FMulAdd, {A, B, Sum});
+        return Builder.CreateIntrinsic(Intrinsic::fmuladd, A->getType(),
+                                       {A, B, Sum});
       }
       NumComputeOps += getNumOps(A->getType());
       Value *Mul = Builder.CreateFMul(A, B);
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 55ad2b6d6200..13eb588e46de 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -2057,7 +2057,6 @@ void llvm::updateProfileCallee(
 static void
 inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
                            const SmallVectorImpl<ReturnInst *> &Returns) {
-  Module *Mod = CB.getModule();
   assert(objcarc::isRetainOrClaimRV(RVCallKind) && "unexpected ARC function");
   bool IsRetainRV = RVCallKind == objcarc::ARCInstKind::RetainRV,
        IsUnsafeClaimRV = !IsRetainRV;
@@ -2089,9 +2088,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
         //   call.
         if (IsUnsafeClaimRV) {
           Builder.SetInsertPoint(II);
-          Function *IFn =
-              Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_release);
-          Builder.CreateCall(IFn, RetOpnd, "");
+          Builder.CreateIntrinsic(Intrinsic::objc_release, {}, RetOpnd);
         }
         II->eraseFromParent();
         InsertRetainCall = false;
@@ -2125,9 +2122,7 @@ inlineRetainOrClaimRVCalls(CallBase &CB, objcarc::ARCInstKind RVCallKind,
       // matching autoreleaseRV or an annotated call in the callee. Emit a call
       // to objc_retain.
       Builder.SetInsertPoint(RI);
-      Function *IFn =
-          Intrinsic::getOrInsertDeclaration(Mod, Intrinsic::objc_retain);
-      Builder.CreateCall(IFn, RetOpnd, "");
+      Builder.CreateIntrinsic(Intrinsic::objc_retain, {}, RetOpnd);
     }
   }
 }
diff --git a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
index 77abf160dc70..cccb9dae17df 100644
--- a/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
+++ b/llvm/lib/Transforms/Utils/MemoryTaggingSupport.cpp
@@ -268,12 +268,11 @@ bool isLifetimeIntrinsic(Value *V) {
 
 Value *readRegister(IRBuilder<> &IRB, StringRef Name) {
   Module *M = IRB.GetInsertBlock()->getParent()->getParent();
-  Function *ReadRegister = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::read_register, IRB.getIntPtrTy(M->getDataLayout()));
   MDNode *MD =
       MDNode::get(M->getContext(), {MDString::get(M->getContext(), Name)});
   Value *Args[] = {MetadataAsValue::get(M->getContext(), MD)};
-  return IRB.CreateCall(ReadRegister, Args);
+  return IRB.CreateIntrinsic(Intrinsic::read_register,
+                             IRB.getIntPtrTy(M->getDataLayout()), Args);
 }
 
 Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) {
@@ -287,12 +286,10 @@ Value *getPC(const Triple &TargetTriple, IRBuilder<> &IRB) {
 Value *getFP(IRBuilder<> &IRB) {
   Function *F = IRB.GetInsertBlock()->getParent();
   Module *M = F->getParent();
-  auto *GetStackPointerFn = Intrinsic::getOrInsertDeclaration(
-      M, Intrinsic::frameaddress,
-      IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()));
   return IRB.CreatePtrToInt(
-      IRB.CreateCall(GetStackPointerFn,
-                     {Constant::getNullValue(IRB.getInt32Ty())}),
+      IRB.CreateIntrinsic(Intrinsic::frameaddress,
+                          IRB.getPtrTy(M->getDataLayout().getAllocaAddrSpace()),
+                          {Constant::getNullValue(IRB.getInt32Ty())}),
       IRB.getIntPtrTy(M->getDataLayout()));
 }
 
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index bc619e5098e0..c412d0398b95 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -2139,10 +2139,9 @@ Value *SCEVExpander::generateOverflowCheck(const SCEVAddRecExpr *AR,
       MulV = TruncTripCount;
       OfMul = ConstantInt::getFalse(MulV->getContext());
     } else {
-      auto *MulF = Intrinsic::getOrInsertDeclaration(
-          Loc->getModule(), Intrinsic::umul_with_overflow, Ty);
-      CallInst *Mul =
-          Builder.CreateCall(MulF, {AbsStep, TruncTripCount}, "mul");
+      CallInst *Mul = Builder.CreateIntrinsic(Intrinsic::umul_with_overflow, Ty,
+                                              {AbsStep, TruncTripCount},
+                                              /*FMFSource=*/nullptr, "mul");
       MulV = Builder.CreateExtractValue(Mul, 0, "mul.result");
       OfMul = Builder.CreateExtractValue(Mul, 1, "mul.overflow");
     }
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index db2acb9eed09..cb4ef87de1c5 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -1958,10 +1958,9 @@ static Value *optimizeDoubleFP(CallInst *CI, IRBuilderBase &B,
   // g((double) float) -> (double) gf(float)
   Value *R;
   if (IsIntrinsic) {
-    Module *M = CI->getModule();
     Intrinsic::ID IID = CalleeFn->getIntrinsicID();
-    Function *Fn = Intrinsic::getOrInsertDeclaration(M, IID, B.getFloatTy());
-    R = isBinary ? B.CreateCall(Fn, V) : B.CreateCall(Fn, V[0]);
+    R = isBinary ? B.CreateIntrinsic(IID, B.getFloatTy(), V)
+                 : B.CreateIntrinsic(IID, B.getFloatTy(), V[0]);
   } else {
     AttributeList CalleeAttrs = CalleeFn->getAttributes();
     R = isBinary ? emitBinaryFloatFnCall(V[0], V[1], TLI, CalleeName, B,
-- 
GitLab


From 4897fc44a918b8da886d48082b6cf004cf3ffe0b Mon Sep 17 00:00:00 2001
From: Jonas Devlieghere <jonas@devlieghere.com>
Date: Thu, 17 Oct 2024 08:22:56 -0700
Subject: [PATCH 271/329] [lldb] Narrow scope of -Wno-deprecated-declarations
 (NFC) (#112276)

Currently all of LLDB is being compiled with
-Wno-deprecated-declarations. That's not desirable, especially as part
of the LLVM monorepo, as we miss deprecation warnings from LLVM and
clang.

According to the git history, this was first introduced to suppress
warnings related to auto_ptr. Since then, other things have been
deprecated and gone unnoticed. This patch limits the flag to Host.mm
which uses a handful of LSApplication headers that have no replacement.

rdar://112040718
---
 lldb/cmake/modules/LLDBConfig.cmake           | 2 --
 lldb/source/Host/macosx/objcxx/CMakeLists.txt | 4 +++-
 2 files changed, 3 insertions(+), 3 deletions(-)

diff --git a/lldb/cmake/modules/LLDBConfig.cmake b/lldb/cmake/modules/LLDBConfig.cmake
index a60921990cf7..93ccd9c479c2 100644
--- a/lldb/cmake/modules/LLDBConfig.cmake
+++ b/lldb/cmake/modules/LLDBConfig.cmake
@@ -188,7 +188,6 @@ include_directories("${CMAKE_CURRENT_BINARY_DIR}/../clang/include")
 
 if (LLVM_COMPILER_IS_GCC_COMPATIBLE)
   # Disable GCC warnings
-  append("-Wno-deprecated-declarations" CMAKE_CXX_FLAGS)
   append("-Wno-unknown-pragmas" CMAKE_CXX_FLAGS)
   append("-Wno-strict-aliasing" CMAKE_CXX_FLAGS)
 
@@ -198,7 +197,6 @@ endif()
 
 # Disable Clang warnings
 if (CMAKE_CXX_COMPILER_ID MATCHES "Clang")
-  append("-Wno-deprecated-register" CMAKE_CXX_FLAGS)
   append("-Wno-vla-extension" CMAKE_CXX_FLAGS)
 endif()
 
diff --git a/lldb/source/Host/macosx/objcxx/CMakeLists.txt b/lldb/source/Host/macosx/objcxx/CMakeLists.txt
index 273999f24380..1e693bed12ce 100644
--- a/lldb/source/Host/macosx/objcxx/CMakeLists.txt
+++ b/lldb/source/Host/macosx/objcxx/CMakeLists.txt
@@ -16,4 +16,6 @@ add_lldb_library(lldbHostMacOSXObjCXX NO_PLUGIN_DEPENDENCIES
     TargetParser
   )
 
-target_compile_options(lldbHostMacOSXObjCXX PRIVATE -fno-objc-exceptions)
+target_compile_options(lldbHostMacOSXObjCXX PRIVATE
+  -fno-objc-exceptions
+  -Wno-deprecated-declarations)
-- 
GitLab


From 98b419ca7688aa2823df1e87f58051aaa8d9c37f Mon Sep 17 00:00:00 2001
From: Pavel Labath <pavel@labath.sk>
Date: Thu, 17 Oct 2024 17:29:38 +0200
Subject: [PATCH 272/329] [lldb] Don't exit the main loop when in runs out of
 things to listen on (#112565)

This behavior made sense in the beginning as the class was completely
single threaded, so if the source count ever reached zero, there was no
way to add new ones. In https://reviews.llvm.org/D131160, the class
gained the ability to add events (callbacks) from other threads, which
means that is no longer the case (and indeed, one possible use case for
this class -- acting as a sort of arbiter for multiple threads wanting
to run code while making sure it runs serially -- has this class sit in
an empty Run call most of the time). I'm not aware of us having a use
for such a thing right now, but one of my tests in another patch turned
into something similar by accident.

Another problem with the current approach is that, in a
distributed/dynamic setup (multiple things using the main loop without a
clear coordinator), one can never be sure whether unregistering a
specific event will terminate the loop (it depends on whether there are
other listeners). We had this problem in lldb-platform.cpp, where we had
to add an additional layer of synchronization to avoid premature
termination. We can remove this if we can rely on the loop terminating
only when we tell it to.
---
 lldb/source/Host/posix/MainLoopPosix.cpp     |  5 +---
 lldb/source/Host/windows/MainLoopWindows.cpp |  4 +--
 lldb/tools/lldb-server/lldb-platform.cpp     | 27 ++++++++------------
 lldb/unittests/Host/MainLoopTest.cpp         | 18 ++++++-------
 4 files changed, 21 insertions(+), 33 deletions(-)

diff --git a/lldb/source/Host/posix/MainLoopPosix.cpp b/lldb/source/Host/posix/MainLoopPosix.cpp
index 816581e70294..6f8eaa55cfdf 100644
--- a/lldb/source/Host/posix/MainLoopPosix.cpp
+++ b/lldb/source/Host/posix/MainLoopPosix.cpp
@@ -365,10 +365,7 @@ Status MainLoopPosix::Run() {
   Status error;
   RunImpl impl(*this);
 
-  // run until termination or until we run out of things to listen to
-  // (m_read_fds will always contain m_trigger_pipe fd, so check for > 1)
-  while (!m_terminate_request &&
-         (m_read_fds.size() > 1 || !m_signals.empty())) {
+  while (!m_terminate_request) {
     error = impl.Poll();
     if (error.Fail())
       return error;
diff --git a/lldb/source/Host/windows/MainLoopWindows.cpp b/lldb/source/Host/windows/MainLoopWindows.cpp
index 88d929535ab6..c9aa6d339d8f 100644
--- a/lldb/source/Host/windows/MainLoopWindows.cpp
+++ b/lldb/source/Host/windows/MainLoopWindows.cpp
@@ -116,9 +116,7 @@ Status MainLoopWindows::Run() {
 
   Status error;
 
-  // run until termination or until we run out of things to listen to
-  while (!m_terminate_request && !m_read_fds.empty()) {
-
+  while (!m_terminate_request) {
     llvm::Expected<size_t> signaled_event = Poll();
     if (!signaled_event)
       return Status::FromError(signaled_event.takeError());
diff --git a/lldb/tools/lldb-server/lldb-platform.cpp b/lldb/tools/lldb-server/lldb-platform.cpp
index 2ef780578d0a..735a558810da 100644
--- a/lldb/tools/lldb-server/lldb-platform.cpp
+++ b/lldb/tools/lldb-server/lldb-platform.cpp
@@ -260,8 +260,7 @@ static void client_handle(GDBRemoteCommunicationServerPlatform &platform,
 static Status spawn_process(const char *progname, const Socket *conn_socket,
                             uint16_t gdb_port, const lldb_private::Args &args,
                             const std::string &log_file,
-                            const StringRef log_channels, MainLoop &main_loop,
-                            std::promise<void> &child_exited) {
+                            const StringRef log_channels, MainLoop &main_loop) {
   Status error;
   SharedSocket shared_socket(conn_socket, error);
   if (error.Fail())
@@ -301,12 +300,10 @@ static Status spawn_process(const char *progname, const Socket *conn_socket,
   if (g_server)
     launch_info.SetMonitorProcessCallback([](lldb::pid_t, int, int) {});
   else
-    launch_info.SetMonitorProcessCallback(
-        [&child_exited, &main_loop](lldb::pid_t, int, int) {
-          main_loop.AddPendingCallback(
-              [](MainLoopBase &loop) { loop.RequestTermination(); });
-          child_exited.set_value();
-        });
+    launch_info.SetMonitorProcessCallback([&main_loop](lldb::pid_t, int, int) {
+      main_loop.AddPendingCallback(
+          [](MainLoopBase &loop) { loop.RequestTermination(); });
+    });
 
   // Copy the current environment.
   launch_info.GetEnvironment() = Host::GetEnvironment();
@@ -550,27 +547,24 @@ int main_platform(int argc, char *argv[]) {
     return socket_error;
   }
 
-  std::promise<void> child_exited;
   MainLoop main_loop;
   {
     llvm::Expected<std::vector<MainLoopBase::ReadHandleUP>> platform_handles =
         platform_sock->Accept(
             main_loop, [progname, gdbserver_port, &inferior_arguments, log_file,
-                        log_channels, &main_loop, &child_exited,
+                        log_channels, &main_loop,
                         &platform_handles](std::unique_ptr<Socket> sock_up) {
               printf("Connection established.\n");
-              Status error = spawn_process(
-                  progname, sock_up.get(), gdbserver_port, inferior_arguments,
-                  log_file, log_channels, main_loop, child_exited);
+              Status error = spawn_process(progname, sock_up.get(),
+                                           gdbserver_port, inferior_arguments,
+                                           log_file, log_channels, main_loop);
               if (error.Fail()) {
                 Log *log = GetLog(LLDBLog::Platform);
                 LLDB_LOGF(log, "spawn_process failed: %s", error.AsCString());
                 WithColor::error()
                     << "spawn_process failed: " << error.AsCString() << "\n";
-                if (!g_server) {
+                if (!g_server)
                   main_loop.RequestTermination();
-                  child_exited.set_value();
-                }
               }
               if (!g_server)
                 platform_handles->clear();
@@ -592,7 +586,6 @@ int main_platform(int argc, char *argv[]) {
 
     main_loop.Run();
   }
-  child_exited.get_future().get();
 
   fprintf(stderr, "lldb-server exiting...\n");
 
diff --git a/lldb/unittests/Host/MainLoopTest.cpp b/lldb/unittests/Host/MainLoopTest.cpp
index b8417c9f00aa..4688d4fed475 100644
--- a/lldb/unittests/Host/MainLoopTest.cpp
+++ b/lldb/unittests/Host/MainLoopTest.cpp
@@ -194,9 +194,6 @@ TEST_F(MainLoopTest, PendingCallbackTrigger) {
     add_callback2.set_value();
   });
   Status error;
-  auto socket_handle = loop.RegisterReadObject(
-      socketpair[1], [](MainLoopBase &) {}, error);
-  ASSERT_TRUE(socket_handle);
   ASSERT_THAT_ERROR(error.ToError(), llvm::Succeeded());
   bool callback2_called = false;
   std::thread callback2_adder([&]() {
@@ -212,15 +209,18 @@ TEST_F(MainLoopTest, PendingCallbackTrigger) {
   ASSERT_TRUE(callback2_called);
 }
 
-// Regression test for assertion failure if a lot of callbacks end up
-// being queued after loop exits.
-TEST_F(MainLoopTest, PendingCallbackAfterLoopExited) {
+TEST_F(MainLoopTest, ManyPendingCallbacks) {
   MainLoop loop;
   Status error;
-  ASSERT_TRUE(loop.Run().Success());
-  // Try to fill the pipe buffer in.
+  // Try to fill up the pipe buffer and make sure bad things don't happen. This
+  // is a regression test for the case where writing to the interrupt pipe
+  // caused a deadlock when the pipe filled up (either because the main loop was
+  // not running, because it was slow, or because it was busy/blocked doing
+  // something else).
   for (int i = 0; i < 65536; ++i)
-    loop.AddPendingCallback([&](MainLoopBase &loop) {});
+    loop.AddPendingCallback(
+        [&](MainLoopBase &loop) { loop.RequestTermination(); });
+  ASSERT_TRUE(loop.Run().Success());
 }
 
 #ifdef LLVM_ON_UNIX
-- 
GitLab


From 834d001e10912c815fa7af14422f60c28162f8d8 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Valentin=20Clement=20=28=E3=83=90=E3=83=AC=E3=83=B3?=
 =?UTF-8?q?=E3=82=BF=E3=82=A4=E3=83=B3=20=E3=82=AF=E3=83=AC=E3=83=A1?=
 =?UTF-8?q?=E3=83=B3=29?= <clementval@gmail.com>
Date: Thu, 17 Oct 2024 08:30:13 -0700
Subject: [PATCH 273/329] [flang][cuda] Relax the verifier for
 cuf.register_kernel op (#112585)

Relax the verifier since the `gpu.func` might be converted to
`llvm.func` before `cuf.register_kernel` is converted.
---
 flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp | 27 ++++++++++++++--------
 flang/test/Fir/cuf-invalid.fir             | 15 ++++++++++++
 2 files changed, 33 insertions(+), 9 deletions(-)

diff --git a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
index 9e3bbd1f9cbe..0b03e070a007 100644
--- a/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
+++ b/flang/lib/Optimizer/Dialect/CUF/CUFOps.cpp
@@ -16,6 +16,7 @@
 #include "flang/Optimizer/Dialect/FIRAttr.h"
 #include "flang/Optimizer/Dialect/FIRType.h"
 #include "mlir/Dialect/GPU/IR/GPUDialect.h"
+#include "mlir/Dialect/LLVMIR/LLVMDialect.h"
 #include "mlir/IR/Attributes.h"
 #include "mlir/IR/BuiltinAttributes.h"
 #include "mlir/IR/BuiltinOps.h"
@@ -276,18 +277,26 @@ mlir::LogicalResult cuf::RegisterKernelOp::verify() {
 
   mlir::SymbolTable symTab(mod);
   auto gpuMod = symTab.lookup<mlir::gpu::GPUModuleOp>(getKernelModuleName());
-  if (!gpuMod)
+  if (!gpuMod) {
+    // If already a gpu.binary then stop the check here.
+    if (symTab.lookup<mlir::gpu::BinaryOp>(getKernelModuleName()))
+      return mlir::success();
     return emitOpError("gpu module not found");
+  }
 
   mlir::SymbolTable gpuSymTab(gpuMod);
-  auto func = gpuSymTab.lookup<mlir::gpu::GPUFuncOp>(getKernelName());
-  if (!func)
-    return emitOpError("device function not found");
-
-  if (!func.isKernel())
-    return emitOpError("only kernel gpu.func can be registered");
-
-  return mlir::success();
+  if (auto func = gpuSymTab.lookup<mlir::gpu::GPUFuncOp>(getKernelName())) {
+    if (!func.isKernel())
+      return emitOpError("only kernel gpu.func can be registered");
+    return mlir::success();
+  } else if (auto func =
+                 gpuSymTab.lookup<mlir::LLVM::LLVMFuncOp>(getKernelName())) {
+    if (!func->getAttrOfType<mlir::UnitAttr>(
+            mlir::gpu::GPUDialect::getKernelFuncAttrName()))
+      return emitOpError("only gpu.kernel llvm.func can be registered");
+    return mlir::success();
+  }
+  return emitOpError("device function not found");
 }
 
 // Tablegen operators
diff --git a/flang/test/Fir/cuf-invalid.fir b/flang/test/Fir/cuf-invalid.fir
index a5747b8ee4a3..8a1eb4857683 100644
--- a/flang/test/Fir/cuf-invalid.fir
+++ b/flang/test/Fir/cuf-invalid.fir
@@ -175,3 +175,18 @@ module attributes {gpu.container_module} {
     llvm.return
   }
 }
+
+// -----
+
+module attributes {gpu.container_module} {
+  gpu.module @cuda_device_mod {
+    llvm.func @_QPsub_device1() {
+      llvm.return
+    }
+  }
+  llvm.func internal @__cudaFortranConstructor() {
+    // expected-error@+1{{'cuf.register_kernel' op only gpu.kernel llvm.func can be registered}}
+    cuf.register_kernel @cuda_device_mod::@_QPsub_device1
+    llvm.return
+  }
+}
-- 
GitLab


From ab208de34efbde4fea03732eaa353a701e72f626 Mon Sep 17 00:00:00 2001
From: David Spickett <david.spickett@linaro.org>
Date: Thu, 17 Oct 2024 16:30:28 +0100
Subject: [PATCH 274/329] [flang][docs] Update description of how to contribute
 (#112369)

It's my understanding that all code review pre-commit takes place on
GitHub Pull Requests and that post-commit review is done either on the
closed PR or the commit on GitHub.
---
 flang/docs/GettingInvolved.md | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/flang/docs/GettingInvolved.md b/flang/docs/GettingInvolved.md
index a8bd93517709..e2220f369466 100644
--- a/flang/docs/GettingInvolved.md
+++ b/flang/docs/GettingInvolved.md
@@ -17,6 +17,11 @@ The Flang Project welcomes contributions of all kinds.
 Please feel free to join the mailing list or the slack channel for discussions related to development of Flang.
 To understand the status of various developments in Flang please join the respective call.
 
+## Contributing
+
+Contributions to Flang are done using GitHub Pull Requests and follow the
+[LLVM contribution process](https://llvm.org/docs/Contributing.html).
+
 ## Forum and Mailing Lists
 
 [Forum](https://discourse.llvm.org/c/subprojects/flang)
@@ -27,8 +32,7 @@ To understand the status of various developments in Flang please join the respec
 [Commits Archive (flang-commits)](http://lists.llvm.org/pipermail/flang-commits)
 
   This list contains all commit messages that are made when Flang developers
-  commit code changes to the repository. It also serves as a forum for
-  patch review (i.e. send patches here). It is useful for those who want to
+  commit code changes to the repository. It is useful for those who want to
   stay on the bleeding edge of Flang development. This list is high
   volume.
 
-- 
GitLab


From c85611e8583e6392d56075ebdfa60893b6284813 Mon Sep 17 00:00:00 2001
From: goldsteinn <35538541+goldsteinn@users.noreply.github.com>
Date: Thu, 17 Oct 2024 11:32:55 -0400
Subject: [PATCH 275/329] [SimplifyLibCall][Attribute] Fix bug where we may
 keep `range` attr with incompatible type (#112649)

In a variety of places we change the bitwidth of a parameter but don't
update the attributes.

The issue in this case is from the `range` attribute when inlining
`__memset_chk`. `optimizeMemSetChk` will replace an `i32` with an
`i8`, and if the `i32` had a `range` attr assosiated it will cause an
error.

Fixes #112633
---
 llvm/include/llvm/IR/Argument.h                  |  2 ++
 llvm/include/llvm/IR/Attributes.h                | 16 +++++++++++-----
 llvm/include/llvm/IR/InstrTypes.h                | 16 ++++++++++++----
 llvm/lib/Bitcode/Reader/BitcodeReader.cpp        |  5 +++--
 llvm/lib/IR/Attributes.cpp                       |  7 ++++++-
 llvm/lib/IR/AutoUpgrade.cpp                      |  6 ++++--
 llvm/lib/IR/Function.cpp                         |  4 ++++
 llvm/lib/IR/Verifier.cpp                         |  2 +-
 llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp        |  3 ++-
 .../Transforms/IPO/DeadArgumentElimination.cpp   |  8 +++++---
 .../Transforms/InstCombine/InstCombineCalls.cpp  | 11 +++++++----
 .../Instrumentation/DataFlowSanitizer.cpp        |  4 ++--
 llvm/lib/Transforms/Utils/CallPromotionUtils.cpp |  6 ++++--
 llvm/lib/Transforms/Utils/CloneFunction.cpp      |  4 ++--
 llvm/lib/Transforms/Utils/InlineFunction.cpp     |  4 ++--
 llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp   |  8 +++++++-
 .../Transforms/InstCombine/simplify-libcalls.ll  | 12 ++++++++++++
 llvm/test/Verifier/range-attr.ll                 |  4 ++--
 18 files changed, 88 insertions(+), 34 deletions(-)

diff --git a/llvm/include/llvm/IR/Argument.h b/llvm/include/llvm/IR/Argument.h
index 0ffcb05519d4..5be58d7eca06 100644
--- a/llvm/include/llvm/IR/Argument.h
+++ b/llvm/include/llvm/IR/Argument.h
@@ -182,6 +182,8 @@ public:
 
   Attribute getAttribute(Attribute::AttrKind Kind) const;
 
+  AttributeSet getAttributes() const;
+
   /// Method for support type inquiry through isa, cast, and dyn_cast.
   static bool classof(const Value *V) {
     return V->getValueID() == ArgumentVal;
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index 57db52e4879b..feeb3a9ddba9 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -1288,11 +1288,17 @@ enum AttributeSafetyKind : uint8_t {
 /// follows the same type rules as FPMathOperator.
 bool isNoFPClassCompatibleType(Type *Ty);
 
-/// Which attributes cannot be applied to a type. The argument \p ASK indicates,
-/// if only attributes that are known to be safely droppable are contained in
-/// the mask; only attributes that might be unsafe to drop (e.g., ABI-related
-/// attributes) are in the mask; or both.
-AttributeMask typeIncompatible(Type *Ty, AttributeSafetyKind ASK = ASK_ALL);
+/// Which attributes cannot be applied to a type. The argument \p AS
+/// is used as a hint for the attributes whose compatibility is being
+/// checked against \p Ty. This does not mean the return will be a
+/// subset of \p AS, just that attributes that have specific dynamic
+/// type compatibilities (i.e `range`) will be checked against what is
+/// contained in \p AS. The argument \p ASK indicates, if only
+/// attributes that are known to be safely droppable are contained in
+/// the mask; only attributes that might be unsafe to drop (e.g.,
+/// ABI-related attributes) are in the mask; or both.
+AttributeMask typeIncompatible(Type *Ty, AttributeSet AS,
+                               AttributeSafetyKind ASK = ASK_ALL);
 
 /// Get param/return attributes which imply immediate undefined behavior if an
 /// invalid value is passed. For example, this includes noundef (where undef
diff --git a/llvm/include/llvm/IR/InstrTypes.h b/llvm/include/llvm/IR/InstrTypes.h
index 86d88da3d946..99f72792ce40 100644
--- a/llvm/include/llvm/IR/InstrTypes.h
+++ b/llvm/include/llvm/IR/InstrTypes.h
@@ -1453,14 +1453,22 @@ public:
   /// looking through to the attributes on the called function when necessary).
   ///@{
 
-  /// Return the parameter attributes for this call.
-  ///
+  /// Return the attributes for this call.
   AttributeList getAttributes() const { return Attrs; }
 
-  /// Set the parameter attributes for this call.
-  ///
+  /// Set the attributes for this call.
   void setAttributes(AttributeList A) { Attrs = A; }
 
+  /// Return the return attributes for this call.
+  AttributeSet getRetAttributes() const {
+    return getAttributes().getRetAttrs();
+  }
+
+  /// Return the param attributes for this call.
+  AttributeSet getParamAttributes(unsigned ArgNo) const {
+    return getAttributes().getParamAttrs(ArgNo);
+  }
+
   /// Try to intersect the attributes from 'this' CallBase and the
   /// 'Other' CallBase. Sets the intersected attributes to 'this' and
   /// return true if successful. Doesn't modify 'this' and returns
diff --git a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
index 182c5808f8ca..5a6fb5064b31 100644
--- a/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
+++ b/llvm/lib/Bitcode/Reader/BitcodeReader.cpp
@@ -7040,11 +7040,12 @@ Error BitcodeReader::materialize(GlobalValue *GV) {
     // Remove incompatible attributes on function calls.
     if (auto *CI = dyn_cast<CallBase>(&I)) {
       CI->removeRetAttrs(AttributeFuncs::typeIncompatible(
-          CI->getFunctionType()->getReturnType()));
+          CI->getFunctionType()->getReturnType(), CI->getRetAttributes()));
 
       for (unsigned ArgNo = 0; ArgNo < CI->arg_size(); ++ArgNo)
         CI->removeParamAttrs(ArgNo, AttributeFuncs::typeIncompatible(
-                                        CI->getArgOperand(ArgNo)->getType()));
+                                        CI->getArgOperand(ArgNo)->getType(),
+                                        CI->getParamAttributes(ArgNo)));
     }
   }
 
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index c2fba49692c7..223c917766a4 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -2300,7 +2300,7 @@ bool AttributeFuncs::isNoFPClassCompatibleType(Type *Ty) {
 }
 
 /// Which attributes cannot be applied to a type.
-AttributeMask AttributeFuncs::typeIncompatible(Type *Ty,
+AttributeMask AttributeFuncs::typeIncompatible(Type *Ty, AttributeSet AS,
                                                AttributeSafetyKind ASK) {
   AttributeMask Incompatible;
 
@@ -2316,6 +2316,11 @@ AttributeMask AttributeFuncs::typeIncompatible(Type *Ty,
     // Attributes that only apply to integers or vector of integers.
     if (ASK & ASK_SAFE_TO_DROP)
       Incompatible.addAttribute(Attribute::Range);
+  } else {
+    Attribute RangeAttr = AS.getAttribute(Attribute::Range);
+    if (RangeAttr.isValid() &&
+        RangeAttr.getRange().getBitWidth() != Ty->getScalarSizeInBits())
+      Incompatible.addAttribute(Attribute::Range);
   }
 
   if (!Ty->isPointerTy()) {
diff --git a/llvm/lib/IR/AutoUpgrade.cpp b/llvm/lib/IR/AutoUpgrade.cpp
index 3aceb5227bb3..bb03c9290e4c 100644
--- a/llvm/lib/IR/AutoUpgrade.cpp
+++ b/llvm/lib/IR/AutoUpgrade.cpp
@@ -5378,9 +5378,11 @@ void llvm::UpgradeFunctionAttributes(Function &F) {
   }
 
   // Remove all incompatibile attributes from function.
-  F.removeRetAttrs(AttributeFuncs::typeIncompatible(F.getReturnType()));
+  F.removeRetAttrs(AttributeFuncs::typeIncompatible(
+      F.getReturnType(), F.getAttributes().getRetAttrs()));
   for (auto &Arg : F.args())
-    Arg.removeAttrs(AttributeFuncs::typeIncompatible(Arg.getType()));
+    Arg.removeAttrs(
+        AttributeFuncs::typeIncompatible(Arg.getType(), Arg.getAttributes()));
 
   // Older versions of LLVM treated an "implicit-section-name" attribute
   // similarly to directly setting the section on a Function.
diff --git a/llvm/lib/IR/Function.cpp b/llvm/lib/IR/Function.cpp
index 09b90713b9c7..889295956dbf 100644
--- a/llvm/lib/IR/Function.cpp
+++ b/llvm/lib/IR/Function.cpp
@@ -359,6 +359,10 @@ Attribute Argument::getAttribute(Attribute::AttrKind Kind) const {
   return getParent()->getParamAttribute(getArgNo(), Kind);
 }
 
+AttributeSet Argument::getAttributes() const {
+  return getParent()->getAttributes().getParamAttrs(getArgNo());
+}
+
 //===----------------------------------------------------------------------===//
 // Helper Methods in Function
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/IR/Verifier.cpp b/llvm/lib/IR/Verifier.cpp
index 0412b93798b9..f34fe7594c86 100644
--- a/llvm/lib/IR/Verifier.cpp
+++ b/llvm/lib/IR/Verifier.cpp
@@ -2012,7 +2012,7 @@ void Verifier::verifyParameterAttrs(AttributeSet Attrs, Type *Ty,
           Attrs.hasAttribute(Attribute::ReadOnly)),
         "Attributes writable and readonly are incompatible!", V);
 
-  AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty);
+  AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(Ty, Attrs);
   for (Attribute Attr : Attrs) {
     if (!Attr.isStringAttribute() &&
         IncompatibleAttrs.contains(Attr.getKindAsEnum())) {
diff --git a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
index eb553ae4eb80..26d8ce77d9a9 100644
--- a/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
+++ b/llvm/lib/Target/AMDGPU/AMDGPULibCalls.cpp
@@ -788,7 +788,8 @@ bool AMDGPULibCalls::fold(CallInst *CI) {
               B.CreateFPToSI(FPOp->getOperand(1), PownType->getParamType(1));
           // Have to drop any nofpclass attributes on the original call site.
           Call->removeParamAttrs(
-              1, AttributeFuncs::typeIncompatible(CastedArg->getType()));
+              1, AttributeFuncs::typeIncompatible(CastedArg->getType(),
+                                                  Call->getParamAttributes(1)));
           Call->setCalledFunction(PownFunc);
           Call->setArgOperand(1, CastedArg);
           return fold_pow(FPOp, B, PownInfo) || true;
diff --git a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
index d1548592b1ce..ed93b4491c50 100644
--- a/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
+++ b/llvm/lib/Transforms/IPO/DeadArgumentElimination.cpp
@@ -857,9 +857,10 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
   // here. Currently, this should not be possible, but special handling might be
   // required when new return value attributes are added.
   if (NRetTy->isVoidTy())
-    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy, PAL.getRetAttrs()));
   else
-    assert(!RAttrs.overlaps(AttributeFuncs::typeIncompatible(NRetTy)) &&
+    assert(!RAttrs.overlaps(
+               AttributeFuncs::typeIncompatible(NRetTy, PAL.getRetAttrs())) &&
            "Return attributes no longer compatible?");
 
   AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
@@ -903,7 +904,8 @@ bool DeadArgumentEliminationPass::removeDeadStuffFromFunction(Function *F) {
     // Adjust the call return attributes in case the function was changed to
     // return void.
     AttrBuilder RAttrs(F->getContext(), CallPAL.getRetAttrs());
-    RAttrs.remove(AttributeFuncs::typeIncompatible(NRetTy));
+    RAttrs.remove(
+        AttributeFuncs::typeIncompatible(NRetTy, CallPAL.getRetAttrs()));
     AttributeSet RetAttrs = AttributeSet::get(F->getContext(), RAttrs);
 
     // Declare these outside of the loops, so we can reuse them for the second
diff --git a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
index 37841e91821c..07b9405b941d 100644
--- a/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
+++ b/llvm/lib/Transforms/InstCombine/InstCombineCalls.cpp
@@ -4151,7 +4151,8 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
 
     if (!CallerPAL.isEmpty() && !Caller->use_empty()) {
       AttrBuilder RAttrs(FT->getContext(), CallerPAL.getRetAttrs());
-      if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(NewRetTy)))
+      if (RAttrs.overlaps(AttributeFuncs::typeIncompatible(
+              NewRetTy, CallerPAL.getRetAttrs())))
         return false;   // Attribute not compatible with transformed value.
     }
 
@@ -4197,7 +4198,8 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     // Check if there are any incompatible attributes we cannot drop safely.
     if (AttrBuilder(FT->getContext(), CallerPAL.getParamAttrs(i))
             .overlaps(AttributeFuncs::typeIncompatible(
-                ParamTy, AttributeFuncs::ASK_UNSAFE_TO_DROP)))
+                ParamTy, CallerPAL.getParamAttrs(i),
+                AttributeFuncs::ASK_UNSAFE_TO_DROP)))
       return false;   // Attribute not compatible with transformed value.
 
     if (Call.isInAllocaArgument(i) ||
@@ -4235,7 +4237,8 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
 
   // If the return value is not being used, the type may not be compatible
   // with the existing attributes.  Wipe out any problematic attributes.
-  RAttrs.remove(AttributeFuncs::typeIncompatible(NewRetTy));
+  RAttrs.remove(
+      AttributeFuncs::typeIncompatible(NewRetTy, CallerPAL.getRetAttrs()));
 
   LLVMContext &Ctx = Call.getContext();
   AI = Call.arg_begin();
@@ -4250,7 +4253,7 @@ bool InstCombinerImpl::transformConstExprCastCall(CallBase &Call) {
     // Add any parameter attributes except the ones incompatible with the new
     // type. Note that we made sure all incompatible ones are safe to drop.
     AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(
-        ParamTy, AttributeFuncs::ASK_SAFE_TO_DROP);
+        ParamTy, CallerPAL.getParamAttrs(i), AttributeFuncs::ASK_SAFE_TO_DROP);
     ArgAttrs.push_back(
         CallerPAL.getParamAttrs(i).removeAttributes(Ctx, IncompatibleAttrs));
   }
diff --git a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
index 577647cac3f5..e226727e64d3 100644
--- a/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
+++ b/llvm/lib/Transforms/Instrumentation/DataFlowSanitizer.cpp
@@ -1305,8 +1305,8 @@ DataFlowSanitizer::buildWrapperFunction(Function *F, StringRef NewFName,
   Function *NewF = Function::Create(NewFT, NewFLink, F->getAddressSpace(),
                                     NewFName, F->getParent());
   NewF->copyAttributesFrom(F);
-  NewF->removeRetAttrs(
-      AttributeFuncs::typeIncompatible(NewFT->getReturnType()));
+  NewF->removeRetAttrs(AttributeFuncs::typeIncompatible(
+      NewFT->getReturnType(), NewF->getAttributes().getRetAttrs()));
 
   BasicBlock *BB = BasicBlock::Create(*Ctx, "entry", NewF);
   if (F->isVarArg()) {
diff --git a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
index 4ead5cdcf29c..17cba2e642a1 100644
--- a/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CallPromotionUtils.cpp
@@ -529,7 +529,8 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
 
       // Remove any incompatible attributes for the argument.
       AttrBuilder ArgAttrs(Ctx, CallerPAL.getParamAttrs(ArgNo));
-      ArgAttrs.remove(AttributeFuncs::typeIncompatible(FormalTy));
+      ArgAttrs.remove(AttributeFuncs::typeIncompatible(
+          FormalTy, CallerPAL.getParamAttrs(ArgNo)));
 
       // We may have a different byval/inalloca type.
       if (ArgAttrs.getByValType())
@@ -549,7 +550,8 @@ CallBase &llvm::promoteCall(CallBase &CB, Function *Callee,
   AttrBuilder RAttrs(Ctx, CallerPAL.getRetAttrs());
   if (!CallSiteRetTy->isVoidTy() && CallSiteRetTy != CalleeRetTy) {
     createRetBitCast(CB, CallSiteRetTy, RetBitCast);
-    RAttrs.remove(AttributeFuncs::typeIncompatible(CalleeRetTy));
+    RAttrs.remove(
+        AttributeFuncs::typeIncompatible(CalleeRetTy, CallerPAL.getRetAttrs()));
     AttributeChanged = true;
   }
 
diff --git a/llvm/lib/Transforms/Utils/CloneFunction.cpp b/llvm/lib/Transforms/Utils/CloneFunction.cpp
index c6ba85bd9e57..5dc82a8dfb2d 100644
--- a/llvm/lib/Transforms/Utils/CloneFunction.cpp
+++ b/llvm/lib/Transforms/Utils/CloneFunction.cpp
@@ -819,9 +819,9 @@ void llvm::CloneAndPruneIntoFromInst(Function *NewFunc, const Function *OldFunc,
   // Drop all incompatible return attributes that cannot be applied to NewFunc
   // during cloning, so as to allow instruction simplification to reason on the
   // old state of the function. The original attributes are restored later.
-  AttributeMask IncompatibleAttrs =
-      AttributeFuncs::typeIncompatible(OldFunc->getReturnType());
   AttributeList Attrs = NewFunc->getAttributes();
+  AttributeMask IncompatibleAttrs = AttributeFuncs::typeIncompatible(
+      OldFunc->getReturnType(), Attrs.getRetAttrs());
   NewFunc->removeRetAttrs(IncompatibleAttrs);
 
   // As phi-nodes have been now remapped, allow incremental simplification of
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index 13eb588e46de..a0a93dc0dab5 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -3057,8 +3057,8 @@ llvm::InlineResult llvm::InlineFunction(CallBase &CB, InlineFunctionInfo &IFI,
         else
           Builder.CreateRet(NewDeoptCall);
         // Since the ret type is changed, remove the incompatible attributes.
-        NewDeoptCall->removeRetAttrs(
-            AttributeFuncs::typeIncompatible(NewDeoptCall->getType()));
+        NewDeoptCall->removeRetAttrs(AttributeFuncs::typeIncompatible(
+            NewDeoptCall->getType(), NewDeoptCall->getRetAttributes()));
       }
 
       // Leave behind the normal returns so we can merge control flow.
diff --git a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
index cb4ef87de1c5..79e91ad097cf 100644
--- a/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
+++ b/llvm/lib/Transforms/Utils/SimplifyLibCalls.cpp
@@ -342,7 +342,13 @@ static Value *copyFlags(const CallInst &Old, Value *New) {
 static Value *mergeAttributesAndFlags(CallInst *NewCI, const CallInst &Old) {
   NewCI->setAttributes(AttributeList::get(
       NewCI->getContext(), {NewCI->getAttributes(), Old.getAttributes()}));
-  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(NewCI->getType()));
+  NewCI->removeRetAttrs(AttributeFuncs::typeIncompatible(
+      NewCI->getType(), NewCI->getRetAttributes()));
+  for (unsigned I = 0; I < NewCI->arg_size(); ++I)
+    NewCI->removeParamAttrs(
+        I, AttributeFuncs::typeIncompatible(NewCI->getArgOperand(I)->getType(),
+                                            NewCI->getParamAttributes(I)));
+
   return copyFlags(Old, NewCI);
 }
 
diff --git a/llvm/test/Transforms/InstCombine/simplify-libcalls.ll b/llvm/test/Transforms/InstCombine/simplify-libcalls.ll
index bb2728a103ec..c4ae7e7e16bc 100644
--- a/llvm/test/Transforms/InstCombine/simplify-libcalls.ll
+++ b/llvm/test/Transforms/InstCombine/simplify-libcalls.ll
@@ -342,5 +342,17 @@ define signext i32 @emit_stpncpy() {
   ret i32 0
 }
 
+define void @simplify_memset_chk_pr112633(ptr %p, i32 %conv) {
+; CHECK-LABEL: @simplify_memset_chk_pr112633(
+; CHECK-NEXT:    [[CALL_I:%.*]] = tail call ptr @__memset_chk(ptr [[P:%.*]], i32 range(i32 0, 123) [[CONV:%.*]], i64 1, i64 1)
+; CHECK-NEXT:    ret void
+;
+  %call.i = tail call ptr @__memset_chk(ptr %p, i32 range(i32 0, 123) %conv, i64 1, i64 1)
+  ret void
+}
+
+declare ptr @__memset_chk(ptr, i32, i64, i64)
+
+
 attributes #0 = { nobuiltin }
 attributes #1 = { builtin }
diff --git a/llvm/test/Verifier/range-attr.ll b/llvm/test/Verifier/range-attr.ll
index f985ab696eac..91412369b0b2 100644
--- a/llvm/test/Verifier/range-attr.ll
+++ b/llvm/test/Verifier/range-attr.ll
@@ -1,12 +1,12 @@
 ; RUN: not llvm-as %s -o /dev/null 2>&1 | FileCheck %s
 
-; CHECK: Range bit width must match type bit width!
+; CHECK: Attribute 'range(i8 1, 0)' applied to incompatible type!
 ; CHECK-NEXT: ptr @bit_widths_do_not_match
 define void @bit_widths_do_not_match(i32 range(i8 1, 0) %a) {
   ret void
 }
 
-; CHECK: Range bit width must match type bit width!
+; CHECK: Attribute 'range(i8 1, 0)' applied to incompatible type!
 ; CHECK-NEXT: ptr @bit_widths_do_not_match_vector
 define void @bit_widths_do_not_match_vector(<4 x i32> range(i8 1, 0) %a) {
   ret void
-- 
GitLab


From 76f377618532fe486d1fff1250598a73c55f4310 Mon Sep 17 00:00:00 2001
From: David Sherwood <david.sherwood@arm.com>
Date: Thu, 17 Oct 2024 16:50:59 +0100
Subject: [PATCH 276/329] [NFC][LoopVectorize] Restructure simple early exit
 tests (#112721)

The previous simple_early_exit.ll was growing too large and difficult to
manage. Instead I've decided to refactor the tests by splitting out into
notional groups:

1. single_early_exit.ll: loops with a single uncountable exit that do
not have live-outs from the loop.
2. single_early_exit_live_outs.ll: loops with a single uncountable exit
with live-outs.
3. multi_early_exit.ll: loops with multiple early exits, i.e. a mixture
of countable and uncountable exits, but with no live-outs from the loop.
4. multi_early_exit_live_outs.ll: as above, but with live-outs.
5. single_early_exit_unsafe_ptrs.ll: loops with a single uncountable
exit, but with pointers that are not unconditionally dereferenceable.
6. unsupported_early_exit.ll: loops with uncountable exits that we
cannot yet vectorise.
7. early_exit_legality.ll: tests the debug output from
LoopVectorizationLegality to make sure we handle different scenarios
correctly.

Only the last test now requires asserts. Over time some of these tests
should start vectorising as more support is added.

I also tried to rename the multi early exit tests to make it clear there
what mixture of countable and uncountable exits are present.
---
 .../LoopVectorize/early_exit_legality.ll      |  542 +++++
 .../LoopVectorize/multi_early_exit.ll         |  122 +
 .../multi_early_exit_live_outs.ll             |  128 +
 .../LoopVectorize/simple_early_exit.ll        | 2118 -----------------
 .../LoopVectorize/single_early_exit.ll        |  220 ++
 .../single_early_exit_live_outs.ll            |  997 ++++++++
 .../single_early_exit_unsafe_ptrs.ll          |  143 ++
 .../LoopVectorize/unsupported_early_exit.ll   |  494 ++++
 8 files changed, 2646 insertions(+), 2118 deletions(-)
 create mode 100644 llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/multi_early_exit.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll
 delete mode 100644 llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/single_early_exit.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
 create mode 100644 llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll

diff --git a/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
new file mode 100644
index 000000000000..21433477c1d7
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/early_exit_legality.ll
@@ -0,0 +1,542 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; REQUIRES: asserts
+; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize -disable-output 2>&1 | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+; == SOME LEGAL EXAMPLES ==
+
+; The form of the induction variables requires SCEV predicates.
+define i32 @diff_exit_block_needs_scev_check(i32 %end) {
+; CHECK-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check'
+; CHECK:       Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32)))<nsw>
+; CHECK-NEXT:  LV: We can vectorize this loop!
+; CHECK-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+entry:
+  %p1 = alloca [1024 x i32]
+  %p2 = alloca [1024 x i32]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  %end.clamped = and i32 %end, 1023
+  br label %for.body
+
+for.body:
+  %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ]
+  %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind
+  %1 = load i32, ptr %arrayidx2, align 4
+  %cmp.early = icmp eq i32 %0, %1
+  br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+  %ind.next = add i8 %ind, 1
+  %conv = zext i8 %ind.next to i32
+  %gep.ind.next = add i64 %gep.ind, 1
+  %cmp = icmp ult i32 %conv, %end.clamped
+  br i1 %cmp, label %for.body, label %exit
+
+found:
+  ret i32 1
+
+exit:
+  ret i32 0
+}
+
+
+define i64 @same_exit_block_pre_inc_use1() {
+; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
+; CHECK:       LV: Found an early exit loop with symbolic max backedge taken count: 63
+; CHECK-NEXT:  LV: We can vectorize this loop!
+; CHECK-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_safe_call() {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_safe_call'
+; CHECK:       LV: Found an early exit loop with symbolic max backedge taken count: 63
+; CHECK-NEXT:  LV: We can vectorize this loop!
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index
+  %ld1 = load float, ptr %arrayidx, align 1
+  %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1)
+  %cmp = fcmp fast ult float %sqrt, 3.0e+00
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_safe_div() {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_safe_div'
+; CHECK:       LV: Found an early exit loop with symbolic max backedge taken count: 63
+; CHECK-NEXT:  LV: We can vectorize this loop!
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %div = udiv i32 %ld1, 20000
+  %cmp = icmp eq i32 %div, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit'
+; CHECK:       LV: Found an early exit loop with symbolic max backedge taken count: 63
+; CHECK-NEXT:  LV: We can vectorize this loop!
+; CHECK-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %cmp = icmp eq i32 %ld1, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index
+  %ld2 = load i64, ptr %arrayidx2, align 8
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ]
+  ret i64 %retval
+}
+
+
+; == SOME ILLEGAL EXAMPLES ==
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
+; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas'
+; CHECK:       LV: Not vectorizing: Loop may fault.
+entry:
+  %p1 = alloca [42 x i8]
+  %p2 = alloca [42 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
+; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_deref_ptrs'
+; CHECK:       LV: Not vectorizing: Loop may fault.
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
+; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_unknown_ptrs'
+; CHECK:       LV: Not vectorizing: Loop may fault.
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't
+; support this yet.
+define i64 @uncountable_exit_on_last_block() {
+; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_on_last_block'
+; CHECK:       LV: Not vectorizing: Early exit is not the latch predecessor.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %search ], [ 3, %entry ]
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ %index, %search ]
+  ret i64 %retval
+}
+
+
+; We don't currently support multiple uncountable early exits.
+define i64 @multiple_uncountable_exits() {
+; CHECK-LABEL: LV: Checking a loop in 'multiple_uncountable_exits'
+; CHECK:       LV: Not vectorizing: Loop has too many uncountable exits.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %search1
+
+search1:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp1 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp1, label %loop.end, label %search2
+
+search2:
+  %cmp2 = icmp ult i8 %ld1, 34
+  br i1 %cmp2, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %search1, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @uncountable_exit_infinite_loop() {
+; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_infinite_loop'
+; CHECK:       LV: Not vectorizing: Cannot determine exact exit count for latch block.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br label %loop
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_unsafe_call() {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_unsafe_call'
+; CHECK:       LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %bad_call = call i32 @foo(i32 %ld1) #0
+  %cmp = icmp eq i32 %bad_call, 34
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_unsafe_div() {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_unsafe_div'
+; CHECK:       LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %div = udiv i32 20000, %ld1
+  %cmp = icmp eq i32 %div, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_store(ptr %dest) {
+; CHECK-LABEL: LV: Checking a loop in 'loop_contains_store'
+; CHECK:       LV: Not vectorizing: Writes to memory unsupported in early exit loops
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 %ld1, ptr %arrayidx2, align 4
+  %cmp = icmp eq i32 %ld1, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @uncountable_exit_in_conditional_block(ptr %mask) {
+; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_in_conditional_block'
+; CHECK:       LV: Not vectorizing: Early exit is not the latch predecessor.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index
+  %ld1 = load i8, ptr %arrayidx1, align 1
+  %cmp1 = icmp ne i8 %ld1, 0
+  br i1 %cmp1, label %loop.search, label %loop.inc
+
+loop.search:
+  %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld2 = load i8, ptr %arrayidx2, align 1
+  %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld3 = load i8, ptr %arrayidx3, align 1
+  %cmp2 = icmp eq i8 %ld2, %ld3
+  br i1 %cmp2, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_with_reduction() {
+; CHECK-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_with_reduction'
+; CHECK:       LV: Not vectorizing: Found reductions or recurrences in early-exit loop.
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %ld2.zext = zext i8 %ld2 to i64
+  %red.next = add i64 %red, %ld2.zext
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  %retval = add i64 %red.next, %final.ind
+  ret i64 %retval
+}
+
+
+define i64 @uncountable_exit_has_multiple_outside_successors() {
+; CHECK-LABEL: LV: Checking a loop in 'uncountable_exit_has_multiple_outside_successors'
+; CHECK:       LV: Not vectorizing: Loop contains an unsupported switch
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  switch i8 %ld1, label %loop.inc [
+  i8 2, label %loop.end
+  i8 3, label %loop.surprise
+  ]
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.surprise:
+  ret i64 3
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+declare i32 @foo(i32) readonly
+declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
+
+attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
diff --git a/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll
new file mode 100644
index 000000000000..94af5b7c7607
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/multi_early_exit.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+define i64 @one_uncountable_two_countable_same_exit_phi_of_consts() {
+; CHECK-LABEL: define i64 @one_uncountable_two_countable_same_exit_phi_of_consts() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[SEARCH]] ], [ 0, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %loop ], [ 1, %search ], [ 0, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @one_uncountable_two_countable_diff_exit_no_phis() {
+; CHECK-LABEL: define i64 @one_uncountable_two_countable_diff_exit_no_phis() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end.early:
+; CHECK-NEXT:    ret i64 1
+; CHECK:       loop.end:
+; CHECK-NEXT:    ret i64 0
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end.early, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end.early:
+  ret i64 1
+
+loop.end:
+  ret i64 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll
new file mode 100644
index 000000000000..7759c10032e9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/multi_early_exit_live_outs.ll
@@ -0,0 +1,128 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+; There are multiple exit blocks - two of them have an exact representation for the
+; exit-not-taken counts and the other is unknown, i.e. the "early exit".
+define i64 @one_uncountable_two_countable_same_exit() {
+; CHECK-LABEL: define i64 @one_uncountable_two_countable_same_exit() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ %index, %search ], [ 128, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @one_uncountable_two_countable_diff_exit() {
+; CHECK-LABEL: define i64 @one_uncountable_two_countable_diff_exit() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end.early:
+; CHECK-NEXT:    [[RET_EARLY:%.*]] = phi i64 [ [[INDEX]], [[SEARCH]] ]
+; CHECK-NEXT:    ret i64 [[RET_EARLY]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ 128, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %cmp1 = icmp ne i64 %index, 64
+  br i1 %cmp1, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end.early, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 128
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end.early:
+  %ret.early = phi i64 [ %index, %search ]
+  ret i64 %ret.early
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ 128, %loop.inc ]
+  ret i64 %retval
+}
diff --git a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll b/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
deleted file mode 100644
index d5e4f4d016c6..000000000000
--- a/llvm/test/Transforms/LoopVectorize/simple_early_exit.ll
+++ /dev/null
@@ -1,2118 +0,0 @@
-; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
-; REQUIRES: asserts
-; RUN: opt -S < %s -p loop-vectorize -debug-only=loop-vectorize 2>%t | FileCheck %s --check-prefixes=CHECK
-; RUN: cat %t | FileCheck %s --check-prefix=DEBUG
-
-declare void @init_mem(ptr, i64);
-
-define i64 @same_exit_block_pre_inc_use1() {
-; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1'
-; DEBUG:       LV: Found an early exit loop with symbolic max backedge taken count: 63
-; DEBUG-NEXT:  LV: We can vectorize this loop!
-; DEBUG-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
-; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds [1024 x i8], ptr %p1, i64 0, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds [1024 x i8], ptr %p2, i64 0, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [40 x i32], align 4
-; CHECK-NEXT:    [[P2:%.*]] = alloca [40 x i32], align 4
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
-; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
-;
-entry:
-  %p1 = alloca [40 x i32]
-  %p2 = alloca [40 x i32]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use2() {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use2() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ 67, %loop ], [ %index, %loop.inc ]
-  ret i64 %retval
-}
-
-define i64 @same_exit_block_pre_inc_use3() {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use3() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  ret i64 %index
-}
-
-
-; In this example the early exit block appears in the list of ExitNotTaken
-; SCEVs, but is not computable.
-define i64 @same_exit_block_pre_inc_use4() {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use4() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i64], align 8
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i64], align 8
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i64]
-  %p2 = alloca [1024 x i64]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i64, ptr %p1, i64 %index
-  %ld1 = load i64, ptr %arrayidx, align 1
-  %cmp3 = icmp ult i64 %index, %ld1
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_post_inc_use() {
-; CHECK-LABEL: define i64 @same_exit_block_post_inc_use() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ %index.next, %loop.inc ]
-  ret i64 %retval
-}
-
-define i64 @same_exit_block_post_inc_use2() {
-; CHECK-LABEL: define i64 @same_exit_block_post_inc_use2() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %index.next = add i64 %index, 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index.next, %loop ], [ %index, %loop.inc ]
-  ret i64 %retval
-}
-
-define i64 @same_exit_block_phi_of_consts() {
-; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @diff_exit_block_pre_inc_use1() {
-; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use1() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
-; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.early.exit
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.early.exit:
-  %retval1 = phi i64 [ %index, %loop ]
-  ret i64 %retval1
-
-loop.end:
-  %retval2 = phi i64 [ 67, %loop.inc ]
-  ret i64 %retval2
-}
-
-define i64 @diff_exit_block_pre_inc_use2() {
-; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use2() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
-; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.early.exit
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.early.exit:
-  %retval1 = phi i64 [ 67, %loop ]
-  ret i64 %retval1
-
-loop.end:
-  %retval2 = phi i64 [ %index, %loop.inc ]
-  ret i64 %retval2
-}
-
-define i64 @diff_exit_block_pre_inc_use3() {
-; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use3() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
-; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[INDEX_LCSSA1]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.early.exit
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.early.exit:
-  ret i64 %index
-
-loop.end:
-  ret i64 %index
-}
-
-
-define i64 @diff_exit_block_phi_of_consts() {
-; CHECK-LABEL: define i64 @diff_exit_block_phi_of_consts() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
-; CHECK:       loop.early.exit:
-; CHECK-NEXT:    ret i64 0
-; CHECK:       loop.end:
-; CHECK-NEXT:    ret i64 1
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.early.exit
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.early.exit:
-  ret i64 0
-
-loop.end:
-  ret i64 1
-}
-
-
-define i64 @diff_exit_block_post_inc_use1() {
-; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use1() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
-; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.early.exit
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.early.exit:
-  %retval1 = phi i64 [ %index, %loop ]
-  ret i64 %retval1
-
-loop.end:
-  %retval2 = phi i64 [ %index.next, %loop.inc ]
-  ret i64 %retval2
-}
-
-
-define i64 @diff_exit_block_post_inc_use2() {
-; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use2() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
-; CHECK:       loop.early.exit:
-; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL1]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL2]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %index.next = add i64 %index, 1
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.early.exit
-
-loop.inc:
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.early.exit:
-  %retval1 = phi i64 [ %index.next, %loop ]
-  ret i64 %retval1
-
-loop.end:
-  %retval2 = phi i64 [ %index, %loop.inc ]
-  ret i64 %retval2
-}
-
-
-; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't
-; support this yet.
-define i64 @early_exit_on_last_block() {
-; DEBUG-LABEL: LV: Checking a loop in 'early_exit_on_last_block'
-; DEBUG:       LV: Not vectorizing: Early exit is not the latch predecessor.
-; CHECK-LABEL: define i64 @early_exit_on_last_block() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[SEARCH:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       search:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[LAND_RHS]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ]
-; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %search ], [ 3, %entry ]
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %search, label %loop.end
-
-search:
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.end, label %loop
-
-loop.end:
-  %retval = phi i64 [ 64, %loop ], [ %index, %search ]
-  ret i64 %retval
-}
-
-
-; There are multiple exit blocks - two of them have an exact representation for the
-; exit-not-taken counts and the other is unknown, i.e. the "early exit".
-define i64 @multiple_exiting_one_early_same_exit() {
-; CHECK-LABEL: define i64 @multiple_exiting_one_early_same_exit() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
-; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
-; CHECK:       search:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ [[INDEX]], [[SEARCH]] ], [ 128, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %cmp1 = icmp ne i64 %index, 64
-  br i1 %cmp1, label %search, label %loop.end
-
-search:
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.end, label %loop.inc
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 128
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ 64, %loop ], [ %index, %search ], [ 128, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @multiple_exiting_one_early_same_exit_phi_of_consts() {
-; CHECK-LABEL: define i64 @multiple_exiting_one_early_same_exit_phi_of_consts() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
-; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
-; CHECK:       search:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END]], label [[LOOP_INC]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[SEARCH]] ], [ 0, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %cmp1 = icmp ne i64 %index, 64
-  br i1 %cmp1, label %search, label %loop.end
-
-search:
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.end, label %loop.inc
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 128
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ 0, %loop ], [ 1, %search ], [ 0, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @multiple_exiting_one_early_diff_exit() {
-; CHECK-LABEL: define i64 @multiple_exiting_one_early_diff_exit() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
-; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
-; CHECK:       search:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end.early:
-; CHECK-NEXT:    [[RET_EARLY:%.*]] = phi i64 [ [[INDEX]], [[SEARCH]] ]
-; CHECK-NEXT:    ret i64 [[RET_EARLY]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 64, [[LOOP]] ], [ 128, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %cmp1 = icmp ne i64 %index, 64
-  br i1 %cmp1, label %search, label %loop.end
-
-search:
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.end.early, label %loop.inc
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 128
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end.early:
-  %ret.early = phi i64 [ %index, %search ]
-  ret i64 %ret.early
-
-loop.end:
-  %retval = phi i64 [ 64, %loop ], [ 128, %loop.inc ]
-  ret i64 %retval
-}
-
-define i64 @multiple_exiting_one_early_diff_exit_no_phis() {
-; CHECK-LABEL: define i64 @multiple_exiting_one_early_diff_exit_no_phis() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX]], 64
-; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH:%.*]], label [[LOOP_END:%.*]]
-; CHECK:       search:
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_END_EARLY:%.*]], label [[LOOP_INC]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 128
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end.early:
-; CHECK-NEXT:    ret i64 1
-; CHECK:       loop.end:
-; CHECK-NEXT:    ret i64 0
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %cmp1 = icmp ne i64 %index, 64
-  br i1 %cmp1, label %search, label %loop.end
-
-search:
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.end.early, label %loop.inc
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 128
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end.early:
-  ret i64 1
-
-loop.end:
-  ret i64 0
-}
-
-
-; We don't currently support multiple early exits.
-define i64 @multiple_early_exits() {
-; DEBUG-LABEL: LV: Checking a loop in 'multiple_early_exits'
-; DEBUG:       LV: Not vectorizing: Loop has too many uncountable exits.
-; CHECK-LABEL: define i64 @multiple_early_exits() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
-; CHECK:       search1:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT:%.*]], label [[FOR_INC:%.*]]
-; CHECK:       search2:
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 [[TMP41]], 34
-; CHECK-NEXT:    br i1 [[CMP2]], label [[FOR_END_LOOPEXIT]], label [[FOR_INC1]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 100, [[FOR_INC]] ], [ 43, [[FOR_INC1]] ]
-; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %search1
-
-search1:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp1 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp1, label %loop.end, label %search2
-
-search2:
-  %cmp2 = icmp ult i8 %ld1, 34
-  br i1 %cmp2, label %loop.end, label %loop.inc
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %search1, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @early_exit_infinite_loop() {
-; DEBUG-LABEL: LV: Checking a loop in 'early_exit_infinite_loop'
-; DEBUG:       LV: Not vectorizing: Cannot determine exact exit count for latch block.
-; CHECK-LABEL: define i64 @early_exit_infinite_loop() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br label [[LAND_RHS]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ]
-; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br label %loop
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use_inv_cond(i1 %cond) {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use_inv_cond(
-; CHECK-SAME: i1 [[COND:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false
-; CHECK-NEXT:    br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  %cmp4 = select i1 %cond, i1 %cmp3, i1 false
-  br i1 %cmp4, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @loop_contains_safe_call() {
-; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_call'
-; DEBUG:       LV: Found an early exit loop with symbolic max backedge taken count: 63
-; DEBUG-NEXT:  LV: We can vectorize this loop!
-; CHECK-LABEL: define i64 @loop_contains_safe_call() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]])
-; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index
-  %ld1 = load float, ptr %arrayidx, align 1
-  %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1)
-  %cmp = fcmp fast ult float %sqrt, 3.0e+00
-  br i1 %cmp, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @loop_contains_unsafe_call() {
-; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_unsafe_call'
-; DEBUG:       LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed.
-; CHECK-LABEL: define i64 @loop_contains_unsafe_call() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR2:[0-9]+]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
-  %ld1 = load i32, ptr %arrayidx, align 1
-  %bad_call = call i32 @foo(i32 %ld1) #0
-  %cmp = icmp eq i32 %bad_call, 34
-  br i1 %cmp, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @loop_contains_safe_div() {
-; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_safe_div'
-; DEBUG:       LV: Found an early exit loop with symbolic max backedge taken count: 63
-; DEBUG-NEXT:  LV: We can vectorize this loop!
-; CHECK-LABEL: define i64 @loop_contains_safe_div() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP1:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX2]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[LD1]], 20000
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DIV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
-  %ld1 = load i32, ptr %arrayidx, align 1
-  %div = udiv i32 %ld1, 20000
-  %cmp = icmp eq i32 %div, 1
-  br i1 %cmp, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @loop_contains_unsafe_div() {
-; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_unsafe_div'
-; DEBUG:       LV: Not vectorizing: Early exit loop contains operations that cannot be speculatively executed.
-; CHECK-LABEL: define i64 @loop_contains_unsafe_div() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 20000, [[LD1]]
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DIV]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i32, ptr %arrayidx, align 1
-  %div = udiv i32 20000, %ld1
-  %cmp = icmp eq i32 %div, 1
-  br i1 %cmp, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @loop_contains_store(ptr %dest) {
-; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_store'
-; DEBUG:       LV: Not vectorizing: Writes to memory unsupported in early exit loops
-; CHECK-LABEL: define i64 @loop_contains_store(
-; CHECK-SAME: ptr [[DEST:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[INDEX]]
-; CHECK-NEXT:    store i32 [[LD1]], ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LD1]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
-  %ld1 = load i32, ptr %arrayidx, align 1
-  %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index
-  store i32 %ld1, ptr %arrayidx2, align 4
-  %cmp = icmp eq i32 %ld1, 1
-  br i1 %cmp, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) {
-; DEBUG-LABEL: LV: Checking a loop in 'loop_contains_load_after_early_exit'
-; DEBUG:       LV: Found an early exit loop with symbolic max backedge taken count: 63
-; DEBUG-NEXT:  LV: We can vectorize this loop!
-; DEBUG-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
-; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit(
-; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LD1]], 1
-; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
-  %ld1 = load i32, ptr %arrayidx, align 1
-  %cmp = icmp eq i32 %ld1, 1
-  br i1 %cmp, label %loop.inc, label %loop.end
-
-loop.inc:
-  %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index
-  %ld2 = load i64, ptr %arrayidx2, align 8
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @early_exit_in_conditional_block(ptr %mask) {
-; DEBUG-LABEL: LV: Checking a loop in 'early_exit_in_conditional_block'
-; DEBUG:       LV: Not vectorizing: Early exit is not the latch predecessor.
-; CHECK-LABEL: define i64 @early_exit_in_conditional_block(
-; CHECK-SAME: ptr [[MASK:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[MASK]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[LD1]], 0
-; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP_SEARCH:%.*]], label [[LOOP_INC]]
-; CHECK:       loop.search:
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
-; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
-; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[LD2]], [[LD3]]
-; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP_SEARCH]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index
-  %ld1 = load i8, ptr %arrayidx1, align 1
-  %cmp1 = icmp ne i8 %ld1, 0
-  br i1 %cmp1, label %loop.search, label %loop.inc
-
-loop.search:
-  %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld2 = load i8, ptr %arrayidx2, align 1
-  %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld3 = load i8, ptr %arrayidx3, align 1
-  %cmp2 = icmp eq i8 %ld2, %ld3
-  br i1 %cmp2, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_reverse() {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], -1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 1023, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, -1
-  %exitcond = icmp eq i64 %index.next, 0
-  br i1 %exitcond, label %loop.end, label %loop
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 1024, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_with_reduction() {
-; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_with_reduction'
-; DEBUG:       LV: Not vectorizing: Found reductions or recurrences in early-exit loop.
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_with_reduction() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[RED_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[LD2_ZEXT:%.*]] = zext i8 [[TMP39]] to i64
-; CHECK-NEXT:    [[RED_NEXT]] = add i64 [[RED]], [[LD2_ZEXT]]
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[FOR_INC]] ], [ [[RED_NEXT]], [[LAND_RHS]] ]
-; CHECK-NEXT:    [[FINAL_IND:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
-; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = add i64 [[RED_NEXT_LCSSA]], [[FINAL_IND]]
-; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  %p2 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %ld2.zext = zext i8 %ld2 to i64
-  %red.next = add i64 %red, %ld2.zext
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  %retval = add i64 %red.next, %final.ind
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
-; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-; The form of the induction variables requires SCEV predicates.
-define i32 @diff_exit_block_needs_scev_check(i32 %end) {
-; DEBUG-LABEL: LV: Checking a loop in 'diff_exit_block_needs_scev_check'
-; DEBUG:       Found an early exit loop with symbolic max backedge taken count: (-1 + (1 umax (zext i10 (trunc i32 %end to i10) to i32)))<nsw>
-; DEBUG-NEXT:  LV: We can vectorize this loop!
-; DEBUG-NEXT:  LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
-; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check(
-; CHECK-SAME: i32 [[END:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i32], align 4
-; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i32], align 4
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    [[END_CLAMPED:%.*]] = and i32 [[END]], 1023
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]]
-; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
-; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]]
-; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
-; CHECK-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
-; CHECK-NEXT:    br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[IND_NEXT]] = add i8 [[IND]], 1
-; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32
-; CHECK-NEXT:    [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1
-; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]]
-; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
-; CHECK:       found:
-; CHECK-NEXT:    ret i32 1
-; CHECK:       exit:
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %p1 = alloca [1024 x i32]
-  %p2 = alloca [1024 x i32]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  %end.clamped = and i32 %end, 1023
-  br label %for.body
-
-for.body:
-  %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ]
-  %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ]
-  %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind
-  %0 = load i32, ptr %arrayidx1, align 4
-  %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind
-  %1 = load i32, ptr %arrayidx2, align 4
-  %cmp.early = icmp eq i32 %0, %1
-  br i1 %cmp.early, label %found, label %for.inc
-
-for.inc:
-  %ind.next = add i8 %ind, 1
-  %conv = zext i8 %ind.next to i32
-  %gep.ind.next = add i64 %gep.ind, 1
-  %cmp = icmp ult i32 %conv, %end.clamped
-  br i1 %cmp, label %for.body, label %exit
-
-found:
-  ret i32 1
-
-exit:
-  ret i32 0
-}
-
-
-declare void @abort()
-
-; This is a variant of an early exit loop where the condition for leaving
-; early is loop invariant.
-define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
-; DEBUG-LABEL: LV: Checking a loop in 'diff_blocks_invariant_early_exit_cond'
-; DEBUG:       LV: Found an early exit loop with symbolic max backedge taken count: 275
-; DEBUG:       LV: Not vectorizing: Auto-vectorization of loops with uncountable early exit is not yet supported.
-; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond(
-; CHECK-SAME: ptr [[S:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[SVAL:%.*]] = load i32, ptr [[S]], align 4
-; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[SVAL]], 0
-; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
-; CHECK:       for.body:
-; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ]
-; CHECK-NEXT:    br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]]
-; CHECK:       for.inc:
-; CHECK-NEXT:    [[IND_NEXT]] = add nsw i32 [[IND]], 1
-; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266
-; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
-; CHECK:       early.exit:
-; CHECK-NEXT:    tail call void @abort()
-; CHECK-NEXT:    unreachable
-; CHECK:       for.end:
-; CHECK-NEXT:    ret i32 0
-;
-entry:
-  %sval = load i32, ptr %s, align 4
-  %cond = icmp eq i32 %sval, 0
-  br label %for.body
-
-for.body:
-  %ind = phi i32 [ -10, %entry ], [ %ind.next, %for.inc ]
-  br i1 %cond, label %for.inc, label %early.exit
-
-for.inc:
-  %ind.next = add nsw i32 %ind, 1
-  %exitcond.not = icmp eq i32 %ind.next, 266
-  br i1 %exitcond.not, label %for.end, label %for.body
-
-early.exit:
-  tail call void @abort()
-  unreachable
-
-for.end:
-  ret i32 0
-}
-
-
-define i64 @early_exit_has_multiple_outside_successors() {
-; DEBUG-LABEL: LV: Checking a loop in 'early_exit_has_multiple_outside_successors'
-; DEBUG:       LV: Not vectorizing: Loop contains an unsupported switch
-; CHECK-LABEL: define i64 @early_exit_has_multiple_outside_successors() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    switch i8 [[LD1]], label [[LOOP_INC]] [
-; CHECK-NEXT:      i8 2, label [[LOOP_END:%.*]]
-; CHECK-NEXT:      i8 3, label [[LOOP_SURPRISE:%.*]]
-; CHECK-NEXT:    ]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.surprise:
-; CHECK-NEXT:    ret i64 3
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [1024 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  switch i8 %ld1, label %loop.inc [
-  i8 2, label %loop.end
-  i8 3, label %loop.surprise
-  ]
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.surprise:
-  ret i64 3
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
-; DEBUG-LABEL: LV: Checking a loop in 'same_exit_block_pre_inc_use1_too_small_allocas'
-; DEBUG:       LV: Not vectorizing: Loop may fault.
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[P1:%.*]] = alloca [42 x i8], align 1
-; CHECK-NEXT:    [[P2:%.*]] = alloca [42 x i8], align 1
-; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
-; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  %p1 = alloca [42 x i8]
-  %p2 = alloca [42 x i8]
-  call void @init_mem(ptr %p1, i64 1024)
-  call void @init_mem(ptr %p2, i64 1024)
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
-; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
-; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
-; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) {
-; CHECK-NEXT:  entry:
-; CHECK-NEXT:    br label [[LOOP:%.*]]
-; CHECK:       loop:
-; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
-; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
-; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
-; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
-; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
-; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
-; CHECK:       loop.inc:
-; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
-; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
-; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
-; CHECK:       loop.end:
-; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
-; CHECK-NEXT:    ret i64 [[RETVAL]]
-;
-entry:
-  br label %loop
-
-loop:
-  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
-  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
-  %ld1 = load i8, ptr %arrayidx, align 1
-  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
-  %ld2 = load i8, ptr %arrayidx1, align 1
-  %cmp3 = icmp eq i8 %ld1, %ld2
-  br i1 %cmp3, label %loop.inc, label %loop.end
-
-loop.inc:
-  %index.next = add i64 %index, 1
-  %exitcond = icmp ne i64 %index.next, 67
-  br i1 %exitcond, label %loop, label %loop.end
-
-loop.end:
-  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
-  ret i64 %retval
-}
-
-
-
-declare i32 @foo(i32) readonly
-declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
-
-attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
new file mode 100644
index 000000000000..52f82d007de4
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit.ll
@@ -0,0 +1,220 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+
+define i64 @same_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @same_exit_block_phi_of_consts() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 0, [[LOOP]] ], [ 1, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 0, %loop ], [ 1, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @diff_exit_block_phi_of_consts() {
+; CHECK-LABEL: define i64 @diff_exit_block_phi_of_consts() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    ret i64 0
+; CHECK:       loop.end:
+; CHECK-NEXT:    ret i64 1
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  ret i64 0
+
+loop.end:
+  ret i64 1
+}
+
+
+; The form of the induction variables requires SCEV predicates.
+define i32 @diff_exit_block_needs_scev_check(i32 %end) {
+; CHECK-LABEL: define i32 @diff_exit_block_needs_scev_check(
+; CHECK-SAME: i32 [[END:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i32], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    [[END_CLAMPED:%.*]] = and i32 [[END]], 1023
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i8 [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 0, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[GEP_IND:%.*]] = phi i64 [ [[GEP_IND_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[GEP_IND]]
+; CHECK-NEXT:    [[TMP0:%.*]] = load i32, ptr [[ARRAYIDX1]], align 4
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[P2]], i64 [[GEP_IND]]
+; CHECK-NEXT:    [[TMP1:%.*]] = load i32, ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP_EARLY:%.*]] = icmp eq i32 [[TMP0]], [[TMP1]]
+; CHECK-NEXT:    br i1 [[CMP_EARLY]], label [[FOUND:%.*]], label [[FOR_INC]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IND_NEXT]] = add i8 [[IND]], 1
+; CHECK-NEXT:    [[CONV:%.*]] = zext i8 [[IND_NEXT]] to i32
+; CHECK-NEXT:    [[GEP_IND_NEXT]] = add i64 [[GEP_IND]], 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp ult i32 [[CONV]], [[END_CLAMPED]]
+; CHECK-NEXT:    br i1 [[CMP]], label [[FOR_BODY]], label [[EXIT:%.*]]
+; CHECK:       found:
+; CHECK-NEXT:    ret i32 1
+; CHECK:       exit:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %p1 = alloca [1024 x i32]
+  %p2 = alloca [1024 x i32]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  %end.clamped = and i32 %end, 1023
+  br label %for.body
+
+for.body:
+  %ind = phi i8 [ %ind.next, %for.inc ], [ 0, %entry ]
+  %gep.ind = phi i64 [ %gep.ind.next, %for.inc ], [ 0, %entry ]
+  %arrayidx1 = getelementptr inbounds i32, ptr %p1, i64 %gep.ind
+  %0 = load i32, ptr %arrayidx1, align 4
+  %arrayidx2 = getelementptr inbounds i32, ptr %p2, i64 %gep.ind
+  %1 = load i32, ptr %arrayidx2, align 4
+  %cmp.early = icmp eq i32 %0, %1
+  br i1 %cmp.early, label %found, label %for.inc
+
+for.inc:
+  %ind.next = add i8 %ind, 1
+  %conv = zext i8 %ind.next to i32
+  %gep.ind.next = add i64 %gep.ind, 1
+  %cmp = icmp ult i32 %conv, %end.clamped
+  br i1 %cmp, label %for.body, label %exit
+
+found:
+  ret i32 1
+
+exit:
+  ret i32 0
+}
+
+
+declare void @abort()
+
+; This is a variant of an early exit loop where the condition for leaving
+; early is loop invariant.
+define i32 @diff_blocks_invariant_early_exit_cond(ptr %s) {
+; CHECK-LABEL: define i32 @diff_blocks_invariant_early_exit_cond(
+; CHECK-SAME: ptr [[S:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[SVAL:%.*]] = load i32, ptr [[S]], align 4
+; CHECK-NEXT:    [[COND:%.*]] = icmp eq i32 [[SVAL]], 0
+; CHECK-NEXT:    br label [[FOR_BODY:%.*]]
+; CHECK:       for.body:
+; CHECK-NEXT:    [[IND:%.*]] = phi i32 [ -10, [[ENTRY:%.*]] ], [ [[IND_NEXT:%.*]], [[FOR_INC:%.*]] ]
+; CHECK-NEXT:    br i1 [[COND]], label [[FOR_INC]], label [[EARLY_EXIT:%.*]]
+; CHECK:       for.inc:
+; CHECK-NEXT:    [[IND_NEXT]] = add nsw i32 [[IND]], 1
+; CHECK-NEXT:    [[EXITCOND_NOT:%.*]] = icmp eq i32 [[IND_NEXT]], 266
+; CHECK-NEXT:    br i1 [[EXITCOND_NOT]], label [[FOR_END:%.*]], label [[FOR_BODY]]
+; CHECK:       early.exit:
+; CHECK-NEXT:    tail call void @abort()
+; CHECK-NEXT:    unreachable
+; CHECK:       for.end:
+; CHECK-NEXT:    ret i32 0
+;
+entry:
+  %sval = load i32, ptr %s, align 4
+  %cond = icmp eq i32 %sval, 0
+  br label %for.body
+
+for.body:
+  %ind = phi i32 [ -10, %entry ], [ %ind.next, %for.inc ]
+  br i1 %cond, label %for.inc, label %early.exit
+
+for.inc:
+  %ind.next = add nsw i32 %ind, 1
+  %exitcond.not = icmp eq i32 %ind.next, 266
+  br i1 %exitcond.not, label %for.end, label %for.body
+
+early.exit:
+  tail call void @abort()
+  unreachable
+
+for.end:
+  ret i32 0
+}
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
new file mode 100644
index 000000000000..7889191c4b5b
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_live_outs.ll
@@ -0,0 +1,997 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+define i64 @same_exit_block_pre_inc_use1() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc1_use_inv_cond(i1 %cond) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc1_use_inv_cond(
+; CHECK-SAME: i1 [[COND:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    [[CMP4:%.*]] = select i1 [[COND]], i1 [[CMP3]], i1 false
+; CHECK-NEXT:    br i1 [[CMP4]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  %cmp4 = select i1 %cond, i1 %cmp3, i1 false
+  br i1 %cmp4, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_gep_two_indices() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P1]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds [1024 x i8], ptr [[P2]], i64 0, i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds [1024 x i8], ptr %p1, i64 0, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds [1024 x i8], ptr %p2, i64 0, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_alloca_diff_type() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [40 x i32], align 4
+; CHECK-NEXT:    [[P2:%.*]] = alloca [40 x i32], align 4
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [40 x i32]
+  %p2 = alloca [40 x i32]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use2() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ 67, [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ 67, %loop ], [ %index, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use3() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use3() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ], [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  ret i64 %index
+}
+
+
+; In this example the early exit block appears in the list of ExitNotTaken
+; SCEVs, but is not computable.
+define i64 @same_exit_block_pre_inc_use4() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use4() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i64], align 8
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i64, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i64, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp ult i64 [[INDEX]], [[LD1]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i64]
+  %p2 = alloca [1024 x i64]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i64, ptr %p1, i64 %index
+  %ld1 = load i64, ptr %arrayidx, align 1
+  %cmp3 = icmp ult i64 %index, %ld1
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_post_inc_use() {
+; CHECK-LABEL: define i64 @same_exit_block_post_inc_use() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[INDEX_NEXT]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ %index.next, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_post_inc_use2() {
+; CHECK-LABEL: define i64 @same_exit_block_post_inc_use2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ], [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %index.next = add i64 %index, 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index.next, %loop ], [ %index, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @diff_exit_block_pre_inc_use1() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ 67, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+define i64 @diff_exit_block_pre_inc_use2() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ 67, [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ 67, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+define i64 @diff_exit_block_pre_inc_use3() {
+; CHECK-LABEL: define i64 @diff_exit_block_pre_inc_use3() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[INDEX_LCSSA:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[INDEX_LCSSA1:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[INDEX_LCSSA1]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  ret i64 %index
+
+loop.end:
+  ret i64 %index
+}
+
+
+define i64 @diff_exit_block_post_inc_use1() {
+; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use1() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index.next, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+define i64 @diff_exit_block_post_inc_use2() {
+; CHECK-LABEL: define i64 @diff_exit_block_post_inc_use2() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_EARLY_EXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END:%.*]]
+; CHECK:       loop.early.exit:
+; CHECK-NEXT:    [[RETVAL1:%.*]] = phi i64 [ [[INDEX_NEXT]], [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL1]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL2:%.*]] = phi i64 [ [[INDEX]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL2]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %index.next = add i64 %index, 1
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.early.exit
+
+loop.inc:
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.early.exit:
+  %retval1 = phi i64 [ %index.next, %loop ]
+  ret i64 %retval1
+
+loop.end:
+  %retval2 = phi i64 [ %index, %loop.inc ]
+  ret i64 %retval2
+}
+
+
+define i64 @loop_contains_safe_call() {
+; CHECK-LABEL: define i64 @loop_contains_safe_call() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds float, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load float, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[SQRT:%.*]] = tail call fast float @llvm.sqrt.f32(float [[LD1]])
+; CHECK-NEXT:    [[CMP:%.*]] = fcmp fast ult float [[SQRT]], 3.000000e+00
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds float, ptr %p1, i64 %index
+  %ld1 = load float, ptr %arrayidx, align 1
+  %sqrt = tail call fast float @llvm.sqrt.f32(float %ld1)
+  %cmp = fcmp fast ult float %sqrt, 3.0e+00
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_safe_div() {
+; CHECK-LABEL: define i64 @loop_contains_safe_div() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP1:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX2:%.*]] = phi i64 [ [[INDEX_NEXT1:%.*]], [[LOOP_INC1:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX2]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 [[LD1]], 20000
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DIV]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC1]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT1]] = add i64 [[INDEX2]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT1]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP1]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX2]], [[LOOP1]] ], [ 67, [[LOOP_INC1]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %div = udiv i32 %ld1, 20000
+  %cmp = icmp eq i32 %div, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_load_after_early_exit(ptr dereferenceable(1024) align(8) %p2) {
+; CHECK-LABEL: define i64 @loop_contains_load_after_early_exit(
+; CHECK-SAME: ptr align 8 dereferenceable(1024) [[P2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LD1]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i64, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i64, ptr [[ARRAYIDX2]], align 8
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ [[LD2]], [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %cmp = icmp eq i32 %ld1, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %arrayidx2 = getelementptr inbounds i64, ptr %p2, i64 %index
+  %ld2 = load i64, ptr %arrayidx2, align 8
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ %ld2, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_reverse() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_reverse() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 1023, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], -1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp eq i64 [[INDEX_NEXT]], 0
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP_END]], label [[LOOP]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 1024, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 1023, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, -1
+  %exitcond = icmp eq i64 %index.next, 0
+  br i1 %exitcond, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 1024, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_deref_ptrs(ptr dereferenceable(1024) %p1, ptr dereferenceable(1024) %p2) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_deref_ptrs(
+; CHECK-SAME: ptr dereferenceable(1024) [[P1:%.*]], ptr dereferenceable(1024) [[P2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+declare i32 @foo(i32) readonly
+declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
+
+attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
diff --git a/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
new file mode 100644
index 000000000000..c68eeac19c9e
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/single_early_exit_unsafe_ptrs.ll
@@ -0,0 +1,143 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_allocas() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [42 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [42 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [42 x i8]
+  %p2 = alloca [42 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(ptr dereferenceable(42) %p1, ptr dereferenceable(42) %p2) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_too_small_deref_ptrs(
+; CHECK-SAME: ptr dereferenceable(42) [[P1:%.*]], ptr dereferenceable(42) [[P2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(ptr %p1, ptr %p2) {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_unknown_ptrs(
+; CHECK-SAME: ptr [[P1:%.*]], ptr [[P2:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
diff --git a/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
new file mode 100644
index 000000000000..cd91d07120f9
--- /dev/null
+++ b/llvm/test/Transforms/LoopVectorize/unsupported_early_exit.ll
@@ -0,0 +1,494 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 4
+; RUN: opt -S < %s -p loop-vectorize | FileCheck %s
+
+declare void @init_mem(ptr, i64);
+
+
+; The early exit (i.e. unknown exit-not-taken count) is the latch - we don't
+; support this yet.
+define i64 @early_exit_on_last_block() {
+; CHECK-LABEL: define i64 @early_exit_on_last_block() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[SEARCH:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[CMP1]], label [[SEARCH]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       search:
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP41:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP42:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP41]], [[TMP42]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_END_LOOPEXIT]], label [[LAND_RHS]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = phi i64 [ 64, [[LAND_RHS]] ], [ [[INDEX]], [[SEARCH]] ]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %search ], [ 3, %entry ]
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %search, label %loop.end
+
+search:
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.end, label %loop
+
+loop.end:
+  %retval = phi i64 [ 64, %loop ], [ %index, %search ]
+  ret i64 %retval
+}
+
+
+; We don't currently support multiple early exits.
+define i64 @multiple_uncountable_exits() {
+; CHECK-LABEL: define i64 @multiple_uncountable_exits() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[SEARCH1:%.*]]
+; CHECK:       search1:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP_END:%.*]], label [[SEARCH2:%.*]]
+; CHECK:       search2:
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp ult i8 [[LD1]], 34
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP_END]], label [[LOOP_INC]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[SEARCH1]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[SEARCH1]] ], [ 100, [[SEARCH2]] ], [ 43, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %search1
+
+search1:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp1 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp1, label %loop.end, label %search2
+
+search2:
+  %cmp2 = icmp ult i8 %ld1, 34
+  br i1 %cmp2, label %loop.end, label %loop.inc
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %search1, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %search1 ], [ 100, %search2 ], [ 43, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @uncountable_exit_infinite_loop() {
+; CHECK-LABEL: define i64 @uncountable_exit_infinite_loop() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[LD1]], [[LD2]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br label [[LOOP]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br label %loop
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_unsafe_call() {
+; CHECK-LABEL: define i64 @loop_contains_unsafe_call() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[BAD_CALL:%.*]] = call i32 @foo(i32 [[LD1]]) #[[ATTR1:[0-9]+]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[BAD_CALL]], 34
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %bad_call = call i32 @foo(i32 %ld1) #0
+  %cmp = icmp eq i32 %bad_call, 34
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_unsafe_div() {
+; CHECK-LABEL: define i64 @loop_contains_unsafe_div() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[DIV:%.*]] = udiv i32 20000, [[LD1]]
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[DIV]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %div = udiv i32 20000, %ld1
+  %cmp = icmp eq i32 %div, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @loop_contains_store(ptr %dest) {
+; CHECK-LABEL: define i64 @loop_contains_store(
+; CHECK-SAME: ptr [[DEST:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i32, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i32, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i32, ptr [[DEST]], i64 [[INDEX]]
+; CHECK-NEXT:    store i32 [[LD1]], ptr [[ARRAYIDX2]], align 4
+; CHECK-NEXT:    [[CMP:%.*]] = icmp eq i32 [[LD1]], 1
+; CHECK-NEXT:    br i1 [[CMP]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i32, ptr %p1, i64 %index
+  %ld1 = load i32, ptr %arrayidx, align 1
+  %arrayidx2 = getelementptr inbounds i32, ptr %dest, i64 %index
+  store i32 %ld1, ptr %arrayidx2, align 4
+  %cmp = icmp eq i32 %ld1, 1
+  br i1 %cmp, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @uncountable_exit_in_conditional_block(ptr %mask) {
+; CHECK-LABEL: define i64 @uncountable_exit_in_conditional_block(
+; CHECK-SAME: ptr [[MASK:%.*]]) {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[MASK]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[CMP1:%.*]] = icmp ne i8 [[LD1]], 0
+; CHECK-NEXT:    br i1 [[CMP1]], label [[LOOP_SEARCH:%.*]], label [[LOOP_INC]]
+; CHECK:       loop.search:
+; CHECK-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD2:%.*]] = load i8, ptr [[ARRAYIDX2]], align 1
+; CHECK-NEXT:    [[ARRAYIDX3:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD3:%.*]] = load i8, ptr [[ARRAYIDX3]], align 1
+; CHECK-NEXT:    [[CMP2:%.*]] = icmp eq i8 [[LD2]], [[LD3]]
+; CHECK-NEXT:    br i1 [[CMP2]], label [[LOOP_INC]], label [[LOOP_END:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP_SEARCH]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx1 = getelementptr inbounds i8, ptr %mask, i64 %index
+  %ld1 = load i8, ptr %arrayidx1, align 1
+  %cmp1 = icmp ne i8 %ld1, 0
+  br i1 %cmp1, label %loop.search, label %loop.inc
+
+loop.search:
+  %arrayidx2 = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld2 = load i8, ptr %arrayidx2, align 1
+  %arrayidx3 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld3 = load i8, ptr %arrayidx3, align 1
+  %cmp2 = icmp eq i8 %ld2, %ld3
+  br i1 %cmp2, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %retval = phi i64 [ %index, %loop.search ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+define i64 @same_exit_block_pre_inc_use1_with_reduction() {
+; CHECK-LABEL: define i64 @same_exit_block_pre_inc_use1_with_reduction() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    [[P2:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    call void @init_mem(ptr [[P2]], i64 1024)
+; CHECK-NEXT:    br label [[LAND_RHS:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[FOR_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[RED:%.*]] = phi i64 [ [[RED_NEXT:%.*]], [[FOR_INC]] ], [ 0, [[ENTRY]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP38:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    [[ARRAYIDX1:%.*]] = getelementptr inbounds i8, ptr [[P2]], i64 [[INDEX]]
+; CHECK-NEXT:    [[TMP39:%.*]] = load i8, ptr [[ARRAYIDX1]], align 1
+; CHECK-NEXT:    [[LD2_ZEXT:%.*]] = zext i8 [[TMP39]] to i64
+; CHECK-NEXT:    [[RED_NEXT]] = add i64 [[RED]], [[LD2_ZEXT]]
+; CHECK-NEXT:    [[CMP3:%.*]] = icmp eq i8 [[TMP38]], [[TMP39]]
+; CHECK-NEXT:    br i1 [[CMP3]], label [[FOR_INC]], label [[FOR_END_LOOPEXIT:%.*]]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LAND_RHS]], label [[FOR_END_LOOPEXIT]]
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RED_NEXT_LCSSA:%.*]] = phi i64 [ [[RED_NEXT]], [[FOR_INC]] ], [ [[RED_NEXT]], [[LAND_RHS]] ]
+; CHECK-NEXT:    [[FINAL_IND:%.*]] = phi i64 [ [[INDEX]], [[LAND_RHS]] ], [ 67, [[FOR_INC]] ]
+; CHECK-NEXT:    [[START_0_LCSSA:%.*]] = add i64 [[RED_NEXT_LCSSA]], [[FINAL_IND]]
+; CHECK-NEXT:    ret i64 [[START_0_LCSSA]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  %p2 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  call void @init_mem(ptr %p2, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %red = phi i64 [ %red.next, %loop.inc ], [ 0, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  %arrayidx1 = getelementptr inbounds i8, ptr %p2, i64 %index
+  %ld2 = load i8, ptr %arrayidx1, align 1
+  %ld2.zext = zext i8 %ld2 to i64
+  %red.next = add i64 %red, %ld2.zext
+  %cmp3 = icmp eq i8 %ld1, %ld2
+  br i1 %cmp3, label %loop.inc, label %loop.end
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.end:
+  %final.ind = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  %retval = add i64 %red.next, %final.ind
+  ret i64 %retval
+}
+
+
+define i64 @uncountable_exit_has_multiple_outside_successors() {
+; CHECK-LABEL: define i64 @uncountable_exit_has_multiple_outside_successors() {
+; CHECK-NEXT:  entry:
+; CHECK-NEXT:    [[P1:%.*]] = alloca [1024 x i8], align 1
+; CHECK-NEXT:    call void @init_mem(ptr [[P1]], i64 1024)
+; CHECK-NEXT:    br label [[LOOP:%.*]]
+; CHECK:       loop:
+; CHECK-NEXT:    [[INDEX:%.*]] = phi i64 [ [[INDEX_NEXT:%.*]], [[LOOP_INC:%.*]] ], [ 3, [[ENTRY:%.*]] ]
+; CHECK-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[P1]], i64 [[INDEX]]
+; CHECK-NEXT:    [[LD1:%.*]] = load i8, ptr [[ARRAYIDX]], align 1
+; CHECK-NEXT:    switch i8 [[LD1]], label [[LOOP_INC]] [
+; CHECK-NEXT:      i8 2, label [[LOOP_END:%.*]]
+; CHECK-NEXT:      i8 3, label [[LOOP_SURPRISE:%.*]]
+; CHECK-NEXT:    ]
+; CHECK:       loop.inc:
+; CHECK-NEXT:    [[INDEX_NEXT]] = add i64 [[INDEX]], 1
+; CHECK-NEXT:    [[EXITCOND:%.*]] = icmp ne i64 [[INDEX_NEXT]], 67
+; CHECK-NEXT:    br i1 [[EXITCOND]], label [[LOOP]], label [[LOOP_END]]
+; CHECK:       loop.surprise:
+; CHECK-NEXT:    ret i64 3
+; CHECK:       loop.end:
+; CHECK-NEXT:    [[RETVAL:%.*]] = phi i64 [ [[INDEX]], [[LOOP]] ], [ 67, [[LOOP_INC]] ]
+; CHECK-NEXT:    ret i64 [[RETVAL]]
+;
+entry:
+  %p1 = alloca [1024 x i8]
+  call void @init_mem(ptr %p1, i64 1024)
+  br label %loop
+
+loop:
+  %index = phi i64 [ %index.next, %loop.inc ], [ 3, %entry ]
+  %arrayidx = getelementptr inbounds i8, ptr %p1, i64 %index
+  %ld1 = load i8, ptr %arrayidx, align 1
+  switch i8 %ld1, label %loop.inc [
+  i8 2, label %loop.end
+  i8 3, label %loop.surprise
+  ]
+
+loop.inc:
+  %index.next = add i64 %index, 1
+  %exitcond = icmp ne i64 %index.next, 67
+  br i1 %exitcond, label %loop, label %loop.end
+
+loop.surprise:
+  ret i64 3
+
+loop.end:
+  %retval = phi i64 [ %index, %loop ], [ 67, %loop.inc ]
+  ret i64 %retval
+}
+
+
+declare i32 @foo(i32) readonly
+declare <vscale x 4 x i32> @foo_vec(<vscale x 4 x i32>)
+
+attributes #0 = { "vector-function-abi-variant"="_ZGVsNxv_foo(foo_vec)" }
-- 
GitLab


From 256bbdb3f642c37268d6fa5dc35e01cd27a67b61 Mon Sep 17 00:00:00 2001
From: Simon Pilgrim <llvm-dev@redking.me.uk>
Date: Thu, 17 Oct 2024 16:29:56 +0100
Subject: [PATCH 277/329] [DAG] visitFCEIL/FTRUNC/FFLOOR/FNEG - use
 FoldConstantArithmetic to attempt to constant fold

Don't rely on isConstantFPBuildVectorOrConstantFP followed by getNode() will constant fold - FoldConstantArithmetic will do all of this for us.

Cleanup for #112682
---
 llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
index 644054361dd3..18439b87a83b 100644
--- a/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/DAGCombiner.cpp
@@ -18291,8 +18291,8 @@ SDValue DAGCombiner::visitFCEIL(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (fceil c1) -> fceil(c1)
-  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FCEIL, SDLoc(N), VT, N0);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FCEIL, SDLoc(N), VT, {N0}))
+    return C;
 
   return SDValue();
 }
@@ -18302,8 +18302,8 @@ SDValue DAGCombiner::visitFTRUNC(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ftrunc c1) -> ftrunc(c1)
-  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FTRUNC, SDLoc(N), VT, N0);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FTRUNC, SDLoc(N), VT, {N0}))
+    return C;
 
   // fold ftrunc (known rounded int x) -> x
   // ftrunc is a part of fptosi/fptoui expansion on some targets, so this is
@@ -18336,8 +18336,8 @@ SDValue DAGCombiner::visitFFLOOR(SDNode *N) {
   EVT VT = N->getValueType(0);
 
   // fold (ffloor c1) -> ffloor(c1)
-  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FFLOOR, SDLoc(N), VT, N0);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FFLOOR, SDLoc(N), VT, {N0}))
+    return C;
 
   return SDValue();
 }
@@ -18348,8 +18348,8 @@ SDValue DAGCombiner::visitFNEG(SDNode *N) {
   SelectionDAG::FlagInserter FlagsInserter(DAG, N);
 
   // Constant fold FNEG.
-  if (DAG.isConstantFPBuildVectorOrConstantFP(N0))
-    return DAG.getNode(ISD::FNEG, SDLoc(N), VT, N0);
+  if (SDValue C = DAG.FoldConstantArithmetic(ISD::FNEG, SDLoc(N), VT, {N0}))
+    return C;
 
   if (SDValue NegN0 =
           TLI.getNegatedExpression(N0, DAG, LegalOperations, ForCodeSize))
-- 
GitLab


From 6f21a7bdeeca84bcc7cf94878e17b5d7ee7b4083 Mon Sep 17 00:00:00 2001
From: Congcong Cai <congcongcai0907@163.com>
Date: Thu, 17 Oct 2024 17:55:37 +0200
Subject: [PATCH 278/329] [clang-tidy] insert ``static`` keyword in correct
 position for misc-use-internal-linkage (#108792)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

Fixes: #108760

---------

Co-authored-by: Danny Mösch <danny.moesch@icloud.com>
---
 .../misc/UseInternalLinkageCheck.cpp          |  7 ++--
 .../clang-tidy/utils/LexerUtils.cpp           |  4 ++-
 clang-tools-extra/docs/ReleaseNotes.rst       |  4 +++
 .../misc/use-internal-linkage-func.cpp        | 35 +++++++++++++++++++
 .../misc/use-internal-linkage-var.cpp         | 12 +++++++
 5 files changed, 59 insertions(+), 3 deletions(-)

diff --git a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
index c086e7721e02..d900978f65a9 100644
--- a/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
+++ b/clang-tools-extra/clang-tidy/misc/UseInternalLinkageCheck.cpp
@@ -8,12 +8,15 @@
 
 #include "UseInternalLinkageCheck.h"
 #include "../utils/FileExtensionsUtils.h"
+#include "../utils/LexerUtils.h"
 #include "clang/AST/Decl.h"
 #include "clang/ASTMatchers/ASTMatchFinder.h"
 #include "clang/ASTMatchers/ASTMatchers.h"
 #include "clang/ASTMatchers/ASTMatchersMacros.h"
 #include "clang/Basic/SourceLocation.h"
 #include "clang/Basic/Specifiers.h"
+#include "clang/Basic/TokenKinds.h"
+#include "clang/Lex/Token.h"
 #include "llvm/ADT/STLExtras.h"
 
 using namespace clang::ast_matchers;
@@ -113,7 +116,7 @@ static constexpr StringRef Message =
 void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) {
   if (const auto *FD = Result.Nodes.getNodeAs<FunctionDecl>("fn")) {
     DiagnosticBuilder DB = diag(FD->getLocation(), Message) << "function" << FD;
-    SourceLocation FixLoc = FD->getTypeSpecStartLoc();
+    const SourceLocation FixLoc = FD->getInnerLocStart();
     if (FixLoc.isInvalid() || FixLoc.isMacroID())
       return;
     if (FixMode == FixModeKind::UseStatic)
@@ -128,7 +131,7 @@ void UseInternalLinkageCheck::check(const MatchFinder::MatchResult &Result) {
       return;
 
     DiagnosticBuilder DB = diag(VD->getLocation(), Message) << "variable" << VD;
-    SourceLocation FixLoc = VD->getTypeSpecStartLoc();
+    const SourceLocation FixLoc = VD->getInnerLocStart();
     if (FixLoc.isInvalid() || FixLoc.isMacroID())
       return;
     if (FixMode == FixModeKind::UseStatic)
diff --git a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
index df2b0bef576c..92c3e0ed7894 100644
--- a/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
+++ b/clang-tools-extra/clang-tidy/utils/LexerUtils.cpp
@@ -24,13 +24,15 @@ getPreviousTokenAndStart(SourceLocation Location, const SourceManager &SM,
   if (Location.isInvalid())
     return {Token, Location};
 
-  auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Location));
+  const auto StartOfFile = SM.getLocForStartOfFile(SM.getFileID(Location));
   while (Location != StartOfFile) {
     Location = Lexer::GetBeginningOfToken(Location, SM, LangOpts);
     if (!Lexer::getRawToken(Location, Token, SM, LangOpts) &&
         (!SkipComments || !Token.is(tok::comment))) {
       break;
     }
+    if (Location == StartOfFile)
+      return {Token, Location};
     Location = Location.getLocWithOffset(-1);
   }
   return {Token, Location};
diff --git a/clang-tools-extra/docs/ReleaseNotes.rst b/clang-tools-extra/docs/ReleaseNotes.rst
index 95be0a89cd6c..e8148e06b6af 100644
--- a/clang-tools-extra/docs/ReleaseNotes.rst
+++ b/clang-tools-extra/docs/ReleaseNotes.rst
@@ -195,6 +195,10 @@ Changes in existing checks
   <clang-tidy/checks/modernize/loop-convert>` check to fix false positive when
   using loop variable in initializer of lambda capture.
 
+- Improved :doc:`misc-use-internal-linkage
+  <clang-tidy/checks/misc/use-internal-linkage>` check to insert ``static`` keyword
+  before type qualifiers such as ``const`` and ``volatile``.
+
 - Improved :doc:`modernize-min-max-use-initializer-list
   <clang-tidy/checks/modernize/min-max-use-initializer-list>` check by fixing
   a false positive when only an implicit conversion happened inside an
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
index 9c91389542b0..8dc739da3a27 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-func.cpp
@@ -17,6 +17,41 @@ void func_cpp_inc();
 // CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc'
 // CHECK-FIXES: static void func_cpp_inc();
 
+int* func_cpp_inc_return_ptr();
+// CHECK-MESSAGES: :[[@LINE-1]]:6: warning: function 'func_cpp_inc_return_ptr'
+// CHECK-FIXES: static int* func_cpp_inc_return_ptr();
+
+const int* func_cpp_inc_return_const_ptr();
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_const_ptr'
+// CHECK-FIXES: static const int* func_cpp_inc_return_const_ptr();
+
+int const* func_cpp_inc_return_ptr_const();
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: function 'func_cpp_inc_return_ptr_const'
+// CHECK-FIXES: static int const* func_cpp_inc_return_ptr_const();
+
+int * const func_cpp_inc_return_const();
+// CHECK-MESSAGES: :[[@LINE-1]]:13: warning: function 'func_cpp_inc_return_const'
+// CHECK-FIXES: static int * const func_cpp_inc_return_const();
+
+volatile const int* func_cpp_inc_return_volatile_const_ptr();
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: function 'func_cpp_inc_return_volatile_const_ptr'
+// CHECK-FIXES: static volatile const int* func_cpp_inc_return_volatile_const_ptr();
+
+[[nodiscard]] void func_nodiscard();
+// CHECK-MESSAGES: :[[@LINE-1]]:20: warning: function 'func_nodiscard'
+// CHECK-FIXES: {{\[\[nodiscard\]\]}} static void func_nodiscard();
+
+#define NDS [[nodiscard]]
+#define NNDS
+
+NDS void func_nds();
+// CHECK-MESSAGES: :[[@LINE-1]]:10: warning: function 'func_nds'
+// CHECK-FIXES: NDS static void func_nds();
+
+NNDS void func_nnds();
+// CHECK-MESSAGES: :[[@LINE-1]]:11: warning: function 'func_nnds'
+// CHECK-FIXES: NNDS static void func_nnds();
+
 #include "func_cpp.inc"
 
 void func_h_inc();
diff --git a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
index 6777ce4bb026..901272e40b8f 100644
--- a/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
+++ b/clang-tools-extra/test/clang-tidy/checkers/misc/use-internal-linkage-var.cpp
@@ -13,6 +13,18 @@ T global_template;
 // CHECK-MESSAGES: :[[@LINE-1]]:3: warning: variable 'global_template'
 // CHECK-FIXES: static T global_template;
 
+int const* ptr_const_star;
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'ptr_const_star'
+// CHECK-FIXES: static int const* ptr_const_star;
+
+const int* const_ptr_star;
+// CHECK-MESSAGES: :[[@LINE-1]]:12: warning: variable 'const_ptr_star'
+// CHECK-FIXES: static const int* const_ptr_star;
+
+const volatile int* const_volatile_ptr_star;
+// CHECK-MESSAGES: :[[@LINE-1]]:21: warning: variable 'const_volatile_ptr_star'
+// CHECK-FIXES: static const volatile int* const_volatile_ptr_star;
+
 int gloabl_header;
 
 extern int global_extern;
-- 
GitLab


From 94643a45b4c549b27407803277ec88b78315e2d9 Mon Sep 17 00:00:00 2001
From: Lukacma <Marian.Lukac@arm.com>
Date: Thu, 17 Oct 2024 17:08:43 +0100
Subject: [PATCH 279/329] [AArch64] Add armv9.6 features to AArch64AsmParser
 (#112722)

New features need to be added to ExtensionMap for .arch and
.arch_extension to work.
---
 .../lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp | 11 +++++++++++
 1 file changed, 11 insertions(+)

diff --git a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
index df69c20b1359..a5165d45893f 100644
--- a/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
+++ b/llvm/lib/Target/AArch64/AsmParser/AArch64AsmParser.cpp
@@ -3736,6 +3736,17 @@ static const struct Extension {
     {"sme-fa64", {AArch64::FeatureSMEFA64}},
     {"cpa", {AArch64::FeatureCPA}},
     {"tlbiw", {AArch64::FeatureTLBIW}},
+    {"cmpbr", {AArch64::FeatureCMPBR}},
+    {"f8f32mm", {AArch64::FeatureF8F32MM}},
+    {"f8f16mm", {AArch64::FeatureF8F16MM}},
+    {"fprcvt", {AArch64::FeatureFPRCVT}},
+    {"lsfe", {AArch64::FeatureLSFE}},
+    {"sme2p2", {AArch64::FeatureSME2p2}},
+    {"ssve-aes", {AArch64::FeatureSSVE_AES}},
+    {"sve2p2", {AArch64::FeatureSVE2p2}},
+    {"sve-aes2", {AArch64::FeatureSVEAES2}},
+    {"sve-bfscale", {AArch64::FeatureSVEBFSCALE}},
+    {"sve-f16f32mm", {AArch64::FeatureSVE_F16F32MM}},
 };
 
 static void setRequiredFeatureString(FeatureBitset FBS, std::string &Str) {
-- 
GitLab


From 020566701030425f44eb80387d0ae76c5a867aa9 Mon Sep 17 00:00:00 2001
From: Shimin Cui <scui@ca.ibm.com>
Date: Thu, 17 Oct 2024 12:10:05 -0400
Subject: [PATCH 280/329] [LTO] Add function alias as function instead of data
 (#112599)
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

On AIX, for undefined functions, only the dotnamed symbols (the address
of the function) are generated after linking (i.e., no named function
symbol is generated).
 
Currently, all alias symbols are added as defined data symbols when
parsing symbols in LTOModule (the Link Time Optimization library used by
linker to optimization code at link time). On AIX, if the function alias
is used in the native object, and only its dotnamed symbol is generated,
the linker will have problem to match the dotnamed symbol from the
native object and the defined symbol marked as data from the bitcode at
LTO linktime.
 
This patch is to add function alias as function instead of data.
---
 llvm/include/llvm/LTO/legacy/LTOModule.h |  2 +-
 llvm/lib/LTO/LTOModule.cpp               | 17 +++++--
 llvm/test/LTO/PowerPC/list-symbol.ll     | 16 ++++++
 llvm/tools/llvm-lto/llvm-lto.cpp         | 65 +++++++++++++++++++++++-
 4 files changed, 93 insertions(+), 7 deletions(-)
 create mode 100644 llvm/test/LTO/PowerPC/list-symbol.ll

diff --git a/llvm/include/llvm/LTO/legacy/LTOModule.h b/llvm/include/llvm/LTO/legacy/LTOModule.h
index 1b2de3b33385..e861a56bcbac 100644
--- a/llvm/include/llvm/LTO/legacy/LTOModule.h
+++ b/llvm/include/llvm/LTO/legacy/LTOModule.h
@@ -195,7 +195,7 @@ private:
 
   /// Add a function symbol as defined to the list.
   void addDefinedFunctionSymbol(ModuleSymbolTable::Symbol Sym);
-  void addDefinedFunctionSymbol(StringRef Name, const Function *F);
+  void addDefinedFunctionSymbol(StringRef Name, const GlobalValue *F);
 
   /// Add a global symbol from module-level ASM to the defined list.
   void addAsmGlobalSymbol(StringRef, lto_symbol_attributes scope);
diff --git a/llvm/lib/LTO/LTOModule.cpp b/llvm/lib/LTO/LTOModule.cpp
index eac78069f4d2..00eb8adb4e10 100644
--- a/llvm/lib/LTO/LTOModule.cpp
+++ b/llvm/lib/LTO/LTOModule.cpp
@@ -406,11 +406,16 @@ void LTOModule::addDefinedFunctionSymbol(ModuleSymbolTable::Symbol Sym) {
     Buffer.c_str();
   }
 
-  const Function *F = cast<Function>(cast<GlobalValue *>(Sym));
-  addDefinedFunctionSymbol(Buffer, F);
+  auto *GV = cast<GlobalValue *>(Sym);
+  assert((isa<Function>(GV) ||
+          (isa<GlobalAlias>(GV) &&
+           isa<Function>(cast<GlobalAlias>(GV)->getAliasee()))) &&
+         "Not function or function alias");
+
+  addDefinedFunctionSymbol(Buffer, GV);
 }
 
-void LTOModule::addDefinedFunctionSymbol(StringRef Name, const Function *F) {
+void LTOModule::addDefinedFunctionSymbol(StringRef Name, const GlobalValue *F) {
   // add to list of defined symbols
   addDefinedSymbol(Name, F, true);
 }
@@ -611,7 +616,11 @@ void LTOModule::parseSymbols() {
     }
 
     assert(isa<GlobalAlias>(GV));
-    addDefinedDataSymbol(Sym);
+
+    if (isa<Function>(cast<GlobalAlias>(GV)->getAliasee()))
+      addDefinedFunctionSymbol(Sym);
+    else
+      addDefinedDataSymbol(Sym);
   }
 
   // make symbols for all undefines
diff --git a/llvm/test/LTO/PowerPC/list-symbol.ll b/llvm/test/LTO/PowerPC/list-symbol.ll
new file mode 100644
index 000000000000..75300b11f7f1
--- /dev/null
+++ b/llvm/test/LTO/PowerPC/list-symbol.ll
@@ -0,0 +1,16 @@
+; RUN: llvm-as < %s > %t
+; RUN: llvm-lto -list-symbols-only %t | FileCheck %s
+; REQUIRES: default_triple
+
+; CHECK-DAG: v    { data defined default }
+; CHECK-DAG: va    { data defined default alias }
+; CHECK-DAG: f    { function defined default }
+; CHECK-DAG: fa    { function defined default alias }
+
+@v = global i32 0
+@va = alias i32, ptr @v
+@fa = alias void (ptr), ptr @f
+
+define void @f() {
+  ret void
+}
diff --git a/llvm/tools/llvm-lto/llvm-lto.cpp b/llvm/tools/llvm-lto/llvm-lto.cpp
index f5076f0b9751..4090faf4e3fd 100644
--- a/llvm/tools/llvm-lto/llvm-lto.cpp
+++ b/llvm/tools/llvm-lto/llvm-lto.cpp
@@ -407,6 +407,64 @@ static void printIndexStats() {
   }
 }
 
+/// Print the lto symbol attributes.
+static void printLTOSymbolAttributes(lto_symbol_attributes Attrs) {
+  outs() << "{ ";
+  unsigned Permission = Attrs & LTO_SYMBOL_PERMISSIONS_MASK;
+  switch (Permission) {
+  case LTO_SYMBOL_PERMISSIONS_CODE:
+    outs() << "function ";
+    break;
+  case LTO_SYMBOL_PERMISSIONS_DATA:
+    outs() << "data ";
+    break;
+  case LTO_SYMBOL_PERMISSIONS_RODATA:
+    outs() << "constant ";
+    break;
+  }
+  unsigned Definition = Attrs & LTO_SYMBOL_DEFINITION_MASK;
+  switch (Definition) {
+  case LTO_SYMBOL_DEFINITION_REGULAR:
+    outs() << "defined ";
+    break;
+  case LTO_SYMBOL_DEFINITION_TENTATIVE:
+    outs() << "common ";
+    break;
+  case LTO_SYMBOL_DEFINITION_WEAK:
+    outs() << "weak ";
+    break;
+  case LTO_SYMBOL_DEFINITION_UNDEFINED:
+    outs() << "extern ";
+    break;
+  case LTO_SYMBOL_DEFINITION_WEAKUNDEF:
+    outs() << "extern-weak ";
+    break;
+  }
+  unsigned Scope = Attrs & LTO_SYMBOL_SCOPE_MASK;
+  switch (Scope) {
+  case LTO_SYMBOL_SCOPE_INTERNAL:
+    outs() << "internal ";
+    break;
+  case LTO_SYMBOL_SCOPE_HIDDEN:
+    outs() << "hidden ";
+    break;
+  case LTO_SYMBOL_SCOPE_PROTECTED:
+    outs() << "protected ";
+    break;
+  case LTO_SYMBOL_SCOPE_DEFAULT:
+    outs() << "default ";
+    break;
+  case LTO_SYMBOL_SCOPE_DEFAULT_CAN_BE_HIDDEN:
+    outs() << "omitted ";
+    break;
+  }
+  if (Attrs & LTO_SYMBOL_COMDAT)
+    outs() << "comdat ";
+  if (Attrs & LTO_SYMBOL_ALIAS)
+    outs() << "alias ";
+  outs() << "}";
+}
+
 /// Load each IR file and dump certain information based on active flags.
 ///
 /// The main point here is to provide lit-testable coverage for the LTOModule
@@ -421,8 +479,11 @@ static void testLTOModule(const TargetOptions &Options) {
     if (ListSymbolsOnly) {
       // List the symbols.
       outs() << Filename << ":\n";
-      for (int I = 0, E = Module->getSymbolCount(); I != E; ++I)
-        outs() << Module->getSymbolName(I) << "\n";
+      for (int I = 0, E = Module->getSymbolCount(); I != E; ++I) {
+        outs() << Module->getSymbolName(I) << "    ";
+        printLTOSymbolAttributes(Module->getSymbolAttributes(I));
+        outs() << "\n";
+      }
     }
     if (QueryHasCtorDtor)
       outs() << Filename
-- 
GitLab


From bf1a554312bd011cb2016a2c9d7e75d6fe3b02af Mon Sep 17 00:00:00 2001
From: Aaron Ballman <aaron@aaronballman.com>
Date: Thu, 17 Oct 2024 12:25:22 -0400
Subject: [PATCH 281/329] Document the requirement that commits have a public
 email address (#109318)

See
https://discourse.llvm.org/t/hidden-emails-on-github-should-we-do-something-about-it/74223
for details about why this is important to the community.

Note, we currently have soft enforcement for this requirement in the
form of a bot which posts comments letting patch authors know their
email is private, so we're already setting expectations in practice;
this PR is documenting those expectations for clarity.
---
 llvm/docs/DeveloperPolicy.rst | 7 +++++++
 1 file changed, 7 insertions(+)

diff --git a/llvm/docs/DeveloperPolicy.rst b/llvm/docs/DeveloperPolicy.rst
index caa4b31b949c..0ecf1423e603 100644
--- a/llvm/docs/DeveloperPolicy.rst
+++ b/llvm/docs/DeveloperPolicy.rst
@@ -105,6 +105,13 @@ When submitting patches, please do not add confidentiality or non-disclosure
 notices to the patches themselves.  These notices conflict with the LLVM
 licensing terms and may result in your contribution being excluded.
 
+The LLVM project uses email to communicate to contributors outside of the
+GitHub platform about their past contributions. Primarily, our buildbot
+infrastructure uses emails to contact contributors about build and test
+failures. Therefore, the LLVM community requires contributors to have a public
+email address associated with their GitHub commits, so please ensure that "Keep
+my email addresses private" is disabled in your
+`account settings <https://github.com/settings/emails>`_.
 
 .. _code review:
 
-- 
GitLab


From feedb35e41522b2f6c11dab4f9263fd305a2c13f Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 17 Oct 2024 09:20:34 -0700
Subject: [PATCH 282/329] [RISCV][GISel] Correct RORIW patterns.

We had two rotl patterns and no rotr pattern. The order was such
that the incorrect rotl pattern was being used.
---
 llvm/lib/Target/RISCV/RISCVGISel.td                        | 2 +-
 .../RISCV/GlobalISel/instruction-select/rotate-rv64.mir    | 7 +++----
 2 files changed, 4 insertions(+), 5 deletions(-)

diff --git a/llvm/lib/Target/RISCV/RISCVGISel.td b/llvm/lib/Target/RISCV/RISCVGISel.td
index 0656928ca41f..67e93b812421 100644
--- a/llvm/lib/Target/RISCV/RISCVGISel.td
+++ b/llvm/lib/Target/RISCV/RISCVGISel.td
@@ -274,7 +274,7 @@ def : Pat<(i32 (xor GPR:$rs1, (not GPR:$rs2))), (XNOR GPR:$rs1, GPR:$rs2)>;
 
 def : PatGprGpr<rotl, ROLW, i32, i32>;
 def : PatGprGpr<rotr, RORW, i32, i32>;
-def : Pat<(i32 (rotl GPR:$rs1, uimm5i32:$imm)),
+def : Pat<(i32 (rotr GPR:$rs1, uimm5i32:$imm)),
           (RORIW GPR:$rs1, (i64 (as_i64imm $imm)))>;
 
 def : Pat<(i32 (rotl GPR:$rs1, uimm5i32:$rs2)),
diff --git a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir
index b75e926bb50c..50b96e0ee972 100644
--- a/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir
+++ b/llvm/test/CodeGen/RISCV/GlobalISel/instruction-select/rotate-rv64.mir
@@ -117,7 +117,7 @@ body:             |
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
-    ; CHECK-NEXT: [[RORIW:%[0-9]+]]:gpr = RORIW [[COPY]], 15
+    ; CHECK-NEXT: [[RORIW:%[0-9]+]]:gpr = RORIW [[COPY]], 17
     ; CHECK-NEXT: $x10 = COPY [[RORIW]]
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
@@ -165,9 +165,8 @@ body:             |
     ; CHECK: liveins: $x10
     ; CHECK-NEXT: {{  $}}
     ; CHECK-NEXT: [[COPY:%[0-9]+]]:gpr = COPY $x10
-    ; CHECK-NEXT: [[ADDI:%[0-9]+]]:gpr = ADDI $x0, 15
-    ; CHECK-NEXT: [[RORW:%[0-9]+]]:gpr = RORW [[COPY]], [[ADDI]]
-    ; CHECK-NEXT: $x10 = COPY [[RORW]]
+    ; CHECK-NEXT: [[RORIW:%[0-9]+]]:gpr = RORIW [[COPY]], 15
+    ; CHECK-NEXT: $x10 = COPY [[RORIW]]
     ; CHECK-NEXT: PseudoRET implicit $x10
     %0:gprb(s64) = COPY $x10
     %1:gprb(s32) = G_TRUNC %0(s64)
-- 
GitLab


From 87645e920528802fb1864e159da3d2be1b733432 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Thu, 17 Oct 2024 18:37:58 +0200
Subject: [PATCH 283/329] [libc][math][c23] Fix undefined behavior in expxf16.h
 (#112734)

Fixes the left-shifting of potentially negative signed integers.
---
 libc/src/math/generic/expxf16.h | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index 8de329bd2ab0..aba99a2914b4 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -108,8 +108,8 @@ LIBC_INLINE ExpRangeReduction exp2_range_reduction(float16 x) {
   float xf = x;
   float kf = fputil::nearest_integer(xf * 0x1.0p+3f);
   int x_hi_mid = static_cast<int>(kf);
-  int x_hi = x_hi_mid >> 3;
-  int x_mid = x_hi_mid & 0x7;
+  unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3;
+  unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7;
   // lo = x - (hi + mid) = round(x * 2^3) * (-2^(-3)) + x
   float lo = fputil::multiply_add(kf, -0x1.0p-3f, xf);
 
@@ -155,8 +155,8 @@ LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) {
   float xf = x;
   float kf = fputil::nearest_integer(xf * (LOG2F_10 * 0x1.0p+3f));
   int x_hi_mid = static_cast<int>(kf);
-  int x_hi = x_hi_mid >> 3;
-  int x_mid = x_hi_mid & 0x7;
+  unsigned x_hi = static_cast<unsigned>(x_hi_mid) >> 3;
+  unsigned x_mid = static_cast<unsigned>(x_hi_mid) & 0x7;
   // lo = x - (hi + mid) = round(x * 2^3 * log2(10)) * log10(2) * (-2^(-3)) + x
   float lo = fputil::multiply_add(kf, LOG10F_2 * -0x1.0p-3f, xf);
 
-- 
GitLab


From 6d7712a70c163d2ae9e1dc928db31fcb45d9e404 Mon Sep 17 00:00:00 2001
From: Nick Riasanovsky <njriasanovsky@berkeley.edu>
Date: Thu, 17 Oct 2024 12:42:08 -0400
Subject: [PATCH 284/329] [clang-tidy][docs] Replace  _not_ in
 reserved-identifier.rst with *not* (#112162)

Fixes a documentation formatting error where `_not_` was used which has no
special meaning in reST and replaces it with `*not*`.

Closes #111691.
---
 .../docs/clang-tidy/checks/bugprone/reserved-identifier.rst     | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst b/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst
index a498ff8409af..3f6cee9b3bb5 100644
--- a/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst
+++ b/clang-tools-extra/docs/clang-tidy/checks/bugprone/reserved-identifier.rst
@@ -28,7 +28,7 @@ Violating the naming rules above results in undefined behavior.
   int _g(); // disallowed in global namespace only
 
 The check can also be inverted, i.e. it can be configured to flag any
-identifier that is _not_ a reserved identifier. This mode is for use by e.g.
+identifier that is *not* a reserved identifier. This mode is for use by e.g.
 standard library implementors, to ensure they don't infringe on the user
 namespace.
 
-- 
GitLab


From 8f8d5f005a937bf8d5244c5bf22906095ff08c70 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Thu, 17 Oct 2024 10:02:08 -0700
Subject: [PATCH 285/329] [rtsan] Add statistics for suppression count
 (#112718)

---
 compiler-rt/lib/rtsan/rtsan_assertions.h      |  9 +++++++--
 compiler-rt/lib/rtsan/rtsan_flags.h           |  2 ++
 compiler-rt/lib/rtsan/rtsan_stats.cpp         | 13 +++++++++++++
 compiler-rt/lib/rtsan/rtsan_stats.h           |  1 +
 compiler-rt/lib/rtsan/rtsan_suppressions.cpp  |  2 +-
 compiler-rt/test/rtsan/exit_stats.cpp         | 11 +++++++++++
 compiler-rt/test/rtsan/exit_stats.cpp.supp    |  1 +
 compiler-rt/test/rtsan/stack_suppressions.cpp |  5 ++++-
 8 files changed, 40 insertions(+), 4 deletions(-)
 create mode 100644 compiler-rt/test/rtsan/exit_stats.cpp.supp

diff --git a/compiler-rt/lib/rtsan/rtsan_assertions.h b/compiler-rt/lib/rtsan/rtsan_assertions.h
index 927b32e03cf0..28a272b64623 100644
--- a/compiler-rt/lib/rtsan/rtsan_assertions.h
+++ b/compiler-rt/lib/rtsan/rtsan_assertions.h
@@ -15,6 +15,7 @@
 #include "rtsan/rtsan.h"
 #include "rtsan/rtsan_context.h"
 #include "rtsan/rtsan_diagnostics.h"
+#include "rtsan/rtsan_stats.h"
 #include "rtsan/rtsan_suppressions.h"
 
 #include "sanitizer_common/sanitizer_stacktrace.h"
@@ -28,8 +29,10 @@ void ExpectNotRealtime(Context &context, const DiagnosticsInfo &info,
   if (context.InRealtimeContext() && !context.IsBypassed()) {
     ScopedBypass sb{context};
 
-    if (IsFunctionSuppressed(info.func_name))
+    if (IsFunctionSuppressed(info.func_name)) {
+      IncrementSuppressedCount();
       return;
+    }
 
     __sanitizer::BufferedStackTrace stack;
 
@@ -38,8 +41,10 @@ void ExpectNotRealtime(Context &context, const DiagnosticsInfo &info,
     stack.Unwind(info.pc, info.bp, nullptr,
                  __sanitizer::common_flags()->fast_unwind_on_fatal);
 
-    if (IsStackTraceSuppressed(stack))
+    if (IsStackTraceSuppressed(stack)) {
+      IncrementSuppressedCount();
       return;
+    }
 
     OnViolation(stack, info);
   }
diff --git a/compiler-rt/lib/rtsan/rtsan_flags.h b/compiler-rt/lib/rtsan/rtsan_flags.h
index 29025c29b6fc..f46e04933fa5 100644
--- a/compiler-rt/lib/rtsan/rtsan_flags.h
+++ b/compiler-rt/lib/rtsan/rtsan_flags.h
@@ -18,6 +18,8 @@ struct Flags {
   Type Name{DefaultValue};
 #include "rtsan_flags.inc"
 #undef RTSAN_FLAG
+
+  bool ContainsSuppresionFile() { return suppressions[0] != '\0'; }
 };
 
 extern Flags flags_data;
diff --git a/compiler-rt/lib/rtsan/rtsan_stats.cpp b/compiler-rt/lib/rtsan/rtsan_stats.cpp
index dac7b23c3ef5..1562b73cf94c 100644
--- a/compiler-rt/lib/rtsan/rtsan_stats.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_stats.cpp
@@ -11,6 +11,7 @@
 //===----------------------------------------------------------------------===//
 
 #include "rtsan/rtsan_stats.h"
+#include "rtsan/rtsan_flags.h"
 
 #include "sanitizer_common/sanitizer_atomic.h"
 #include "sanitizer_common/sanitizer_common.h"
@@ -20,6 +21,7 @@ using namespace __rtsan;
 
 static atomic_uint32_t rtsan_total_error_count{0};
 static atomic_uint32_t rtsan_unique_error_count{0};
+static atomic_uint32_t rtsan_suppressed_count{0};
 
 void __rtsan::IncrementTotalErrorCount() {
   atomic_fetch_add(&rtsan_total_error_count, 1, memory_order_relaxed);
@@ -37,9 +39,20 @@ static u32 GetUniqueErrorCount() {
   return atomic_load(&rtsan_unique_error_count, memory_order_relaxed);
 }
 
+void __rtsan::IncrementSuppressedCount() {
+  atomic_fetch_add(&rtsan_suppressed_count, 1, memory_order_relaxed);
+}
+
+static u32 GetSuppressedCount() {
+  return atomic_load(&rtsan_suppressed_count, memory_order_relaxed);
+}
+
 void __rtsan::PrintStatisticsSummary() {
   ScopedErrorReportLock l;
   Printf("RealtimeSanitizer exit stats:\n");
   Printf("    Total error count: %u\n", GetTotalErrorCount());
   Printf("    Unique error count: %u\n", GetUniqueErrorCount());
+
+  if (flags().ContainsSuppresionFile())
+    Printf("    Suppression count: %u\n", GetSuppressedCount());
 }
diff --git a/compiler-rt/lib/rtsan/rtsan_stats.h b/compiler-rt/lib/rtsan/rtsan_stats.h
index a72098792c89..a8a67ea2a44b 100644
--- a/compiler-rt/lib/rtsan/rtsan_stats.h
+++ b/compiler-rt/lib/rtsan/rtsan_stats.h
@@ -16,6 +16,7 @@ namespace __rtsan {
 
 void IncrementTotalErrorCount();
 void IncrementUniqueErrorCount();
+void IncrementSuppressedCount();
 
 void PrintStatisticsSummary();
 
diff --git a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
index a7c3d42ac68a..2bcfbeed4195 100644
--- a/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_suppressions.cpp
@@ -56,7 +56,7 @@ void __rtsan::InitializeSuppressions() {
   CHECK_EQ(nullptr, suppression_ctx);
 
   // We will use suppression_ctx == nullptr as an early out
-  if (flags().suppressions[0] == '\0')
+  if (!flags().ContainsSuppresionFile())
     return;
 
   suppression_ctx = new (suppression_placeholder)
diff --git a/compiler-rt/test/rtsan/exit_stats.cpp b/compiler-rt/test/rtsan/exit_stats.cpp
index d4d19ace778b..92ca58f1edde 100644
--- a/compiler-rt/test/rtsan/exit_stats.cpp
+++ b/compiler-rt/test/rtsan/exit_stats.cpp
@@ -1,6 +1,7 @@
 // RUN: %clangxx -fsanitize=realtime %s -o %t
 // RUN: %env_rtsan_opts="halt_on_error=false,print_stats_on_exit=true" %run %t 2>&1 | FileCheck %s
 // RUN: %env_rtsan_opts="halt_on_error=true,print_stats_on_exit=true" not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-HALT
+// RUN: %env_rtsan_opts="suppressions=%s.supp,print_stats_on_exit=true" not %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-SUPPRESSIONS
 
 // UNSUPPORTED: ios
 
@@ -23,7 +24,17 @@ int main() {
 // CHECK: RealtimeSanitizer exit stats:
 // CHECK-NEXT: Total error count: 10
 // CHECK-NEXT: Unique error count: 1
+// CHECK-NOT: Suppression count
 
 // CHECK-HALT: RealtimeSanitizer exit stats:
 // CHECK-HALT-NEXT: Total error count: 1
 // CHECK-HALT-NEXT: Unique error count: 1
+// CHECK-HALT-NOT: Suppression count
+
+// We pass in intentionally_non_existant_function in the suppressions file
+// This is just to ensure we only get the "Suppression count" metric if this
+// file is passed at runtime, otherwise that statistic is omitted
+// CHECK-SUPPRESSIONS: RealtimeSanitizer exit stats:
+// CHECK-SUPPRESSIONS-NEXT: Total error count: 1
+// CHECK-SUPPRESSIONS-NEXT: Unique error count: 1
+// CHECK-SUPPRESSIONS-NEXT: Suppression count: 0
diff --git a/compiler-rt/test/rtsan/exit_stats.cpp.supp b/compiler-rt/test/rtsan/exit_stats.cpp.supp
new file mode 100644
index 000000000000..b720bdb77080
--- /dev/null
+++ b/compiler-rt/test/rtsan/exit_stats.cpp.supp
@@ -0,0 +1 @@
+function-name-matches:intentionally_non_existant_function
diff --git a/compiler-rt/test/rtsan/stack_suppressions.cpp b/compiler-rt/test/rtsan/stack_suppressions.cpp
index b9b2d0957636..be1cf4963c7f 100644
--- a/compiler-rt/test/rtsan/stack_suppressions.cpp
+++ b/compiler-rt/test/rtsan/stack_suppressions.cpp
@@ -1,6 +1,6 @@
 // RUN: %clangxx -fsanitize=realtime %s -o %t
 // RUN: %env_rtsan_opts=halt_on_error=false %run %t 2>&1 | FileCheck %s --check-prefix=CHECK-NOSUPPRESSIONS
-// RUN: %env_rtsan_opts=suppressions='%s.supp' not %run %t 2>&1 | FileCheck %s
+// RUN: %env_rtsan_opts=suppressions='%s.supp':print_stats_on_exit=true not %run %t 2>&1 | FileCheck %s
 // UNSUPPORTED: ios
 
 // Intent: Ensure that suppressions work as intended
@@ -61,6 +61,9 @@ int main() {
 // CHECK-NOT: free
 // CHECK-NOT: BlockFunc
 
+// CHECK: RealtimeSanitizer exit stats:
+// CHECK: Suppression count: 7
+
 // CHECK-NOSUPPRESSIONS: malloc
 // CHECK-NOSUPPRESSIONS: vector
 // CHECK-NOSUPPRESSIONS: free
-- 
GitLab


From 1a609052b65e7b8ca78159d5ad14eafbeb039eb2 Mon Sep 17 00:00:00 2001
From: Danila Malyutin <danilaml@users.noreply.github.com>
Date: Thu, 17 Oct 2024 21:04:04 +0400
Subject: [PATCH 286/329] [AArch64][InstCombine] Eliminate redundant barrier
 intrinsics (#112023)

If there are no memory ops on the path from one dmb to another then one
barrier can be eliminated.
---
 .../AArch64/AArch64TargetTransformInfo.cpp    |  31 +++
 .../InstCombine/AArch64/dmb-intrinsics.ll     | 220 ++++++++++++++++++
 2 files changed, 251 insertions(+)
 create mode 100644 llvm/test/Transforms/InstCombine/AArch64/dmb-intrinsics.ll

diff --git a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
index d33d0aa58554..7c6b789b9c1b 100644
--- a/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
+++ b/llvm/lib/Target/AArch64/AArch64TargetTransformInfo.cpp
@@ -66,6 +66,10 @@ static cl::opt<unsigned>
     BaseHistCntCost("aarch64-base-histcnt-cost", cl::init(8), cl::Hidden,
                     cl::desc("The cost of a histcnt instruction"));
 
+static cl::opt<unsigned> DMBLookaheadThreshold(
+    "dmb-lookahead-threshold", cl::init(10), cl::Hidden,
+    cl::desc("The number of instructions to search for a redundant dmb"));
+
 namespace {
 class TailFoldingOption {
   // These bitfields will only ever be set to something non-zero in operator=,
@@ -2152,6 +2156,31 @@ static std::optional<Instruction *> instCombineSVEInsr(InstCombiner &IC,
   return std::nullopt;
 }
 
+static std::optional<Instruction *> instCombineDMB(InstCombiner &IC,
+                                                   IntrinsicInst &II) {
+  // If this barrier is post-dominated by identical one we can remove it
+  auto *NI = II.getNextNonDebugInstruction();
+  unsigned LookaheadThreshold = DMBLookaheadThreshold;
+  auto CanSkipOver = [](Instruction *I) {
+    return !I->mayReadOrWriteMemory() && !I->mayHaveSideEffects();
+  };
+  while (LookaheadThreshold-- && CanSkipOver(NI)) {
+    auto *NIBB = NI->getParent();
+    NI = NI->getNextNonDebugInstruction();
+    if (!NI) {
+      if (auto *SuccBB = NIBB->getUniqueSuccessor())
+        NI = SuccBB->getFirstNonPHIOrDbgOrLifetime();
+      else
+        break;
+    }
+  }
+  auto *NextII = dyn_cast_or_null<IntrinsicInst>(NI);
+  if (NextII && II.isIdenticalTo(NextII))
+    return IC.eraseInstFromFunction(II);
+
+  return std::nullopt;
+}
+
 std::optional<Instruction *>
 AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
                                      IntrinsicInst &II) const {
@@ -2159,6 +2188,8 @@ AArch64TTIImpl::instCombineIntrinsic(InstCombiner &IC,
   switch (IID) {
   default:
     break;
+  case Intrinsic::aarch64_dmb:
+    return instCombineDMB(IC, II);
   case Intrinsic::aarch64_sve_fcvt_bf16f32_v2:
   case Intrinsic::aarch64_sve_fcvt_f16f32:
   case Intrinsic::aarch64_sve_fcvt_f16f64:
diff --git a/llvm/test/Transforms/InstCombine/AArch64/dmb-intrinsics.ll b/llvm/test/Transforms/InstCombine/AArch64/dmb-intrinsics.ll
new file mode 100644
index 000000000000..dacdd4130136
--- /dev/null
+++ b/llvm/test/Transforms/InstCombine/AArch64/dmb-intrinsics.ll
@@ -0,0 +1,220 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --version 5
+; RUN: opt -S -passes=instcombine < %s | FileCheck %s
+; ARM64 dmb intrinsics
+
+target triple = "aarch64-unknown-linux-gnu"
+
+declare void @llvm.aarch64.dmb(i32)
+declare void @llvm.aarch64.dsb(i32)
+declare void @clobber()
+declare void @pure() memory(none) willreturn nounwind
+declare i32  @llvm.ctlz.i32(i32, i1)
+
+define void @simple() #0 {
+; CHECK-LABEL: define void @simple() {
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+; dmb ish (0xb) is technically stronger than ishst (0xa) but we don't merge for now
+define void @simple_nonmatching() #0 {
+; CHECK-LABEL: define void @simple_nonmatching() {
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 11)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  call void @llvm.aarch64.dmb(i32 11)
+  ret void
+}
+
+define ptr @simple_safe_instruction(ptr %p) #0 {
+; CHECK-LABEL: define ptr @simple_safe_instruction(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret ptr [[RES]]
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  %res = getelementptr inbounds i8, ptr %p, i32 8
+  call void @llvm.aarch64.dmb(i32 10)
+  ret ptr %res
+}
+
+define i32 @simple_safe_intrinsic(i32 %n) #0 {
+; CHECK-LABEL: define i32 @simple_safe_intrinsic(
+; CHECK-SAME: i32 [[N:%.*]]) {
+; CHECK-NEXT:    [[RES:%.*]] = call range(i32 0, 33) i32 @llvm.ctlz.i32(i32 [[N]], i1 false)
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret i32 [[RES]]
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  %res = call i32 @llvm.ctlz.i32(i32 %n, i1 false)
+  call void @llvm.aarch64.dmb(i32 10)
+  ret i32 %res
+}
+
+define void @simple_unsafe_intrinsic() #0 {
+; CHECK-LABEL: define void @simple_unsafe_intrinsic() {
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    call void @llvm.aarch64.dsb(i32 10)
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  call void @llvm.aarch64.dsb(i32 10)
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+define void @simple_safe_unsafe_instruction(ptr %p) #0 {
+; CHECK-LABEL: define void @simple_safe_unsafe_instruction(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  store i32 42, ptr %p
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+define void @simple_safe_unsafe_call(ptr %p) #0 {
+; CHECK-LABEL: define void @simple_safe_unsafe_call(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    call void @clobber()
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  call void @clobber()
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+define void @simple_safe_safe_call(ptr %p) #0 {
+; CHECK-LABEL: define void @simple_safe_safe_call(
+; CHECK-SAME: ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+  call void @llvm.aarch64.dmb(i32 10)
+  call void @pure()
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+define void @multiple_bbs1(i1 %f) #0 {
+; CHECK-LABEL: define void @multiple_bbs1(
+; CHECK-SAME: i1 [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]]
+; CHECK:       [[BB_T]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[BB_F]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %f, label %bb_t, label %bb_f
+bb_t:
+  call void @llvm.aarch64.dmb(i32 10)
+  br label %exit
+bb_f:
+  call void @llvm.aarch64.dmb(i32 10)
+  br label %exit
+exit:
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+define void @multiple_bbs2(i1 %f) #0 {
+; CHECK-LABEL: define void @multiple_bbs2(
+; CHECK-SAME: i1 [[F:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]]
+; CHECK:       [[BB_T]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[BB_F]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %f, label %bb_t, label %bb_f
+bb_t:
+  call void @llvm.aarch64.dmb(i32 10)
+  br label %exit
+bb_f:
+  br label %exit
+exit:
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+define void @multiple_bbs3(i1 %f, ptr %p) #0 {
+; CHECK-LABEL: define void @multiple_bbs3(
+; CHECK-SAME: i1 [[F:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]]
+; CHECK:       [[BB_T]]:
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[BB_F]]:
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %f, label %bb_t, label %bb_f
+bb_t:
+  call void @llvm.aarch64.dmb(i32 10)
+  br label %exit
+bb_f:
+  store i32 42, ptr %p
+  br label %exit
+exit:
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
+define void @multiple_bbs_unsafe(i1 %f, ptr %p) #0 {
+; CHECK-LABEL: define void @multiple_bbs_unsafe(
+; CHECK-SAME: i1 [[F:%.*]], ptr [[P:%.*]]) {
+; CHECK-NEXT:  [[ENTRY:.*:]]
+; CHECK-NEXT:    br i1 [[F]], label %[[BB_T:.*]], label %[[BB_F:.*]]
+; CHECK:       [[BB_T]]:
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    store i32 42, ptr [[P]], align 4
+; CHECK-NEXT:    br label %[[EXIT:.*]]
+; CHECK:       [[BB_F]]:
+; CHECK-NEXT:    br label %[[EXIT]]
+; CHECK:       [[EXIT]]:
+; CHECK-NEXT:    call void @llvm.aarch64.dmb(i32 10)
+; CHECK-NEXT:    ret void
+;
+entry:
+  br i1 %f, label %bb_t, label %bb_f
+bb_t:
+  call void @llvm.aarch64.dmb(i32 10)
+  store i32 42, ptr %p
+  br label %exit
+bb_f:
+  call void @llvm.aarch64.dmb(i32 10)
+  br label %exit
+exit:
+  call void @llvm.aarch64.dmb(i32 10)
+  ret void
+}
+
-- 
GitLab


From 2ef24e05defb6aa470fd4234853b2c11401cd660 Mon Sep 17 00:00:00 2001
From: Xing Xue <xingxue@outlook.com>
Date: Thu, 17 Oct 2024 13:07:18 -0400
Subject: [PATCH 287/329] [libunwind][AIX] Remove weak declaration
 "__xlcxx_personality_v0" (#112436)

`__xlcxx_personality_v0` is the personality routine in `libc++abi` for
the EH of applications generated by the legacy IBM C++ compiler. Since
the EH info generated by the legacy compiler does not provide the
location of the personality routine, this routine is hard-coded as the
handler for legacy EH in the unwinder. The symbol is resolved
dynamically using `dlopen()` to avoid a hard dependency of `libunwind`
on `libc++abi` for cases such as non-C++ applications. The weak
declaration of `__xlcxx_personality_v0` was originally intended to
bypass `dlopen()` if the C++ application generated by the legacy
compiler is statically linked with the new LLVM C++ compiler.
Unfortunately, this causes problems with runtime linking for
Clang-compiled code using the unwinder that does not link with
`libc++abi`.

On the other hand, the C++ runtime libraries shipped for AIX are
actually stripped and statically linking is not supported. So, we can
fix the problem by removing the `__xlcxx_personality_v0` weak
declaration. Besides, `dlopen()` would work as long as the libc++abi
shared library is available.
---
 libunwind/src/UnwindCursor.hpp           | 52 ++++++++++--------------
 libunwind/test/aix_runtime_link.pass.cpp | 20 +++++++++
 2 files changed, 42 insertions(+), 30 deletions(-)
 create mode 100644 libunwind/test/aix_runtime_link.pass.cpp

diff --git a/libunwind/src/UnwindCursor.hpp b/libunwind/src/UnwindCursor.hpp
index ce6dced535e7..2a3aba28fb6c 100644
--- a/libunwind/src/UnwindCursor.hpp
+++ b/libunwind/src/UnwindCursor.hpp
@@ -2033,7 +2033,6 @@ typedef _Unwind_Reason_Code __xlcxx_personality_v0_t(int, _Unwind_Action,
                                                      uint64_t,
                                                      _Unwind_Exception *,
                                                      struct _Unwind_Context *);
-__attribute__((__weak__)) __xlcxx_personality_v0_t __xlcxx_personality_v0;
 }
 
 static __xlcxx_personality_v0_t *xlcPersonalityV0;
@@ -2126,42 +2125,35 @@ bool UnwindCursor<A, R>::getInfoFromTBTable(pint_t pc, R &registers) {
     // function __xlcxx_personality_v0(), which is the personality for the state
     // table and is exported from libc++abi, is directly assigned as the
     // handler here. When a legacy XLC++ frame is encountered, the symbol
-    // is resolved dynamically using dlopen() to avoid hard dependency from
-    // libunwind on libc++abi.
+    // is resolved dynamically using dlopen() to avoid a hard dependency of
+    // libunwind on libc++abi in cases such as non-C++ applications.
 
     // Resolve the function pointer to the state table personality if it has
-    // not already.
+    // not already been done.
     if (xlcPersonalityV0 == NULL) {
       xlcPersonalityV0InitLock.lock();
       if (xlcPersonalityV0 == NULL) {
-        // If libc++abi is statically linked in, symbol __xlcxx_personality_v0
-        // has been resolved at the link time.
-        xlcPersonalityV0 = &__xlcxx_personality_v0;
+        // Resolve __xlcxx_personality_v0 using dlopen().
+        const char *libcxxabi = "libc++abi.a(libc++abi.so.1)";
+        void *libHandle;
+        // The AIX dlopen() sets errno to 0 when it is successful, which
+        // clobbers the value of errno from the user code. This is an AIX
+        // bug because according to POSIX it should not set errno to 0. To
+        // workaround before AIX fixes the bug, errno is saved and restored.
+        int saveErrno = errno;
+        libHandle = dlopen(libcxxabi, RTLD_MEMBER | RTLD_NOW);
+        if (libHandle == NULL) {
+          _LIBUNWIND_TRACE_UNWINDING("dlopen() failed with errno=%d\n", errno);
+          assert(0 && "dlopen() failed");
+        }
+        xlcPersonalityV0 = reinterpret_cast<__xlcxx_personality_v0_t *>(
+            dlsym(libHandle, "__xlcxx_personality_v0"));
         if (xlcPersonalityV0 == NULL) {
-          // libc++abi is dynamically linked. Resolve __xlcxx_personality_v0
-          // using dlopen().
-          const char libcxxabi[] = "libc++abi.a(libc++abi.so.1)";
-          void *libHandle;
-          // The AIX dlopen() sets errno to 0 when it is successful, which
-          // clobbers the value of errno from the user code. This is an AIX
-          // bug because according to POSIX it should not set errno to 0. To
-          // workaround before AIX fixes the bug, errno is saved and restored.
-          int saveErrno = errno;
-          libHandle = dlopen(libcxxabi, RTLD_MEMBER | RTLD_NOW);
-          if (libHandle == NULL) {
-            _LIBUNWIND_TRACE_UNWINDING("dlopen() failed with errno=%d\n",
-                                       errno);
-            assert(0 && "dlopen() failed");
-          }
-          xlcPersonalityV0 = reinterpret_cast<__xlcxx_personality_v0_t *>(
-              dlsym(libHandle, "__xlcxx_personality_v0"));
-          if (xlcPersonalityV0 == NULL) {
-            _LIBUNWIND_TRACE_UNWINDING("dlsym() failed with errno=%d\n", errno);
-            assert(0 && "dlsym() failed");
-          }
-          dlclose(libHandle);
-          errno = saveErrno;
+          _LIBUNWIND_TRACE_UNWINDING("dlsym() failed with errno=%d\n", errno);
+          assert(0 && "dlsym() failed");
         }
+        dlclose(libHandle);
+        errno = saveErrno;
       }
       xlcPersonalityV0InitLock.unlock();
     }
diff --git a/libunwind/test/aix_runtime_link.pass.cpp b/libunwind/test/aix_runtime_link.pass.cpp
new file mode 100644
index 000000000000..deb192c07981
--- /dev/null
+++ b/libunwind/test/aix_runtime_link.pass.cpp
@@ -0,0 +1,20 @@
+//===----------------------------------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+// Test that libunwind loads successfully independently of libc++abi with
+// runtime linking on AIX.
+
+// REQUIRES: target={{.+}}-aix{{.*}}
+// ADDITIONAL_COMPILE_FLAGS: -Wl,-brtl
+
+#include <unwind.h>
+extern "C" int printf(const char *, ...);
+int main(void) {
+  void *fp = (void *)&_Unwind_Backtrace;
+  printf("%p\n", fp);
+}
-- 
GitLab


From 03888a9046167c50c20e219e790d616b42b91608 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Thu, 17 Oct 2024 19:07:47 +0200
Subject: [PATCH 288/329] [clang][bytecode] Handle non-arrays in initElem{,Pop}
 (#112719)

... provided the given index is 0. Skip the atIndex() in that case.
---
 clang/lib/AST/ByteCode/Interp.h           | 37 ++++++++++++++++++-----
 clang/test/AST/ByteCode/placement-new.cpp | 14 +++++++++
 2 files changed, 43 insertions(+), 8 deletions(-)

diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index dece95971b76..a1a92562cc5e 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -1863,13 +1863,24 @@ bool InitPop(InterpState &S, CodePtr OpPC) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitElem(InterpState &S, CodePtr OpPC, uint32_t Idx) {
   const T &Value = S.Stk.pop<T>();
-  const Pointer &Ptr = S.Stk.peek<Pointer>().atIndex(Idx);
+  const Pointer &Ptr = S.Stk.peek<Pointer>();
+
   if (Ptr.isUnknownSizeArray())
     return false;
-  if (!CheckInit(S, OpPC, Ptr))
+
+  // In the unlikely event that we're initializing the first item of
+  // a non-array, skip the atIndex().
+  if (Idx == 0 && !Ptr.getFieldDesc()->isArray()) {
+    Ptr.initialize();
+    new (&Ptr.deref<T>()) T(Value);
+    return true;
+  }
+
+  const Pointer &ElemPtr = Ptr.atIndex(Idx);
+  if (!CheckInit(S, OpPC, ElemPtr))
     return false;
-  Ptr.initialize();
-  new (&Ptr.deref<T>()) T(Value);
+  ElemPtr.initialize();
+  new (&ElemPtr.deref<T>()) T(Value);
   return true;
 }
 
@@ -1877,13 +1888,23 @@ bool InitElem(InterpState &S, CodePtr OpPC, uint32_t Idx) {
 template <PrimType Name, class T = typename PrimConv<Name>::T>
 bool InitElemPop(InterpState &S, CodePtr OpPC, uint32_t Idx) {
   const T &Value = S.Stk.pop<T>();
-  const Pointer &Ptr = S.Stk.pop<Pointer>().atIndex(Idx);
+  const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (Ptr.isUnknownSizeArray())
     return false;
-  if (!CheckInit(S, OpPC, Ptr))
+
+  // In the unlikely event that we're initializing the first item of
+  // a non-array, skip the atIndex().
+  if (Idx == 0 && !Ptr.getFieldDesc()->isArray()) {
+    Ptr.initialize();
+    new (&Ptr.deref<T>()) T(Value);
+    return true;
+  }
+
+  const Pointer &ElemPtr = Ptr.atIndex(Idx);
+  if (!CheckInit(S, OpPC, ElemPtr))
     return false;
-  Ptr.initialize();
-  new (&Ptr.deref<T>()) T(Value);
+  ElemPtr.initialize();
+  new (&ElemPtr.deref<T>()) T(Value);
   return true;
 }
 
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index caf3ac97fd1c..6bd83f2372ea 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -52,6 +52,20 @@ consteval auto ok4() {
 }
 static_assert(ok4() == 37);
 
+consteval int ok5() {
+  int i;
+  new (&i) int[1]{1};
+
+  struct S {
+    int a; int b;
+  } s;
+  new (&s) S[1]{{12, 13}};
+
+  return 25;
+  // return s.a + s.b; FIXME: Broken in the current interpreter.
+}
+static_assert(ok5() == 25);
+
 /// FIXME: Broken in both interpreters.
 #if 0
 consteval int ok5() {
-- 
GitLab


From f35a14dd507b6fc90fe8e0b606c2f787d7dfedea Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Thu, 17 Oct 2024 10:17:09 -0700
Subject: [PATCH 289/329] [HLSL] Simplify debug check in
 ResourceBindings::addDeclBindingInfo (#112661)

Follow-up for
https://github.com/llvm/llvm-project/pull/111203#pullrequestreview-2373679837.
---
 clang/lib/Sema/SemaHLSL.cpp | 10 +---------
 1 file changed, 1 insertion(+), 9 deletions(-)

diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index 0d23c4935e91..efb0fbaa432d 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -110,15 +110,7 @@ DeclBindingInfo *ResourceBindings::addDeclBindingInfo(const VarDecl *VD,
                                                       ResourceClass ResClass) {
   assert(getDeclBindingInfo(VD, ResClass) == nullptr &&
          "DeclBindingInfo already added");
-#ifndef NDEBUG
-  // Verify that existing bindings for this decl are stored sequentially
-  // and at the end of the BindingsList
-  auto I = DeclToBindingListIndex.find(VD);
-  if (I != DeclToBindingListIndex.end()) {
-    for (unsigned Index = I->getSecond(); Index < BindingsList.size(); ++Index)
-      assert(BindingsList[Index].Decl == VD);
-  }
-#endif
+  assert(!hasBindingInfoForDecl(VD) || BindingsList.back().Decl == VD);
   // VarDecl may have multiple entries for different resource classes.
   // DeclToBindingListIndex stores the index of the first binding we saw
   // for this decl. If there are any additional ones then that index
-- 
GitLab


From dea213cb9b2e1ce7a6032ae4bc5306f74ebfc604 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Thu, 17 Oct 2024 10:37:32 -0700
Subject: [PATCH 290/329] Add atan2 test case for prior change in
 X86SelLowering.cpp (#112616)

When updating X86SelLowering.cpp for atan2, based on #96222, it was
known that a needed change was missing which was merged later in
#101268. However, the corresponding test update to
`fp-strict-libcalls-msvc32.ll` was missed.

This change rectifies that oversight.

This also adds a missing label to the tanh test, since it's produced by
update_llc_test_checks.py

Part of: Implement the atan2 HLSL Function #70096.
---
 .../CodeGen/X86/fp-strict-libcalls-msvc32.ll  | 22 +++++++++++++++++++
 1 file changed, 22 insertions(+)

diff --git a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
index 5d4e86afc8ac..74291fbb75e8 100644
--- a/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
+++ b/llvm/test/CodeGen/X86/fp-strict-libcalls-msvc32.ll
@@ -228,6 +228,26 @@ define float @atan(float %x) #0 {
   ret float %result
 }
 
+define float @atan2(float %x, float %y) #0 {
+; CHECK-LABEL: atan2:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subl $20, %esp
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fxch %st(1)
+; CHECK-NEXT:    fstpl {{[0-9]+}}(%esp)
+; CHECK-NEXT:    fstpl (%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    calll _atan2
+; CHECK-NEXT:    fstps {{[0-9]+}}(%esp)
+; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
+; CHECK-NEXT:    wait
+; CHECK-NEXT:    addl $20, %esp
+; CHECK-NEXT:    retl
+  %result = call float @llvm.experimental.constrained.atan2.f32(float %x, float %y, metadata !"round.dynamic", metadata !"fpexcept.strict") #0
+  ret float %result
+}
+
 define float @cosh(float %x) #0 {
 ; CHECK-LABEL: cosh:
 ; CHECK:       # %bb.0:
@@ -263,6 +283,7 @@ define float @sinh(float %x) #0 {
 }
 
 define float @tanh(float %x) #0 {
+; CHECK-LABEL: tanh:
 ; CHECK:       # %bb.0:
 ; CHECK-NEXT:    subl $12, %esp
 ; CHECK-NEXT:    flds {{[0-9]+}}(%esp)
@@ -293,6 +314,7 @@ declare float @llvm.experimental.constrained.tan.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.acos.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.asin.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.atan.f32(float, metadata, metadata)
+declare float @llvm.experimental.constrained.atan2.f32(float, float, metadata, metadata)
 declare float @llvm.experimental.constrained.cosh.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.sinh.f32(float, metadata, metadata)
 declare float @llvm.experimental.constrained.tanh.f32(float, metadata, metadata)
-- 
GitLab


From 2bebeea2a1c74b78d1be32dbe3a7d724da1af102 Mon Sep 17 00:00:00 2001
From: Tex Riddell <texr@microsoft.com>
Date: Thu, 17 Oct 2024 10:39:36 -0700
Subject: [PATCH 291/329] [WebAssembly] Add atan2 to
 RuntimeLibcallSignatureTable (#112613)

This change is part of this proposal:
https://discourse.llvm.org/t/rfc-all-the-math-intrinsics/78294

- `WebAssemblyRuntimeLibcallSignatures.cpp`: Add `RTLIB::ATAN2*` to
RuntimeLibcallSignatureTable
- Add atan2 calls to `CodeGen/WebAssembly/libcalls-trig.ll` and update
test checks

Part of: Implement the atan2 HLSL Function #70096.
---
 .../WebAssemblyRuntimeLibcallSignatures.cpp   |   3 +
 .../test/CodeGen/WebAssembly/libcalls-trig.ll | 316 ++++++++++--------
 2 files changed, 173 insertions(+), 146 deletions(-)

diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
index ba3ab5164af2..aaa522567072 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyRuntimeLibcallSignatures.cpp
@@ -213,6 +213,9 @@ struct RuntimeLibcallSignatureTable {
     Table[RTLIB::ATAN_F32] = f32_func_f32;
     Table[RTLIB::ATAN_F64] = f64_func_f64;
     Table[RTLIB::ATAN_F128] = i64_i64_func_i64_i64;
+    Table[RTLIB::ATAN2_F32] = f32_func_f32_f32;
+    Table[RTLIB::ATAN2_F64] = f64_func_f64_f64;
+    Table[RTLIB::ATAN2_F128] = i64_i64_func_i64_i64_i64_i64;
     Table[RTLIB::SINH_F32] = f32_func_f32;
     Table[RTLIB::SINH_F64] = f64_func_f64;
     Table[RTLIB::SINH_F128] = i64_i64_func_i64_i64;
diff --git a/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
index 8bc9c043fcf8..7850559b49b7 100644
--- a/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
+++ b/llvm/test/CodeGen/WebAssembly/libcalls-trig.ll
@@ -10,10 +10,11 @@ declare fp128 @llvm.cos.f128(fp128)
 declare fp128 @llvm.tan.f128(fp128)
 declare fp128 @llvm.asin.f128(fp128)
 declare fp128 @llvm.acos.f128(fp128)
-declare fp128 @llvm.atan.f128.i32(fp128)
+declare fp128 @llvm.atan.f128(fp128)
 declare fp128 @llvm.sinh.f128(fp128)
 declare fp128 @llvm.cosh.f128(fp128)
 declare fp128 @llvm.tanh.f128(fp128)
+declare fp128 @llvm.atan2.f128(fp128, fp128)
 
 declare double @llvm.sin.f64(double)
 declare double @llvm.cos.f64(double)
@@ -24,6 +25,7 @@ declare double @llvm.atan.f64(double)
 declare double @llvm.sinh.f64(double)
 declare double @llvm.cosh.f64(double)
 declare double @llvm.tanh.f64(double)
+declare double @llvm.atan2.f64(double, double)
 
 declare float @llvm.sin.f32(float)
 declare float @llvm.cos.f32(float)
@@ -34,6 +36,7 @@ declare float @llvm.atan.f32(float)
 declare float @llvm.sinh.f32(float)
 declare float @llvm.cosh.f32(float)
 declare float @llvm.tanh.f32(float)
+declare float @llvm.atan2.f32(float, float)
 
 
 define fp128 @fp128libcalls(fp128 %x) {
@@ -42,154 +45,171 @@ define fp128 @fp128libcalls(fp128 %x) {
 ; CHECK:         .functype fp128libcalls (i32, i64, i64) -> ()
 ; CHECK-NEXT:    .local i32
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    global.get      $push28=, __stack_pointer
-; CHECK-NEXT:    i32.const       $push29=, 144
-; CHECK-NEXT:    i32.sub         $push73=, $pop28, $pop29
-; CHECK-NEXT:    local.tee       $push72=, 3, $pop73
-; CHECK-NEXT:    global.set      __stack_pointer, $pop72
-; CHECK-NEXT:    local.get       $push74=, 3
-; CHECK-NEXT:    i32.const       $push62=, 128
-; CHECK-NEXT:    i32.add         $push63=, $pop74, $pop62
-; CHECK-NEXT:    local.get       $push76=, 1
-; CHECK-NEXT:    local.get       $push75=, 2
-; CHECK-NEXT:    call    sinl, $pop63, $pop76, $pop75
-; CHECK-NEXT:    local.get       $push77=, 3
-; CHECK-NEXT:    i32.const       $push58=, 112
-; CHECK-NEXT:    i32.add         $push59=, $pop77, $pop58
-; CHECK-NEXT:    local.get       $push78=, 3
-; CHECK-NEXT:    i64.load        $push3=, 128($pop78)
-; CHECK-NEXT:    local.get       $push79=, 3
-; CHECK-NEXT:    i32.const       $push60=, 128
-; CHECK-NEXT:    i32.add         $push61=, $pop79, $pop60
-; CHECK-NEXT:    i32.const       $push0=, 8
-; CHECK-NEXT:    i32.add         $push1=, $pop61, $pop0
-; CHECK-NEXT:    i64.load        $push2=, 0($pop1)
-; CHECK-NEXT:    call    cosl, $pop59, $pop3, $pop2
-; CHECK-NEXT:    local.get       $push80=, 3
-; CHECK-NEXT:    i32.const       $push54=, 96
-; CHECK-NEXT:    i32.add         $push55=, $pop80, $pop54
-; CHECK-NEXT:    local.get       $push81=, 3
-; CHECK-NEXT:    i64.load        $push6=, 112($pop81)
-; CHECK-NEXT:    local.get       $push82=, 3
-; CHECK-NEXT:    i32.const       $push56=, 112
-; CHECK-NEXT:    i32.add         $push57=, $pop82, $pop56
-; CHECK-NEXT:    i32.const       $push71=, 8
-; CHECK-NEXT:    i32.add         $push4=, $pop57, $pop71
-; CHECK-NEXT:    i64.load        $push5=, 0($pop4)
-; CHECK-NEXT:    call    tanl, $pop55, $pop6, $pop5
-; CHECK-NEXT:    local.get       $push83=, 3
-; CHECK-NEXT:    i32.const       $push50=, 80
-; CHECK-NEXT:    i32.add         $push51=, $pop83, $pop50
-; CHECK-NEXT:    local.get       $push84=, 3
-; CHECK-NEXT:    i64.load        $push9=, 96($pop84)
-; CHECK-NEXT:    local.get       $push85=, 3
-; CHECK-NEXT:    i32.const       $push52=, 96
-; CHECK-NEXT:    i32.add         $push53=, $pop85, $pop52
-; CHECK-NEXT:    i32.const       $push70=, 8
-; CHECK-NEXT:    i32.add         $push7=, $pop53, $pop70
-; CHECK-NEXT:    i64.load        $push8=, 0($pop7)
-; CHECK-NEXT:    call    asinl, $pop51, $pop9, $pop8
-; CHECK-NEXT:    local.get       $push86=, 3
-; CHECK-NEXT:    i32.const       $push46=, 64
-; CHECK-NEXT:    i32.add         $push47=, $pop86, $pop46
-; CHECK-NEXT:    local.get       $push87=, 3
-; CHECK-NEXT:    i64.load        $push12=, 80($pop87)
-; CHECK-NEXT:    local.get       $push88=, 3
-; CHECK-NEXT:    i32.const       $push48=, 80
-; CHECK-NEXT:    i32.add         $push49=, $pop88, $pop48
-; CHECK-NEXT:    i32.const       $push69=, 8
-; CHECK-NEXT:    i32.add         $push10=, $pop49, $pop69
-; CHECK-NEXT:    i64.load        $push11=, 0($pop10)
-; CHECK-NEXT:    call    acosl, $pop47, $pop12, $pop11
-; CHECK-NEXT:    local.get       $push89=, 3
-; CHECK-NEXT:    i32.const       $push42=, 48
-; CHECK-NEXT:    i32.add         $push43=, $pop89, $pop42
-; CHECK-NEXT:    local.get       $push90=, 3
-; CHECK-NEXT:    i64.load        $push15=, 64($pop90)
-; CHECK-NEXT:    local.get       $push91=, 3
-; CHECK-NEXT:    i32.const       $push44=, 64
-; CHECK-NEXT:    i32.add         $push45=, $pop91, $pop44
-; CHECK-NEXT:    i32.const       $push68=, 8
-; CHECK-NEXT:    i32.add         $push13=, $pop45, $pop68
-; CHECK-NEXT:    i64.load        $push14=, 0($pop13)
-; CHECK-NEXT:    call    atanl, $pop43, $pop15, $pop14
-; CHECK-NEXT:    local.get       $push92=, 3
-; CHECK-NEXT:    i32.const       $push38=, 32
-; CHECK-NEXT:    i32.add         $push39=, $pop92, $pop38
-; CHECK-NEXT:    local.get       $push93=, 3
-; CHECK-NEXT:    i64.load        $push18=, 48($pop93)
-; CHECK-NEXT:    local.get       $push94=, 3
-; CHECK-NEXT:    i32.const       $push40=, 48
-; CHECK-NEXT:    i32.add         $push41=, $pop94, $pop40
-; CHECK-NEXT:    i32.const       $push67=, 8
-; CHECK-NEXT:    i32.add         $push16=, $pop41, $pop67
-; CHECK-NEXT:    i64.load        $push17=, 0($pop16)
-; CHECK-NEXT:    call    sinhl, $pop39, $pop18, $pop17
-; CHECK-NEXT:    local.get       $push95=, 3
-; CHECK-NEXT:    i32.const       $push34=, 16
-; CHECK-NEXT:    i32.add         $push35=, $pop95, $pop34
-; CHECK-NEXT:    local.get       $push96=, 3
-; CHECK-NEXT:    i64.load        $push21=, 32($pop96)
-; CHECK-NEXT:    local.get       $push97=, 3
-; CHECK-NEXT:    i32.const       $push36=, 32
-; CHECK-NEXT:    i32.add         $push37=, $pop97, $pop36
-; CHECK-NEXT:    i32.const       $push66=, 8
-; CHECK-NEXT:    i32.add         $push19=, $pop37, $pop66
-; CHECK-NEXT:    i64.load        $push20=, 0($pop19)
-; CHECK-NEXT:    call    coshl, $pop35, $pop21, $pop20
-; CHECK-NEXT:    local.get       $push100=, 3
-; CHECK-NEXT:    local.get       $push98=, 3
-; CHECK-NEXT:    i64.load        $push24=, 16($pop98)
-; CHECK-NEXT:    local.get       $push99=, 3
-; CHECK-NEXT:    i32.const       $push32=, 16
-; CHECK-NEXT:    i32.add         $push33=, $pop99, $pop32
-; CHECK-NEXT:    i32.const       $push65=, 8
-; CHECK-NEXT:    i32.add         $push22=, $pop33, $pop65
-; CHECK-NEXT:    i64.load        $push23=, 0($pop22)
-; CHECK-NEXT:    call    tanhl, $pop100, $pop24, $pop23
-; CHECK-NEXT:    local.get       $push102=, 0
-; CHECK-NEXT:    local.get       $push101=, 3
-; CHECK-NEXT:    i32.const       $push64=, 8
-; CHECK-NEXT:    i32.add         $push25=, $pop101, $pop64
-; CHECK-NEXT:    i64.load        $push26=, 0($pop25)
-; CHECK-NEXT:    i64.store       8($pop102), $pop26
-; CHECK-NEXT:    local.get       $push104=, 0
-; CHECK-NEXT:    local.get       $push103=, 3
-; CHECK-NEXT:    i64.load        $push27=, 0($pop103)
-; CHECK-NEXT:    i64.store       0($pop104), $pop27
-; CHECK-NEXT:    local.get       $push105=, 3
-; CHECK-NEXT:    i32.const       $push30=, 144
-; CHECK-NEXT:    i32.add         $push31=, $pop105, $pop30
-; CHECK-NEXT:    global.set      __stack_pointer, $pop31
+; CHECK-NEXT:    global.get $push31=, __stack_pointer
+; CHECK-NEXT:    i32.const $push32=, 160
+; CHECK-NEXT:    i32.sub $push81=, $pop31, $pop32
+; CHECK-NEXT:    local.tee $push80=, 3, $pop81
+; CHECK-NEXT:    global.set __stack_pointer, $pop80
+; CHECK-NEXT:    local.get $push82=, 3
+; CHECK-NEXT:    i32.const $push69=, 144
+; CHECK-NEXT:    i32.add $push70=, $pop82, $pop69
+; CHECK-NEXT:    local.get $push84=, 1
+; CHECK-NEXT:    local.get $push83=, 2
+; CHECK-NEXT:    call sinl, $pop70, $pop84, $pop83
+; CHECK-NEXT:    local.get $push85=, 3
+; CHECK-NEXT:    i32.const $push65=, 128
+; CHECK-NEXT:    i32.add $push66=, $pop85, $pop65
+; CHECK-NEXT:    local.get $push86=, 3
+; CHECK-NEXT:    i64.load $push3=, 144($pop86)
+; CHECK-NEXT:    local.get $push87=, 3
+; CHECK-NEXT:    i32.const $push67=, 144
+; CHECK-NEXT:    i32.add $push68=, $pop87, $pop67
+; CHECK-NEXT:    i32.const $push0=, 8
+; CHECK-NEXT:    i32.add $push1=, $pop68, $pop0
+; CHECK-NEXT:    i64.load $push2=, 0($pop1)
+; CHECK-NEXT:    call cosl, $pop66, $pop3, $pop2
+; CHECK-NEXT:    local.get $push88=, 3
+; CHECK-NEXT:    i32.const $push61=, 112
+; CHECK-NEXT:    i32.add $push62=, $pop88, $pop61
+; CHECK-NEXT:    local.get $push89=, 3
+; CHECK-NEXT:    i64.load $push6=, 128($pop89)
+; CHECK-NEXT:    local.get $push90=, 3
+; CHECK-NEXT:    i32.const $push63=, 128
+; CHECK-NEXT:    i32.add $push64=, $pop90, $pop63
+; CHECK-NEXT:    i32.const $push79=, 8
+; CHECK-NEXT:    i32.add $push4=, $pop64, $pop79
+; CHECK-NEXT:    i64.load $push5=, 0($pop4)
+; CHECK-NEXT:    call tanl, $pop62, $pop6, $pop5
+; CHECK-NEXT:    local.get $push91=, 3
+; CHECK-NEXT:    i32.const $push57=, 96
+; CHECK-NEXT:    i32.add $push58=, $pop91, $pop57
+; CHECK-NEXT:    local.get $push92=, 3
+; CHECK-NEXT:    i64.load $push9=, 112($pop92)
+; CHECK-NEXT:    local.get $push93=, 3
+; CHECK-NEXT:    i32.const $push59=, 112
+; CHECK-NEXT:    i32.add $push60=, $pop93, $pop59
+; CHECK-NEXT:    i32.const $push78=, 8
+; CHECK-NEXT:    i32.add $push7=, $pop60, $pop78
+; CHECK-NEXT:    i64.load $push8=, 0($pop7)
+; CHECK-NEXT:    call asinl, $pop58, $pop9, $pop8
+; CHECK-NEXT:    local.get $push94=, 3
+; CHECK-NEXT:    i32.const $push53=, 80
+; CHECK-NEXT:    i32.add $push54=, $pop94, $pop53
+; CHECK-NEXT:    local.get $push95=, 3
+; CHECK-NEXT:    i64.load $push12=, 96($pop95)
+; CHECK-NEXT:    local.get $push96=, 3
+; CHECK-NEXT:    i32.const $push55=, 96
+; CHECK-NEXT:    i32.add $push56=, $pop96, $pop55
+; CHECK-NEXT:    i32.const $push77=, 8
+; CHECK-NEXT:    i32.add $push10=, $pop56, $pop77
+; CHECK-NEXT:    i64.load $push11=, 0($pop10)
+; CHECK-NEXT:    call acosl, $pop54, $pop12, $pop11
+; CHECK-NEXT:    local.get $push97=, 3
+; CHECK-NEXT:    i32.const $push49=, 64
+; CHECK-NEXT:    i32.add $push50=, $pop97, $pop49
+; CHECK-NEXT:    local.get $push98=, 3
+; CHECK-NEXT:    i64.load $push15=, 80($pop98)
+; CHECK-NEXT:    local.get $push99=, 3
+; CHECK-NEXT:    i32.const $push51=, 80
+; CHECK-NEXT:    i32.add $push52=, $pop99, $pop51
+; CHECK-NEXT:    i32.const $push76=, 8
+; CHECK-NEXT:    i32.add $push13=, $pop52, $pop76
+; CHECK-NEXT:    i64.load $push14=, 0($pop13)
+; CHECK-NEXT:    call atanl, $pop50, $pop15, $pop14
+; CHECK-NEXT:    local.get $push100=, 3
+; CHECK-NEXT:    i32.const $push45=, 48
+; CHECK-NEXT:    i32.add $push46=, $pop100, $pop45
+; CHECK-NEXT:    local.get $push101=, 3
+; CHECK-NEXT:    i64.load $push18=, 64($pop101)
+; CHECK-NEXT:    local.get $push102=, 3
+; CHECK-NEXT:    i32.const $push47=, 64
+; CHECK-NEXT:    i32.add $push48=, $pop102, $pop47
+; CHECK-NEXT:    i32.const $push75=, 8
+; CHECK-NEXT:    i32.add $push16=, $pop48, $pop75
+; CHECK-NEXT:    i64.load $push17=, 0($pop16)
+; CHECK-NEXT:    call sinhl, $pop46, $pop18, $pop17
+; CHECK-NEXT:    local.get $push103=, 3
+; CHECK-NEXT:    i32.const $push41=, 32
+; CHECK-NEXT:    i32.add $push42=, $pop103, $pop41
+; CHECK-NEXT:    local.get $push104=, 3
+; CHECK-NEXT:    i64.load $push21=, 48($pop104)
+; CHECK-NEXT:    local.get $push105=, 3
+; CHECK-NEXT:    i32.const $push43=, 48
+; CHECK-NEXT:    i32.add $push44=, $pop105, $pop43
+; CHECK-NEXT:    i32.const $push74=, 8
+; CHECK-NEXT:    i32.add $push19=, $pop44, $pop74
+; CHECK-NEXT:    i64.load $push20=, 0($pop19)
+; CHECK-NEXT:    call coshl, $pop42, $pop21, $pop20
+; CHECK-NEXT:    local.get $push106=, 3
+; CHECK-NEXT:    i32.const $push37=, 16
+; CHECK-NEXT:    i32.add $push38=, $pop106, $pop37
+; CHECK-NEXT:    local.get $push107=, 3
+; CHECK-NEXT:    i64.load $push24=, 32($pop107)
+; CHECK-NEXT:    local.get $push108=, 3
+; CHECK-NEXT:    i32.const $push39=, 32
+; CHECK-NEXT:    i32.add $push40=, $pop108, $pop39
+; CHECK-NEXT:    i32.const $push73=, 8
+; CHECK-NEXT:    i32.add $push22=, $pop40, $pop73
+; CHECK-NEXT:    i64.load $push23=, 0($pop22)
+; CHECK-NEXT:    call tanhl, $pop38, $pop24, $pop23
+; CHECK-NEXT:    local.get $push113=, 3
+; CHECK-NEXT:    local.get $push112=, 1
+; CHECK-NEXT:    local.get $push111=, 2
+; CHECK-NEXT:    local.get $push109=, 3
+; CHECK-NEXT:    i64.load $push27=, 16($pop109)
+; CHECK-NEXT:    local.get $push110=, 3
+; CHECK-NEXT:    i32.const $push35=, 16
+; CHECK-NEXT:    i32.add $push36=, $pop110, $pop35
+; CHECK-NEXT:    i32.const $push72=, 8
+; CHECK-NEXT:    i32.add $push25=, $pop36, $pop72
+; CHECK-NEXT:    i64.load $push26=, 0($pop25)
+; CHECK-NEXT:    call atan2l, $pop113, $pop112, $pop111, $pop27, $pop26
+; CHECK-NEXT:    local.get $push115=, 0
+; CHECK-NEXT:    local.get $push114=, 3
+; CHECK-NEXT:    i32.const $push71=, 8
+; CHECK-NEXT:    i32.add $push28=, $pop114, $pop71
+; CHECK-NEXT:    i64.load $push29=, 0($pop28)
+; CHECK-NEXT:    i64.store 8($pop115), $pop29
+; CHECK-NEXT:    local.get $push117=, 0
+; CHECK-NEXT:    local.get $push116=, 3
+; CHECK-NEXT:    i64.load $push30=, 0($pop116)
+; CHECK-NEXT:    i64.store 0($pop117), $pop30
+; CHECK-NEXT:    local.get $push118=, 3
+; CHECK-NEXT:    i32.const $push33=, 160
+; CHECK-NEXT:    i32.add $push34=, $pop118, $pop33
+; CHECK-NEXT:    global.set __stack_pointer, $pop34
 ; CHECK-NEXT:    return
   ; libm calls
   %d = call fp128 @llvm.sin.f128(fp128 %x)
   %e = call fp128 @llvm.cos.f128(fp128 %d)
   %f = call fp128 @llvm.tan.f128(fp128 %e)
-  %g = call fp128 @llvm.asin.f128.i32(fp128 %f)
+  %g = call fp128 @llvm.asin.f128(fp128 %f)
   %h = call fp128 @llvm.acos.f128(fp128 %g)
   %i = call fp128 @llvm.atan.f128(fp128 %h)
   %a = call fp128 @llvm.sinh.f128(fp128 %i)
   %b = call fp128 @llvm.cosh.f128(fp128 %a)
   %c = call fp128 @llvm.tanh.f128(fp128 %b)
-  ret fp128 %c
+  %j = call fp128 @llvm.atan2.f128(fp128 %x, fp128 %c)
+  ret fp128 %j
 }
 
 define double @f64libcalls(double %x) {
 ; CHECK-LABEL: f64libcalls:
 ; CHECK:         .functype f64libcalls (f64) -> (f64)
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    local.get $push9=, 0
-; CHECK-NEXT:    call    $push0=, sin, $pop9
-; CHECK-NEXT:    call    $push1=, cos, $pop0
-; CHECK-NEXT:    call    $push2=, tan, $pop1
-; CHECK-NEXT:    call    $push3=, asin, $pop2
-; CHECK-NEXT:    call    $push4=, acos, $pop3
-; CHECK-NEXT:    call    $push5=, atan, $pop4
-; CHECK-NEXT:    call    $push6=, sinh, $pop5
-; CHECK-NEXT:    call    $push7=, cosh, $pop6
-; CHECK-NEXT:    call    $push8=, tanh, $pop7  
-; CHECK-NEXT:    return $pop8
+; CHECK-NEXT:    local.get $push11=, 0
+; CHECK-NEXT:    local.get $push10=, 0
+; CHECK-NEXT:    call $push0=, sin, $pop10
+; CHECK-NEXT:    call $push1=, cos, $pop0
+; CHECK-NEXT:    call $push2=, tan, $pop1
+; CHECK-NEXT:    call $push3=, asin, $pop2
+; CHECK-NEXT:    call $push4=, acos, $pop3
+; CHECK-NEXT:    call $push5=, atan, $pop4
+; CHECK-NEXT:    call $push6=, sinh, $pop5
+; CHECK-NEXT:    call $push7=, cosh, $pop6
+; CHECK-NEXT:    call $push8=, tanh, $pop7
+; CHECK-NEXT:    call $push9=, atan2, $pop11, $pop8
+; CHECK-NEXT:    return $pop9
 
 
  %k = call double @llvm.sin.f64(double %x)
@@ -201,24 +221,27 @@ define double @f64libcalls(double %x) {
  %f = call double @llvm.sinh.f64(double %e)
  %g = call double @llvm.cosh.f64(double %f)
  %h = call double @llvm.tanh.f64(double %g)
- ret double %h
+ %i = call double @llvm.atan2.f64(double %x, double %h)
+ ret double %i
 }
 
 define float @f32libcalls(float %x) {
 ; CHECK-LABEL: f32libcalls:
 ; CHECK:         .functype f32libcalls (f32) -> (f32)
 ; CHECK-NEXT:  # %bb.0:
-; CHECK-NEXT:    local.get $push9=, 0
-; CHECK-NEXT:    call    $push0=, sinf, $pop9
-; CHECK-NEXT:    call    $push1=, cosf, $pop0
-; CHECK-NEXT:    call    $push2=, tanf, $pop1
-; CHECK-NEXT:    call    $push3=, asinf, $pop2
-; CHECK-NEXT:    call    $push4=, acosf, $pop3
-; CHECK-NEXT:    call    $push5=, atanf, $pop4
-; CHECK-NEXT:    call    $push6=, sinhf, $pop5
-; CHECK-NEXT:    call    $push7=, coshf, $pop6
-; CHECK-NEXT:    call    $push8=, tanhf, $pop7  
-; CHECK-NEXT:    return $pop8
+; CHECK-NEXT:    local.get $push11=, 0
+; CHECK-NEXT:    local.get $push10=, 0
+; CHECK-NEXT:    call $push0=, sinf, $pop10
+; CHECK-NEXT:    call $push1=, cosf, $pop0
+; CHECK-NEXT:    call $push2=, tanf, $pop1
+; CHECK-NEXT:    call $push3=, asinf, $pop2
+; CHECK-NEXT:    call $push4=, acosf, $pop3
+; CHECK-NEXT:    call $push5=, atanf, $pop4
+; CHECK-NEXT:    call $push6=, sinhf, $pop5
+; CHECK-NEXT:    call $push7=, coshf, $pop6
+; CHECK-NEXT:    call $push8=, tanhf, $pop7
+; CHECK-NEXT:    call $push9=, atan2f, $pop11, $pop8
+; CHECK-NEXT:    return $pop9
 
 
  %k = call float @llvm.sin.f32(float %x)
@@ -230,5 +253,6 @@ define float @f32libcalls(float %x) {
  %f = call float @llvm.sinh.f32(float %e)
  %g = call float @llvm.cosh.f32(float %f)
  %h = call float @llvm.tanh.f32(float %g)
- ret float %h
+ %i = call float @llvm.atan2.f32(float %x, float %h)
+ ret float %i
 }
-- 
GitLab


From 2c93598b32c217c605dc4eeea8e37eae2ba5799a Mon Sep 17 00:00:00 2001
From: Krzysztof Parzyszek <Krzysztof.Parzyszek@amd.com>
Date: Thu, 17 Oct 2024 12:50:10 -0500
Subject: [PATCH 292/329] [flang] Update printing values in dump-parse-tree
 (#112709)

Remove 'if std::string' that is covered by another branch of the
if-statement.
Add printing of 'bool' and 'int' values, since they have corresponding
`GetNodeName` definitions.
---
 flang/include/flang/Parser/dump-parse-tree.h |  6 ++++--
 flang/test/Parser/OpenMP/allocate-tree.f90   | 13 +++++++++++++
 2 files changed, 17 insertions(+), 2 deletions(-)

diff --git a/flang/include/flang/Parser/dump-parse-tree.h b/flang/include/flang/Parser/dump-parse-tree.h
index 5d243b4e5d3e..ccbe5475d051 100644
--- a/flang/include/flang/Parser/dump-parse-tree.h
+++ b/flang/include/flang/Parser/dump-parse-tree.h
@@ -884,8 +884,10 @@ protected:
     } else if constexpr (HasSource<T>::value) {
       return x.source.ToString();
 #endif
-    } else if constexpr (std::is_same_v<T, std::string>) {
-      return x;
+    } else if constexpr (std::is_same_v<T, int>) {
+      return std::to_string(x);
+    } else if constexpr (std::is_same_v<T, bool>) {
+      return x ? "true" : "false";
     } else {
       return "";
     }
diff --git a/flang/test/Parser/OpenMP/allocate-tree.f90 b/flang/test/Parser/OpenMP/allocate-tree.f90
index 9de257b00dc3..bf413d591baf 100644
--- a/flang/test/Parser/OpenMP/allocate-tree.f90
+++ b/flang/test/Parser/OpenMP/allocate-tree.f90
@@ -18,6 +18,19 @@ program allocate_tree
     allocate(w, xarray(4), zarray(t, z))
 end program allocate_tree
 
+!CHECK: | | DeclarationConstruct -> SpecificationConstruct -> TypeDeclarationStmt
+!CHECK-NEXT: | | | DeclarationTypeSpec -> IntrinsicTypeSpec -> IntegerTypeSpec ->
+!CHECK-NEXT: | | | AttrSpec -> Allocatable
+!CHECK-NEXT: | | | EntityDecl
+!CHECK-NEXT: | | | | Name = 'w'
+!CHECK-NEXT: | | | EntityDecl
+!CHECK-NEXT: | | | | Name = 'xarray'
+!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '1'
+!CHECK-NEXT: | | | EntityDecl
+!CHECK-NEXT: | | | | Name = 'zarray'
+!CHECK-NEXT: | | | | ArraySpec -> DeferredShapeSpecList -> int = '2'
+
+
 !CHECK: | | ExecutionPartConstruct -> ExecutableConstruct -> OpenMPConstruct -> OpenMPExecutableAllocate
 !CHECK-NEXT: | | | Verbatim
 !CHECK-NEXT: | | | OmpClauseList ->
-- 
GitLab


From ed7868de03c7b93809f87ed1a01103b926564feb Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 17 Oct 2024 11:07:43 -0700
Subject: [PATCH 293/329] [nfc][sanitizer] Replace mmap with InternalMmapVector
 in test (#112756)

---
 .../lib/sanitizer_common/tests/sanitizer_posix_test.cpp     | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
index bed19d15a8ec..803c8d39362e 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
@@ -65,8 +65,8 @@ TEST(SanitizerCommon, PthreadDestructorIterations) {
 
 TEST(SanitizerCommon, IsAccessibleMemoryRange) {
   const int page_size = GetPageSize();
-  uptr mem = (uptr)mmap(0, 3 * page_size, PROT_READ | PROT_WRITE,
-                        MAP_PRIVATE | MAP_ANON, -1, 0);
+  InternalMmapVector<char> buffer(3 * page_size);
+  uptr mem = reinterpret_cast<uptr>(buffer.data());
   // Protect the middle page.
   mprotect((void *)(mem + page_size), page_size, PROT_NONE);
   EXPECT_TRUE(IsAccessibleMemoryRange(mem, page_size - 1));
@@ -78,8 +78,6 @@ TEST(SanitizerCommon, IsAccessibleMemoryRange) {
   EXPECT_TRUE(IsAccessibleMemoryRange(mem + 2 * page_size, page_size));
   EXPECT_FALSE(IsAccessibleMemoryRange(mem, 3 * page_size));
   EXPECT_FALSE(IsAccessibleMemoryRange(0x0, 2));
-
-  munmap((void *)mem, 3 * page_size);
 }
 
 }  // namespace __sanitizer
-- 
GitLab


From ed3d05178274890fb804f43ae1bcdfd33b5fd8f0 Mon Sep 17 00:00:00 2001
From: OverMighty <its.overmighty@gmail.com>
Date: Thu, 17 Oct 2024 20:44:23 +0200
Subject: [PATCH 294/329] [libc][math][c23] Add sinhf16 and coshf16 C23 math
 functions (#105947)

Part of #95250.
---
 libc/config/gpu/entrypoints.txt           |   2 +
 libc/config/linux/x86_64/entrypoints.txt  |   2 +
 libc/docs/math/index.rst                  |   4 +-
 libc/spec/stdc.td                         |   4 +
 libc/src/math/CMakeLists.txt              |   4 +
 libc/src/math/coshf16.h                   |  21 ++++
 libc/src/math/generic/CMakeLists.txt      |  39 ++++++
 libc/src/math/generic/coshf16.cpp         | 103 ++++++++++++++++
 libc/src/math/generic/expxf16.h           | 114 +++++++++++++++++
 libc/src/math/generic/sinhf16.cpp         | 144 ++++++++++++++++++++++
 libc/src/math/sinhf16.h                   |  21 ++++
 libc/test/src/math/CMakeLists.txt         |  22 ++++
 libc/test/src/math/coshf16_test.cpp       |  40 ++++++
 libc/test/src/math/sinhf16_test.cpp       |  40 ++++++
 libc/test/src/math/smoke/CMakeLists.txt   |  26 ++++
 libc/test/src/math/smoke/coshf16_test.cpp |  90 ++++++++++++++
 libc/test/src/math/smoke/sinhf16_test.cpp |  88 +++++++++++++
 17 files changed, 762 insertions(+), 2 deletions(-)
 create mode 100644 libc/src/math/coshf16.h
 create mode 100644 libc/src/math/generic/coshf16.cpp
 create mode 100644 libc/src/math/generic/sinhf16.cpp
 create mode 100644 libc/src/math/sinhf16.h
 create mode 100644 libc/test/src/math/coshf16_test.cpp
 create mode 100644 libc/test/src/math/sinhf16_test.cpp
 create mode 100644 libc/test/src/math/smoke/coshf16_test.cpp
 create mode 100644 libc/test/src/math/smoke/sinhf16_test.cpp

diff --git a/libc/config/gpu/entrypoints.txt b/libc/config/gpu/entrypoints.txt
index 251ad43ece8d..4bb81f5d3b2d 100644
--- a/libc/config/gpu/entrypoints.txt
+++ b/libc/config/gpu/entrypoints.txt
@@ -521,6 +521,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.coshf16
     libc.src.math.exp10f16
     libc.src.math.exp10m1f16
     libc.src.math.exp2f16
@@ -585,6 +586,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
     libc.src.math.setpayloadsigf16
+    libc.src.math.sinhf16
     libc.src.math.totalorderf16
     libc.src.math.totalordermagf16
     libc.src.math.truncf16
diff --git a/libc/config/linux/x86_64/entrypoints.txt b/libc/config/linux/x86_64/entrypoints.txt
index 3ca14ec03de3..39f451d6b5fc 100644
--- a/libc/config/linux/x86_64/entrypoints.txt
+++ b/libc/config/linux/x86_64/entrypoints.txt
@@ -610,6 +610,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.canonicalizef16
     libc.src.math.ceilf16
     libc.src.math.copysignf16
+    libc.src.math.coshf16
     libc.src.math.exp10f16
     libc.src.math.exp10m1f16
     libc.src.math.exp2f16
@@ -678,6 +679,7 @@ if(LIBC_TYPES_HAS_FLOAT16)
     libc.src.math.scalbnf16
     libc.src.math.setpayloadf16
     libc.src.math.setpayloadsigf16
+    libc.src.math.sinhf16
     libc.src.math.sinpif16
     libc.src.math.totalorderf16
     libc.src.math.totalordermagf16
diff --git a/libc/docs/math/index.rst b/libc/docs/math/index.rst
index 95ac7f4f12f9..902645c9e001 100644
--- a/libc/docs/math/index.rst
+++ b/libc/docs/math/index.rst
@@ -278,7 +278,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | cos       | |check|          | |check|         |                        |                      |                        | 7.12.4.5               | F.10.1.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| cosh      | |check|          |                 |                        |                      |                        | 7.12.5.4               | F.10.2.4                   |
+| cosh      | |check|          |                 |                        | |check|              |                        | 7.12.5.4               | F.10.2.4                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | cospi     | |check|          |                 |                        |                      |                        | 7.12.4.12              | F.10.1.12                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
@@ -340,7 +340,7 @@ Higher Math Functions
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sincos    | |check|          | |check|         |                        |                      |                        |                        |                            |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
-| sinh      | |check|          |                 |                        |                      |                        | 7.12.5.5               | F.10.2.5                   |
+| sinh      | |check|          |                 |                        | |check|              |                        | 7.12.5.5               | F.10.2.5                   |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
 | sinpi     | |check|          |                 |                        |  |check|             |                        | 7.12.4.13              | F.10.1.13                  |
 +-----------+------------------+-----------------+------------------------+----------------------+------------------------+------------------------+----------------------------+
diff --git a/libc/spec/stdc.td b/libc/spec/stdc.td
index ea032ba5f66e..e4e46e7e13a5 100644
--- a/libc/spec/stdc.td
+++ b/libc/spec/stdc.td
@@ -792,7 +792,11 @@ def StdC : StandardSpec<"stdc"> {
           FunctionSpec<"pow", RetValSpec<DoubleType>, [ArgSpec<DoubleType>, ArgSpec<DoubleType>]>,
 
           FunctionSpec<"coshf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          GuardedFunctionSpec<"coshf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+
           FunctionSpec<"sinhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
+          GuardedFunctionSpec<"sinhf16", RetValSpec<Float16Type>, [ArgSpec<Float16Type>], "LIBC_TYPES_HAS_FLOAT16">,
+
           FunctionSpec<"tanhf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
 
           FunctionSpec<"acosf", RetValSpec<FloatType>, [ArgSpec<FloatType>]>,
diff --git a/libc/src/math/CMakeLists.txt b/libc/src/math/CMakeLists.txt
index ecf639684814..2f76b57d19e9 100644
--- a/libc/src/math/CMakeLists.txt
+++ b/libc/src/math/CMakeLists.txt
@@ -89,8 +89,11 @@ add_math_entrypoint_object(copysignf128)
 
 add_math_entrypoint_object(cos)
 add_math_entrypoint_object(cosf)
+
 add_math_entrypoint_object(cosh)
 add_math_entrypoint_object(coshf)
+add_math_entrypoint_object(coshf16)
+
 add_math_entrypoint_object(cospif)
 
 add_math_entrypoint_object(daddl)
@@ -481,6 +484,7 @@ add_math_entrypoint_object(sinpif16)
 
 add_math_entrypoint_object(sinh)
 add_math_entrypoint_object(sinhf)
+add_math_entrypoint_object(sinhf16)
 
 add_math_entrypoint_object(sqrt)
 add_math_entrypoint_object(sqrtf)
diff --git a/libc/src/math/coshf16.h b/libc/src/math/coshf16.h
new file mode 100644
index 000000000000..55c9d4941d4a
--- /dev/null
+++ b/libc/src/math/coshf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for coshf16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_COSHF16_H
+#define LLVM_LIBC_SRC_MATH_COSHF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 coshf16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_COSHF16_H
diff --git a/libc/src/math/generic/CMakeLists.txt b/libc/src/math/generic/CMakeLists.txt
index ffa74970a2ab..4a3de8f0400d 100644
--- a/libc/src/math/generic/CMakeLists.txt
+++ b/libc/src/math/generic/CMakeLists.txt
@@ -4218,6 +4218,25 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  coshf16
+  SRCS
+    coshf16.cpp
+  HDRS
+    ../coshf16.h
+  DEPENDS
+    .expxf16
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   sinhf
   SRCS
@@ -4233,6 +4252,25 @@ add_entrypoint_object(
     -O3
 )
 
+add_entrypoint_object(
+  sinhf16
+  SRCS
+    sinhf16.cpp
+  HDRS
+    ../sinhf16.h
+  DEPENDS
+    .expxf16
+    libc.hdr.errno_macros
+    libc.hdr.fenv_macros
+    libc.src.__support.FPUtil.except_value_utils
+    libc.src.__support.FPUtil.fenv_impl
+    libc.src.__support.FPUtil.fp_bits
+    libc.src.__support.FPUtil.rounding_mode
+    libc.src.__support.macros.optimization
+  COMPILE_OPTIONS
+    -O3
+)
+
 add_entrypoint_object(
   tanhf
   SRCS
@@ -5297,6 +5335,7 @@ add_header_library(
     expxf16.h
   DEPENDS
     libc.src.__support.CPP.array
+    libc.src.__support.FPUtil.cast
     libc.src.__support.FPUtil.fp_bits
     libc.src.__support.FPUtil.multiply_add
     libc.src.__support.FPUtil.nearest_integer
diff --git a/libc/src/math/generic/coshf16.cpp b/libc/src/math/generic/coshf16.cpp
new file mode 100644
index 000000000000..cca7581c70e0
--- /dev/null
+++ b/libc/src/math/generic/coshf16.cpp
@@ -0,0 +1,103 @@
+//===-- Half-precision cosh(x) function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/coshf16.h"
+#include "expxf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 9> COSHF16_EXCEPTS_POS = {{
+    // x = 0x1.6ap-5, coshf16(x) = 0x1p+0 (RZ)
+    {0x29a8U, 0x3c00U, 1U, 0U, 1U},
+    // x = 0x1.8c4p+0, coshf16(x) = 0x1.3a8p+1 (RZ)
+    {0x3e31U, 0x40eaU, 1U, 0U, 0U},
+    // x = 0x1.994p+0, coshf16(x) = 0x1.498p+1 (RZ)
+    {0x3e65U, 0x4126U, 1U, 0U, 0U},
+    // x = 0x1.b6p+0, coshf16(x) = 0x1.6d8p+1 (RZ)
+    {0x3ed8U, 0x41b6U, 1U, 0U, 1U},
+    // x = 0x1.aap+1, coshf16(x) = 0x1.be8p+3 (RZ)
+    {0x42a8U, 0x4afaU, 1U, 0U, 1U},
+    // x = 0x1.cc4p+1, coshf16(x) = 0x1.23cp+4 (RZ)
+    {0x4331U, 0x4c8fU, 1U, 0U, 0U},
+    // x = 0x1.288p+2, coshf16(x) = 0x1.9b4p+5 (RZ)
+    {0x44a2U, 0x526dU, 1U, 0U, 0U},
+    // x = 0x1.958p+2, coshf16(x) = 0x1.1a4p+8 (RZ)
+    {0x4656U, 0x5c69U, 1U, 0U, 0U},
+    // x = 0x1.5fp+3, coshf16(x) = 0x1.c54p+14 (RZ)
+    {0x497cU, 0x7715U, 1U, 0U, 1U},
+}};
+
+static constexpr fputil::ExceptValues<float16, 4> COSHF16_EXCEPTS_NEG = {{
+    // x = -0x1.6ap-5, coshf16(x) = 0x1p+0 (RZ)
+    {0xa9a8U, 0x3c00U, 1U, 0U, 1U},
+    // x = -0x1.b6p+0, coshf16(x) = 0x1.6d8p+1 (RZ)
+    {0xbed8U, 0x41b6U, 1U, 0U, 1U},
+    // x = -0x1.288p+2, coshf16(x) = 0x1.9b4p+5 (RZ)
+    {0xc4a2U, 0x526dU, 1U, 0U, 0U},
+    // x = -0x1.5fp+3, coshf16(x) = 0x1.c54p+14 (RZ)
+    {0xc97cU, 0x7715U, 1U, 0U, 1U},
+}};
+
+LLVM_LIBC_FUNCTION(float16, coshf16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| >= acosh(2^16), or x is NaN.
+  if (LIBC_UNLIKELY(x_abs >= 0x49e5U)) {
+    // cosh(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // When |x| >= acosh(2^16).
+    if (x_abs >= 0x49e5U) {
+      // cosh(+/-inf) = +inf
+      if (x_bits.is_inf())
+        return FPBits::inf().get_val();
+
+      switch (fputil::quick_get_round()) {
+      case FE_TONEAREST:
+      case FE_UPWARD:
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf().get_val();
+      default:
+        return FPBits::max_normal().get_val();
+      }
+    }
+  }
+
+  if (x_bits.is_pos()) {
+    if (auto r = COSHF16_EXCEPTS_POS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+  } else {
+    if (auto r = COSHF16_EXCEPTS_NEG.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+  }
+
+  return eval_sinh_or_cosh</*IsSinh=*/false>(x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/generic/expxf16.h b/libc/src/math/generic/expxf16.h
index aba99a2914b4..7202b1b11319 100644
--- a/libc/src/math/generic/expxf16.h
+++ b/libc/src/math/generic/expxf16.h
@@ -12,6 +12,7 @@
 #include "src/__support/CPP/array.h"
 #include "src/__support/FPUtil/FPBits.h"
 #include "src/__support/FPUtil/PolyEval.h"
+#include "src/__support/FPUtil/cast.h"
 #include "src/__support/FPUtil/multiply_add.h"
 #include "src/__support/FPUtil/nearest_integer.h"
 #include "src/__support/macros/attributes.h"
@@ -174,6 +175,119 @@ LIBC_INLINE ExpRangeReduction exp10_range_reduction(float16 x) {
   return {exp2_hi_mid, exp10_lo};
 }
 
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log2(exp(1)), SG, RN);
+static constexpr float LOG2F_E = 0x1.715476p+0f;
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > round(log(2), SG, RN);
+static constexpr float LOGF_2 = 0x1.62e43p-1f;
+
+// Generated by Sollya with the following commands:
+//   > display = hexadecimal;
+//   > for i from 0 to 31 do printsingle(round(2^(i * 2^-5), SG, RN));
+static constexpr cpp::array<uint32_t, 32> EXP2_MID_5_BITS = {
+    0x3f80'0000U, 0x3f82'cd87U, 0x3f85'aac3U, 0x3f88'980fU, 0x3f8b'95c2U,
+    0x3f8e'a43aU, 0x3f91'c3d3U, 0x3f94'f4f0U, 0x3f98'37f0U, 0x3f9b'8d3aU,
+    0x3f9e'f532U, 0x3fa2'7043U, 0x3fa5'fed7U, 0x3fa9'a15bU, 0x3fad'583fU,
+    0x3fb1'23f6U, 0x3fb5'04f3U, 0x3fb8'fbafU, 0x3fbd'08a4U, 0x3fc1'2c4dU,
+    0x3fc5'672aU, 0x3fc9'b9beU, 0x3fce'248cU, 0x3fd2'a81eU, 0x3fd7'44fdU,
+    0x3fdb'fbb8U, 0x3fe0'ccdfU, 0x3fe5'b907U, 0x3fea'c0c7U, 0x3fef'e4baU,
+    0x3ff5'257dU, 0x3ffa'83b3U,
+};
+
+// This function correctly calculates sinh(x) and cosh(x) by calculating exp(x)
+// and exp(-x) simultaneously.
+// To compute e^x, we perform the following range reduction:
+// find hi, mid, lo such that:
+//   x = (hi + mid) * log(2) + lo, in which
+//     hi is an integer,
+//     0 <= mid * 2^5 < 32 is an integer
+//     -2^(-5) <= lo * log2(e) <= 2^-5.
+// In particular,
+//   hi + mid = round(x * log2(e) * 2^5) * 2^(-5).
+// Then,
+//   e^x = 2^(hi + mid) * e^lo = 2^hi * 2^mid * e^lo.
+// We store 2^mid in the lookup table EXP2_MID_5_BITS, and compute 2^hi * 2^mid
+// by adding hi to the exponent field of 2^mid.
+// e^lo is computed using a degree-3 minimax polynomial generated by Sollya:
+//   e^lo ~ P(lo)
+//        = 1 + lo + c2 * lo^2 + ... + c5 * lo^5
+//        = (1 + c2*lo^2 + c4*lo^4) + lo * (1 + c3*lo^2 + c5*lo^4)
+//        = P_even + lo * P_odd
+// To compute e^(-x), notice that:
+//   e^(-x) = 2^(-(hi + mid)) * e^(-lo)
+//          ~ 2^(-(hi + mid)) * P(-lo)
+//          = 2^(-(hi + mid)) * (P_even - lo * P_odd)
+// So:
+//   sinh(x) = (e^x - e^(-x)) / 2
+//           ~ 0.5 * (2^(hi + mid) * (P_even + lo * P_odd) -
+//                    2^(-(hi + mid)) * (P_even - lo * P_odd))
+//           = 0.5 * (P_even * (2^(hi + mid) - 2^(-(hi + mid))) +
+//                    lo * P_odd * (2^(hi + mid) + 2^(-(hi + mid))))
+// And similarly:
+//   cosh(x) = (e^x + e^(-x)) / 2
+//           ~ 0.5 * (P_even * (2^(hi + mid) + 2^(-(hi + mid))) +
+//                    lo * P_odd * (2^(hi + mid) - 2^(-(hi + mid))))
+// The main point of these formulas is that the expensive part of calculating
+// the polynomials approximating lower parts of e^x and e^(-x) is shared and
+// only done once.
+template <bool IsSinh> LIBC_INLINE float16 eval_sinh_or_cosh(float16 x) {
+  float xf = x;
+  float kf = fputil::nearest_integer(xf * (LOG2F_E * 0x1.0p+5f));
+  int x_hi_mid_p = static_cast<int>(kf);
+  int x_hi_mid_m = -x_hi_mid_p;
+
+  unsigned x_hi_p = static_cast<unsigned>(x_hi_mid_p) >> 5;
+  unsigned x_hi_m = static_cast<unsigned>(x_hi_mid_m) >> 5;
+  unsigned x_mid_p = static_cast<unsigned>(x_hi_mid_p) & 0x1f;
+  unsigned x_mid_m = static_cast<unsigned>(x_hi_mid_m) & 0x1f;
+
+  uint32_t exp2_hi_mid_bits_p =
+      EXP2_MID_5_BITS[x_mid_p] +
+      static_cast<uint32_t>(x_hi_p << fputil::FPBits<float>::FRACTION_LEN);
+  uint32_t exp2_hi_mid_bits_m =
+      EXP2_MID_5_BITS[x_mid_m] +
+      static_cast<uint32_t>(x_hi_m << fputil::FPBits<float>::FRACTION_LEN);
+  // exp2_hi_mid_p = 2^(hi + mid)
+  float exp2_hi_mid_p = fputil::FPBits<float>(exp2_hi_mid_bits_p).get_val();
+  // exp2_hi_mid_m = 2^(-(hi + mid))
+  float exp2_hi_mid_m = fputil::FPBits<float>(exp2_hi_mid_bits_m).get_val();
+
+  // exp2_hi_mid_sum = 2^(hi + mid) + 2^(-(hi + mid))
+  float exp2_hi_mid_sum = exp2_hi_mid_p + exp2_hi_mid_m;
+  // exp2_hi_mid_diff = 2^(hi + mid) - 2^(-(hi + mid))
+  float exp2_hi_mid_diff = exp2_hi_mid_p - exp2_hi_mid_m;
+
+  // lo = x - (hi + mid) = round(x * log2(e) * 2^5) * log(2) * (-2^(-5)) + x
+  float lo = fputil::multiply_add(kf, LOGF_2 * -0x1.0p-5f, xf);
+  float lo_sq = lo * lo;
+
+  // Degree-3 minimax polynomial generated by Sollya with the following
+  // commands:
+  //   > display = hexadecimal;
+  //   > P = fpminimax(expm1(x)/x, 2, [|SG...|], [-2^-5, 2^-5]);
+  //   > 1 + x * P;
+  constexpr cpp::array<float, 4> COEFFS = {0x1p+0f, 0x1p+0f, 0x1.0004p-1f,
+                                           0x1.555778p-3f};
+  float half_p_odd =
+      fputil::polyeval(lo_sq, COEFFS[1] * 0.5f, COEFFS[3] * 0.5f);
+  float half_p_even =
+      fputil::polyeval(lo_sq, COEFFS[0] * 0.5f, COEFFS[2] * 0.5f);
+
+  // sinh(x) = lo * (0.5 * P_odd * (2^(hi + mid) + 2^(-(hi + mid)))) +
+  //                (0.5 * P_even * (2^(hi + mid) - 2^(-(hi + mid))))
+  if constexpr (IsSinh)
+    return fputil::cast<float16>(fputil::multiply_add(
+        lo, half_p_odd * exp2_hi_mid_sum, half_p_even * exp2_hi_mid_diff));
+  // cosh(x) = lo * (0.5 * P_odd * (2^(hi + mid) - 2^(-(hi + mid)))) +
+  //                (0.5 * P_even * (2^(hi + mid) + 2^(-(hi + mid))))
+  return fputil::cast<float16>(fputil::multiply_add(
+      lo, half_p_odd * exp2_hi_mid_diff, half_p_even * exp2_hi_mid_sum));
+}
+
 } // namespace LIBC_NAMESPACE_DECL
 
 #endif // LLVM_LIBC_SRC_MATH_GENERIC_EXPXF16_H
diff --git a/libc/src/math/generic/sinhf16.cpp b/libc/src/math/generic/sinhf16.cpp
new file mode 100644
index 000000000000..e2dd009dc72c
--- /dev/null
+++ b/libc/src/math/generic/sinhf16.cpp
@@ -0,0 +1,144 @@
+//===-- Half-precision sinh(x) function -----------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinhf16.h"
+#include "expxf16.h"
+#include "hdr/errno_macros.h"
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/FEnvImpl.h"
+#include "src/__support/FPUtil/FPBits.h"
+#include "src/__support/FPUtil/except_value_utils.h"
+#include "src/__support/FPUtil/rounding_mode.h"
+#include "src/__support/common.h"
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/optimization.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+static constexpr fputil::ExceptValues<float16, 16> SINHF16_EXCEPTS_POS = {{
+    // x = 0x1.714p-5, sinhf16(x) = 0x1.714p-5 (RZ)
+    {0x29c5U, 0x29c5U, 1U, 0U, 1U},
+    // x = 0x1.25p-4, sinhf16(x) = 0x1.25p-4 (RZ)
+    {0x2c94U, 0x2c94U, 1U, 0U, 1U},
+    // x = 0x1.f5p-4, sinhf16(x) = 0x1.f64p-4 (RZ)
+    {0x2fd4U, 0x2fd9U, 1U, 0U, 0U},
+    // x = 0x1.b1cp-3, sinhf16(x) = 0x1.b4cp-3 (RZ)
+    {0x32c7U, 0x32d3U, 1U, 0U, 1U},
+    // x = 0x1.6e8p-2, sinhf16(x) = 0x1.764p-2 (RZ)
+    {0x35baU, 0x35d9U, 1U, 0U, 1U},
+    // x = 0x1.6b4p-1, sinhf16(x) = 0x1.8a4p-1 (RZ)
+    {0x39adU, 0x3a29U, 1U, 0U, 1U},
+    // x = 0x1.a58p-1, sinhf16(x) = 0x1.d68p-1 (RZ)
+    {0x3a96U, 0x3b5aU, 1U, 0U, 1U},
+    // x = 0x1.574p+0, sinhf16(x) = 0x1.c78p+0 (RZ)
+    {0x3d5dU, 0x3f1eU, 1U, 0U, 1U},
+    // x = 0x1.648p+1, sinhf16(x) = 0x1.024p+3 (RZ)
+    {0x4192U, 0x4809U, 1U, 0U, 0U},
+    // x = 0x1.cdcp+1, sinhf16(x) = 0x1.26cp+4 (RZ)
+    {0x4337U, 0x4c9bU, 1U, 0U, 0U},
+    // x = 0x1.d0cp+1, sinhf16(x) = 0x1.2d8p+4 (RZ)
+    {0x4343U, 0x4cb6U, 1U, 0U, 1U},
+    // x = 0x1.018p+2, sinhf16(x) = 0x1.bfp+4 (RZ)
+    {0x4406U, 0x4efcU, 1U, 0U, 0U},
+    // x = 0x1.2fcp+2, sinhf16(x) = 0x1.cc4p+5 (RZ)
+    {0x44bfU, 0x5331U, 1U, 0U, 1U},
+    // x = 0x1.4ecp+2, sinhf16(x) = 0x1.75cp+6 (RZ)
+    {0x453bU, 0x55d7U, 1U, 0U, 0U},
+    // x = 0x1.8a4p+2, sinhf16(x) = 0x1.d94p+7 (RZ)
+    {0x4629U, 0x5b65U, 1U, 0U, 1U},
+    // x = 0x1.5fp+3, sinhf16(x) = 0x1.c54p+14 (RZ)
+    {0x497cU, 0x7715U, 1U, 0U, 1U},
+}};
+
+static constexpr fputil::ExceptValues<float16, 12> SINHF16_EXCEPTS_NEG = {{
+    // x = -0x1.714p-5, sinhf16(x) = -0x1.714p-5 (RZ)
+    {0xa9c5U, 0xa9c5U, 0U, 1U, 1U},
+    // x = -0x1.25p-4, sinhf16(x) = -0x1.25p-4 (RZ)
+    {0xac94U, 0xac94U, 0U, 1U, 1U},
+    // x = -0x1.f5p-4, sinhf16(x) = -0x1.f64p-4 (RZ)
+    {0xafd4U, 0xafd9U, 0U, 1U, 0U},
+    // x = -0x1.6e8p-2, sinhf16(x) = -0x1.764p-2 (RZ)
+    {0xb5baU, 0xb5d9U, 0U, 1U, 1U},
+    // x = -0x1.a58p-1, sinhf16(x) = -0x1.d68p-1 (RZ)
+    {0xba96U, 0xbb5aU, 0U, 1U, 1U},
+    // x = -0x1.cdcp+1, sinhf16(x) = -0x1.26cp+4 (RZ)
+    {0xc337U, 0xcc9bU, 0U, 1U, 0U},
+    // x = -0x1.d0cp+1, sinhf16(x) = -0x1.2d8p+4 (RZ)
+    {0xc343U, 0xccb6U, 0U, 1U, 1U},
+    // x = -0x1.018p+2, sinhf16(x) = -0x1.bfp+4 (RZ)
+    {0xc406U, 0xcefcU, 0U, 1U, 0U},
+    // x = -0x1.2fcp+2, sinhf16(x) = -0x1.cc4p+5 (RZ)
+    {0xc4bfU, 0xd331U, 0U, 1U, 1U},
+    // x = -0x1.4ecp+2, sinhf16(x) = -0x1.75cp+6 (RZ)
+    {0xc53bU, 0xd5d7U, 0U, 1U, 0U},
+    // x = -0x1.8a4p+2, sinhf16(x) = -0x1.d94p+7 (RZ)
+    {0xc629U, 0xdb65U, 0U, 1U, 1U},
+    // x = -0x1.5fp+3, sinhf16(x) = -0x1.c54p+14 (RZ)
+    {0xc97cU, 0xf715U, 0U, 1U, 1U},
+}};
+
+LLVM_LIBC_FUNCTION(float16, sinhf16, (float16 x)) {
+  using FPBits = fputil::FPBits<float16>;
+  FPBits x_bits(x);
+
+  uint16_t x_u = x_bits.uintval();
+  uint16_t x_abs = x_u & 0x7fffU;
+
+  // When |x| = 0, or -2^(-14) <= x <= -2^(-9), or |x| >= asinh(2^16), or x is
+  // NaN.
+  if (LIBC_UNLIKELY(x_abs == 0U || (x_u >= 0x8400U && x_u <= 0xa400U) ||
+                    x_abs >= 0x49e5U)) {
+    // sinh(NaN) = NaN
+    if (x_bits.is_nan()) {
+      if (x_bits.is_signaling_nan()) {
+        fputil::raise_except_if_required(FE_INVALID);
+        return FPBits::quiet_nan().get_val();
+      }
+
+      return x;
+    }
+
+    // sinh(+/-0) = sinh(+/-0)
+    if (x_abs == 0U)
+      return FPBits::zero(x_bits.sign()).get_val();
+
+    // When |x| >= asinh(2^16).
+    if (x_abs >= 0x49e5U) {
+      // sinh(+/-inf) = +/-inf
+      if (x_bits.is_inf())
+        return FPBits::inf(x_bits.sign()).get_val();
+
+      int rounding_mode = fputil::quick_get_round();
+      if (rounding_mode == FE_TONEAREST ||
+          (x_bits.is_pos() && rounding_mode == FE_UPWARD) ||
+          (x_bits.is_neg() && rounding_mode == FE_DOWNWARD)) {
+        fputil::set_errno_if_required(ERANGE);
+        fputil::raise_except_if_required(FE_OVERFLOW | FE_INEXACT);
+        return FPBits::inf(x_bits.sign()).get_val();
+      }
+      return FPBits::max_normal(x_bits.sign()).get_val();
+    }
+
+    // When -2^(-14) <= x <= -2^(-9).
+    if (fputil::fenv_is_round_down())
+      return FPBits(static_cast<uint16_t>(x_u + 1)).get_val();
+    return FPBits(static_cast<uint16_t>(x_u)).get_val();
+  }
+
+  if (x_bits.is_pos()) {
+    if (auto r = SINHF16_EXCEPTS_POS.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+  } else {
+    if (auto r = SINHF16_EXCEPTS_NEG.lookup(x_u); LIBC_UNLIKELY(r.has_value()))
+      return r.value();
+  }
+
+  return eval_sinh_or_cosh</*IsSinh=*/true>(x);
+}
+
+} // namespace LIBC_NAMESPACE_DECL
diff --git a/libc/src/math/sinhf16.h b/libc/src/math/sinhf16.h
new file mode 100644
index 000000000000..8b8c1b64e7ec
--- /dev/null
+++ b/libc/src/math/sinhf16.h
@@ -0,0 +1,21 @@
+//===-- Implementation header for sinhf16 -----------------------*- C++ -*-===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#ifndef LLVM_LIBC_SRC_MATH_SINHF16_H
+#define LLVM_LIBC_SRC_MATH_SINHF16_H
+
+#include "src/__support/macros/config.h"
+#include "src/__support/macros/properties/types.h"
+
+namespace LIBC_NAMESPACE_DECL {
+
+float16 sinhf16(float16 x);
+
+} // namespace LIBC_NAMESPACE_DECL
+
+#endif // LLVM_LIBC_SRC_MATH_SINHF16_H
diff --git a/libc/test/src/math/CMakeLists.txt b/libc/test/src/math/CMakeLists.txt
index 5dff0b49125b..381a3f478f37 100644
--- a/libc/test/src/math/CMakeLists.txt
+++ b/libc/test/src/math/CMakeLists.txt
@@ -1916,6 +1916,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  coshf16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    coshf16_test.cpp
+  DEPENDS
+    libc.src.math.coshf16
+)
+
 add_fp_unittest(
   sinhf_test
   NEED_MPFR
@@ -1932,6 +1943,17 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sinhf16_test
+  NEED_MPFR
+  SUITE
+    libc-math-unittests
+  SRCS
+    sinhf16_test.cpp
+  DEPENDS
+    libc.src.math.sinhf16
+)
+
 add_fp_unittest(
   tanhf_test
   NEED_MPFR
diff --git a/libc/test/src/math/coshf16_test.cpp b/libc/test/src/math/coshf16_test.cpp
new file mode 100644
index 000000000000..a0d1fd211047
--- /dev/null
+++ b/libc/test/src/math/coshf16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for coshf16 ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/coshf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcCoshf16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x,
+                                   LIBC_NAMESPACE::coshf16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcCoshf16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Cosh, x,
+                                   LIBC_NAMESPACE::coshf16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/sinhf16_test.cpp b/libc/test/src/math/sinhf16_test.cpp
new file mode 100644
index 000000000000..a16ab9279c45
--- /dev/null
+++ b/libc/test/src/math/sinhf16_test.cpp
@@ -0,0 +1,40 @@
+//===-- Exhaustive test for sinhf16 ---------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "src/math/sinhf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+#include "utils/MPFRWrapper/MPFRUtils.h"
+
+using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+namespace mpfr = LIBC_NAMESPACE::testing::mpfr;
+
+// Range: [0, Inf];
+static constexpr uint16_t POS_START = 0x0000U;
+static constexpr uint16_t POS_STOP = 0x7c00U;
+
+// Range: [-Inf, 0];
+static constexpr uint16_t NEG_START = 0x8000U;
+static constexpr uint16_t NEG_STOP = 0xfc00U;
+
+TEST_F(LlvmLibcSinhf16Test, PositiveRange) {
+  for (uint16_t v = POS_START; v <= POS_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
+                                   LIBC_NAMESPACE::sinhf16(x), 0.5);
+  }
+}
+
+TEST_F(LlvmLibcSinhf16Test, NegativeRange) {
+  for (uint16_t v = NEG_START; v <= NEG_STOP; ++v) {
+    float16 x = FPBits(v).get_val();
+    EXPECT_MPFR_MATCH_ALL_ROUNDING(mpfr::Operation::Sinh, x,
+                                   LIBC_NAMESPACE::sinhf16(x), 0.5);
+  }
+}
diff --git a/libc/test/src/math/smoke/CMakeLists.txt b/libc/test/src/math/smoke/CMakeLists.txt
index 6b3623dc0d0d..f713430ee27c 100644
--- a/libc/test/src/math/smoke/CMakeLists.txt
+++ b/libc/test/src/math/smoke/CMakeLists.txt
@@ -3728,6 +3728,19 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  coshf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    coshf16_test.cpp
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.errno.errno
+    libc.src.math.coshf16
+    libc.src.__support.FPUtil.cast
+)
+
 add_fp_unittest(
   sinhf_test
   SUITE
@@ -3741,6 +3754,19 @@ add_fp_unittest(
     libc.src.__support.FPUtil.fp_bits
 )
 
+add_fp_unittest(
+  sinhf16_test
+  SUITE
+    libc-math-smoke-tests
+  SRCS
+    sinhf16_test.cpp
+  DEPENDS
+    libc.hdr.fenv_macros
+    libc.src.errno.errno
+    libc.src.math.sinhf16
+    libc.src.__support.FPUtil.cast
+)
+
 add_fp_unittest(
   tanhf_test
   SUITE
diff --git a/libc/test/src/math/smoke/coshf16_test.cpp b/libc/test/src/math/smoke/coshf16_test.cpp
new file mode 100644
index 000000000000..08d05ecce86b
--- /dev/null
+++ b/libc/test/src/math/smoke/coshf16_test.cpp
@@ -0,0 +1,90 @@
+//===-- Unittests for coshf16 ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/coshf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcCoshf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcCoshf16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::coshf16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::coshf16(sNaN), FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::coshf16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::coshf16(neg_inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast<float16>(1.0),
+                            LIBC_NAMESPACE::coshf16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(LIBC_NAMESPACE::fputil::cast<float16>(1.0),
+                            LIBC_NAMESPACE::coshf16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcCoshf16Test, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(max_normal),
+                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::coshf16(neg_max_normal),
+                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  // round(acosh(2^16), HP, RU);
+  float16 x = LIBC_NAMESPACE::fputil::cast<float16>(0x1.794p+3);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::coshf16(x),
+                                               FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(inf, LIBC_NAMESPACE::coshf16(x),
+                                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  // round(-acosh(2^16), HP, RD);
+  x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.794p+3);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::coshf16(x),
+                                               FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(inf, LIBC_NAMESPACE::coshf16(x),
+                                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      max_normal, LIBC_NAMESPACE::coshf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+}
diff --git a/libc/test/src/math/smoke/sinhf16_test.cpp b/libc/test/src/math/smoke/sinhf16_test.cpp
new file mode 100644
index 000000000000..4f21d33ba78e
--- /dev/null
+++ b/libc/test/src/math/smoke/sinhf16_test.cpp
@@ -0,0 +1,88 @@
+//===-- Unittests for sinhf16 ---------------------------------------------===//
+//
+// Part of the LLVM Project, under the Apache License v2.0 with LLVM Exceptions.
+// See https://llvm.org/LICENSE.txt for license information.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+//
+//===----------------------------------------------------------------------===//
+
+#include "hdr/fenv_macros.h"
+#include "src/__support/FPUtil/cast.h"
+#include "src/errno/libc_errno.h"
+#include "src/math/sinhf16.h"
+#include "test/UnitTest/FPMatcher.h"
+#include "test/UnitTest/Test.h"
+
+using LlvmLibcSinhf16Test = LIBC_NAMESPACE::testing::FPTest<float16>;
+
+TEST_F(LlvmLibcSinhf16Test, SpecialNumbers) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_ALL_ROUNDING(aNaN, LIBC_NAMESPACE::sinhf16(aNaN));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(aNaN, LIBC_NAMESPACE::sinhf16(sNaN), FE_INVALID);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(inf, LIBC_NAMESPACE::sinhf16(inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_inf, LIBC_NAMESPACE::sinhf16(neg_inf));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(zero, LIBC_NAMESPACE::sinhf16(zero));
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_ALL_ROUNDING(neg_zero, LIBC_NAMESPACE::sinhf16(neg_zero));
+  EXPECT_MATH_ERRNO(0);
+}
+
+TEST_F(LlvmLibcSinhf16Test, Overflow) {
+  LIBC_NAMESPACE::libc_errno = 0;
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(inf, LIBC_NAMESPACE::sinhf16(max_normal),
+                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION(neg_inf, LIBC_NAMESPACE::sinhf16(neg_max_normal),
+                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  // round(asinh(2^16), HP, RU);
+  float16 x = LIBC_NAMESPACE::fputil::cast<float16>(0x1.794p+3);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(inf, LIBC_NAMESPACE::sinhf16(x),
+                                               FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(inf, LIBC_NAMESPACE::sinhf16(x),
+                                              FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  // round(asinh(-2^16), HP, RD);
+  x = LIBC_NAMESPACE::fputil::cast<float16>(-0x1.794p+3);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_NEAREST(
+      neg_inf, LIBC_NAMESPACE::sinhf16(x), FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_UPWARD(
+      neg_max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_DOWNWARD(
+      neg_inf, LIBC_NAMESPACE::sinhf16(x), FE_OVERFLOW | FE_INEXACT);
+  EXPECT_MATH_ERRNO(ERANGE);
+
+  EXPECT_FP_EQ_WITH_EXCEPTION_ROUNDING_TOWARD_ZERO(
+      neg_max_normal, LIBC_NAMESPACE::sinhf16(x), FE_INEXACT);
+  EXPECT_MATH_ERRNO(0);
+}
-- 
GitLab


From ab7518050183162f09724ef8682a580cc68709bc Mon Sep 17 00:00:00 2001
From: Greg Roth <grroth@microsoft.com>
Date: Thu, 17 Oct 2024 12:54:52 -0600
Subject: [PATCH 295/329] [DirectX] Remove trivially dead functions at linkage
 finalize (#106146)

Functions are not removed even when made internal by
DXILFinalizeLinkage. The removal code is called from alwaysinliner and
globalopt, which are invoked too early to remove functions made internal
by this pass.

This adds a check similar to that in alwaysinliner that removes
trivially dead functions after being marked internal. It refactors that
code a bit to make it simpler including reversing what is stored int he
work queue.

Tests both the pass in isolation and the full i0nlining, linkage
finalization and function removal steps.

Fixes #106139
---
 .../Target/DirectX/DXILFinalizeLinkage.cpp    |  18 +-
 .../DirectX/ShaderFlags/double-extensions.ll  |   3 +-
 .../CodeGen/DirectX/ShaderFlags/doubles.ll    |   4 +-
 .../DirectX/conflicting-bitcast-insert.ll     |  10 +-
 .../finalize-linkage-remove-dead-lib.ll       | 222 ++++++++++++++++++
 .../DirectX/finalize-linkage-remove-dead.ll   | 156 ++++++++++++
 llvm/test/CodeGen/DirectX/finalize_linkage.ll |   2 +-
 llvm/test/CodeGen/DirectX/fneg-conversion.ll  |   6 +-
 .../CodeGen/DirectX/omit-bitcast-insert.ll    |  14 +-
 llvm/test/CodeGen/DirectX/scalar-load.ll      |  10 +-
 llvm/test/CodeGen/DirectX/scalar-store.ll     |   6 +-
 .../CodeGen/DirectX/scalarize-two-calls.ll    |   4 +-
 llvm/test/CodeGen/DirectX/strip-fn-attrs.ll   |   2 +-
 13 files changed, 425 insertions(+), 32 deletions(-)
 create mode 100644 llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
 create mode 100644 llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll

diff --git a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
index d32dda2a67c9..aa1f55c572df 100644
--- a/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
+++ b/llvm/lib/Target/DirectX/DXILFinalizeLinkage.cpp
@@ -19,20 +19,20 @@
 using namespace llvm;
 
 static bool finalizeLinkage(Module &M) {
-  SmallPtrSet<Function *, 8> EntriesAndExports;
+  SmallPtrSet<Function *, 8> Funcs;
 
-  // Find all entry points and export functions
+  // Collect non-entry and non-exported functions to set to internal linkage.
   for (Function &EF : M.functions()) {
-    if (!EF.hasFnAttribute("hlsl.shader") && !EF.hasFnAttribute("hlsl.export"))
+    if (EF.hasFnAttribute("hlsl.shader") || EF.hasFnAttribute("hlsl.export"))
       continue;
-    EntriesAndExports.insert(&EF);
+    Funcs.insert(&EF);
   }
 
-  for (Function &F : M.functions()) {
-    if (F.getLinkage() == GlobalValue::ExternalLinkage &&
-        !EntriesAndExports.contains(&F)) {
-      F.setLinkage(GlobalValue::InternalLinkage);
-    }
+  for (Function *F : Funcs) {
+    if (F->getLinkage() == GlobalValue::ExternalLinkage)
+      F->setLinkage(GlobalValue::InternalLinkage);
+    if (F->isDefTriviallyDead())
+      M.getFunctionList().erase(F);
   }
 
   return false;
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
index d027216e4213..a8d5f9c78f0b 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/double-extensions.ll
@@ -9,11 +9,12 @@ target triple = "dxil-pc-shadermodel6.7-library"
 ; CHECK-NEXT: ;       Double-precision extensions for 11.1
 ; CHECK-NEXT: ; Note: extra DXIL module flags:
 ; CHECK-NEXT: {{^;$}}
-define double @div(double %a, double %b) {
+define double @div(double %a, double %b) #0 {
   %res = fdiv double %a, %b
   ret double %res
 }
 
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
 
 ; DXC: - Name:            SFI0
 ; DXC-NEXT:     Size:            8
diff --git a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
index c1a4c219a169..e9b44240e10b 100644
--- a/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
+++ b/llvm/test/CodeGen/DirectX/ShaderFlags/doubles.ll
@@ -9,11 +9,13 @@ target triple = "dxil-pc-shadermodel6.7-library"
 ; CHECK-NEXT: ; Note: extra DXIL module flags:
 ; CHECK-NEXT: {{^;$}}
 
-define double @add(double %a, double %b) {
+define double @add(double %a, double %b) #0 {
   %sum = fadd double %a, %b
   ret double %sum
 }
 
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
+
 ; DXC: - Name:            SFI0
 ; DXC-NEXT:     Size:            8
 ; DXC-NEXT:     Flags:
diff --git a/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll b/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll
index 8f5d3ae86417..39e21daceea8 100644
--- a/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll
+++ b/llvm/test/CodeGen/DirectX/conflicting-bitcast-insert.ll
@@ -1,25 +1,27 @@
 ; RUN: llc --filetype=asm %s -o - | FileCheck %s
 target triple = "dxil-unknown-shadermodel6.7-library"
 
-define i64 @test(ptr %p) {
+define i64 @test(ptr %p) #0 {
   store i32 0, ptr %p
   %v = load i64, ptr %p
   ret i64 %v
 }
 
-; CHECK: define internal i64 @test(ptr %p) {
+; CHECK: define i64 @test(ptr %p) #0 {
 ; CHECK-NEXT: %1 = bitcast ptr %p to ptr
 ; CHECK-NEXT: store i32 0, ptr %1, align 4
 ; CHECK-NEXT: %2 = bitcast ptr %p to ptr
 ; CHECK-NEXT: %3 = load i64, ptr %2, align 8
 
-define i64 @testGEP(ptr %p) {
+define i64 @testGEP(ptr %p) #0 {
   %ptr = getelementptr i32, ptr %p, i32 4
   %val = load i64, ptr %p
   ret i64 %val
 }
 
-; CHECK: define internal i64 @testGEP(ptr %p) {
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
+
+; CHECK: define i64 @testGEP(ptr %p) #0 {
 ; CHECK-NEXT:   %1 = bitcast ptr %p to ptr
 ; CHECK-NEXT:   %ptr = getelementptr i32, ptr %1, i32 4
 ; CHECK-NEXT:   %2 = bitcast ptr %p to ptr
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
new file mode 100644
index 000000000000..202609c8156a
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead-lib.ll
@@ -0,0 +1,222 @@
+; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-library %s | FileCheck %s
+; RUN: llc %s --filetype=asm -o - | FileCheck %s
+
+target triple = "dxilv1.5-pc-shadermodel6.5-compute"
+
+; Confirm that DXILFinalizeLinkage will remove functions that have compatible
+; linkage and are not called from anywhere. This should be any function that
+; is not explicitly marked export and is not an entry point.
+
+; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; CHECK-NOT: define {{.*}}doNothingUncalled
+define void @"?doNothingUncalled@@YAXXZ"() #2 {
+entry:
+  ret void
+}
+
+; Alwaysinline and uncalled, this should be removed.
+; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
+define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline and uncalled, this should be removed.
+; CHECK-NOT: define {{.*}}doNoinlineUncalled
+define void @"?doNoinlineUncalled@@YAXXZ"() #4 {
+entry:
+  ret void
+}
+
+; No inlining attribute, internal, and uncalled; this should be removed.
+; CHECK-NOT: define {{.*}}doInternalUncalled
+define internal void @"?doInternalUncalled@@YAXXZ"() #2 {
+entry:
+  ret void
+}
+
+; Alwaysinline, internal, and uncalled; this should be removed.
+; CHECK-NOT: define {{.*}}doAlwaysInlineInternalUncalled
+define internal void @"?doAlwaysInlineInternalUncalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, internal, and uncalled; this should be removed.
+; CHECK-NOT: define {{.*}}doNoinlineInternalUncalled
+define internal void @"?doNoinlineInternalUncalled@@YAXXZ"() #4 {
+entry:
+  ret void
+}
+
+; Marked external and uncalled, this should become internal and be removed.
+; CHECK-NOT: define {{.*}}doExternalUncalled
+define external void @"?doExternalUncalled@@YAXXZ"() #2 {
+entry:
+  ret void
+}
+
+; Alwaysinline, external and uncalled, this should become internal and be removed.
+; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
+define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, external and uncalled, this should become internal and be removed.
+; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
+define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #4 {
+entry:
+  ret void
+}
+
+; No inlining attribute and called, this should stay.
+; CHECK: define {{.*}}doNothingCalled
+define void @"?doNothingCalled@@YAXXZ"() #2 {
+entry:
+  ret void
+}
+
+; Alwaysinline and called, this should stay.
+; CHECK: define {{.*}}doAlwaysInlineCalled
+define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline and called, this should stay.
+; CHECK: define {{.*}}doNoinlineCalled
+define void @"?doNoinlineCalled@@YAXXZ"() #4 {
+entry:
+  ret void
+}
+
+; No inlining attribute, internal, and called; this should stay.
+; CHECK: define {{.*}}doInternalCalled
+define internal void @"?doInternalCalled@@YAXXZ"() #2 {
+entry:
+  ret void
+}
+
+; Alwaysinline, internal, and called; this should stay.
+; CHECK: define {{.*}}doAlwaysInlineInternalCalled
+define internal void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, internal, and called; this should stay.
+; CHECK: define {{.*}}doNoinlineInternalCalled
+define internal void @"?doNoinlineInternalCalled@@YAXXZ"() #4 {
+entry:
+  ret void
+}
+
+; Marked external and called, this should become internal and stay.
+; CHECK: define {{.*}}doExternalCalled
+define external void @"?doExternalCalled@@YAXXZ"() #2 {
+entry:
+  ret void
+}
+
+; Always inlined, external and called, this should become internal and stay.
+; CHECK: define {{.*}}doAlwaysInlineExternalCalled
+define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, external and called, this should become internal and stay.
+; CHECK: define {{.*}}doNoinlineExternalCalled
+define external void @"?doNoinlineExternalCalled@@YAXXZ"() #4 {
+entry:
+  ret void
+}
+
+; No inlining attribute and exported, this should stay.
+; CHECK: define {{.*}}doNothingExported
+define void @"?doNothingExported@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; Alwaysinline and exported, this should stay.
+; CHECK: define {{.*}}doAlwaysInlineExported
+define void @"?doAlwaysInlineExported@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Noinline attribute and exported, this should stay.
+; CHECK: define {{.*}}doNoinlineExported
+define void @"?doNoinlineExported@@YAXXZ"() #5 {
+entry:
+  ret void
+}
+
+; No inlining attribute, internal, and exported; this should stay.
+; CHECK: define {{.*}}doInternalExported
+define internal void @"?doInternalExported@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; Alwaysinline, internal, and exported; this should stay.
+; CHECK: define {{.*}}doAlwaysInlineInternalExported
+define internal void @"?doAlwaysInlineInternalExported@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Noinline, internal, and exported; this should stay.
+; CHECK: define {{.*}}doNoinlineInternalExported
+define internal void @"?doNoinlineInternalExported@@YAXXZ"() #5 {
+entry:
+  ret void
+}
+
+; Marked external and exported, this should stay.
+; CHECK: define {{.*}}doExternalExported
+define external void @"?doExternalExported@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; Alwaysinline, external and exported, this should stay.
+; CHECK: define {{.*}}doAlwaysInlineExternalExported
+define external void @"?doAlwaysInlineExternalExported@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Noinline, external and exported, this should stay.
+; CHECK: define {{.*}}doNoinlineExternalExported
+define external void @"?doNoinlineExternalExported@@YAXXZ"() #5 {
+entry:
+  ret void
+}
+
+; Entry point function, this should stay.
+; CHECK: define void @main()
+define void @main() #6 {
+entry:
+  call void @"?doNothingCalled@@YAXXZ"() #7
+  call void @"?doAlwaysInlineCalled@@YAXXZ"() #7
+  call void @"?doNoinlineCalled@@YAXXZ"() #7
+  call void @"?doInternalCalled@@YAXXZ"() #7
+  call void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #7
+  call void @"?doNoinlineInternalCalled@@YAXXZ"() #7
+  call void @"?doExternalCalled@@YAXXZ"() #7
+  call void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #7
+  call void @"?doNoinlineExternalCalled@@YAXXZ"() #7
+  ret void
+}
+
+attributes #0 = { alwaysinline convergent norecurse nounwind }
+attributes #1 = { alwaysinline convergent norecurse nounwind "hlsl.export"}
+attributes #2 = { convergent norecurse nounwind }
+attributes #3 = { convergent norecurse nounwind "hlsl.export"}
+attributes #4 = { convergent noinline norecurse nounwind }
+attributes #5 = { convergent noinline norecurse nounwind "hlsl.export"}
+attributes #6 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+attributes #7 = { convergent }
diff --git a/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
new file mode 100644
index 000000000000..49c3bda621d7
--- /dev/null
+++ b/llvm/test/CodeGen/DirectX/finalize-linkage-remove-dead.ll
@@ -0,0 +1,156 @@
+; RUN: opt -S -dxil-finalize-linkage -mtriple=dxil-unknown-shadermodel6.5-compute %s | FileCheck %s
+; RUN: llc %s --filetype=asm -o - | FileCheck %s
+
+target triple = "dxilv1.5-pc-shadermodel6.5-compute"
+
+; Confirm that DXILFinalizeLinkage will remove functions that have compatible
+; linkage and are not called from anywhere. This should be any function that
+; is not an entry point.
+
+; Has no specified inlining/linking behavior and is uncalled, this should be removed.
+; CHECK-NOT: define {{.*}}doNothingUncalled
+define void @"?doNothingUncalled@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Alwaysinline and uncalled, this should be removed.
+; CHECK-NOT: define {{.*}}doAlwaysInlineUncalled
+define void @"?doAlwaysInlineUncalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline and uncalled, this should be removed.
+; CHECK-NOT: define {{.*}}doNoinlineUncalled
+define void @"?doNoinlineUncalled@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; No inlining attribute, internal, and uncalled; this should be removed.
+; CHECK-NOT: define {{.*}}doInternalUncalled
+define internal void @"?doInternalUncalled@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Alwaysinline, internal, and uncalled; this should be removed.
+; CHECK-NOT: define {{.*}}doAlwaysInlineInternalUncalled
+define internal void @"?doAlwaysInlineInternalUncalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, internal, and uncalled; this should be removed.
+; CHECK-NOT: define {{.*}}doNoinlineInternalUncalled
+define internal void @"?doNoinlineInternalUncalled@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; Marked external and uncalled, this should become internal and be removed.
+; CHECK-NOT: define {{.*}}doExternalUncalled
+define external void @"?doExternalUncalled@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Alwaysinline, external and uncalled, this should become internal and be removed.
+; CHECK-NOT: define {{.*}}doAlwaysInlineExternalUncalled
+define external void @"?doAlwaysInlineExternalUncalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, external and uncalled, this should become internal and be removed.
+; CHECK-NOT: define {{.*}}doNoinlineExternalUncalled
+define external void @"?doNoinlineExternalUncalled@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; No inlining attribute and called, this should stay.
+; CHECK: define {{.*}}doNothingCalled
+define void @"?doNothingCalled@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Alwaysinline and called, this should stay.
+; CHECK: define {{.*}}doAlwaysInlineCalled
+define void @"?doAlwaysInlineCalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline and called, this should stay.
+; CHECK: define {{.*}}doNoinlineCalled
+define void @"?doNoinlineCalled@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; No inlining attribute, internal, and called; this should stay.
+; CHECK: define {{.*}}doInternalCalled
+define internal void @"?doInternalCalled@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Alwaysinline, internal, and called; this should stay.
+; CHECK: define {{.*}}doAlwaysInlineInternalCalled
+define internal void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, internal, and called; this should stay.
+; CHECK: define {{.*}}doNoinlineInternalCalled
+define internal void @"?doNoinlineInternalCalled@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; Marked external and called, this should become internal and stay.
+; CHECK: define {{.*}}doExternalCalled
+define external void @"?doExternalCalled@@YAXXZ"() #1 {
+entry:
+  ret void
+}
+
+; Always inlined, external and called, this should become internal and stay.
+; CHECK: define {{.*}}doAlwaysInlineExternalCalled
+define external void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #0 {
+entry:
+  ret void
+}
+
+; Noinline, external and called, this should become internal and stay.
+; CHECK: define {{.*}}doNoinlineExternalCalled
+define external void @"?doNoinlineExternalCalled@@YAXXZ"() #3 {
+entry:
+  ret void
+}
+
+; Entry point function, this should stay.
+; CHECK: define void @main()
+define void @main() #4 {
+entry:
+  call void @"?doNothingCalled@@YAXXZ"() #5
+  call void @"?doAlwaysInlineCalled@@YAXXZ"() #5
+  call void @"?doNoinlineCalled@@YAXXZ"() #5
+  call void @"?doInternalCalled@@YAXXZ"() #5
+  call void @"?doAlwaysInlineInternalCalled@@YAXXZ"() #5
+  call void @"?doNoinlineInternalCalled@@YAXXZ"() #5
+  call void @"?doExternalCalled@@YAXXZ"() #5
+  call void @"?doAlwaysInlineExternalCalled@@YAXXZ"() #5
+  call void @"?doNoinlineExternalCalled@@YAXXZ"() #5
+  ret void
+}
+
+attributes #0 = { alwaysinline convergent norecurse nounwind }
+attributes #1 = { convergent norecurse nounwind }
+attributes #3 = { convergent noinline norecurse nounwind }
+attributes #4 = { convergent noinline norecurse "hlsl.numthreads"="1,1,1" "hlsl.shader"="compute" }
+attributes #5 = { convergent }
diff --git a/llvm/test/CodeGen/DirectX/finalize_linkage.ll b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
index b6da9f6cb392..c761a79a5c28 100644
--- a/llvm/test/CodeGen/DirectX/finalize_linkage.ll
+++ b/llvm/test/CodeGen/DirectX/finalize_linkage.ll
@@ -6,7 +6,7 @@ target triple = "dxilv1.5-pc-shadermodel6.5-compute"
 ; DXILFinalizeLinkage changes linkage of all functions that are not
 ; entry points or exported function to internal.
 
-; CHECK: define internal void @"?f1@@YAXXZ"()
+; CHECK-NOT: define internal void @"?f1@@YAXXZ"()
 define void @"?f1@@YAXXZ"() #0 {
 entry:
   ret void
diff --git a/llvm/test/CodeGen/DirectX/fneg-conversion.ll b/llvm/test/CodeGen/DirectX/fneg-conversion.ll
index a397c18398c5..3acf4790de4b 100644
--- a/llvm/test/CodeGen/DirectX/fneg-conversion.ll
+++ b/llvm/test/CodeGen/DirectX/fneg-conversion.ll
@@ -1,14 +1,16 @@
 ; RUN: llc %s --filetype=asm -o - | FileCheck %s
 target triple = "dxil-unknown-shadermodel6.7-library"
 
-define float @negateF(float %0) {
+define float @negateF(float %0) #0 {
 ; CHECK:  %2 = fsub float -0.000000e+00, %0
   %2 = fneg float %0
   ret float %2
 }
 
-define double @negateD(double %0) {
+define double @negateD(double %0) #0 {
 ; CHECK: %2 = fsub double -0.000000e+00, %0
   %2 = fneg double %0
   ret double %2
 }
+
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
\ No newline at end of file
diff --git a/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll b/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll
index 6066a0033e45..734ff1b4dd2d 100644
--- a/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll
+++ b/llvm/test/CodeGen/DirectX/omit-bitcast-insert.ll
@@ -1,32 +1,34 @@
 ; RUN: llc --filetype=asm %s -o - | FileCheck %s
 target triple = "dxil-unknown-shadermodel6.7-library"
 
-define i64 @test(ptr %p) {
+define i64 @test(ptr %p) #0 {
   %v = load i64, ptr %p
   ret i64 %v
 }
 
-; CHECK: define internal i64 @test(ptr %p) {
+; CHECK: define i64 @test(ptr %p) #0 {
 ; CHECK-NEXT: %v = load i64, ptr %p, align 8
 ; CHECK-NEXT: ret i64 %v
 
-define i64 @test2(ptr %p) {
+define i64 @test2(ptr %p) #0 {
   store i64 0, ptr %p
   %v = load i64, ptr %p
   ret i64 %v
 }
 
-; CHECK: define internal i64 @test2(ptr %p) {
+; CHECK: define i64 @test2(ptr %p) #0 {
 ; CHECK-NEXT: store i64 0, ptr %p
 ; CHECK-NEXT: %v = load i64, ptr %p, align 8
 ; CHECK-NEXT: ret i64 %v
 
-define i32 @test3(ptr %0)  {
+define i32 @test3(ptr %0) #0 {
   %2 = getelementptr i32, ptr %0, i32 4
   %3 = load i32, ptr %2
   ret i32 %3
 }
 
-; CHECK: define internal i32 @test3(ptr %0)  {
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
+
+; CHECK: define i32 @test3(ptr %0) #0 {
 ; CHECK-NEXT: %2 = getelementptr i32, ptr %0, i32 4
 ; CHECK-NEXT: %3 = load i32, ptr %2
diff --git a/llvm/test/CodeGen/DirectX/scalar-load.ll b/llvm/test/CodeGen/DirectX/scalar-load.ll
index 612e5646f5a0..b911a8f7855b 100644
--- a/llvm/test/CodeGen/DirectX/scalar-load.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-load.ll
@@ -20,7 +20,7 @@
 
 
 ; CHECK-LABEL: load_array_vec_test
-define <4 x i32> @load_array_vec_test() {
+define <4 x i32> @load_array_vec_test() #0 {
   ; CHECK-COUNT-8: load i32, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align 4
   ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([2 x <4 x i32>], [2 x <4 x i32>] addrspace(3)* @"arrayofVecData", i32 0, i32 0), align 4
@@ -30,7 +30,7 @@ define <4 x i32> @load_array_vec_test() {
 }
 
 ; CHECK-LABEL: load_vec_test
-define <4 x i32> @load_vec_test() {
+define <4 x i32> @load_vec_test() #0 {
   ; CHECK-COUNT-4: load i32, ptr addrspace(3) {{(@vecData.scalarized|getelementptr \(i32, ptr addrspace\(3\) @vecData.scalarized, i32 .*\)|%.*)}}, align {{.*}}
   ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4 
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* @"vecData", align 4
@@ -38,7 +38,7 @@ define <4 x i32> @load_vec_test() {
 }
 
 ; CHECK-LABEL: load_static_array_of_vec_test
-define <4 x i32> @load_static_array_of_vec_test(i32 %index) {
+define <4 x i32> @load_static_array_of_vec_test(i32 %index) #0 {
   ; CHECK: getelementptr [3 x [4 x i32]], ptr @staticArrayOfVecData.scalarized, i32 0, i32 %index
   ; CHECK-COUNT-4: load i32, ptr {{.*}}, align 4
   ; CHECK-NOT: load i32, ptr {{.*}}, align 4
@@ -48,7 +48,7 @@ define <4 x i32> @load_static_array_of_vec_test(i32 %index) {
 }
 
 ; CHECK-LABEL: multid_load_test
-define <4 x i32> @multid_load_test()  {
+define <4 x i32> @multid_load_test() #0 {
   ; CHECK-COUNT-8: load i32, ptr addrspace(3) {{(.*@groushared2dArrayofVectors.scalarized.*|%.*)}}, align 4
   ; CHECK-NOT: load i32, ptr addrspace(3) {{.*}}, align 4
   %1 = load <4 x i32>, <4 x i32> addrspace(3)* getelementptr inbounds ([3 x [3 x <4 x i32>]], [3 x [3 x <4 x i32>]] addrspace(3)* @"groushared2dArrayofVectors", i32 0, i32 0, i32 0), align 4
@@ -56,3 +56,5 @@ define <4 x i32> @multid_load_test()  {
   %3 = add <4 x i32> %1, %2
   ret <4 x i32> %3
 }
+
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
diff --git a/llvm/test/CodeGen/DirectX/scalar-store.ll b/llvm/test/CodeGen/DirectX/scalar-store.ll
index 7734d32bca58..c45481e8cae1 100644
--- a/llvm/test/CodeGen/DirectX/scalar-store.ll
+++ b/llvm/test/CodeGen/DirectX/scalar-store.ll
@@ -12,7 +12,7 @@
 ; CHECK-NOT: @vecData
 
 ; CHECK-LABEL: store_array_vec_test
-define void @store_array_vec_test () local_unnamed_addr {
+define void @store_array_vec_test () local_unnamed_addr #0 {
     ; CHECK-COUNT-6: store float {{1|2|3|4|6}}.000000e+00, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align {{4|8|16}}
     ; CHECK-NOT: store float {{1|2|3|4|6}}.000000e+00, ptr addrspace(3) {{(.*@arrayofVecData.scalarized.*|%.*)}}, align {{4|8|16}}
     store <3 x float> <float 1.000000e+00, float 2.000000e+00, float 3.000000e+00>, ptr addrspace(3) @"arrayofVecData", align 16 
@@ -21,9 +21,11 @@ define void @store_array_vec_test () local_unnamed_addr {
  } 
 
 ; CHECK-LABEL: store_vec_test
-define void @store_vec_test(<4 x i32> %inputVec) {
+define void @store_vec_test(<4 x i32> %inputVec) #0 {
   ; CHECK-COUNT-4: store i32 %inputVec.{{.*}}, ptr addrspace(3) {{(@vecData.scalarized|getelementptr \(i32, ptr addrspace\(3\) @vecData.scalarized, i32 .*\)|%.*)}}, align 4 
   ; CHECK-NOT: store i32 %inputVec.{{.*}}, ptr addrspace(3)
   store <4 x i32> %inputVec, <4 x i32> addrspace(3)* @"vecData", align 4
   ret void
 }
+
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
diff --git a/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll
index a14c1de5cc42..0546a5505416 100644
--- a/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll
+++ b/llvm/test/CodeGen/DirectX/scalarize-two-calls.ll
@@ -2,7 +2,7 @@
 
 ; CHECK: target triple = "dxilv1.3-pc-shadermodel6.3-library"
 ; CHECK-LABEL: cos_sin_float_test
-define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) {
+define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) #0 {
     ; CHECK: [[ee0:%.*]] = extractelement <4 x float> %a, i64 0
     ; CHECK: [[ie0:%.*]] = call float @dx.op.unary.f32(i32 13, float [[ee0]])
     ; CHECK: [[ee1:%.*]] = extractelement <4 x float> %a, i64 1
@@ -23,3 +23,5 @@ define noundef <4 x float> @cos_sin_float_test(<4 x float> noundef %a) {
     %3 = tail call <4 x float> @llvm.cos.v4f32(<4 x float> %2) 
     ret <4 x float> %3 
 } 
+
+attributes #0 = { convergent norecurse nounwind "hlsl.export"}
diff --git a/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll b/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll
index b0dd89cf90f2..4223c9094319 100644
--- a/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll
+++ b/llvm/test/CodeGen/DirectX/strip-fn-attrs.ll
@@ -12,4 +12,4 @@ define dso_local float @fma(float %0, float %1, float %2) local_unnamed_addr #0
 ; CHECK: attributes #0 = { nounwind memory(none) }
 ; CHECK-NOT: attributes #
 
-attributes #0 = { norecurse nounwind readnone willreturn }
+attributes #0 = { norecurse nounwind readnone willreturn "hlsl.export"}
-- 
GitLab


From a9d39ce5d2e93e76598732e2caeaae0dbe155f1c Mon Sep 17 00:00:00 2001
From: Craig Topper <craig.topper@sifive.com>
Date: Thu, 17 Oct 2024 11:55:31 -0700
Subject: [PATCH 296/329] [RISCV][GISel] Pass APInt by const reference. NFC

---
 llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp | 2 +-
 llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h   | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
index d1449f751b40..c06ab061ddc3 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.cpp
@@ -670,7 +670,7 @@ bool RISCVLegalizerInfo::legalizeVAStart(MachineInstr &MI,
   return true;
 }
 
-bool RISCVLegalizerInfo::shouldBeInConstantPool(APInt APImm,
+bool RISCVLegalizerInfo::shouldBeInConstantPool(const APInt &APImm,
                                                 bool ShouldOptForSize) const {
   assert(APImm.getBitWidth() == 32 || APImm.getBitWidth() == 64);
   int64_t Imm = APImm.getSExtValue();
diff --git a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
index d2afb175ae42..ee8b08bc9e31 100644
--- a/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
+++ b/llvm/lib/Target/RISCV/GISel/RISCVLegalizerInfo.h
@@ -38,7 +38,7 @@ public:
                          MachineInstr &MI) const override;
 
 private:
-  bool shouldBeInConstantPool(APInt APImm, bool ShouldOptForSize) const;
+  bool shouldBeInConstantPool(const APInt &APImm, bool ShouldOptForSize) const;
   bool legalizeShlAshrLshr(MachineInstr &MI, MachineIRBuilder &MIRBuilder,
                            GISelChangeObserver &Observer) const;
 
-- 
GitLab


From 29097dd2f39cfd7e5f2e389b0f0a7701188d7570 Mon Sep 17 00:00:00 2001
From: Chris Apple <cja-private@pm.me>
Date: Thu, 17 Oct 2024 11:58:07 -0700
Subject: [PATCH 297/329] [rtsan][NFC] Remove rtsan_ prefix from stats
 variables (#112762)

---
 compiler-rt/lib/rtsan/rtsan_stats.cpp | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/compiler-rt/lib/rtsan/rtsan_stats.cpp b/compiler-rt/lib/rtsan/rtsan_stats.cpp
index 1562b73cf94c..277182a7abc8 100644
--- a/compiler-rt/lib/rtsan/rtsan_stats.cpp
+++ b/compiler-rt/lib/rtsan/rtsan_stats.cpp
@@ -19,32 +19,32 @@
 using namespace __sanitizer;
 using namespace __rtsan;
 
-static atomic_uint32_t rtsan_total_error_count{0};
-static atomic_uint32_t rtsan_unique_error_count{0};
-static atomic_uint32_t rtsan_suppressed_count{0};
+static atomic_uint32_t total_error_count{0};
+static atomic_uint32_t unique_error_count{0};
+static atomic_uint32_t suppressed_count{0};
 
 void __rtsan::IncrementTotalErrorCount() {
-  atomic_fetch_add(&rtsan_total_error_count, 1, memory_order_relaxed);
+  atomic_fetch_add(&total_error_count, 1, memory_order_relaxed);
 }
 
 void __rtsan::IncrementUniqueErrorCount() {
-  atomic_fetch_add(&rtsan_unique_error_count, 1, memory_order_relaxed);
+  atomic_fetch_add(&unique_error_count, 1, memory_order_relaxed);
 }
 
 static u32 GetTotalErrorCount() {
-  return atomic_load(&rtsan_total_error_count, memory_order_relaxed);
+  return atomic_load(&total_error_count, memory_order_relaxed);
 }
 
 static u32 GetUniqueErrorCount() {
-  return atomic_load(&rtsan_unique_error_count, memory_order_relaxed);
+  return atomic_load(&unique_error_count, memory_order_relaxed);
 }
 
 void __rtsan::IncrementSuppressedCount() {
-  atomic_fetch_add(&rtsan_suppressed_count, 1, memory_order_relaxed);
+  atomic_fetch_add(&suppressed_count, 1, memory_order_relaxed);
 }
 
 static u32 GetSuppressedCount() {
-  return atomic_load(&rtsan_suppressed_count, memory_order_relaxed);
+  return atomic_load(&suppressed_count, memory_order_relaxed);
 }
 
 void __rtsan::PrintStatisticsSummary() {
-- 
GitLab


From 32b55f375feaf6bcc2c870964a0bf087cf3c22bf Mon Sep 17 00:00:00 2001
From: Sylvestre Ledru <sylvestre@debian.org>
Date: Thu, 17 Oct 2024 21:32:20 +0200
Subject: [PATCH 298/329] Add support of the next Ubuntu (Ubuntu 25.04 - Plucky
 Puffin)

---
 clang/include/clang/Driver/Distro.h | 3 ++-
 clang/lib/Driver/Distro.cpp         | 1 +
 2 files changed, 3 insertions(+), 1 deletion(-)

diff --git a/clang/include/clang/Driver/Distro.h b/clang/include/clang/Driver/Distro.h
index 1404e1686848..b4d485dac8a2 100644
--- a/clang/include/clang/Driver/Distro.h
+++ b/clang/include/clang/Driver/Distro.h
@@ -80,6 +80,7 @@ public:
     UbuntuMantic,
     UbuntuNoble,
     UbuntuOracular,
+    UbuntuPlucky,
     UnknownDistro
   };
 
@@ -131,7 +132,7 @@ public:
   }
 
   bool IsUbuntu() const {
-    return DistroVal >= UbuntuHardy && DistroVal <= UbuntuOracular;
+    return DistroVal >= UbuntuHardy && DistroVal <= UbuntuPlucky;
   }
 
   bool IsAlpineLinux() const { return DistroVal == AlpineLinux; }
diff --git a/clang/lib/Driver/Distro.cpp b/clang/lib/Driver/Distro.cpp
index 6f49e641104c..3d1bce027f66 100644
--- a/clang/lib/Driver/Distro.cpp
+++ b/clang/lib/Driver/Distro.cpp
@@ -96,6 +96,7 @@ static Distro::DistroType DetectLsbRelease(llvm::vfs::FileSystem &VFS) {
                     .Case("mantic", Distro::UbuntuMantic)
                     .Case("noble", Distro::UbuntuNoble)
                     .Case("oracular", Distro::UbuntuOracular)
+                    .Case("plucky", Distro::UbuntuPlucky)
                     .Default(Distro::UnknownDistro);
   return Version;
 }
-- 
GitLab


From 70865844cbf619e1f4011cd3383a028ab4ec2081 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 17 Oct 2024 13:07:02 -0700
Subject: [PATCH 299/329] [sanitizer] Large range support in
 IsAccessibleMemoryRange (#112665)

The comment stated that it's slow, but likely it's a deadlock,
as write can be blocked.

Also we can't be sure that `page_size * 10` is appropriate size.

Still most likely this is NFC, as the max `size` we use is 32,
and should fit in any buffer.
---
 .../sanitizer_posix_libcdep.cpp               | 52 ++++++++++++-------
 .../tests/sanitizer_posix_test.cpp            |  6 +++
 2 files changed, 40 insertions(+), 18 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index 9ffb36f812c4..3ab83977a4ee 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -288,26 +288,42 @@ bool SignalContext::IsStackOverflow() const {
 
 #endif  // SANITIZER_GO
 
+static void SetNonBlock(int fd) {
+  int res = fcntl(fd, F_GETFL, 0);
+  CHECK(!internal_iserror(res, nullptr));
+
+  res |= O_NONBLOCK;
+  res = fcntl(fd, F_SETFL, res);
+  CHECK(!internal_iserror(res, nullptr));
+}
+
 bool IsAccessibleMemoryRange(uptr beg, uptr size) {
-  uptr page_size = GetPageSizeCached();
-  // Checking too large memory ranges is slow.
-  CHECK_LT(size, page_size * 10);
-  int sock_pair[2];
-  if (pipe(sock_pair))
-    return false;
-  uptr bytes_written =
-      internal_write(sock_pair[1], reinterpret_cast<void *>(beg), size);
-  int write_errno;
-  bool result;
-  if (internal_iserror(bytes_written, &write_errno)) {
-    CHECK_EQ(EFAULT, write_errno);
-    result = false;
-  } else {
-    result = (bytes_written == size);
+  while (size) {
+    // `read` from `fds[0]` into a dummy buffer to free up the pipe buffer for
+    // more `write` is slower than just recreating a pipe.
+    int fds[2];
+    CHECK_EQ(0, pipe(fds));
+
+    auto cleanup = at_scope_exit([&]() {
+      internal_close(fds[0]);
+      internal_close(fds[1]);
+    });
+
+    SetNonBlock(fds[1]);
+
+    int write_errno;
+    uptr w = internal_write(fds[1], reinterpret_cast<char *>(beg), size);
+    if (internal_iserror(w, &write_errno)) {
+      if (write_errno == EINTR)
+        continue;
+      CHECK_EQ(EFAULT, write_errno);
+      return false;
+    }
+    size -= w;
+    beg += w;
   }
-  internal_close(sock_pair[0]);
-  internal_close(sock_pair[1]);
-  return result;
+
+  return true;
 }
 
 void PlatformPrepareForSandboxing(void *args) {
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
index 803c8d39362e..04890f2f5e2a 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
@@ -80,6 +80,12 @@ TEST(SanitizerCommon, IsAccessibleMemoryRange) {
   EXPECT_FALSE(IsAccessibleMemoryRange(0x0, 2));
 }
 
+TEST(SanitizerCommon, IsAccessibleMemoryRangeLarge) {
+  InternalMmapVector<char> buffer(10000 * GetPageSize());
+  EXPECT_TRUE(IsAccessibleMemoryRange(reinterpret_cast<uptr>(buffer.data()),
+                                      buffer.size()));
+}
+
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_POSIX
-- 
GitLab


From 2cd10f5292992a2fb549eb69806447679dfc15db Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 17 Oct 2024 16:14:39 -0400
Subject: [PATCH 300/329] [libc++] Remove obsolete Appveyor files (#112744)

We don't use Appveyor to run Windows CI anymore (in fact I don't know if
we ever did). Our full Windows CI setup is based on Github actions, so
these files are effectively dead code.
---
 libcxx/appveyor-reqs-install.cmd | 53 ------------------------
 libcxx/appveyor.yml              | 71 --------------------------------
 2 files changed, 124 deletions(-)
 delete mode 100644 libcxx/appveyor-reqs-install.cmd
 delete mode 100644 libcxx/appveyor.yml

diff --git a/libcxx/appveyor-reqs-install.cmd b/libcxx/appveyor-reqs-install.cmd
deleted file mode 100644
index e3bd018dd374..000000000000
--- a/libcxx/appveyor-reqs-install.cmd
+++ /dev/null
@@ -1,53 +0,0 @@
-@echo on
-
-if NOT EXIST C:\projects\deps (
-  mkdir C:\projects\deps
-)
-cd C:\projects\deps
-
-::###########################################################################
-:: Setup Compiler
-::###########################################################################
-if NOT EXIST llvm-installer.exe (
-  appveyor DownloadFile https://prereleases.llvm.org/win-snapshots/LLVM-9.0.0-r357435-win32.exe -FileName llvm-installer.exe
-)
-if "%CLANG_VERSION%"=="ToT" (
-    START /WAIT llvm-installer.exe /S /D=C:\"Program Files\LLVM"
-)
-if DEFINED CLANG_VERSION  @set PATH="C:\Program Files\LLVM\bin";%PATH%
-if DEFINED CLANG_VERSION  clang-cl -v
-
-if DEFINED MINGW_PATH rename "C:\Program Files\Git\usr\bin\sh.exe" "sh-ignored.exe"
-if DEFINED MINGW_PATH @set "PATH=%PATH:C:\Program Files (x86)\Git\bin=%"
-if DEFINED MINGW_PATH @set "PATH=%PATH%;%MINGW_PATH%"
-if DEFINED MINGW_PATH g++ -v
-
-::###########################################################################
-:: Install a recent CMake
-::###########################################################################
-if NOT EXIST cmake (
-  appveyor DownloadFile https://cmake.org/files/v3.7/cmake-3.7.2-win64-x64.zip -FileName cmake.zip
-  7z x cmake.zip -oC:\projects\deps > nul
-  move C:\projects\deps\cmake-* C:\projects\deps\cmake
-  rm cmake.zip
-)
-@set PATH=C:\projects\deps\cmake\bin;%PATH%
-cmake --version
-
-::###########################################################################
-:: Install Ninja
-::###########################################################################
-if NOT EXIST ninja (
-  appveyor DownloadFile https://github.com/ninja-build/ninja/releases/download/v1.6.0/ninja-win.zip -FileName ninja.zip
-  7z x ninja.zip -oC:\projects\deps\ninja > nul
-  rm ninja.zip
-)
-@set PATH=C:\projects\deps\ninja;%PATH%
-ninja --version
-
-::###########################################################################
-:: Setup the cached copy of LLVM
-::###########################################################################
-git clone --depth=1 http://llvm.org/git/llvm.git
-
-@echo off
diff --git a/libcxx/appveyor.yml b/libcxx/appveyor.yml
deleted file mode 100644
index 8a69cb9e7dde..000000000000
--- a/libcxx/appveyor.yml
+++ /dev/null
@@ -1,71 +0,0 @@
-version: '{build}'
-
-shallow_clone: true
-
-build:
-  verbosity: detailed
-
-configuration:
-  - Debug
-
-environment:
-  matrix:
-    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2017
-      CMAKE_OPTIONS: -DCMAKE_C_COMPILER=clang-cl.exe -DCMAKE_CXX_COMPILER=clang-cl.exe
-      CLANG_VERSION: ToT
-      MSVC_SETUP_PATH: C:\Program Files (x86)\Microsoft Visual Studio\2017\Community\VC\Auxiliary\Build\vcvarsall.bat
-      MSVC_SETUP_ARG: x86
-      GENERATOR: Ninja
-      MAKE_PROGRAM: ninja
-      APPVEYOR_SAVE_CACHE_ON_ERROR: true
-# TODO: Maybe re-enable this configuration? Do we want to support MSVC 2015's runtime?
-#    - APPVEYOR_BUILD_WORKER_IMAGE: Visual Studio 2015
-#      MINGW_PATH: C:\mingw-w64\i686-6.3.0-posix-dwarf-rt_v5-rev1\mingw32\bin
-#      GENERATOR: MinGW Makefiles
-#      MAKE_PROGRAM: mingw32-make
-#      APPVEYOR_SAVE_CACHE_ON_ERROR: true
-
-install:
-  ############################################################################
-  # All external dependencies are installed in C:\projects\deps
-  ############################################################################
-  - call "%APPVEYOR_BUILD_FOLDER%\\appveyor-reqs-install.cmd"
-
-before_build:
-  - if DEFINED MSVC_SETUP_PATH call "%MSVC_SETUP_PATH%" %MSVC_SETUP_ARG%
-  - cd %APPVEYOR_BUILD_FOLDER%
-
-build_script:
-  - md C:\projects\build-libcxx
-  - cd C:\projects\build-libcxx
-  - echo %configuration%
-
-  #############################################################################
-  # Configuration Step
-  #############################################################################
-  - cmake -G "%GENERATOR%" %CMAKE_OPTIONS%
-    "-DCMAKE_BUILD_TYPE=%configuration%"
-    "-DLLVM_PATH=C:\projects\deps\llvm"
-    -DLLVM_LIT_ARGS="-v --show-xfail --show-unsupported"
-    %APPVEYOR_BUILD_FOLDER%
-
-  #############################################################################
-  # Build Step
-  #############################################################################
-  - "%MAKE_PROGRAM%"
-
-test_script:
-  - "%MAKE_PROGRAM% check-cxx"
-
-on_failure:
-  - appveyor PushArtifact CMakeFiles/CMakeOutput.log
-  - appveyor PushArtifact CMakeFiles/CMakeError.log
-
-artifacts:
-  - path: '_build/CMakeFiles/*.log'
-    name: logs
-
-cache:
- - C:\projects\deps\ninja
- - C:\projects\deps\cmake
- - C:\projects\deps\llvm-installer.exe
-- 
GitLab


From 8c77f4c5087ac5a8e5dc08e472cf06897689a68b Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 17 Oct 2024 16:15:33 -0400
Subject: [PATCH 301/329] [runtimes] Avoid cluttering the top-level build
 directory with test artifacts (#112717)

Instead of placing artifacts for testing the runtimes at <build>/test,
place those artifacts at <build>/<project>/test. This prevents
cluttering the build directory with the runtimes' test artifacts for
everyone else.

As a drive-by, remove LIBCXX_BINARY_INCLUDE_DIR which wasn't used
anymore.
---
 libcxx/CMakeLists.txt                      | 3 ---
 libcxx/test/configs/cmake-bridge.cfg.in    | 2 +-
 libcxxabi/test/configs/cmake-bridge.cfg.in | 2 +-
 libunwind/test/configs/cmake-bridge.cfg.in | 2 +-
 4 files changed, 3 insertions(+), 6 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 75c926f5432a..6befb87e5556 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -19,7 +19,6 @@ set(CMAKE_FOLDER "libc++")
 
 set(LIBCXX_SOURCE_DIR  ${CMAKE_CURRENT_SOURCE_DIR})
 set(LIBCXX_BINARY_DIR  ${CMAKE_CURRENT_BINARY_DIR})
-set(LIBCXX_BINARY_INCLUDE_DIR "${LIBCXX_BINARY_DIR}/include/c++build")
 
 include(GNUInstallDirs)
 include(WarningFlags)
@@ -443,8 +442,6 @@ else()
       "Path where target-specific libc++ headers should be installed.")
 endif()
 
-file(MAKE_DIRECTORY "${LIBCXX_BINARY_INCLUDE_DIR}")
-
 set(CMAKE_ARCHIVE_OUTPUT_DIRECTORY ${LIBCXX_LIBRARY_DIR})
 set(CMAKE_LIBRARY_OUTPUT_DIRECTORY ${LIBCXX_LIBRARY_DIR})
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY ${LIBCXX_LIBRARY_DIR})
diff --git a/libcxx/test/configs/cmake-bridge.cfg.in b/libcxx/test/configs/cmake-bridge.cfg.in
index bc9bb0e03911..139a6cafa2cf 100644
--- a/libcxx/test/configs/cmake-bridge.cfg.in
+++ b/libcxx/test/configs/cmake-bridge.cfg.in
@@ -20,7 +20,7 @@ config.name = os.path.basename('@LIBCXX_TEST_CONFIG@')
 config.test_source_root = os.path.join('@LIBCXX_SOURCE_DIR@', 'test')
 config.test_format = libcxx.test.format.CxxStandardLibraryTest()
 config.recursiveExpansionLimit = 10
-config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
+config.test_exec_root = os.path.join('@LIBCXX_BINARY_DIR@', 'test')
 
 # Add substitutions for bootstrapping the test suite configuration
 config.substitutions.append(('%{libcxx-dir}', '@LIBCXX_SOURCE_DIR@'))
diff --git a/libcxxabi/test/configs/cmake-bridge.cfg.in b/libcxxabi/test/configs/cmake-bridge.cfg.in
index b00eb642750c..1dd6b3367e43 100644
--- a/libcxxabi/test/configs/cmake-bridge.cfg.in
+++ b/libcxxabi/test/configs/cmake-bridge.cfg.in
@@ -21,7 +21,7 @@ config.name = os.path.basename('@LIBCXXABI_TEST_CONFIG@')
 config.test_source_root = os.path.join('@LIBCXXABI_SOURCE_DIR@', 'test')
 config.test_format = libcxx.test.format.CxxStandardLibraryTest()
 config.recursiveExpansionLimit = 10
-config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
+config.test_exec_root = os.path.join('@LIBCXXABI_BINARY_DIR@', 'test')
 
 # TODO: This is a non-standard Lit attribute and we should have another way of accessing this.
 config.host_triple = '@LLVM_HOST_TRIPLE@'
diff --git a/libunwind/test/configs/cmake-bridge.cfg.in b/libunwind/test/configs/cmake-bridge.cfg.in
index f627f401f9f7..20b61e788ab1 100644
--- a/libunwind/test/configs/cmake-bridge.cfg.in
+++ b/libunwind/test/configs/cmake-bridge.cfg.in
@@ -20,7 +20,7 @@ config.name = os.path.basename('@LIBUNWIND_TEST_CONFIG@')
 config.test_source_root = os.path.join('@LIBUNWIND_SOURCE_DIR@', 'test')
 config.test_format = libcxx.test.format.CxxStandardLibraryTest()
 config.recursiveExpansionLimit = 10
-config.test_exec_root = os.path.join('@CMAKE_BINARY_DIR@', 'test')
+config.test_exec_root = os.path.join('@LIBUNWIND_BINARY_DIR@', 'test')
 
 # Add a few features that are common to all the configurations
 if @LIBUNWIND_USES_ARM_EHABI@:
-- 
GitLab


From e2d07fc3d8737c08d351e841f82911a5c3ddf433 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 17 Oct 2024 16:16:15 -0400
Subject: [PATCH 302/329] [libc++] Mark libc++ deallocation helpers as noexcept
 (#110884)

They already can't throw exceptions and they are called from noexcept
functions, but they were not marked as noexcept. Depending on compiler
inlining, this might not make a difference or this might improve the
codegen a bit by removing the implicit try-catch block that Clang
generates around non-noexcept functions called from noexcept functions.

The original issue also mentioned that one occurrence of
std::allocator::deallocate was missing noexcept, however it has since
then been removed.

Fixes #66100
---
 libcxx/include/new | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/libcxx/include/new b/libcxx/include/new
index 2947ee179510..75e2b8742df6 100644
--- a/libcxx/include/new
+++ b/libcxx/include/new
@@ -281,7 +281,7 @@ _LIBCPP_HIDE_FROM_ABI void* __libcpp_operator_new(_Args... __args) {
 }
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) {
+_LIBCPP_HIDE_FROM_ABI void __libcpp_operator_delete(_Args... __args) _NOEXCEPT {
 #if __has_builtin(__builtin_operator_new) && __has_builtin(__builtin_operator_delete)
   __builtin_operator_delete(__args...);
 #else
@@ -302,7 +302,7 @@ inline _LIBCPP_HIDE_FROM_ABI void* __libcpp_allocate(size_t __size, size_t __ali
 }
 
 template <class... _Args>
-_LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args) {
+_LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __size, _Args... __args) _NOEXCEPT {
 #if !_LIBCPP_HAS_SIZED_DEALLOCATION
   (void)__size;
   return std::__libcpp_operator_delete(__ptr, __args...);
@@ -311,7 +311,7 @@ _LIBCPP_HIDE_FROM_ABI void __do_deallocate_handle_size(void* __ptr, size_t __siz
 #endif
 }
 
-inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) {
+inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size, size_t __align) _NOEXCEPT {
 #if !_LIBCPP_HAS_ALIGNED_ALLOCATION
   (void)__align;
   return __do_deallocate_handle_size(__ptr, __size);
@@ -325,7 +325,7 @@ inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate(void* __ptr, size_t __size
 #endif
 }
 
-inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) {
+inline _LIBCPP_HIDE_FROM_ABI void __libcpp_deallocate_unsized(void* __ptr, size_t __align) _NOEXCEPT {
 #if !_LIBCPP_HAS_ALIGNED_ALLOCATION
   (void)__align;
   return __libcpp_operator_delete(__ptr);
-- 
GitLab


From e67442486d5efd48235f62b438557bc95193fc48 Mon Sep 17 00:00:00 2001
From: Louis Dionne <ldionne.2@gmail.com>
Date: Thu, 17 Oct 2024 16:17:40 -0400
Subject: [PATCH 303/329] [runtimes] Improve the documentation for
 LIBCXX_ADDITIONAL_COMPILE_FLAGS (#112733)

This clarifies how that option is meant to be used to avoid confusion.
As a drive-by, also fix an incorrect usage in the recently-added GPU
caches.
---
 libcxx/CMakeLists.txt               |  4 +++-
 libcxx/cmake/caches/AMDGPU.cmake    |  4 ++--
 libcxx/docs/VendorDocumentation.rst | 12 ++++--------
 libcxxabi/CMakeLists.txt            |  3 +--
 libunwind/CMakeLists.txt            |  3 +--
 5 files changed, 11 insertions(+), 15 deletions(-)

diff --git a/libcxx/CMakeLists.txt b/libcxx/CMakeLists.txt
index 6befb87e5556..574b262018cd 100644
--- a/libcxx/CMakeLists.txt
+++ b/libcxx/CMakeLists.txt
@@ -456,7 +456,9 @@ set(LIBCXX_COMPILE_FLAGS "")
 set(LIBCXX_LINK_FLAGS "")
 set(LIBCXX_LIBRARIES "")
 set(LIBCXX_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING
-    "Additional Compile only flags which can be provided in cache")
+    "Additional compile flags to use when building libc++. This should be a CMake ;-delimited list of individual
+     compiler options to use. For options that must be passed as-is to the compiler without deduplication (e.g.
+     `-Xclang -foo` option groups), consider using `SHELL:` (https://cmake.org/cmake/help/latest/command/add_compile_options.html#option-de-duplication).")
 set(LIBCXX_ADDITIONAL_LIBRARIES "" CACHE STRING
     "Additional libraries libc++ is linked to which can be provided in cache")
 
diff --git a/libcxx/cmake/caches/AMDGPU.cmake b/libcxx/cmake/caches/AMDGPU.cmake
index c7d6afc854a5..1a6bfd85a50b 100644
--- a/libcxx/cmake/caches/AMDGPU.cmake
+++ b/libcxx/cmake/caches/AMDGPU.cmake
@@ -29,7 +29,7 @@ set(LIBCXXABI_USE_LLVM_UNWINDER OFF CACHE BOOL "")
 
 # Necessary compile flags for AMDGPU.
 set(LIBCXX_ADDITIONAL_COMPILE_FLAGS
-    "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "")
+    "-nogpulib;-flto;-fconvergent-functions;SHELL:-Xclang -mcode-object-version=none" CACHE STRING "")
 set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS
-    "-nogpulib;-flto;-fconvergent-functions;-Xclang;-mcode-object-version=none" CACHE STRING "")
+    "-nogpulib;-flto;-fconvergent-functions;SHELL:-Xclang -mcode-object-version=none" CACHE STRING "")
 set(CMAKE_REQUIRED_FLAGS "-nogpulib" CACHE STRING "")
diff --git a/libcxx/docs/VendorDocumentation.rst b/libcxx/docs/VendorDocumentation.rst
index 3a3d1cdb1ea7..3795381264c9 100644
--- a/libcxx/docs/VendorDocumentation.rst
+++ b/libcxx/docs/VendorDocumentation.rst
@@ -213,11 +213,13 @@ General purpose options
 
   Output name for the shared libc++ runtime library.
 
-.. option:: LIBCXX_ADDITIONAL_COMPILE_FLAGS:STRING
+.. option:: {LIBCXX,LIBCXXABI,LIBUNWIND}_ADDITIONAL_COMPILE_FLAGS:STRING
 
   **Default**: ``""``
 
-  Additional Compile only flags which can be provided in cache.
+  Additional compile flags to use when building the runtimes. This should be a CMake ``;``-delimited list of individual
+  compiler options to use. For options that must be passed as-is to the compiler without deduplication (e.g.
+  ``-Xclang -foo`` option groups), consider using ``SHELL:`` as `documented here <https://cmake.org/cmake/help/latest/command/add_compile_options.html#option-de-duplication>`_.
 
 .. option:: LIBCXX_ADDITIONAL_LIBRARIES:STRING
 
@@ -346,12 +348,6 @@ The following options allow building libc++ for a different ABI version.
   Build and use the LLVM unwinder. Note: This option can only be used when
   libc++abi is the C++ ABI library used.
 
-.. option:: LIBCXXABI_ADDITIONAL_COMPILE_FLAGS:STRING
-
-  **Default**: ``""``
-
-  Additional Compile only flags which can be provided in cache.
-
 .. option:: LIBCXXABI_ADDITIONAL_LIBRARIES:STRING
 
   **Default**: ``""``
diff --git a/libcxxabi/CMakeLists.txt b/libcxxabi/CMakeLists.txt
index ac1ee69d5f11..da0e8b286cdd 100644
--- a/libcxxabi/CMakeLists.txt
+++ b/libcxxabi/CMakeLists.txt
@@ -222,8 +222,7 @@ set(LIBCXXABI_CXX_FLAGS "")
 set(LIBCXXABI_COMPILE_FLAGS "")
 set(LIBCXXABI_LINK_FLAGS "")
 set(LIBCXXABI_LIBRARIES "")
-set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING
-    "Additional Compile only flags which can be provided in cache")
+set(LIBCXXABI_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING "See documentation LIBCXX_ADDITIONAL_COMPILE_FLAGS")
 set(LIBCXXABI_ADDITIONAL_LIBRARIES "" CACHE STRING
     "Additional libraries libc++abi is linked to which can be provided in cache")
 
diff --git a/libunwind/CMakeLists.txt b/libunwind/CMakeLists.txt
index b911f482fc26..ea06dc8a67b9 100644
--- a/libunwind/CMakeLists.txt
+++ b/libunwind/CMakeLists.txt
@@ -162,8 +162,7 @@ set(LIBUNWIND_C_FLAGS "")
 set(LIBUNWIND_CXX_FLAGS "")
 set(LIBUNWIND_COMPILE_FLAGS "")
 set(LIBUNWIND_LINK_FLAGS "")
-set(LIBUNWIND_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING
-    "Additional Compile only flags which can be provided in cache")
+set(LIBUNWIND_ADDITIONAL_COMPILE_FLAGS "" CACHE STRING "See documentation for LIBCXX_ADDITIONAL_COMPILE_FLAGS")
 set(LIBUNWIND_ADDITIONAL_LIBRARIES "" CACHE STRING
     "Additional libraries libunwind is linked to which can be provided in cache")
 
-- 
GitLab


From e913a33fcfbd667e4e3a35919b6bd9c5876a90a3 Mon Sep 17 00:00:00 2001
From: Malavika Samak <malavika.samak@gmail.com>
Date: Thu, 17 Oct 2024 13:48:35 -0700
Subject: [PATCH 304/329] [-Wunsafe-buffer-usage] Emit a warning if pointer
 returned by vector::data and array::data is cast to larger type (#111910)

Emit a warning when the raw pointer retrieved from std::vector and
std::array instances are cast to a larger type. Such a cast followed by
a field dereference to the resulting pointer could cause an OOB access.
This is similar to the existing span::data warning.

(rdar://136704278)

Co-authored-by: MalavikaSamak <malavika2@apple.com>
---
 .../clang/Basic/DiagnosticSemaKinds.td        |  2 +-
 clang/lib/Analysis/UnsafeBufferUsage.cpp      |  7 +-
 clang/lib/Sema/AnalysisBasedWarnings.cpp      |  9 ++
 ...e-buffer-usage-warning-data-invocation.cpp | 97 +++++++++++++------
 4 files changed, 82 insertions(+), 33 deletions(-)

diff --git a/clang/include/clang/Basic/DiagnosticSemaKinds.td b/clang/include/clang/Basic/DiagnosticSemaKinds.td
index 487dd8990d88..883db838ca01 100644
--- a/clang/include/clang/Basic/DiagnosticSemaKinds.td
+++ b/clang/include/clang/Basic/DiagnosticSemaKinds.td
@@ -12499,7 +12499,7 @@ def warn_unsafe_buffer_variable : Warning<
   InGroup<UnsafeBufferUsage>, DefaultIgnore;
 def warn_unsafe_buffer_operation : Warning<
   "%select{unsafe pointer operation|unsafe pointer arithmetic|"
-  "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of span::data|"
+  "unsafe buffer access|function introduces unsafe buffer manipulation|unsafe invocation of %1|"
   "field %1 prone to unsafe buffer manipulation}0">,
   InGroup<UnsafeBufferUsage>, DefaultIgnore;
 def warn_unsafe_buffer_libc_call : Warning<
diff --git a/clang/lib/Analysis/UnsafeBufferUsage.cpp b/clang/lib/Analysis/UnsafeBufferUsage.cpp
index 97f1c4f16b8f..5e0ec9ecc92e 100644
--- a/clang/lib/Analysis/UnsafeBufferUsage.cpp
+++ b/clang/lib/Analysis/UnsafeBufferUsage.cpp
@@ -1499,8 +1499,11 @@ public:
   }
 
   static Matcher matcher() {
-    Matcher callExpr = cxxMemberCallExpr(
-        callee(cxxMethodDecl(hasName("data"), ofClass(hasName("std::span")))));
+
+    Matcher callExpr = cxxMemberCallExpr(callee(
+        cxxMethodDecl(hasName("data"),
+                      ofClass(anyOf(hasName("std::span"), hasName("std::array"),
+                                    hasName("std::vector"))))));
     return stmt(
         explicitCastExpr(anyOf(has(callExpr), has(parenExpr(has(callExpr)))))
             .bind(OpTag));
diff --git a/clang/lib/Sema/AnalysisBasedWarnings.cpp b/clang/lib/Sema/AnalysisBasedWarnings.cpp
index 6496a33b8f5a..c76733e9a774 100644
--- a/clang/lib/Sema/AnalysisBasedWarnings.cpp
+++ b/clang/lib/Sema/AnalysisBasedWarnings.cpp
@@ -2279,9 +2279,18 @@ public:
         QualType srcType = ECE->getSubExpr()->getType();
         const uint64_t sSize =
             Ctx.getTypeSize(srcType.getTypePtr()->getPointeeType());
+
         if (sSize >= dSize)
           return;
 
+        if (const auto *CE = dyn_cast<CXXMemberCallExpr>(
+                ECE->getSubExpr()->IgnoreParens())) {
+          D = CE->getMethodDecl();
+        }
+
+        if (!D)
+          return;
+
         MsgParam = 4;
       }
       Loc = Operation->getBeginLoc();
diff --git a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
index 08707d7ff545..0228e42652bd 100644
--- a/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
+++ b/clang/test/SemaCXX/warn-unsafe-buffer-usage-warning-data-invocation.cpp
@@ -32,38 +32,68 @@ void foo(int *p){}
 namespace std{
   template <typename T> class span {
 
-  T *elements;
+   T *elements;
  
-  span(T *, unsigned){}
+   span(T *, unsigned){}
 
-  public:
+   public:
 
-  constexpr span<T> subspan(size_t offset, size_t count) const {
-    return span<T> (elements+offset, count); // expected-warning{{unsafe pointer arithmetic}}
-  }
+   constexpr span<T> subspan(size_t offset, size_t count) const {
+     return span<T> (elements+offset, count); // expected-warning{{unsafe pointer arithmetic}}
+   }
 
-  constexpr T* data() const noexcept {
-    return elements;
-  }
+   constexpr T* data() const noexcept {
+     return elements;
+   }
+
+   constexpr T* hello() const noexcept {
+     return elements;
+   }
+ };
+
+ template <typename T> class vector {
+
+   T *elements;
+
+   public:
+
+   vector(size_t n) {
+     elements = new T[n];
+   }
+
+   constexpr T* data() const noexcept {
+      return elements;
+   }
+
+   ~vector() {
+     delete[] elements;
+   }
+ };
+
+ template <class T, size_t N>
+ class array {
+   T elements[N];
+
+   public:
+
+   constexpr const T* data() const noexcept {
+      return elements;
+   }
+
+ };
 
- 
-  constexpr T* hello() const noexcept {
-   return elements;
-  }
-};
- 
  template <typename T> class span_duplicate {
-  span_duplicate(T *, unsigned){}
+   span_duplicate(T *, unsigned){}
 
-  T array[10];
+   T array[10];
 
-  public:
+   public:
 
-  T* data() {
-    return array;
-  }
+   T* data() {
+     return array;
+   }
 
-};
+ };
 }
 
 using namespace std;
@@ -89,21 +119,28 @@ void cast_without_data(int *ptr) {
  float *p = (float*) ptr;
 }
 
-void warned_patterns(std::span<int> span_ptr, std::span<Base> base_span, span<int> span_without_qual) {
-    A *a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}}
-    a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of span::data}}
+void warned_patterns_span(std::span<int> span_ptr, std::span<Base> base_span, span<int> span_without_qual) {
+    A *a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of 'data'}}
+    a1 = (A*)span_ptr.data(); // expected-warning{{unsafe invocation of 'data'}}
 
-    a1 = (A*)(span_ptr.data()); // expected-warning{{unsafe invocation of span::data}}
-    A *a2 = (A*) (span_without_qual.data()); // expected-warning{{unsafe invocation of span::data}}
+    a1 = (A*)(span_ptr.data()); // expected-warning{{unsafe invocation of 'data'}}
+    A *a2 = (A*) (span_without_qual.data()); // expected-warning{{unsafe invocation of 'data'}}
 
-    a2 = (A*) span_without_qual.data(); // expected-warning{{unsafe invocation of span::data}}
+    a2 = (A*) span_without_qual.data(); // expected-warning{{unsafe invocation of 'data'}}
 
      // TODO:: Should we warn when we cast from base to derived type?
-     Derived *b = dynamic_cast<Derived*> (base_span.data());// expected-warning{{unsafe invocation of span::data}}
+     Derived *b = dynamic_cast<Derived*> (base_span.data());// expected-warning{{unsafe invocation of 'data'}}
 
     // TODO:: This pattern is safe. We can add special handling for it, if we decide this
     // is the recommended fixit for the unsafe invocations.
-    A *a3 = (A*)span_ptr.subspan(0, sizeof(A)).data(); // expected-warning{{unsafe invocation of span::data}}
+    A *a3 = (A*)span_ptr.subspan(0, sizeof(A)).data(); // expected-warning{{unsafe invocation of 'data'}}
+}
+
+void warned_patterns_array(std::array<int, 5> array_ptr, std::array<Base, 10> base_span, span<int> span_without_qual) {
+    const A *a1 = (A*)array_ptr.data(); // expected-warning{{unsafe invocation of 'data'}}
+    a1 = (A*)array_ptr.data(); // expected-warning{{unsafe invocation of 'data'}}
+
+    a1 = (A*)(array_ptr.data()); // expected-warning{{unsafe invocation of 'data'}}
 }
 
 void not_warned_patterns(std::span<A> span_ptr, std::span<Base> base_span) {
-- 
GitLab


From b060661da8b3b53db55644e5e358bb2dca8b56d7 Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Thu, 17 Oct 2024 13:55:20 -0700
Subject: [PATCH 305/329] [SCEVExpander] Expand UDiv avoiding UB when in
 seq_min/max. (#92177)

Update SCEVExpander to introduce an SafeUDivMode, which is set
when expanding operands of SCEVSequentialMinMaxExpr. In this mode,
the expander will make sure that the divisor of the expanded UDiv is
neither 0 nor poison.

Fixes https://github.com/llvm/llvm-project/issues/89958.


PR https://github.com/llvm/llvm-project/pull/92177
---
 .../Utils/ScalarEvolutionExpander.h           |  5 +++
 .../Utils/ScalarEvolutionExpander.cpp         | 20 ++++++++++-
 .../trip-count-expansion-may-introduce-ub.ll  | 36 ++++++++++---------
 3 files changed, 44 insertions(+), 17 deletions(-)

diff --git a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
index 5697d983c9ad..7dd754a2bc0d 100644
--- a/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
+++ b/llvm/include/llvm/Transforms/Utils/ScalarEvolutionExpander.h
@@ -126,6 +126,11 @@ class SCEVExpander : public SCEVVisitor<SCEVExpander, Value *> {
   /// "expanded" form.
   bool LSRMode;
 
+  /// When true, rewrite any divisors of UDiv expressions that may be 0 to
+  /// umax(Divisor, 1) to avoid introducing UB. If the divisor may be poison,
+  /// freeze it first.
+  bool SafeUDivMode = false;
+
   typedef IRBuilder<InstSimplifyFolder, IRBuilderCallbackInserter> BuilderType;
   BuilderType Builder;
 
diff --git a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
index c412d0398b95..39da38e49181 100644
--- a/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
+++ b/llvm/lib/Transforms/Utils/ScalarEvolutionExpander.cpp
@@ -681,7 +681,21 @@ Value *SCEVExpander::visitUDivExpr(const SCEVUDivExpr *S) {
                          SCEV::FlagAnyWrap, /*IsSafeToHoist*/ true);
   }
 
-  Value *RHS = expand(S->getRHS());
+  const SCEV *RHSExpr = S->getRHS();
+  Value *RHS = expand(RHSExpr);
+  if (SafeUDivMode) {
+    bool GuaranteedNotPoison =
+        ScalarEvolution::isGuaranteedNotToBePoison(RHSExpr);
+    if (!GuaranteedNotPoison)
+      RHS = Builder.CreateFreeze(RHS);
+
+    // We need an umax if either RHSExpr is not known to be zero, or if it is
+    // not guaranteed to be non-poison. In the later case, the frozen poison may
+    // be 0.
+    if (!SE.isKnownNonZero(RHSExpr) || !GuaranteedNotPoison)
+      RHS = Builder.CreateIntrinsic(RHS->getType(), Intrinsic::umax,
+                                    {RHS, ConstantInt::get(RHS->getType(), 1)});
+  }
   return InsertBinop(Instruction::UDiv, LHS, RHS, SCEV::FlagAnyWrap,
                      /*IsSafeToHoist*/ SE.isKnownNonZero(S->getRHS()));
 }
@@ -1376,11 +1390,14 @@ Value *SCEVExpander::visitSignExtendExpr(const SCEVSignExtendExpr *S) {
 Value *SCEVExpander::expandMinMaxExpr(const SCEVNAryExpr *S,
                                       Intrinsic::ID IntrinID, Twine Name,
                                       bool IsSequential) {
+  bool PrevSafeMode = SafeUDivMode;
+  SafeUDivMode |= IsSequential;
   Value *LHS = expand(S->getOperand(S->getNumOperands() - 1));
   Type *Ty = LHS->getType();
   if (IsSequential)
     LHS = Builder.CreateFreeze(LHS);
   for (int i = S->getNumOperands() - 2; i >= 0; --i) {
+    SafeUDivMode = (IsSequential && i != 0) || PrevSafeMode;
     Value *RHS = expand(S->getOperand(i));
     if (IsSequential && i != 0)
       RHS = Builder.CreateFreeze(RHS);
@@ -1395,6 +1412,7 @@ Value *SCEVExpander::expandMinMaxExpr(const SCEVNAryExpr *S,
     }
     LHS = Sel;
   }
+  SafeUDivMode = PrevSafeMode;
   return LHS;
 }
 
diff --git a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
index 7dfd80a688f3..1b646200e1c7 100644
--- a/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
+++ b/llvm/test/Transforms/LoopVectorize/trip-count-expansion-may-introduce-ub.ll
@@ -459,12 +459,12 @@ exit:
   ret i64 %p
 }
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[TMP10]]
 ; CHECK-NEXT:    [[TMP8:%.*]] = freeze i64 [[TMP0]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP8]], i64 [[SMAX]])
@@ -529,13 +529,14 @@ exit:
 
 declare void @foo()
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_call_before_loop(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    call void @foo()
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze i64 [[N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]])
@@ -599,14 +600,15 @@ exit:
   ret i64 %p
 }
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute(ptr %dst, i64 %N, i1 %c) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_loop_may_not_execute(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i1 [[C:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    br i1 [[C]], label [[LOOP_HEADER_PREHEADER:%.*]], label [[EXIT:%.*]]
 ; CHECK:       loop.header.preheader:
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze i64 [[N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]])
@@ -672,12 +674,13 @@ exit:
   ret i64 %p
 }
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds(ptr %dst, i64 %N, i64 %M) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch_different_bounds(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[M]]
+; CHECK-NEXT:    [[TMP0:%.*]] = freeze i64 [[M]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP0]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[TMP1]]
 ; CHECK-NEXT:    [[TMP3:%.*]] = freeze i64 [[TMP2]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP3]], i64 [[SMAX]])
@@ -740,13 +743,13 @@ exit:
   ret i64 %p
 }
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_frozen_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[FR_N:%.*]] = freeze i64 [[N]]
-; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[FR_N]]
+; CHECK-NEXT:    [[TMP1:%.*]] = call i64 @llvm.umax.i64(i64 [[FR_N]], i64 1)
+; CHECK-NEXT:    [[TMP2:%.*]] = udiv i64 42, [[TMP1]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze i64 [[TMP2]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]])
@@ -931,12 +934,12 @@ exit:
   ret void
 }
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_urem_by_value_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP12:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[TMP12]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = mul nuw i64 [[N]], [[TMP0]]
 ; CHECK-NEXT:    [[TMP2:%.*]] = sub i64 42, [[TMP1]]
 ; CHECK-NEXT:    [[SMAX1:%.*]] = call i64 @llvm.smax.i64(i64 [[TMP2]], i64 0)
@@ -1002,7 +1005,6 @@ exit:
   ret i64 %p
 }
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(ptr %dst, i64 %N) {
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_urem_by_constant_in_latch(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
@@ -1156,7 +1158,8 @@ define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(ptr %dst, i64 %
 ; CHECK-LABEL: define i64 @multi_exit_4_exit_count_with_udiv_by_value_in_latch1(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]]) {
 ; CHECK-NEXT:  entry:
-; CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 42, [[N]]
+; CHECK-NEXT:    [[TMP8:%.*]] = call i64 @llvm.umax.i64(i64 [[N]], i64 1)
+; CHECK-NEXT:    [[TMP9:%.*]] = udiv i64 42, [[TMP8]]
 ; CHECK-NEXT:    [[TMP10:%.*]] = freeze i64 [[TMP9]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP10]], i64 [[SMAX]])
@@ -1262,13 +1265,14 @@ exit:
   ret i64 %p
 }
 
-; FIXME: currently the expansion of the loop bounds may introduce UB through the division.
 define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison(ptr %dst, i64 %N, i64 %M) {
 ; CHECK-LABEL: define i64 @multi_exit_count_with_udiv_by_value_in_latch_different_bounds_divisor_non_zero_may_be_poison(
 ; CHECK-SAME: ptr [[DST:%.*]], i64 [[N:%.*]], i64 [[M:%.*]]) {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[M_1:%.*]] = call i64 @llvm.umax.i64(i64 [[M]], i64 1)
-; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[M_1]]
+; CHECK-NEXT:    [[TMP9:%.*]] = freeze i64 [[M_1]]
+; CHECK-NEXT:    [[TMP10:%.*]] = call i64 @llvm.umax.i64(i64 [[TMP9]], i64 1)
+; CHECK-NEXT:    [[TMP0:%.*]] = udiv i64 42, [[TMP10]]
 ; CHECK-NEXT:    [[TMP1:%.*]] = freeze i64 [[TMP0]]
 ; CHECK-NEXT:    [[SMAX:%.*]] = call i64 @llvm.smax.i64(i64 [[N]], i64 0)
 ; CHECK-NEXT:    [[UMIN:%.*]] = call i64 @llvm.umin.i64(i64 [[TMP1]], i64 [[SMAX]])
-- 
GitLab


From 8c62bf54df76e37d0978f4901c6be6554e978b53 Mon Sep 17 00:00:00 2001
From: Bill Wendling <morbo@google.com>
Date: Thu, 17 Oct 2024 21:52:40 +0000
Subject: [PATCH 306/329] [Clang] Disable use of the counted_by attribute for
 whole struct pointers (#112636)

The whole struct is specificed in the __bdos. The calculation of the
whole size of the structure can be done in two ways:

    1) sizeof(struct S) + count * sizeof(typeof(fam))
    2) offsetof(struct S, fam) + count * sizeof(typeof(fam))

The first will add any remaining whitespace that might exist after
allocation while the second method is more precise, but not quite
expected from programmers. See [1] for a discussion of the topic.

GCC isn't (currently) able to calculate __bdos on a pointer to the whole
structure. Therefore, because of the above issue, we'll choose to match
what GCC does for consistency's sake.

[1] https://lore.kernel.org/lkml/ZvV6X5FPBBW7CO1f@archlinux/

Co-authored-by: Eli Friedman <efriedma@quicinc.com>
---
 clang/lib/CodeGen/CGBuiltin.cpp      |  45 +++--
 clang/test/CodeGen/attr-counted-by.c | 259 ++++++++++-----------------
 2 files changed, 110 insertions(+), 194 deletions(-)

diff --git a/clang/lib/CodeGen/CGBuiltin.cpp b/clang/lib/CodeGen/CGBuiltin.cpp
index f6d7db2c204c..a048a566a092 100644
--- a/clang/lib/CodeGen/CGBuiltin.cpp
+++ b/clang/lib/CodeGen/CGBuiltin.cpp
@@ -1013,6 +1013,24 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
     // Can't find the field referenced by the "counted_by" attribute.
     return nullptr;
 
+  if (isa<DeclRefExpr>(Base))
+    // The whole struct is specificed in the __bdos. The calculation of the
+    // whole size of the structure can be done in two ways:
+    //
+    //     1) sizeof(struct S) + count * sizeof(typeof(fam))
+    //     2) offsetof(struct S, fam) + count * sizeof(typeof(fam))
+    //
+    // The first will add additional padding after the end of the array,
+    // allocation while the second method is more precise, but not quite
+    // expected from programmers. See
+    // https://lore.kernel.org/lkml/ZvV6X5FPBBW7CO1f@archlinux/ for a
+    // discussion of the topic.
+    //
+    // GCC isn't (currently) able to calculate __bdos on a pointer to the whole
+    // structure. Therefore, because of the above issue, we'll choose to match
+    // what GCC does for consistency's sake.
+    return nullptr;
+
   // Build a load of the counted_by field.
   bool IsSigned = CountedByFD->getType()->isSignedIntegerType();
   Value *CountedByInst = EmitLoadOfCountedByField(Base, FAMDecl, CountedByFD);
@@ -1043,32 +1061,9 @@ CodeGenFunction::emitFlexibleArrayMemberSize(const Expr *E, unsigned Type,
   CharUnits Size = Ctx.getTypeSizeInChars(ArrayTy->getElementType());
   llvm::Constant *ElemSize =
       llvm::ConstantInt::get(ResType, Size.getQuantity(), IsSigned);
-  Value *FAMSize =
+  Value *Res =
       Builder.CreateMul(CountedByInst, ElemSize, "", !IsSigned, IsSigned);
-  FAMSize = Builder.CreateIntCast(FAMSize, ResType, IsSigned);
-  Value *Res = FAMSize;
-
-  if (isa<DeclRefExpr>(Base)) {
-    // The whole struct is specificed in the __bdos.
-    const ASTRecordLayout &Layout = Ctx.getASTRecordLayout(OuterRD);
-
-    // Get the offset of the FAM.
-    llvm::Constant *FAMOffset = ConstantInt::get(ResType, Offset, IsSigned);
-    Value *OffsetAndFAMSize =
-        Builder.CreateAdd(FAMOffset, Res, "", !IsSigned, IsSigned);
-
-    // Get the full size of the struct.
-    llvm::Constant *SizeofStruct =
-        ConstantInt::get(ResType, Layout.getSize().getQuantity(), IsSigned);
-
-    // max(sizeof(struct s),
-    //     offsetof(struct s, array) + p->count * sizeof(*p->array))
-    Res = IsSigned
-              ? Builder.CreateBinaryIntrinsic(llvm::Intrinsic::smax,
-                                              OffsetAndFAMSize, SizeofStruct)
-              : Builder.CreateBinaryIntrinsic(llvm::Intrinsic::umax,
-                                              OffsetAndFAMSize, SizeofStruct);
-  }
+  Res = Builder.CreateIntCast(Res, ResType, IsSigned);
 
   // A negative \p IdxInst or \p CountedByInst means that the index lands
   // outside of the flexible array member. If that's the case, we want to
diff --git a/clang/test/CodeGen/attr-counted-by.c b/clang/test/CodeGen/attr-counted-by.c
index 4a130c5e3d40..f70e552bca26 100644
--- a/clang/test/CodeGen/attr-counted-by.c
+++ b/clang/test/CodeGen/attr-counted-by.c
@@ -66,7 +66,7 @@ struct anon_struct {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3:![0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10:[0-9]+]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB1:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9:[0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
@@ -114,7 +114,7 @@ void test1(struct annotated *p, int index, int val) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB3:[0-9]+]], i64 [[INDEX]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
@@ -197,42 +197,26 @@ size_t test2_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOT_COUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[DOTCOUNTED_BY_LOAD]] to i64, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[INDEX]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB4:[0-9]+]], i64 [[INDEX]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = shl nsw i64 [[TMP2]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP3]], i64 4)
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = trunc i64 [[TMP4]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP6:%.*]] = add i32 [[TMP5]], 12
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP6]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test3(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 4)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds nuw [0 x i32], ptr [[ARRAY]], i64 0, i64 [[INDEX]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test3(
@@ -254,34 +238,18 @@ size_t test2_bdos(struct annotated *p) {
 void test3(struct annotated *p, size_t index) {
   // This test differs from 'test2' by checking bdos on the whole array and not
   // just the FAM.
-  p->array[index] = __builtin_dynamic_object_size(p, 1);
+  p->array[index] = __builtin_dynamic_object_size(p, 0);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 4)
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 12
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 0
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP5]]
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 0, 8589934601) i64 @test3_bdos(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test3_bdos(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 4)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = add nuw nsw i64 [[TMP2]], 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], -1
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP5:%.*]] = select i1 [[TMP4]], i64 [[TMP3]], i64 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP5]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test3_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2:[0-9]+]] {
@@ -294,7 +262,7 @@ void test3(struct annotated *p, size_t index) {
 // NO-SANITIZE-WITHOUT-ATTR-NEXT:    ret i64 -1
 //
 size_t test3_bdos(struct annotated *p) {
-  return __builtin_dynamic_object_size(p, 1);
+  return __builtin_dynamic_object_size(p, 0);
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test4(
@@ -308,7 +276,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB5:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD]], 2
@@ -325,7 +293,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP7:%.*]] = icmp ult i64 [[IDXPROM12]], [[TMP6]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP7]], label [[CONT19:%.*]], label [[HANDLER_OUT_OF_BOUNDS15:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds15:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM12]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB6:[0-9]+]], i64 [[IDXPROM12]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont19:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP8:%.*]] = icmp sgt i32 [[DOT_COUNTED_BY_LOAD6]], 3
@@ -342,7 +310,7 @@ size_t test3_bdos(struct annotated *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP13:%.*]] = icmp ult i64 [[IDXPROM28]], [[TMP12]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP13]], label [[CONT35:%.*]], label [[HANDLER_OUT_OF_BOUNDS31:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds31:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM28]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB7:[0-9]+]], i64 [[IDXPROM28]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont35:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX33:%.*]] = getelementptr inbounds [0 x i32], ptr [[ARRAY]], i64 0, i64 [[IDXPROM28]]
@@ -488,39 +456,27 @@ size_t test4_bdos(struct annotated *p, int index) {
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
 // SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
+// SANITIZE-WITH-ATTR-NEXT:    [[DOTCOUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOTCOUNTED_BY_GEP]], align 4
+// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOTCOUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB8:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP1]], i64 0, i64 [[IDXPROM]]
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = add i32 [[TMP2]], 16
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP3]]
-// SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
+// SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA4]]
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test5(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[P:%.*]], i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD_TR:%.*]] = trunc i64 [[DOT_COUNTED_BY_LOAD]] to i32
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl i32 [[DOT_COUNTED_BY_LOAD_TR]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add i32 [[TMP0]], 16
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP1]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP2]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 [[CONV]], ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr [[TMP0]], i64 0, i64 [[IDXPROM]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i32 -1, ptr [[ARRAYIDX]], align 4, !tbaa [[TBAA2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test5(
@@ -545,27 +501,15 @@ void test5(struct anon_struct *p, int index) {
   p->array[index] = __builtin_dynamic_object_size(p, 1);
 }
 
-// SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 16
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = select i1 [[DOTINV]], i64 0, i64 [[TMP1]]
-// SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP2]]
+// SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
-// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local range(i64 16, 1) i64 @test5_bdos(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test5_bdos(
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i64, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = shl nuw i64 [[DOT_COUNTED_BY_LOAD]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = add nuw i64 [[TMP0]], 16
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i64 [[DOT_COUNTED_BY_LOAD]], 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = select i1 [[DOTINV]], i64 0, i64 [[TMP1]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 [[TMP2]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i64 @test5_bdos(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR2]] {
@@ -590,7 +534,7 @@ size_t test5_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i64 [[DOT_COUNTED_BY_LOAD]], [[IDXPROM]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB9:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 16
@@ -683,7 +627,7 @@ size_t test6_bdos(struct anon_struct *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB11:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 9
@@ -723,12 +667,12 @@ void test7(struct union_of_fams *p, int index) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test7_bdos(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -756,7 +700,7 @@ size_t test7_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB12:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[INTS:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 9
@@ -837,7 +781,7 @@ size_t test8_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB14:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB14:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
@@ -877,12 +821,12 @@ void test9(struct union_of_fams *p, int index) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test9_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i64 @test9_bdos(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -910,7 +854,7 @@ size_t test9_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT7:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB15:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont7:
 // SANITIZE-WITH-ATTR-NEXT:    [[BYTES:%.*]] = getelementptr inbounds i8, ptr [[P]], i64 12
@@ -997,7 +941,7 @@ size_t test10_bdos(struct union_of_fams *p) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB16:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 12
@@ -1037,12 +981,12 @@ void test11(struct annotated *p, int index) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test11_bdos(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 4
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test11_bdos(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 4
 //
@@ -1076,16 +1020,16 @@ struct hang {
 int test12_a, test12_b;
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
-// SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR11:[0-9]+]]
+// SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR10:[0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT9:![0-9]+]]
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ult i32 [[INDEX]], 6
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[INDEX]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB18:[0-9]+]], i64 [[TMP1]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i32], ptr [[BAZ]], i64 0, i64 [[TMP1]]
@@ -1095,17 +1039,17 @@ int test12_a, test12_b;
 // SANITIZE-WITH-ATTR-NEXT:    [[DOTNOT:%.*]] = icmp eq i32 [[DOTCOUNTED_BY_LOAD]], 0
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[DOTNOT]], label [[HANDLER_OUT_OF_BOUNDS4:%.*]], label [[HANDLER_TYPE_MISMATCH6:%.*]], !prof [[PROF10:![0-9]+]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds4:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB20:[0-9]+]], i64 0) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.type_mismatch6:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_type_mismatch_v1_abort(ptr nonnull @[[GLOB21:[0-9]+]], i64 ptrtoint (ptr getelementptr inbounds nuw (i8, ptr @test12_foo, i64 4) to i64)) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test12(
-// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR5:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR4:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[BAZ:%.*]] = alloca [[STRUCT_HANG:%.*]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR12:[0-9]+]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.lifetime.start.p0(i64 24, ptr nonnull [[BAZ]]) #[[ATTR11:[0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr noundef nonnull align 4 dereferenceable(24) [[BAZ]], ptr noundef nonnull align 4 dereferenceable(24) @test12_bar, i64 24, i1 false), !tbaa.struct [[TBAA_STRUCT7:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[INDEX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [6 x i32], ptr [[BAZ]], i64 0, i64 [[IDXPROM]]
@@ -1188,7 +1132,7 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = icmp ult i64 [[INDEX]], [[TMP1]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP2]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB24:[0-9]+]], i64 [[INDEX]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
@@ -1197,7 +1141,7 @@ struct test13_bar {
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 0
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i32 @test13(
-// NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: i64 noundef [[INDEX:%.*]]) local_unnamed_addr #[[ATTR7:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr @test13_f, align 8, !tbaa [[TBAA8:![0-9]+]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[REVMAP:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP0]], i64 16
@@ -1249,14 +1193,14 @@ struct test14_foo {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB25:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       trap:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR10]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR9]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test14(
-// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTCOMPOUNDLITERAL:%.*]] = alloca [[STRUCT_TEST14_FOO:%.*]], align 4
 // NO-SANITIZE-WITH-ATTR-NEXT:    store i32 1, ptr [[DOTCOMPOUNDLITERAL]], align 4, !tbaa [[TBAA2]]
@@ -1305,14 +1249,14 @@ int test14(int idx) {
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[TRAP:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
 // SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB27:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       trap:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR10]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @llvm.trap() #[[ATTR9]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test15(
-// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i32], ptr getelementptr inbounds nuw (i8, ptr @__const.test15.foo, i64 8), i64 0, i64 [[IDXPROM]]
@@ -1350,12 +1294,12 @@ int test15(int idx) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test19(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test19(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1375,12 +1319,12 @@ size_t test19(struct annotated *p) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test20(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test20(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1400,12 +1344,12 @@ size_t test20(struct annotated *p) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test21(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test21(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1425,12 +1369,12 @@ size_t test21(struct annotated *p) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test22(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test22(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1450,12 +1394,12 @@ size_t test22(struct annotated *p) {
 }
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test23(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local noundef i64 @test23(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR4]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readnone [[P:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret i64 -1
 //
@@ -1487,7 +1431,7 @@ struct tests_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT4:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB28:[0-9]+]], i64 10) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont4:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX2:%.*]] = getelementptr inbounds i8, ptr [[VAR]], i64 84
@@ -1528,7 +1472,7 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ugt i32 [[DOTCOUNTED_BY_LOAD]], 10
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 10) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB29:[0-9]+]], i64 10) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
@@ -1536,7 +1480,7 @@ int test24(int c, struct tests_foo *var) {
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test25(
-// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR9:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: i32 noundef [[C:%.*]], ptr nocapture noundef readonly [[VAR:%.*]]) local_unnamed_addr #[[ATTR8:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[VAR]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds i8, ptr [[TMP0]], i64 44
@@ -1580,7 +1524,7 @@ struct test26_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT5:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB30:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont5:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[FOO]], i64 8
@@ -1651,7 +1595,7 @@ struct test27_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP0]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP1]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB32:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ENTRIES:%.*]] = getelementptr inbounds nuw i8, ptr [[P]], i64 24
@@ -1717,7 +1661,7 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT17:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB34:[0-9]+]], i64 [[IDXPROM]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont17:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARR:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
@@ -1726,7 +1670,7 @@ struct test28_foo {
 // SANITIZE-WITH-ATTR-NEXT:    ret i32 [[TMP5]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test28(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR9]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[P:%.*]], i32 noundef [[I:%.*]]) local_unnamed_addr #[[ATTR8]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = load ptr, ptr [[P]], align 8, !tbaa [[TBAA11]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = load ptr, ptr [[TMP0]], align 8, !tbaa [[TBAA11]]
@@ -1779,7 +1723,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = zext i32 [[IDX1]] to i64
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP0]], label [[CONT3:%.*]], label [[HANDLER_OUT_OF_BOUNDS:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB36:[0-9]+]], i64 [[TMP1]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB36:[0-9]+]], i64 [[TMP1]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont3:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[TMP1]]
@@ -1791,7 +1735,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = icmp ult i64 [[IDXPROM15]], [[TMP3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    br i1 [[TMP4]], label [[CONT20:%.*]], label [[HANDLER_OUT_OF_BOUNDS16:%.*]], !prof [[PROF3]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       handler.out_of_bounds16:
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB37:[0-9]+]], i64 [[IDXPROM15]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR:       cont20:
 // SANITIZE-WITH-ATTR-NEXT:    [[ARRAY:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP2]], i64 12
@@ -1803,7 +1747,7 @@ struct annotated_struct_array {
 // SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test29(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR10:[0-9]+]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[ANN:%.*]], i32 noundef [[IDX1:%.*]], i32 noundef [[IDX2:%.*]]) local_unnamed_addr #[[ATTR9:[0-9]+]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX1]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [10 x ptr], ptr [[ANN]], i64 0, i64 [[IDXPROM]]
@@ -1865,26 +1809,19 @@ struct test30_struct {
 };
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
-// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR5]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR4]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
 // SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = zext i32 [[IDX]] to i64, !nosanitize [[META2]]
-// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[TMP0]]) #[[ATTR10]], !nosanitize [[META2]]
+// SANITIZE-WITH-ATTR-NEXT:    tail call void @__ubsan_handle_out_of_bounds_abort(ptr nonnull @[[GLOB39:[0-9]+]], i64 [[TMP0]]) #[[ATTR9]], !nosanitize [[META2]]
 // SANITIZE-WITH-ATTR-NEXT:    unreachable, !nosanitize [[META2]]
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local void @test30(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR1]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR0]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_GEP:%.*]] = getelementptr inbounds i8, ptr [[PTR]], i64 8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[DOT_COUNTED_BY_GEP]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = tail call i32 @llvm.smax.i32(i32 [[DOT_COUNTED_BY_LOAD]], i32 4)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = trunc i32 [[TMP0]] to i8
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = add i8 [[TMP1]], 12
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i8 0, i8 [[TMP2]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[PCPU_REFCNT:%.*]] = getelementptr inbounds nuw i8, ptr [[PTR]], i64 12
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[IDXPROM:%.*]] = sext i32 [[IDX]] to i64
 // NO-SANITIZE-WITH-ATTR-NEXT:    [[ARRAYIDX:%.*]] = getelementptr inbounds [0 x i8], ptr [[PCPU_REFCNT]], i64 0, i64 [[IDXPROM]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 [[CONV]], ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    store i8 -1, ptr [[ARRAYIDX]], align 1, !tbaa [[TBAA6]]
 // NO-SANITIZE-WITH-ATTR-NEXT:    ret void
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local void @test30(
@@ -1916,30 +1853,14 @@ struct test31_struct {
 };
 
 // SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
-// SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // SANITIZE-WITH-ATTR-NEXT:  entry:
-// SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-// SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 4
-// SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
-// SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
-// SANITIZE-WITH-ATTR-NEXT:    ret i32 [[CONV]]
+// SANITIZE-WITH-ATTR-NEXT:    ret i32 -1
 //
 // NO-SANITIZE-WITH-ATTR-LABEL: define dso_local i32 @test31(
-// NO-SANITIZE-WITH-ATTR-SAME: ptr nocapture noundef readonly [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
+// NO-SANITIZE-WITH-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR3]] {
 // NO-SANITIZE-WITH-ATTR-NEXT:  entry:
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOT_COUNTED_BY_LOAD:%.*]] = load i32, ptr [[PTR]], align 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP0:%.*]] = sext i32 [[DOT_COUNTED_BY_LOAD]] to i64
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP1:%.*]] = shl nsw i64 [[TMP0]], 2
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP2:%.*]] = tail call i64 @llvm.smax.i64(i64 [[TMP1]], i64 0)
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP3:%.*]] = trunc i64 [[TMP2]] to i32
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[TMP4:%.*]] = add i32 [[TMP3]], 4
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[DOTINV:%.*]] = icmp slt i32 [[DOT_COUNTED_BY_LOAD]], 0
-// NO-SANITIZE-WITH-ATTR-NEXT:    [[CONV:%.*]] = select i1 [[DOTINV]], i32 0, i32 [[TMP4]]
-// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 [[CONV]]
+// NO-SANITIZE-WITH-ATTR-NEXT:    ret i32 -1
 //
 // SANITIZE-WITHOUT-ATTR-LABEL: define dso_local i32 @test31(
 // SANITIZE-WITHOUT-ATTR-SAME: ptr noundef [[PTR:%.*]], i32 noundef [[IDX:%.*]]) local_unnamed_addr #[[ATTR2]] {
-- 
GitLab


From 5033ea73bb01061feb09b3216c74619e1fbefdeb Mon Sep 17 00:00:00 2001
From: Jacob Lalonde <jalalonde@fb.com>
Date: Thu, 17 Oct 2024 15:26:05 -0700
Subject: [PATCH 307/329] [LLDB][Minidump] Add breakpoint stop reasons to the
 minidump. (#108448)

Recently my coworker @jeffreytan81 pointed out that Minidumps don't show
breakpoints when collected. This was prior blocked because Minidumps
could only contain 1 exception, now that we support N signals/sections
we can save all the threads stopped on breakpoints.
---
 .../Minidump/MinidumpFileBuilder.cpp          | 82 +++++++++----------
 .../ObjectFile/Minidump/MinidumpFileBuilder.h |  1 -
 .../Process/minidump/ProcessMinidump.cpp      | 12 ++-
 .../minidump-new/TestMiniDumpNew.py           | 20 +++++
 .../linux-x86_64-exceptiondescription.yaml    | 37 +++++++++
 llvm/include/llvm/BinaryFormat/Minidump.h     |  2 +
 6 files changed, 107 insertions(+), 47 deletions(-)
 create mode 100644 lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64-exceptiondescription.yaml

diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
index f6c16b6e3d96..bcac5edbc1a7 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.cpp
@@ -75,8 +75,7 @@ Status MinidumpFileBuilder::AddHeaderAndCalculateDirectories() {
     StopInfoSP stop_info_sp = thread_sp->GetStopInfo();
     if (stop_info_sp) {
       const StopReason &stop_reason = stop_info_sp->GetStopReason();
-      if (stop_reason == StopReason::eStopReasonException ||
-          stop_reason == StopReason::eStopReasonSignal)
+      if (stop_reason != lldb::eStopReasonInvalid)
         m_expected_directories++;
     }
   }
@@ -685,50 +684,45 @@ Status MinidumpFileBuilder::AddExceptions() {
   Status error;
   for (const ThreadSP &thread_sp : thread_list) {
     StopInfoSP stop_info_sp = thread_sp->GetStopInfo();
-    bool add_exception = false;
-    if (stop_info_sp) {
-      switch (stop_info_sp->GetStopReason()) {
-      case eStopReasonSignal:
-      case eStopReasonException:
-        add_exception = true;
-        break;
-      default:
-        break;
-      }
-    }
-    if (add_exception) {
-      constexpr size_t minidump_exception_size =
-          sizeof(llvm::minidump::ExceptionStream);
-      error = AddDirectory(StreamType::Exception, minidump_exception_size);
-      if (error.Fail())
-        return error;
+    // If we don't have a stop info, or if it's invalid, skip.
+    if (!stop_info_sp ||
+        stop_info_sp->GetStopReason() == lldb::eStopReasonInvalid)
+      continue;
 
-      StopInfoSP stop_info_sp = thread_sp->GetStopInfo();
-      RegisterContextSP reg_ctx_sp(thread_sp->GetRegisterContext());
-      Exception exp_record = {};
-      exp_record.ExceptionCode =
-          static_cast<llvm::support::ulittle32_t>(stop_info_sp->GetValue());
-      exp_record.ExceptionFlags = static_cast<llvm::support::ulittle32_t>(0);
-      exp_record.ExceptionRecord = static_cast<llvm::support::ulittle64_t>(0);
-      exp_record.ExceptionAddress = reg_ctx_sp->GetPC();
-      exp_record.NumberParameters = static_cast<llvm::support::ulittle32_t>(0);
-      exp_record.UnusedAlignment = static_cast<llvm::support::ulittle32_t>(0);
-      // exp_record.ExceptionInformation;
-
-      ExceptionStream exp_stream;
-      exp_stream.ThreadId =
-          static_cast<llvm::support::ulittle32_t>(thread_sp->GetID());
-      exp_stream.UnusedAlignment = static_cast<llvm::support::ulittle32_t>(0);
-      exp_stream.ExceptionRecord = exp_record;
-      auto Iter = m_tid_to_reg_ctx.find(thread_sp->GetID());
-      if (Iter != m_tid_to_reg_ctx.end()) {
-        exp_stream.ThreadContext = Iter->second;
-      } else {
-        exp_stream.ThreadContext.DataSize = 0;
-        exp_stream.ThreadContext.RVA = 0;
-      }
-      m_data.AppendData(&exp_stream, minidump_exception_size);
+    constexpr size_t minidump_exception_size =
+        sizeof(llvm::minidump::ExceptionStream);
+    error = AddDirectory(StreamType::Exception, minidump_exception_size);
+    if (error.Fail())
+      return error;
+
+    RegisterContextSP reg_ctx_sp(thread_sp->GetRegisterContext());
+    Exception exp_record = {};
+    exp_record.ExceptionCode =
+        static_cast<llvm::support::ulittle32_t>(stop_info_sp->GetValue());
+    exp_record.ExceptionFlags =
+        static_cast<llvm::support::ulittle32_t>(Exception::LLDB_FLAG);
+    exp_record.ExceptionRecord = static_cast<llvm::support::ulittle64_t>(0);
+    exp_record.ExceptionAddress = reg_ctx_sp->GetPC();
+    exp_record.NumberParameters = static_cast<llvm::support::ulittle32_t>(1);
+    std::string description = stop_info_sp->GetDescription();
+    // We have 120 bytes to work with and it's unlikely description will
+    // overflow, but we gotta check.
+    memcpy(&exp_record.ExceptionInformation, description.c_str(),
+           std::max(description.size(), Exception::MaxParameterBytes));
+    exp_record.UnusedAlignment = static_cast<llvm::support::ulittle32_t>(0);
+    ExceptionStream exp_stream;
+    exp_stream.ThreadId =
+        static_cast<llvm::support::ulittle32_t>(thread_sp->GetID());
+    exp_stream.UnusedAlignment = static_cast<llvm::support::ulittle32_t>(0);
+    exp_stream.ExceptionRecord = exp_record;
+    auto Iter = m_tid_to_reg_ctx.find(thread_sp->GetID());
+    if (Iter != m_tid_to_reg_ctx.end()) {
+      exp_stream.ThreadContext = Iter->second;
+    } else {
+      exp_stream.ThreadContext.DataSize = 0;
+      exp_stream.ThreadContext.RVA = 0;
     }
+    m_data.AppendData(&exp_stream, minidump_exception_size);
   }
 
   return error;
diff --git a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
index a4240f871c8a..58b284608bd5 100644
--- a/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
+++ b/lldb/source/Plugins/ObjectFile/Minidump/MinidumpFileBuilder.h
@@ -175,5 +175,4 @@ private:
   lldb::FileUP m_core_file;
   lldb_private::SaveCoreOptions m_save_core_options;
 };
-
 #endif // LLDB_SOURCE_PLUGINS_OBJECTFILE_MINIDUMP_MINIDUMPFILEBUILDER_H
diff --git a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
index 5ea3db23f114..5b0df72130c1 100644
--- a/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
+++ b/lldb/source/Plugins/Process/minidump/ProcessMinidump.cpp
@@ -276,8 +276,16 @@ void ProcessMinidump::RefreshStateAfterStop() {
         // No stop.
         return;
       }
-
-      stop_info = StopInfo::CreateStopReasonWithSignal(*stop_thread, signo);
+      const char *description = nullptr;
+      if (exception_stream.ExceptionRecord.ExceptionFlags ==
+          llvm::minidump::Exception::LLDB_FLAG)
+        description = reinterpret_cast<const char *>(
+            exception_stream.ExceptionRecord.ExceptionInformation);
+
+      llvm::StringRef description_str(description,
+                                      Exception::MaxParameterBytes);
+      stop_info = StopInfo::CreateStopReasonWithSignal(
+          *stop_thread, signo, description_str.str().c_str());
     } else if (arch.GetTriple().getVendor() == llvm::Triple::Apple) {
       stop_info = StopInfoMachException::CreateStopReasonWithMachException(
           *stop_thread, exception_stream.ExceptionRecord.ExceptionCode, 2,
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
index 5a0b6e790a42..8776d72ecbc0 100644
--- a/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/TestMiniDumpNew.py
@@ -524,3 +524,23 @@ class MiniDumpNewTestCase(TestBase):
             self.assertStopReason(thread.GetStopReason(), lldb.eStopReasonSignal)
             stop_description = thread.GetStopDescription(256)
             self.assertIn("SIGSEGV", stop_description)
+
+    def test_breakpoint_on_minidump(self):
+        """
+        Test that LLDB breakpoints are recorded in Minidumps
+        """
+        yaml = "linux-x86_64-exceptiondescription.yaml"
+        core = self.getBuildArtifact("breakpoint.core.dmp")
+        self.yaml2obj(yaml, core)
+        try:
+            # Create a target with the object file we just created from YAML
+            target = self.dbg.CreateTarget(None)
+            self.assertTrue(target, VALID_TARGET)
+            process = target.LoadCore(core)
+            self.assertTrue(process, VALID_PROCESS)
+            thread = process.GetThreadAtIndex(0)
+            stop_reason = thread.GetStopDescription(256)
+            self.assertIn("breakpoint 1.1", stop_reason)
+        finally:
+            if os.path.isfile(core):
+                os.unlink(core)
diff --git a/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64-exceptiondescription.yaml b/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64-exceptiondescription.yaml
new file mode 100644
index 000000000000..bf26e05cd775
--- /dev/null
+++ b/lldb/test/API/functionalities/postmortem/minidump-new/linux-x86_64-exceptiondescription.yaml
@@ -0,0 +1,37 @@
+--- !minidump
+Streams:
+  - Type:            SystemInfo
+    Processor Arch:  AMD64
+    Processor Level: 6
+    Processor Revision: 15876
+    Number of Processors: 40
+    Platform ID:     Linux
+    CSD Version:     'Linux 3.13.0-91-generic'
+    CPU:
+      Vendor ID:       GenuineIntel
+      Version Info:    0x00000000
+      Feature Info:    0x00000000
+  - Type:            ThreadList
+    Threads:
+      - Thread Id:       0x31F222
+        Context:         00000000000000
+        Stack:
+          Start of Memory Range: 0x7FFFFFFFD660
+          Content:         ''
+  - Type:            Exception
+    Thread ID:       0x31F222
+    Exception Record:
+      Exception Code:  0x2
+      Exception Flags: 0x4C4C4442
+      Exception Address: 0x555555556671
+      Number of Parameters: 1
+      Parameter 0:     0x696F706B61657262
+      Parameter 1:     0x312E3120746E
+      Parameter 2:     0x1
+      Parameter 3:     0x8000000000000000
+      Parameter 4:     0x200000002
+      Parameter 5:     0x8000000000000002
+      Parameter 7:     0x555555556671
+      Parameter 8:     0x1
+    Thread Context:  ''
+...
diff --git a/llvm/include/llvm/BinaryFormat/Minidump.h b/llvm/include/llvm/BinaryFormat/Minidump.h
index 8054e81322a9..addff4298235 100644
--- a/llvm/include/llvm/BinaryFormat/Minidump.h
+++ b/llvm/include/llvm/BinaryFormat/Minidump.h
@@ -246,6 +246,8 @@ static_assert(sizeof(Thread) == 48);
 
 struct Exception {
   static constexpr size_t MaxParameters = 15;
+  static constexpr size_t MaxParameterBytes = MaxParameters * sizeof(uint64_t);
+  static const uint32_t LLDB_FLAG = 'LLDB';
 
   support::ulittle32_t ExceptionCode;
   support::ulittle32_t ExceptionFlags;
-- 
GitLab


From 71b81e93d28c8db3f9cfa1d715c925a98ae4b153 Mon Sep 17 00:00:00 2001
From: Ryosuke Niwa <rniwa@webkit.org>
Date: Thu, 17 Oct 2024 16:52:31 -0700
Subject: [PATCH 308/329] [alpha.webkit.UncountedLocalVarsChecker] Recursive
 functions are erroneously treated as non-trivial (#110973)

This PR fixes the bug that alpha.webkit.UncountedLocalVarsChecker
erroneously treats a trivial recursive function as non-trivial. This was
caused by TrivialFunctionAnalysis::isTrivialImpl which takes a statement
as an argument populating the cache with "false" while traversing the
statement to determine its triviality within a recursive function in
TrivialFunctionAnalysisVisitor's WithCachedResult. Because
IsFunctionTrivial honors an entry in the cache, this resulted in the
whole function to be treated as non-trivial.

Thankfully, TrivialFunctionAnalysisVisitor::IsFunctionTrivial already
handles recursive functions correctly so this PR applies the same logic
to TrivialFunctionAnalysisVisitor::WithCachedResult by sharing code
between the two functions. This avoids the cache to be pre-populated
with "false" while traversing statements in a recurisve function.
---
 .../Checkers/WebKit/PtrTypesSemantics.cpp     | 58 +++++++------------
 .../Checkers/WebKit/uncounted-local-vars.cpp  | 39 +++++++++++++
 .../Checkers/WebKit/uncounted-obj-arg.cpp     | 12 ++++
 3 files changed, 73 insertions(+), 36 deletions(-)

diff --git a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
index 2298fe39850d..e043806eadd6 100644
--- a/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
+++ b/clang/lib/StaticAnalyzer/Checkers/WebKit/PtrTypesSemantics.cpp
@@ -273,16 +273,29 @@ class TrivialFunctionAnalysisVisitor
     return true;
   }
 
-  template <typename CheckFunction>
-  bool WithCachedResult(const Stmt *S, CheckFunction Function) {
-    // If the statement isn't in the cache, conservatively assume that
-    // it's not trivial until analysis completes. Insert false to the cache
-    // first to avoid infinite recursion.
-    auto [It, IsNew] = Cache.insert(std::make_pair(S, false));
+  template <typename StmtOrDecl, typename CheckFunction>
+  bool WithCachedResult(const StmtOrDecl *S, CheckFunction Function) {
+    auto CacheIt = Cache.find(S);
+    if (CacheIt != Cache.end())
+      return CacheIt->second;
+
+    // Treat a recursive statement to be trivial until proven otherwise.
+    auto [RecursiveIt, IsNew] = RecursiveFn.insert(std::make_pair(S, true));
     if (!IsNew)
-      return It->second;
+      return RecursiveIt->second;
+
     bool Result = Function();
+
+    if (!Result) {
+      for (auto &It : RecursiveFn)
+        It.second = false;
+    }
+    RecursiveIt = RecursiveFn.find(S);
+    assert(RecursiveIt != RecursiveFn.end());
+    Result = RecursiveIt->second;
+    RecursiveFn.erase(RecursiveIt);
     Cache[S] = Result;
+
     return Result;
   }
 
@@ -292,16 +305,7 @@ public:
   TrivialFunctionAnalysisVisitor(CacheTy &Cache) : Cache(Cache) {}
 
   bool IsFunctionTrivial(const Decl *D) {
-    auto CacheIt = Cache.find(D);
-    if (CacheIt != Cache.end())
-      return CacheIt->second;
-
-    // Treat a recursive function call to be trivial until proven otherwise.
-    auto [RecursiveIt, IsNew] = RecursiveFn.insert(std::make_pair(D, true));
-    if (!IsNew)
-      return RecursiveIt->second;
-
-    bool Result = [&]() {
+    return WithCachedResult(D, [&]() {
       if (auto *CtorDecl = dyn_cast<CXXConstructorDecl>(D)) {
         for (auto *CtorInit : CtorDecl->inits()) {
           if (!Visit(CtorInit->getInit()))
@@ -312,20 +316,7 @@ public:
       if (!Body)
         return false;
       return Visit(Body);
-    }();
-
-    if (!Result) {
-      // D and its mutually recursive callers are all non-trivial.
-      for (auto &It : RecursiveFn)
-        It.second = false;
-    }
-    RecursiveIt = RecursiveFn.find(D);
-    assert(RecursiveIt != RecursiveFn.end());
-    Result = RecursiveIt->second;
-    RecursiveFn.erase(RecursiveIt);
-    Cache[D] = Result;
-
-    return Result;
+    });
   }
 
   bool VisitStmt(const Stmt *S) {
@@ -586,11 +577,6 @@ bool TrivialFunctionAnalysis::isTrivialImpl(
 
 bool TrivialFunctionAnalysis::isTrivialImpl(
     const Stmt *S, TrivialFunctionAnalysis::CacheTy &Cache) {
-  // If the statement isn't in the cache, conservatively assume that
-  // it's not trivial until analysis completes. Unlike a function case,
-  // we don't insert an entry into the cache until Visit returns
-  // since Visit* functions themselves make use of the cache.
-
   TrivialFunctionAnalysisVisitor V(Cache);
   bool Result = V.Visit(S);
   assert(Cache.contains(S) && "Top-level statement not properly cached!");
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
index 25776870dd3a..b5f6b8535bf4 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-local-vars.cpp
@@ -289,3 +289,42 @@ void foo() {
 }
 
 } // namespace local_assignment_to_global
+
+namespace local_var_in_recursive_function {
+
+struct TreeNode {
+  Ref<TreeNode> create() { return Ref(*new TreeNode); }
+
+  void ref() const { ++refCount; }
+  void deref() const {
+    if (!--refCount)
+      delete this;
+  }
+
+  int recursiveCost();
+  int recursiveWeight();
+  int weight();
+
+  int cost { 0 };
+  mutable unsigned refCount { 0 };
+  TreeNode* nextSibling { nullptr };
+  TreeNode* firstChild { nullptr };
+};
+
+int TreeNode::recursiveCost() {
+  // no warnings
+  unsigned totalCost = cost;
+  for (TreeNode* node = firstChild; node; node = node->nextSibling)
+    totalCost += recursiveCost();
+  return totalCost;
+}
+
+int TreeNode::recursiveWeight() {
+  unsigned totalCost = weight();
+  for (TreeNode* node = firstChild; node; node = node->nextSibling)
+    // expected-warning@-1{{Local variable 'node' is uncounted and unsafe [alpha.webkit.UncountedLocalVarsChecker]}}
+    totalCost += recursiveWeight();
+  return totalCost;
+}
+
+} // namespace local_var_in_recursive_function
diff --git a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
index 1a42de90105a..10da776f8157 100644
--- a/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
+++ b/clang/test/Analysis/Checkers/WebKit/uncounted-obj-arg.cpp
@@ -259,6 +259,15 @@ public:
   void mutuallyRecursive8() { mutuallyRecursive9(); someFunction(); }
   void mutuallyRecursive9() { mutuallyRecursive8(); }
 
+  int recursiveCost() {
+    unsigned totalCost = 0;
+    for (unsigned i = 0; i < sizeof(children)/sizeof(*children); ++i) {
+      if (auto* child = children[i])
+        totalCost += child->recursiveCost();
+    }
+    return totalCost;
+  }
+
   int trivial1() { return 123; }
   float trivial2() { return 0.3; }
   float trivial3() { return (float)0.4; }
@@ -448,6 +457,7 @@ public:
   Number* number { nullptr };
   ComplexNumber complex;
   Enum enumValue { Enum::Value1 };
+  RefCounted* children[4];
 };
 
 unsigned RefCounted::s_v = 0;
@@ -558,6 +568,8 @@ public:
     getFieldTrivial().mutuallyRecursive9();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
 
+    getFieldTrivial().recursiveCost(); // no-warning
+
     getFieldTrivial().someFunction();
     // expected-warning@-1{{Call argument for 'this' parameter is uncounted and unsafe}}
     getFieldTrivial().nonTrivial1();
-- 
GitLab


From 46df20ab63ee8c14c5d4eef07e2a7cccd466c064 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 17 Oct 2024 16:59:49 -0700
Subject: [PATCH 309/329] [sanitizer] Add TryMemCpy (#112668)

For posix implementation is similar to
`IsAccessibleMemoryRange`, using `pipe`.

We need this because we can't rely on non-atomic
`IsAccessibleMemoryRange` + `memcpy`, as the
protection or mapping may change and we may
crash.
---
 .../lib/sanitizer_common/sanitizer_common.h   |  6 +++
 .../sanitizer_common/sanitizer_fuchsia.cpp    |  5 ++
 .../sanitizer_posix_libcdep.cpp               | 44 +++++++++++++++++
 .../lib/sanitizer_common/sanitizer_win.cpp    |  5 ++
 .../tests/sanitizer_posix_test.cpp            | 49 +++++++++++++++++--
 5 files changed, 105 insertions(+), 4 deletions(-)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index 082d2158e579..3a28420ed02d 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -268,7 +268,13 @@ class ScopedErrorReportLock {
 extern uptr stoptheworld_tracer_pid;
 extern uptr stoptheworld_tracer_ppid;
 
+// Returns true if the entire range can be read.
 bool IsAccessibleMemoryRange(uptr beg, uptr size);
+// Attempts to copy `n` bytes from memory range starting at `src` to `dest`.
+// Returns true if the entire range can be read. Returns `false` if any part of
+// the source range cannot be read, in which case the contents of `dest` are
+// undefined.
+bool TryMemCpy(void *dest, const void *src, uptr n);
 
 // Error report formatting.
 const char *StripPathPrefix(const char *filepath,
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp
index 75dcf546729f..c2ace46c9465 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_fuchsia.cpp
@@ -444,6 +444,11 @@ bool IsAccessibleMemoryRange(uptr beg, uptr size) {
   return status == ZX_OK;
 }
 
+bool TryMemCpy(void *dest, const void *src, uptr n) {
+  // TODO: implement.
+  return false;
+}
+
 // FIXME implement on this platform.
 void GetMemoryProfile(fill_profile_f cb, uptr *stats) {}
 
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
index 3ab83977a4ee..7ee2319456d2 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_posix_libcdep.cpp
@@ -326,6 +326,50 @@ bool IsAccessibleMemoryRange(uptr beg, uptr size) {
   return true;
 }
 
+bool TryMemCpy(void *dest, const void *src, uptr n) {
+  if (!n)
+    return true;
+  int fds[2];
+  CHECK_EQ(0, pipe(fds));
+
+  auto cleanup = at_scope_exit([&]() {
+    internal_close(fds[0]);
+    internal_close(fds[1]);
+  });
+
+  SetNonBlock(fds[0]);
+  SetNonBlock(fds[1]);
+
+  char *d = static_cast<char *>(dest);
+  const char *s = static_cast<const char *>(src);
+
+  while (n) {
+    int e;
+    uptr w = internal_write(fds[1], s, n);
+    if (internal_iserror(w, &e)) {
+      if (e == EINTR)
+        continue;
+      CHECK_EQ(EFAULT, e);
+      return false;
+    }
+    s += w;
+    n -= w;
+
+    while (w) {
+      uptr r = internal_read(fds[0], d, w);
+      if (internal_iserror(r, &e)) {
+        CHECK_EQ(EINTR, e);
+        continue;
+      }
+
+      d += r;
+      w -= r;
+    }
+  }
+
+  return true;
+}
+
 void PlatformPrepareForSandboxing(void *args) {
   // Some kinds of sandboxes may forbid filesystem access, so we won't be able
   // to read the file mappings from /proc/self/maps. Luckily, neither the
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
index 6fb947aa6d6c..ea513d5f263f 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_win.cpp
@@ -968,6 +968,11 @@ bool IsAccessibleMemoryRange(uptr beg, uptr size) {
   return true;
 }
 
+bool TryMemCpy(void *dest, const void *src, uptr n) {
+  // TODO: implement.
+  return false;
+}
+
 bool SignalContext::IsStackOverflow() const {
   return (DWORD)GetType() == EXCEPTION_STACK_OVERFLOW;
 }
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
index 04890f2f5e2a..658ca60175b3 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
@@ -13,11 +13,14 @@
 #include "sanitizer_common/sanitizer_platform.h"
 #if SANITIZER_POSIX
 
-#include "sanitizer_common/sanitizer_common.h"
-#include "gtest/gtest.h"
+#  include <pthread.h>
+#  include <sys/mman.h>
 
-#include <pthread.h>
-#include <sys/mman.h>
+#  include <algorithm>
+#  include <numeric>
+
+#  include "gtest/gtest.h"
+#  include "sanitizer_common/sanitizer_common.h"
 
 namespace __sanitizer {
 
@@ -86,6 +89,44 @@ TEST(SanitizerCommon, IsAccessibleMemoryRangeLarge) {
                                       buffer.size()));
 }
 
+TEST(SanitizerCommon, TryMemCpy) {
+  std::vector<char> src(10000000);
+  std::iota(src.begin(), src.end(), 123);
+  std::vector<char> dst;
+
+  // Don't use ::testing::ElementsAreArray or similar, as the huge output on an
+  // error is not helpful.
+
+  dst.assign(1, 0);
+  EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size()));
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin()));
+
+  dst.assign(100, 0);
+  EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size()));
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin()));
+
+  dst.assign(534, 0);
+  EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size()));
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin()));
+
+  dst.assign(GetPageSize(), 0);
+  EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size()));
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin()));
+
+  dst.assign(src.size(), 0);
+  EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size()));
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin()));
+
+  dst.assign(src.size() - 1, 0);
+  EXPECT_TRUE(TryMemCpy(dst.data(), src.data(), dst.size()));
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), src.begin()));
+}
+
+TEST(SanitizerCommon, TryMemCpyNull) {
+  std::vector<char> dst(100);
+  EXPECT_FALSE(TryMemCpy(dst.data(), nullptr, dst.size()));
+}
+
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_POSIX
-- 
GitLab


From 7106de9573c29db5d107a2f4ab02d8621eea2510 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 17 Oct 2024 17:27:57 -0700
Subject: [PATCH 310/329] [sanitizer] Add MemCpyAccessible (#112794)

A layer over `TryMemCpy` to copy only available pages.
---
 .../lib/sanitizer_common/sanitizer_common.h   |  2 +
 .../sanitizer_common_libcdep.cpp              | 26 +++++++++++++
 .../tests/sanitizer_posix_test.cpp            | 37 +++++++++++++++++++
 3 files changed, 65 insertions(+)

diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common.h b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
index 3a28420ed02d..0b5e68c5fd79 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common.h
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common.h
@@ -275,6 +275,8 @@ bool IsAccessibleMemoryRange(uptr beg, uptr size);
 // the source range cannot be read, in which case the contents of `dest` are
 // undefined.
 bool TryMemCpy(void *dest, const void *src, uptr n);
+// Copies accessible memory, and zero fill inaccessible.
+void MemCpyAccessible(void *dest, const void *src, uptr n);
 
 // Error report formatting.
 const char *StripPathPrefix(const char *filepath,
diff --git a/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp b/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp
index 684720963a8d..f275e81ff041 100644
--- a/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp
+++ b/compiler-rt/lib/sanitizer_common/sanitizer_common_libcdep.cpp
@@ -219,6 +219,32 @@ static void StopStackDepotBackgroundThread() {
 static void StopStackDepotBackgroundThread() {}
 #endif
 
+void MemCpyAccessible(void *dest, const void *src, uptr n) {
+  if (TryMemCpy(dest, src, n))
+    return;
+
+  const uptr page_size = GetPageSize();
+  uptr b = reinterpret_cast<uptr>(src);
+  uptr b_up = RoundUpTo(b, page_size);
+
+  uptr e = reinterpret_cast<uptr>(src) + n;
+  uptr e_down = RoundDownTo(e, page_size);
+
+  auto copy_or_zero = [dest, src](uptr beg, uptr end) {
+    const uptr udest = reinterpret_cast<uptr>(dest);
+    const uptr usrc = reinterpret_cast<uptr>(src);
+    void *d = reinterpret_cast<void *>(udest + (beg - usrc));
+    const uptr size = end - beg;
+    if (!TryMemCpy(d, reinterpret_cast<void *>(beg), size))
+      internal_memset(d, 0, size);
+  };
+
+  copy_or_zero(b, b_up);
+  for (uptr p = b_up; p < e_down; p += page_size)
+    copy_or_zero(p, p + page_size);
+  copy_or_zero(e_down, e);
+}
+
 }  // namespace __sanitizer
 
 SANITIZER_INTERFACE_WEAK_DEF(void, __sanitizer_sandbox_on_notify,
diff --git a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
index 658ca60175b3..5016b09c1530 100644
--- a/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
+++ b/compiler-rt/lib/sanitizer_common/tests/sanitizer_posix_test.cpp
@@ -127,6 +127,43 @@ TEST(SanitizerCommon, TryMemCpyNull) {
   EXPECT_FALSE(TryMemCpy(dst.data(), nullptr, dst.size()));
 }
 
+TEST(SanitizerCommon, MemCpyAccessible) {
+  const int page_num = 1000;
+  const int page_size = GetPageSize();
+  InternalMmapVector<char> src(page_num * page_size);
+  std::iota(src.begin(), src.end(), 123);
+  std::vector<char> dst;
+  std::vector<char> exp = {src.begin(), src.end()};
+
+  // Protect some pages.
+  for (int i = 7; i < page_num; i *= 2) {
+    mprotect(src.data() + i * page_size, page_size, PROT_NONE);
+    std::fill(exp.data() + i * page_size, exp.data() + (i + 1) * page_size, 0);
+  }
+
+  dst.assign(src.size(), 0);
+  EXPECT_FALSE(TryMemCpy(dst.data(), src.data(), dst.size()));
+
+  // Full page aligned range with mprotect pages.
+  dst.assign(src.size(), 0);
+  MemCpyAccessible(dst.data(), src.data(), dst.size());
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), exp.begin()));
+
+  // Misaligned range with mprotect pages.
+  size_t offb = 3;
+  size_t offe = 7;
+  dst.assign(src.size() - offb - offe, 0);
+  MemCpyAccessible(dst.data(), src.data() + offb, dst.size());
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), exp.begin() + offb));
+
+  // Misaligned range with ends in mprotect pages.
+  offb = 3 + 7 * page_size;
+  offe = 7 + 14 * page_size;
+  dst.assign(src.size() - offb - offe, 0);
+  MemCpyAccessible(dst.data(), src.data() + offb, dst.size());
+  EXPECT_TRUE(std::equal(dst.begin(), dst.end(), exp.begin() + offb));
+}
+
 }  // namespace __sanitizer
 
 #endif  // SANITIZER_POSIX
-- 
GitLab


From 7dbfa7b981417773d01c52b0d716d592870081bb Mon Sep 17 00:00:00 2001
From: Helena Kotas <hekotas@microsoft.com>
Date: Thu, 17 Oct 2024 17:59:08 -0700
Subject: [PATCH 311/329] [HLSL] Add handle initialization for simple resource
 declarations (#111207)

Adds `@_init_resource_bindings()` function to module initialization that
includes `handle.fromBinding` intrinsic calls for simple resource
declarations. Arrays of resources or resources inside user defined types
are not supported yet.

While this unblocks our progress on [Compile a runnable shader from
clang](https://github.com/llvm/wg-hlsl/issues/7) milestone, this is
probably not the way we would like to handle resource binding
initialization going forward. Ideally, it should be done via the
resource class constructors in order to support dynamic resource binding
or unbounded arrays if resources.

Depends on PRs #110327 and #111203.

Part 1 of #105076
---
 clang/include/clang/AST/Type.h                |  4 +
 clang/lib/AST/Type.cpp                        | 15 ++++
 clang/lib/CodeGen/CGDeclCXX.cpp               |  8 ++
 clang/lib/CodeGen/CGHLSLRuntime.cpp           | 90 +++++++++++++++++++
 clang/lib/CodeGen/CGHLSLRuntime.h             |  9 ++
 clang/lib/CodeGen/CodeGenModule.cpp           |  3 +
 clang/lib/Sema/SemaHLSL.cpp                   | 22 +----
 .../builtins/RWBuffer-constructor.hlsl        | 32 ++++---
 .../StructuredBuffer-constructor.hlsl         | 31 ++++---
 9 files changed, 170 insertions(+), 44 deletions(-)

diff --git a/clang/include/clang/AST/Type.h b/clang/include/clang/AST/Type.h
index deda5b3f70f3..40e617bf8f3b 100644
--- a/clang/include/clang/AST/Type.h
+++ b/clang/include/clang/AST/Type.h
@@ -6320,6 +6320,10 @@ public:
   static bool classof(const Type *T) {
     return T->getTypeClass() == HLSLAttributedResource;
   }
+
+  // Returns handle type from HLSL resource, if the type is a resource
+  static const HLSLAttributedResourceType *
+  findHandleTypeOnResource(const Type *RT);
 };
 
 class TemplateTypeParmType : public Type, public llvm::FoldingSetNode {
diff --git a/clang/lib/AST/Type.cpp b/clang/lib/AST/Type.cpp
index 6f23a1a13d05..5232efae4e36 100644
--- a/clang/lib/AST/Type.cpp
+++ b/clang/lib/AST/Type.cpp
@@ -5335,3 +5335,18 @@ std::string FunctionEffectWithCondition::description() const {
     Result += "(expr)";
   return Result;
 }
+
+const HLSLAttributedResourceType *
+HLSLAttributedResourceType::findHandleTypeOnResource(const Type *RT) {
+  // If the type RT is an HLSL resource class, the first field must
+  // be the resource handle of type HLSLAttributedResourceType
+  const clang::Type *Ty = RT->getUnqualifiedDesugaredType();
+  if (const RecordDecl *RD = Ty->getAsCXXRecordDecl()) {
+    if (!RD->fields().empty()) {
+      const auto &FirstFD = RD->fields().begin();
+      return dyn_cast<HLSLAttributedResourceType>(
+          FirstFD->getType().getTypePtr());
+    }
+  }
+  return nullptr;
+}
diff --git a/clang/lib/CodeGen/CGDeclCXX.cpp b/clang/lib/CodeGen/CGDeclCXX.cpp
index 8dcb5f610061..b4f1a68cfe87 100644
--- a/clang/lib/CodeGen/CGDeclCXX.cpp
+++ b/clang/lib/CodeGen/CGDeclCXX.cpp
@@ -1121,6 +1121,14 @@ CodeGenFunction::GenerateCXXGlobalInitFunc(llvm::Function *Fn,
       if (Decls[i])
         EmitRuntimeCall(Decls[i]);
 
+    if (getLangOpts().HLSL) {
+      CGHLSLRuntime &CGHLSL = CGM.getHLSLRuntime();
+      if (CGHLSL.needsResourceBindingInitFn()) {
+        llvm::Function *ResInitFn = CGHLSL.createResourceBindingInitFn();
+        Builder.CreateCall(llvm::FunctionCallee(ResInitFn), {});
+      }
+    }
+
     Scope.ForceCleanup();
 
     if (ExitBlock) {
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.cpp b/clang/lib/CodeGen/CGHLSLRuntime.cpp
index 3237d93ca31c..2cce2936fe5a 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.cpp
+++ b/clang/lib/CodeGen/CGHLSLRuntime.cpp
@@ -18,8 +18,13 @@
 #include "TargetInfo.h"
 #include "clang/AST/Decl.h"
 #include "clang/Basic/TargetOptions.h"
+#include "llvm/IR/GlobalVariable.h"
+#include "llvm/IR/LLVMContext.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/Value.h"
+#include "llvm/Support/Alignment.h"
+
 #include "llvm/Support/FormatVariadic.h"
 
 using namespace clang;
@@ -489,3 +494,88 @@ void CGHLSLRuntime::generateGlobalCtorDtorCalls() {
       GV->eraseFromParent();
   }
 }
+
+void CGHLSLRuntime::handleGlobalVarDefinition(const VarDecl *VD,
+                                              llvm::GlobalVariable *GV) {
+  // If the global variable has resource binding, add it to the list of globals
+  // that need resource binding initialization.
+  const HLSLResourceBindingAttr *RBA = VD->getAttr<HLSLResourceBindingAttr>();
+  if (!RBA)
+    return;
+
+  if (!HLSLAttributedResourceType::findHandleTypeOnResource(
+          VD->getType().getTypePtr()))
+    // FIXME: Only simple declarations of resources are supported for now.
+    // Arrays of resources or resources in user defined classes are
+    // not implemented yet.
+    return;
+
+  ResourcesToBind.emplace_back(VD, GV);
+}
+
+bool CGHLSLRuntime::needsResourceBindingInitFn() {
+  return !ResourcesToBind.empty();
+}
+
+llvm::Function *CGHLSLRuntime::createResourceBindingInitFn() {
+  // No resources to bind
+  assert(needsResourceBindingInitFn() && "no resources to bind");
+
+  LLVMContext &Ctx = CGM.getLLVMContext();
+  llvm::Type *Int1Ty = llvm::Type::getInt1Ty(Ctx);
+
+  llvm::Function *InitResBindingsFunc =
+      llvm::Function::Create(llvm::FunctionType::get(CGM.VoidTy, false),
+                             llvm::GlobalValue::InternalLinkage,
+                             "_init_resource_bindings", CGM.getModule());
+
+  llvm::BasicBlock *EntryBB =
+      llvm::BasicBlock::Create(Ctx, "entry", InitResBindingsFunc);
+  CGBuilderTy Builder(CGM, Ctx);
+  const DataLayout &DL = CGM.getModule().getDataLayout();
+  Builder.SetInsertPoint(EntryBB);
+
+  for (const auto &[VD, GV] : ResourcesToBind) {
+    for (Attr *A : VD->getAttrs()) {
+      HLSLResourceBindingAttr *RBA = dyn_cast<HLSLResourceBindingAttr>(A);
+      if (!RBA)
+        continue;
+
+      const HLSLAttributedResourceType *AttrResType =
+          HLSLAttributedResourceType::findHandleTypeOnResource(
+              VD->getType().getTypePtr());
+
+      // FIXME: Only simple declarations of resources are supported for now.
+      // Arrays of resources or resources in user defined classes are
+      // not implemented yet.
+      assert(AttrResType != nullptr &&
+             "Resource class must have a handle of HLSLAttributedResourceType");
+
+      llvm::Type *TargetTy =
+          CGM.getTargetCodeGenInfo().getHLSLType(CGM, AttrResType);
+      assert(TargetTy != nullptr &&
+             "Failed to convert resource handle to target type");
+
+      auto *Space = llvm::ConstantInt::get(CGM.IntTy, RBA->getSpaceNumber());
+      auto *Slot = llvm::ConstantInt::get(CGM.IntTy, RBA->getSlotNumber());
+      // FIXME: resource arrays are not yet implemented
+      auto *Range = llvm::ConstantInt::get(CGM.IntTy, 1);
+      auto *Index = llvm::ConstantInt::get(CGM.IntTy, 0);
+      // FIXME: NonUniformResourceIndex bit is not yet implemented
+      auto *NonUniform = llvm::ConstantInt::get(Int1Ty, false);
+      llvm::Value *Args[] = {Space, Slot, Range, Index, NonUniform};
+
+      llvm::Value *CreateHandle = Builder.CreateIntrinsic(
+          /*ReturnType=*/TargetTy, getCreateHandleFromBindingIntrinsic(), Args,
+          nullptr, Twine(VD->getName()).concat("_h"));
+
+      llvm::Value *HandleRef =
+          Builder.CreateStructGEP(GV->getValueType(), GV, 0);
+      Builder.CreateAlignedStore(CreateHandle, HandleRef,
+                                 HandleRef->getPointerAlignment(DL));
+    }
+  }
+
+  Builder.CreateRetVoid();
+  return InitResBindingsFunc;
+}
diff --git a/clang/lib/CodeGen/CGHLSLRuntime.h b/clang/lib/CodeGen/CGHLSLRuntime.h
index f7621ee20b12..ff7df41b5c62 100644
--- a/clang/lib/CodeGen/CGHLSLRuntime.h
+++ b/clang/lib/CodeGen/CGHLSLRuntime.h
@@ -92,6 +92,8 @@ public:
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveIsFirstLane, wave_is_first_lane)
   GENERATE_HLSL_INTRINSIC_FUNCTION(WaveReadLaneAt, wave_readlane)
 
+  GENERATE_HLSL_INTRINSIC_FUNCTION(CreateHandleFromBinding, handle_fromBinding)
+
   //===----------------------------------------------------------------------===//
   // End of reserved area for HLSL intrinsic getters.
   //===----------------------------------------------------------------------===//
@@ -137,6 +139,10 @@ public:
 
   void emitEntryFunction(const FunctionDecl *FD, llvm::Function *Fn);
   void setHLSLFunctionAttributes(const FunctionDecl *FD, llvm::Function *Fn);
+  void handleGlobalVarDefinition(const VarDecl *VD, llvm::GlobalVariable *Var);
+
+  bool needsResourceBindingInitFn();
+  llvm::Function *createResourceBindingInitFn();
 
 private:
   void addBufferResourceAnnotation(llvm::GlobalVariable *GV,
@@ -148,6 +154,9 @@ private:
   void addBufferDecls(const DeclContext *DC, Buffer &CB);
   llvm::Triple::ArchType getArch();
   llvm::SmallVector<Buffer> Buffers;
+
+  llvm::SmallVector<std::pair<const VarDecl *, llvm::GlobalVariable *>>
+      ResourcesToBind;
 };
 
 } // namespace CodeGen
diff --git a/clang/lib/CodeGen/CodeGenModule.cpp b/clang/lib/CodeGen/CodeGenModule.cpp
index b3e805a67768..9a84a11973b1 100644
--- a/clang/lib/CodeGen/CodeGenModule.cpp
+++ b/clang/lib/CodeGen/CodeGenModule.cpp
@@ -5634,6 +5634,9 @@ void CodeGenModule::EmitGlobalVarDefinition(const VarDecl *D,
     getCUDARuntime().handleVarRegistration(D, *GV);
   }
 
+  if (LangOpts.HLSL)
+    getHLSLRuntime().handleGlobalVarDefinition(D, GV);
+
   GV->setInitializer(Init);
   if (emitter)
     emitter->finalize(GV);
diff --git a/clang/lib/Sema/SemaHLSL.cpp b/clang/lib/Sema/SemaHLSL.cpp
index efb0fbaa432d..1d18a6308e2a 100644
--- a/clang/lib/Sema/SemaHLSL.cpp
+++ b/clang/lib/Sema/SemaHLSL.cpp
@@ -1039,21 +1039,6 @@ SemaHLSL::TakeLocForHLSLAttribute(const HLSLAttributedResourceType *RT) {
   return LocInfo;
 }
 
-// Returns handle type of a resource, if the type is a resource
-static const HLSLAttributedResourceType *
-findHandleTypeOnResource(const Type *Ty) {
-  // If Ty is a resource class, the first field must
-  // be the resource handle of type HLSLAttributedResourceType
-  if (RecordDecl *RD = Ty->getAsCXXRecordDecl()) {
-    if (!RD->fields().empty()) {
-      const auto &FirstFD = RD->fields().begin();
-      return dyn_cast<HLSLAttributedResourceType>(
-          FirstFD->getType().getTypePtr());
-    }
-  }
-  return nullptr;
-}
-
 // Walks though the global variable declaration, collects all resource binding
 // requirements and adds them to Bindings
 void SemaHLSL::collectResourcesOnUserRecordDecl(const VarDecl *VD,
@@ -1075,7 +1060,7 @@ void SemaHLSL::collectResourcesOnUserRecordDecl(const VarDecl *VD,
       continue;
 
     if (const HLSLAttributedResourceType *AttrResType =
-            findHandleTypeOnResource(Ty)) {
+            HLSLAttributedResourceType::findHandleTypeOnResource(Ty)) {
       // Add a new DeclBindingInfo to Bindings if it does not already exist
       ResourceClass RC = AttrResType->getAttrs().ResourceClass;
       DeclBindingInfo *DBI = Bindings.getDeclBindingInfo(VD, RC);
@@ -1126,7 +1111,8 @@ static bool DiagnoseLocalRegisterBinding(Sema &S, SourceLocation &ArgLoc,
 
   // Resource
   if (const HLSLAttributedResourceType *AttrResType =
-          findHandleTypeOnResource(VD->getType().getTypePtr())) {
+          HLSLAttributedResourceType::findHandleTypeOnResource(
+              VD->getType().getTypePtr())) {
     if (RegType == getRegisterType(AttrResType->getAttrs().ResourceClass))
       return true;
 
@@ -2369,7 +2355,7 @@ void SemaHLSL::collectResourcesOnVarDecl(VarDecl *VD) {
 
   // Resource (or array of resources)
   if (const HLSLAttributedResourceType *AttrResType =
-          findHandleTypeOnResource(Ty)) {
+          HLSLAttributedResourceType::findHandleTypeOnResource(Ty)) {
     Bindings.addDeclBindingInfo(VD, AttrResType->getAttrs().ResourceClass);
     return;
   }
diff --git a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
index 19699dcf14d9..3949f7b943cf 100644
--- a/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/RWBuffer-constructor.hlsl
@@ -1,19 +1,25 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// FIXME: SPIR-V codegen of llvm.spv.handle.fromBinding is not yet implemented
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
-// XFAIL: *
-// This expectedly fails because create.handle is no longer called
-// from RWBuffer constructor and the replacement has not been
-// implemented yet. This test should be updated to expect
-// dx.create.handleFromBinding as part of issue #105076.
+// NOTE: SPIRV codegen for resource types is not yet implemented
 
-RWBuffer<float> Buf;
+RWBuffer<float> Buf : register(u5, space3);
 
-// CHECK: define linkonce_odr noundef ptr @"??0?$RWBuffer@M@hlsl@@QAA@XZ"
+// CHECK: %"class.hlsl::RWBuffer" = type { target("dx.TypedBuffer", float, 1, 0, 0), float }
+// CHECK: @Buf = global %"class.hlsl::RWBuffer" zeroinitializer, align 4
+
+// CHECK: define linkonce_odr void @_ZN4hlsl8RWBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(8) %this)
 // CHECK-NEXT: entry:
 
-// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
-// CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+// CHECK: define internal void @_GLOBAL__sub_I_RWBuffer_constructor.hlsl()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__cxx_global_var_init()
+// CHECK-NEXT: call void @_init_resource_bindings()
 
-// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
-// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
+// CHECK: define internal void @_init_resource_bindings() {
+// CHECK-NEXT: entry:
+// CHECK-DXIL-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.dx.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4
+// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.TypedBuffer", float, 1, 0, 0) @llvm.spv.handle.fromBinding.tdx.TypedBuffer_f32_1_0_0t(i32 3, i32 5, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: store target("dx.TypedBuffer", float, 1, 0, 0) %Buf_h, ptr @Buf, align 4
diff --git a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
index f65090410ce6..4dbca9bc0a4d 100644
--- a/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
+++ b/clang/test/CodeGenHLSL/builtins/StructuredBuffer-constructor.hlsl
@@ -1,19 +1,24 @@
-// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s
-// RUN: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefix=CHECK-SPIRV
+// RUN: %clang_cc1 -triple dxil-pc-shadermodel6.3-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-DXIL
+// RUN-DISABLED: %clang_cc1 -triple spirv-vulkan-library -x hlsl -emit-llvm -disable-llvm-passes -o - %s | FileCheck %s --check-prefixes=CHECK,CHECK-SPIRV
 
-// XFAIL: *
-// This expectedly fails because create.handle is no longer invoked
-// from StructuredBuffer constructor and the replacement has not been
-// implemented yet. This test should be updated to expect
-// dx.create.handleFromBinding as part of issue #105076.
+// NOTE: SPIRV codegen for resource types is not yet implemented
 
-StructuredBuffer<float> Buf;
+StructuredBuffer<float> Buf : register(u10);
 
-// CHECK: define linkonce_odr noundef ptr @"??0?$StructuredBuffer@M@hlsl@@QAA@XZ"
+// CHECK: %"class.hlsl::StructuredBuffer" = type { target("dx.RawBuffer", float, 1, 0), float }
+// CHECK: @Buf = global %"class.hlsl::StructuredBuffer" zeroinitializer, align 4
+
+// CHECK: define linkonce_odr void @_ZN4hlsl16StructuredBufferIfEC2Ev(ptr noundef nonnull align 4 dereferenceable(8) %this)
 // CHECK-NEXT: entry:
 
-// CHECK: %[[HandleRes:[0-9]+]] = call ptr @llvm.dx.create.handle(i8 1)
-// CHECK: store ptr %[[HandleRes]], ptr %h, align 4
+// CHECK: define internal void @_GLOBAL__sub_I_StructuredBuffer_constructor.hlsl()
+// CHECK-NEXT: entry:
+// CHECK-NEXT: call void @__cxx_global_var_init()
+// CHECK-NEXT: call void @_init_resource_bindings()
 
-// CHECK-SPIRV: %[[HandleRes:[0-9]+]] = call ptr @llvm.spv.create.handle(i8 1)
-// CHECK-SPIRV: store ptr %[[HandleRes]], ptr %h, align 8
+// CHECK: define internal void @_init_resource_bindings() {
+// CHECK-NEXT: entry:
+// CHECK-DXIL-NEXT: %Buf_h = call target("dx.RawBuffer", float, 1, 0) @llvm.dx.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 10, i32 1, i32 0, i1 false)
+// CHECK-DXIL-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf_h, ptr @Buf, align 4
+// CHECK-SPIRV-NEXT: %Buf_h = call target("dx.RawBuffer", float, 1, 0) @llvm.spv.handle.fromBinding.tdx.RawBuffer_f32_1_0t(i32 0, i32 10, i32 1, i32 0, i1 false)
+// CHECK-SPIRV-NEXT: store target("dx.RawBuffer", float, 1, 0) %Buf_h, ptr @Buf", align 4
-- 
GitLab


From e9eec14bb3566f6578950797559de98678f16985 Mon Sep 17 00:00:00 2001
From: tangaac <tangyan01@loongson.cn>
Date: Fri, 18 Oct 2024 09:06:29 +0800
Subject: [PATCH 312/329] [LoongArch] [CodeGen] Add options for Clang to
 generate LoongArch-specific frecipe & frsqrte instructions (#109917)

Two options: `-mfrecipe` & `-mno-frecipe`.
Enable or Disable frecipe.{s/d} and frsqrte.{s/d} instructions.
The default is `-mno-frecipe`.
---
 clang/include/clang/Driver/Options.td         |   4 +
 .../lib/Driver/ToolChains/Arch/LoongArch.cpp  |   9 +
 clang/test/Driver/loongarch-mfrecipe.c        |  30 +
 .../LoongArch/LoongArchFloat32InstrInfo.td    |   6 +
 .../LoongArch/LoongArchFloat64InstrInfo.td    |   2 +
 .../LoongArch/LoongArchISelLowering.cpp       |  67 ++
 .../Target/LoongArch/LoongArchISelLowering.h  |  15 +
 .../LoongArch/LoongArchLASXInstrInfo.td       |  10 +
 .../Target/LoongArch/LoongArchLSXInstrInfo.td |  13 +
 .../LoongArch/fdiv-reciprocal-estimate.ll     |  80 ++
 .../LoongArch/fsqrt-reciprocal-estimate.ll    | 797 ++++++++++++++++++
 .../lasx/fdiv-reciprocal-estimate.ll          | 114 +++
 .../lasx/fsqrt-reciprocal-estimate.ll         |  75 ++
 .../LoongArch/lsx/fdiv-reciprocal-estimate.ll | 114 +++
 .../lsx/fsqrt-reciprocal-estimate.ll          |  75 ++
 15 files changed, 1411 insertions(+)
 create mode 100644 clang/test/Driver/loongarch-mfrecipe.c
 create mode 100644 llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll

diff --git a/clang/include/clang/Driver/Options.td b/clang/include/clang/Driver/Options.td
index 379e75b197cf..4eb013d587eb 100644
--- a/clang/include/clang/Driver/Options.td
+++ b/clang/include/clang/Driver/Options.td
@@ -5387,6 +5387,10 @@ def mno_lasx : Flag<["-"], "mno-lasx">, Group<m_loongarch_Features_Group>,
 let Flags = [TargetSpecific] in {
 def msimd_EQ : Joined<["-"], "msimd=">, Group<m_loongarch_Features_Group>,
   HelpText<"Select the SIMD extension(s) to be enabled in LoongArch either 'none', 'lsx', 'lasx'.">;
+def mfrecipe : Flag<["-"], "mfrecipe">, Group<m_loongarch_Features_Group>,
+  HelpText<"Enable frecipe.{s/d} and frsqrte.{s/d}">;
+def mno_frecipe : Flag<["-"], "mno-frecipe">, Group<m_loongarch_Features_Group>,
+  HelpText<"Disable frecipe.{s/d} and frsqrte.{s/d}">;
 def mannotate_tablejump : Flag<["-"], "mannotate-tablejump">, Group<m_loongarch_Features_Group>,
   HelpText<"Enable annotate table jump instruction to correlate it with the jump table.">;
 def mno_annotate_tablejump : Flag<["-"], "mno-annotate-tablejump">, Group<m_loongarch_Features_Group>,
diff --git a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
index 771adade9381..355253e4b3b0 100644
--- a/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
+++ b/clang/lib/Driver/ToolChains/Arch/LoongArch.cpp
@@ -251,6 +251,15 @@ void loongarch::getLoongArchTargetFeatures(const Driver &D,
     } else /*-mno-lasx*/
       Features.push_back("-lasx");
   }
+
+  // Select frecipe feature determined by -m[no-]frecipe.
+  if (const Arg *A =
+          Args.getLastArg(options::OPT_mfrecipe, options::OPT_mno_frecipe)) {
+    if (A->getOption().matches(options::OPT_mfrecipe))
+      Features.push_back("+frecipe");
+    else
+      Features.push_back("-frecipe");
+  }
 }
 
 std::string loongarch::postProcessTargetCPUString(const std::string &CPU,
diff --git a/clang/test/Driver/loongarch-mfrecipe.c b/clang/test/Driver/loongarch-mfrecipe.c
new file mode 100644
index 000000000000..14afd54af0b9
--- /dev/null
+++ b/clang/test/Driver/loongarch-mfrecipe.c
@@ -0,0 +1,30 @@
+/// Test -m[no]frecipe options.
+
+// RUN: %clang --target=loongarch64 -mfrecipe -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-FRECIPE
+// RUN: %clang --target=loongarch64 -mno-frecipe -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-NO-FRECIPE
+// RUN: %clang --target=loongarch64 -mno-frecipe -mfrecipe -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-FRECIPE
+// RUN: %clang --target=loongarch64  -mfrecipe -mno-frecipe -fsyntax-only %s -### 2>&1 | \
+// RUN:     FileCheck %s --check-prefix=CC1-NO-FRECIPE
+
+// RUN: %clang --target=loongarch64 -mfrecipe -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-FRECIPE
+// RUN: %clang --target=loongarch64 -mno-frecipe -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-NO-FRECIPE
+// RUN: %clang --target=loongarch64 -mno-frecipe -mfrecipe -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-FRECIPE
+// RUN: %clang --target=loongarch64 -mfrecipe -mno-frecipe -S -emit-llvm %s -o - | \
+// RUN: FileCheck %s --check-prefix=IR-NO-FRECIPE
+
+
+// CC1-FRECIPE: "-target-feature" "+frecipe"
+// CC1-NO-FRECIPE: "-target-feature" "-frecipe"
+
+// IR-FRECIPE: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}+frecipe{{(,.*)?}}"
+// IR-NO-FRECIPE: attributes #[[#]] ={{.*}}"target-features"="{{(.*,)?}}-frecipe{{(,.*)?}}"
+
+int foo(void) {
+  return 42;
+}
\ No newline at end of file
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
index d6a83c0c8cd8..65802d660432 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat32InstrInfo.td
@@ -19,12 +19,16 @@ def SDT_LoongArchMOVGR2FR_W_LA64
 def SDT_LoongArchMOVFR2GR_S_LA64
     : SDTypeProfile<1, 1, [SDTCisVT<0, i64>, SDTCisVT<1, f32>]>;
 def SDT_LoongArchFTINT : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
+def SDT_LoongArchFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
+def SDT_LoongArchFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisFP<1>]>;
 
 def loongarch_movgr2fr_w_la64
     : SDNode<"LoongArchISD::MOVGR2FR_W_LA64", SDT_LoongArchMOVGR2FR_W_LA64>;
 def loongarch_movfr2gr_s_la64
     : SDNode<"LoongArchISD::MOVFR2GR_S_LA64", SDT_LoongArchMOVFR2GR_S_LA64>;
 def loongarch_ftint : SDNode<"LoongArchISD::FTINT", SDT_LoongArchFTINT>;
+def loongarch_frecipe : SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchFRECIPE>;
+def loongarch_frsqrte : SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchFRSQRTE>;
 
 //===----------------------------------------------------------------------===//
 // Instructions
@@ -286,6 +290,8 @@ let Predicates = [HasFrecipe] in {
 // FP approximate reciprocal operation
 def : Pat<(int_loongarch_frecipe_s FPR32:$src), (FRECIPE_S FPR32:$src)>;
 def : Pat<(int_loongarch_frsqrte_s FPR32:$src), (FRSQRTE_S FPR32:$src)>;
+def : Pat<(loongarch_frecipe FPR32:$src), (FRECIPE_S FPR32:$src)>;
+def : Pat<(loongarch_frsqrte FPR32:$src), (FRSQRTE_S FPR32:$src)>;
 }
 
 // fmadd.s: fj * fk + fa
diff --git a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
index 30cce8439640..b98025643903 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchFloat64InstrInfo.td
@@ -253,6 +253,8 @@ let Predicates = [HasFrecipe] in {
 // FP approximate reciprocal operation
 def : Pat<(int_loongarch_frecipe_d FPR64:$src), (FRECIPE_D FPR64:$src)>;
 def : Pat<(int_loongarch_frsqrte_d FPR64:$src), (FRSQRTE_D FPR64:$src)>;
+def : Pat<(loongarch_frecipe FPR64:$src), (FRECIPE_D FPR64:$src)>;
+def : Pat<(loongarch_frsqrte FPR64:$src), (FRSQRTE_D FPR64:$src)>;
 }
 
 // fmadd.d: fj * fk + fa
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
index fc5f0fc1bf0d..676d43ef22c4 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.cpp
@@ -4697,6 +4697,8 @@ const char *LoongArchTargetLowering::getTargetNodeName(unsigned Opcode) const {
     NODE_NAME_CASE(VANY_ZERO)
     NODE_NAME_CASE(VALL_NONZERO)
     NODE_NAME_CASE(VANY_NONZERO)
+    NODE_NAME_CASE(FRECIPE)
+    NODE_NAME_CASE(FRSQRTE)
   }
 #undef NODE_NAME_CASE
   return nullptr;
@@ -5900,6 +5902,71 @@ Register LoongArchTargetLowering::getExceptionSelectorRegister(
   return LoongArch::R5;
 }
 
+//===----------------------------------------------------------------------===//
+// Target Optimization Hooks
+//===----------------------------------------------------------------------===//
+
+static int getEstimateRefinementSteps(EVT VT,
+                                      const LoongArchSubtarget &Subtarget) {
+  // Feature FRECIPE instrucions relative accuracy is 2^-14.
+  // IEEE float has 23 digits and double has 52 digits.
+  int RefinementSteps = VT.getScalarType() == MVT::f64 ? 2 : 1;
+  return RefinementSteps;
+}
+
+SDValue LoongArchTargetLowering::getSqrtEstimate(SDValue Operand,
+                                                 SelectionDAG &DAG, int Enabled,
+                                                 int &RefinementSteps,
+                                                 bool &UseOneConstNR,
+                                                 bool Reciprocal) const {
+  if (Subtarget.hasFrecipe()) {
+    SDLoc DL(Operand);
+    EVT VT = Operand.getValueType();
+
+    if (VT == MVT::f32 || (VT == MVT::f64 && Subtarget.hasBasicD()) ||
+        (VT == MVT::v4f32 && Subtarget.hasExtLSX()) ||
+        (VT == MVT::v2f64 && Subtarget.hasExtLSX()) ||
+        (VT == MVT::v8f32 && Subtarget.hasExtLASX()) ||
+        (VT == MVT::v4f64 && Subtarget.hasExtLASX())) {
+
+      if (RefinementSteps == ReciprocalEstimate::Unspecified)
+        RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
+
+      SDValue Estimate = DAG.getNode(LoongArchISD::FRSQRTE, DL, VT, Operand);
+      if (Reciprocal)
+        Estimate = DAG.getNode(ISD::FMUL, DL, VT, Operand, Estimate);
+
+      return Estimate;
+    }
+  }
+
+  return SDValue();
+}
+
+SDValue LoongArchTargetLowering::getRecipEstimate(SDValue Operand,
+                                                  SelectionDAG &DAG,
+                                                  int Enabled,
+                                                  int &RefinementSteps) const {
+  if (Subtarget.hasFrecipe()) {
+    SDLoc DL(Operand);
+    EVT VT = Operand.getValueType();
+
+    if (VT == MVT::f32 || (VT == MVT::f64 && Subtarget.hasBasicD()) ||
+        (VT == MVT::v4f32 && Subtarget.hasExtLSX()) ||
+        (VT == MVT::v2f64 && Subtarget.hasExtLSX()) ||
+        (VT == MVT::v8f32 && Subtarget.hasExtLASX()) ||
+        (VT == MVT::v4f64 && Subtarget.hasExtLASX())) {
+
+      if (RefinementSteps == ReciprocalEstimate::Unspecified)
+        RefinementSteps = getEstimateRefinementSteps(VT, Subtarget);
+
+      return DAG.getNode(LoongArchISD::FRECIPE, DL, VT, Operand);
+    }
+  }
+
+  return SDValue();
+}
+
 //===----------------------------------------------------------------------===//
 //                           LoongArch Inline Assembly Support
 //===----------------------------------------------------------------------===//
diff --git a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
index 6177884bd195..df6a55a2b831 100644
--- a/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchISelLowering.h
@@ -141,6 +141,10 @@ enum NodeType : unsigned {
   VALL_NONZERO,
   VANY_NONZERO,
 
+  // Floating point approximate reciprocal operation
+  FRECIPE,
+  FRSQRTE
+
   // Intrinsic operations end =============================================
 };
 } // end namespace LoongArchISD
@@ -216,6 +220,17 @@ public:
   Register
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
+  bool isFsqrtCheap(SDValue Operand, SelectionDAG &DAG) const override {
+    return true;
+  }
+
+  SDValue getSqrtEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                          int &RefinementSteps, bool &UseOneConstNR,
+                          bool Reciprocal) const override;
+
+  SDValue getRecipEstimate(SDValue Operand, SelectionDAG &DAG, int Enabled,
+                           int &RefinementSteps) const override;
+
   ISD::NodeType getExtendForAtomicOps() const override {
     return ISD::SIGN_EXTEND;
   }
diff --git a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
index dd7e5713e45f..d13cc9af135b 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLASXInstrInfo.td
@@ -10,6 +10,7 @@
 //
 //===----------------------------------------------------------------------===//
 
+// Target nodes.
 def loongarch_xvpermi: SDNode<"LoongArchISD::XVPERMI", SDT_LoongArchV1RUimm>;
 
 def lasxsplati8
@@ -2094,6 +2095,15 @@ foreach Inst = ["XVFRECIPE_S", "XVFRSQRTE_S"] in
 foreach Inst = ["XVFRECIPE_D", "XVFRSQRTE_D"] in
   def : Pat<(deriveLASXIntrinsic<Inst>.ret (v4f64 LASX256:$xj)),
             (!cast<LAInst>(Inst) LASX256:$xj)>;
+
+def : Pat<(loongarch_vfrecipe v8f32:$src), 
+          (XVFRECIPE_S v8f32:$src)>;
+def : Pat<(loongarch_vfrecipe v4f64:$src), 
+          (XVFRECIPE_D v4f64:$src)>;
+def : Pat<(loongarch_vfrsqrte v8f32:$src), 
+          (XVFRSQRTE_S v8f32:$src)>;
+def : Pat<(loongarch_vfrsqrte v4f64:$src), 
+          (XVFRSQRTE_D v4f64:$src)>;
 }
 
 def : Pat<(int_loongarch_lasx_xvpickve_w_f v8f32:$xj, timm:$imm),
diff --git a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
index e7ac9f3bd04c..86aa6dcfd826 100644
--- a/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
+++ b/llvm/lib/Target/LoongArch/LoongArchLSXInstrInfo.td
@@ -23,6 +23,8 @@ def SDT_LoongArchV2R : SDTypeProfile<1, 2, [SDTCisVec<0>,
                                      SDTCisSameAs<0, 1>, SDTCisSameAs<1, 2>]>;
 def SDT_LoongArchV1RUimm: SDTypeProfile<1, 2, [SDTCisVec<0>,
                                         SDTCisSameAs<0,1>, SDTCisVT<2, i64>]>;
+def SDT_LoongArchVFRECIPE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
+def SDT_LoongArchVFRSQRTE : SDTypeProfile<1, 1, [SDTCisFP<0>, SDTCisVec<0>, SDTCisSameAs<0, 1>]>;
 
 // Target nodes.
 def loongarch_vreplve : SDNode<"LoongArchISD::VREPLVE", SDT_LoongArchVreplve>;
@@ -50,6 +52,8 @@ def loongarch_vilvh: SDNode<"LoongArchISD::VILVH", SDT_LoongArchV2R>;
 
 def loongarch_vshuf4i: SDNode<"LoongArchISD::VSHUF4I", SDT_LoongArchV1RUimm>;
 def loongarch_vreplvei: SDNode<"LoongArchISD::VREPLVEI", SDT_LoongArchV1RUimm>;
+def loongarch_vfrecipe: SDNode<"LoongArchISD::FRECIPE", SDT_LoongArchVFRECIPE>;
+def loongarch_vfrsqrte: SDNode<"LoongArchISD::FRSQRTE", SDT_LoongArchVFRSQRTE>;
 
 def immZExt1 : ImmLeaf<i64, [{return isUInt<1>(Imm);}]>;
 def immZExt2 : ImmLeaf<i64, [{return isUInt<2>(Imm);}]>;
@@ -2238,6 +2242,15 @@ foreach Inst = ["VFRECIPE_S", "VFRSQRTE_S"] in
 foreach Inst = ["VFRECIPE_D", "VFRSQRTE_D"] in
   def : Pat<(deriveLSXIntrinsic<Inst>.ret (v2f64 LSX128:$vj)),
             (!cast<LAInst>(Inst) LSX128:$vj)>;
+
+def : Pat<(loongarch_vfrecipe v4f32:$src), 
+          (VFRECIPE_S v4f32:$src)>;
+def : Pat<(loongarch_vfrecipe v2f64:$src), 
+          (VFRECIPE_D v2f64:$src)>;
+def : Pat<(loongarch_vfrsqrte v4f32:$src), 
+          (VFRSQRTE_S v4f32:$src)>;
+def : Pat<(loongarch_vfrsqrte v2f64:$src), 
+          (VFRSQRTE_D v2f64:$src)>;
 }
 
 // load
diff --git a/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll
new file mode 100644
index 000000000000..3f38bbed881a
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/fdiv-reciprocal-estimate.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F
+; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE
+; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D
+; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE
+
+;; Exercise the 'fdiv' LLVM IR: https://llvm.org/docs/LangRef.html#fdiv-instruction
+
+define float @fdiv_s(float %x, float %y) {
+; LA32F-LABEL: fdiv_s:
+; LA32F:       # %bb.0:
+; LA32F-NEXT:    fdiv.s	$fa0, $fa0, $fa1
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL: fdiv_s:
+; LA32F-FRECIPE:       # %bb.0:
+; LA32F-FRECIPE-NEXT:    frecipe.s	$fa2, $fa1
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa3, $fa0, $fa2
+; LA32F-FRECIPE-NEXT:    fnmsub.s	$fa0, $fa1, $fa3, $fa0
+; LA32F-FRECIPE-NEXT:    fmadd.s	$fa0, $fa2, $fa0, $fa3
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL: fdiv_s:
+; LA64D:       # %bb.0:
+; LA64D-NEXT:    fdiv.s $fa0, $fa0, $fa1
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL: fdiv_s:
+; LA64D-FRECIPE:       # %bb.0:
+; LA64D-FRECIPE-NEXT:    frecipe.s	$fa2, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa3, $fa0, $fa2
+; LA64D-FRECIPE-NEXT:    fnmsub.s	$fa0, $fa1, $fa3, $fa0
+; LA64D-FRECIPE-NEXT:    fmadd.s	$fa0, $fa2, $fa0, $fa3
+; LA64D-FRECIPE-NEXT:    ret
+  %div = fdiv fast float %x, %y
+  ret float %div
+}
+
+define double @fdiv_d(double %x, double %y) {
+; LA32F-LABEL: fdiv_d:
+; LA32F:       # %bb.0:
+; LA32F-NEXT:    addi.w	$sp, $sp, -16
+; LA32F-NEXT:    .cfi_def_cfa_offset 16
+; LA32F-NEXT:    st.w	$ra, $sp, 12                    # 4-byte Folded Spill
+; LA32F-NEXT:    .cfi_offset 1, -4
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    ld.w	$ra, $sp, 12                    # 4-byte Folded Reload
+; LA32F-NEXT:    addi.w	$sp, $sp, 16
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL: fdiv_d:
+; LA32F-FRECIPE:       # %bb.0:
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, -16
+; LA32F-FRECIPE-NEXT:    .cfi_def_cfa_offset 16
+; LA32F-FRECIPE-NEXT:    st.w	$ra, $sp, 12                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    .cfi_offset 1, -4
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    ld.w	$ra, $sp, 12                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, 16
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL: fdiv_d:
+; LA64D:       # %bb.0:
+; LA64D-NEXT:    fdiv.d $fa0, $fa0, $fa1
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL: fdiv_d:
+; LA64D-FRECIPE:       # %bb.0:
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a0, %pc_hi20(.LCPI1_0)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa2, $a0, %pc_lo12(.LCPI1_0)
+; LA64D-FRECIPE-NEXT:    frecipe.d	$fa3, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa2, $fa1, $fa3, $fa2
+; LA64D-FRECIPE-NEXT:    fnmsub.d	$fa2, $fa2, $fa3, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa3, $fa0, $fa2
+; LA64D-FRECIPE-NEXT:    fnmsub.d	$fa0, $fa1, $fa3, $fa0
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa0, $fa2, $fa0, $fa3
+; LA64D-FRECIPE-NEXT:    ret
+  %div = fdiv fast double %x, %y
+  ret double %div
+}
diff --git a/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll
new file mode 100644
index 000000000000..388ae6321f66
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/fsqrt-reciprocal-estimate.ll
@@ -0,0 +1,797 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,-frecipe < %s | FileCheck %s --check-prefix=LA32F
+; RUN: llc --mtriple=loongarch32 --mattr=+f,-d,+frecipe < %s | FileCheck %s --check-prefix=LA32F-FRECIPE
+; RUN: llc --mtriple=loongarch64 --mattr=+d,-frecipe < %s | FileCheck %s --check-prefix=LA64D
+; RUN: llc --mtriple=loongarch64 --mattr=+d,+frecipe < %s | FileCheck %s --check-prefix=LA64D-FRECIPE
+
+
+declare float @llvm.sqrt.f32(float)
+declare double @llvm.sqrt.f64(double)
+
+define float @frsqrt_f32(float %a) nounwind {
+; LA32F-LABEL: frsqrt_f32:
+; LA32F:       # %bb.0:
+; LA32F-NEXT:    frsqrt.s $fa0, $fa0
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL: frsqrt_f32:
+; LA32F-FRECIPE:       # %bb.0:
+; LA32F-FRECIPE-NEXT:    frsqrte.s	$fa1, $fa0
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a0, %pc_hi20(.LCPI0_0)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa2, $a0, %pc_lo12(.LCPI0_0)
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a0, %pc_hi20(.LCPI0_1)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa3, $a0, %pc_lo12(.LCPI0_1)
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fmadd.s	$fa0, $fa0, $fa1, $fa2
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa3
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa0, $fa1, $fa0
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL: frsqrt_f32:
+; LA64D:       # %bb.0:
+; LA64D-NEXT:    frsqrt.s $fa0, $fa0
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL: frsqrt_f32:
+; LA64D-FRECIPE:       # %bb.0:
+; LA64D-FRECIPE-NEXT:    frsqrte.s	$fa1, $fa0
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a0, %pc_hi20(.LCPI0_0)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa2, $a0, %pc_lo12(.LCPI0_0)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a0, %pc_hi20(.LCPI0_1)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa3, $a0, %pc_lo12(.LCPI0_1)
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.s	$fa0, $fa0, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa0, $fa1, $fa0
+; LA64D-FRECIPE-NEXT:    ret
+
+  %1 = call fast float @llvm.sqrt.f32(float %a)
+  %2 = fdiv fast float 1.0, %1
+  ret float %2
+}
+
+define double @frsqrt_f64(double %a) nounwind {
+; LA32F-LABEL: frsqrt_f64:
+; LA32F:       # %bb.0:
+; LA32F-NEXT:    addi.w	$sp, $sp, -16
+; LA32F-NEXT:    st.w	$ra, $sp, 12 
+; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    move	$a2, $a0
+; LA32F-NEXT:    move	$a3, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 261888
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    ld.w	$ra, $sp, 12  
+; LA32F-NEXT:    addi.w	$sp, $sp, 16
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL: frsqrt_f64:
+; LA32F-FRECIPE:       # %bb.0:
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, -16
+; LA32F-FRECIPE-NEXT:    st.w	$ra, $sp, 12                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    move	$a2, $a0
+; LA32F-FRECIPE-NEXT:    move	$a3, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 261888
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    ld.w	$ra, $sp, 12                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, 16
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL: frsqrt_f64:
+; LA64D:       # %bb.0:
+; LA64D-NEXT:    frsqrt.d $fa0, $fa0
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL: frsqrt_f64:
+; LA64D-FRECIPE:       # %bb.0:
+; LA64D-FRECIPE-NEXT:    frsqrte.d	$fa1, $fa0
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a0, %pc_hi20(.LCPI1_0)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa2, $a0, %pc_lo12(.LCPI1_0)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a0, %pc_hi20(.LCPI1_1)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa3, $a0, %pc_lo12(.LCPI1_1)
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa4, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa0, $fa0, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa0, $fa1, $fa0
+; LA64D-FRECIPE-NEXT:    ret
+  %1 = call fast double @llvm.sqrt.f64(double %a)
+  %2 = fdiv fast double 1.0, %1
+  ret double %2
+}
+
+define double @sqrt_simplify_before_recip_3_uses_f64(double %x, ptr %p1, ptr %p2) nounwind {
+; LA32F-LABEL:    sqrt_simplify_before_recip_3_uses_f64:
+; LA32F:    # %bb.0:
+; LA32F-NEXT:    addi.w	$sp, $sp, -32
+; LA32F-NEXT:    st.w	$ra, $sp, 28                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$fp, $sp, 24                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s0, $sp, 20                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s1, $sp, 16                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s2, $sp, 12                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s3, $sp, 8                     # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
+; LA32F-NEXT:    move	$fp, $a3
+; LA32F-NEXT:    move	$s0, $a2
+; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    move	$s1, $a0
+; LA32F-NEXT:    move	$s2, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 261888
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    move	$a2, $s1
+; LA32F-NEXT:    move	$a3, $s2
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    move	$s3, $a0
+; LA32F-NEXT:    move	$s4, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 263248
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    move	$a2, $s1
+; LA32F-NEXT:    move	$a3, $s2
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    st.w	$s3, $s0, 0
+; LA32F-NEXT:    st.w	$s4, $s0, 4
+; LA32F-NEXT:    st.w	$a0, $fp, 0
+; LA32F-NEXT:    st.w	$a1, $fp, 4
+; LA32F-NEXT:    move	$a0, $s1
+; LA32F-NEXT:    move	$a1, $s2
+; LA32F-NEXT:    ld.w	$s4, $sp, 4                     # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s3, $sp, 8                     # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s2, $sp, 12                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s1, $sp, 16                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s0, $sp, 20                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$fp, $sp, 24                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$ra, $sp, 28                    # 4-byte Folded Reload
+; LA32F-NEXT:    addi.w	$sp, $sp, 32
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL:    sqrt_simplify_before_recip_3_uses_f64:
+; LA32F-FRECIPE:    # %bb.0:
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, -32
+; LA32F-FRECIPE-NEXT:    st.w	$ra, $sp, 28                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$fp, $sp, 24                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s0, $sp, 20                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s1, $sp, 16                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s2, $sp, 12                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s3, $sp, 8                     # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    move	$fp, $a3
+; LA32F-FRECIPE-NEXT:    move	$s0, $a2
+; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    move	$s1, $a0
+; LA32F-FRECIPE-NEXT:    move	$s2, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 261888
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    move	$a2, $s1
+; LA32F-FRECIPE-NEXT:    move	$a3, $s2
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    move	$s3, $a0
+; LA32F-FRECIPE-NEXT:    move	$s4, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263248
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    move	$a2, $s1
+; LA32F-FRECIPE-NEXT:    move	$a3, $s2
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    st.w	$s3, $s0, 0
+; LA32F-FRECIPE-NEXT:    st.w	$s4, $s0, 4
+; LA32F-FRECIPE-NEXT:    st.w	$a0, $fp, 0
+; LA32F-FRECIPE-NEXT:    st.w	$a1, $fp, 4
+; LA32F-FRECIPE-NEXT:    move	$a0, $s1
+; LA32F-FRECIPE-NEXT:    move	$a1, $s2
+; LA32F-FRECIPE-NEXT:    ld.w	$s4, $sp, 4                     # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s3, $sp, 8                     # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s2, $sp, 12                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s1, $sp, 16                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s0, $sp, 20                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$fp, $sp, 24                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$ra, $sp, 28                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, 32
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_f64:
+; LA64D:       # %bb.0:
+; LA64D-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI2_0)
+; LA64D-NEXT:    fld.d	$fa2, $a2, %pc_lo12(.LCPI2_0)
+; LA64D-NEXT:    fsqrt.d	$fa1, $fa0
+; LA64D-NEXT:    frsqrt.d	$fa0, $fa0
+; LA64D-NEXT:    fdiv.d	$fa2, $fa2, $fa1
+; LA64D-NEXT:    fst.d	$fa0, $a0, 0
+; LA64D-NEXT:    fst.d	$fa2, $a1, 0
+; LA64D-NEXT:    fmov.d	$fa0, $fa1
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f64:
+; LA64D-FRECIPE:       # %bb.0:
+; LA64D-FRECIPE-NEXT:    frsqrte.d	$fa1, $fa0
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI2_0)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa2, $a2, %pc_lo12(.LCPI2_0)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI2_1)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa3, $a2, %pc_lo12(.LCPI2_1)
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa4, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI2_2)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa5, $a2, %pc_lo12(.LCPI2_2)
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa2, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa2, $fa1, $fa5
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fst.d	$fa1, $a0, 0
+; LA64D-FRECIPE-NEXT:    fst.d	$fa2, $a1, 0
+; LA64D-FRECIPE-NEXT:    ret
+  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
+  %rsqrt = fdiv fast double 1.0, %sqrt
+  %r = fdiv fast double 42.0, %sqrt
+  %sqrt_fast = fdiv fast double %x, %sqrt
+  store double %rsqrt, ptr %p1, align 8
+  store double %r, ptr %p2, align 8
+  ret double %sqrt_fast
+}
+
+
+define double @sqrt_simplify_before_recip_3_uses_order_f64(double %x, ptr %p1, ptr %p2) nounwind {
+; LA32F-LABEL:    sqrt_simplify_before_recip_3_uses_order_f64:
+; LA32F:    # %bb.0:
+; LA32F-NEXT:    addi.w	$sp, $sp, -32
+; LA32F-NEXT:    st.w	$ra, $sp, 28                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$fp, $sp, 24                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s0, $sp, 20                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s1, $sp, 16                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s2, $sp, 12                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s3, $sp, 8                     # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
+; LA32F-NEXT:    move	$fp, $a3
+; LA32F-NEXT:    move	$s0, $a2
+; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    move	$s1, $a0
+; LA32F-NEXT:    move	$s2, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 263248
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    move	$a2, $s1
+; LA32F-NEXT:    move	$a3, $s2
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    move	$s3, $a0
+; LA32F-NEXT:    move	$s4, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 263256
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    move	$a2, $s1
+; LA32F-NEXT:    move	$a3, $s2
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    st.w	$s3, $s0, 0
+; LA32F-NEXT:    st.w	$s4, $s0, 4
+; LA32F-NEXT:    st.w	$a0, $fp, 0
+; LA32F-NEXT:    st.w	$a1, $fp, 4
+; LA32F-NEXT:    move	$a0, $s1
+; LA32F-NEXT:    move	$a1, $s2
+; LA32F-NEXT:    ld.w	$s4, $sp, 4                     # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s3, $sp, 8                     # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s2, $sp, 12                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s1, $sp, 16                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s0, $sp, 20                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$fp, $sp, 24                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$ra, $sp, 28                    # 4-byte Folded Reload
+; LA32F-NEXT:    addi.w	$sp, $sp, 32
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL:    sqrt_simplify_before_recip_3_uses_order_f64:
+; LA32F-FRECIPE:    # %bb.0:
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, -32
+; LA32F-FRECIPE-NEXT:    st.w	$ra, $sp, 28                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$fp, $sp, 24                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s0, $sp, 20                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s1, $sp, 16                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s2, $sp, 12                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s3, $sp, 8                     # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s4, $sp, 4                     # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    move	$fp, $a3
+; LA32F-FRECIPE-NEXT:    move	$s0, $a2
+; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    move	$s1, $a0
+; LA32F-FRECIPE-NEXT:    move	$s2, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263248
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    move	$a2, $s1
+; LA32F-FRECIPE-NEXT:    move	$a3, $s2
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    move	$s3, $a0
+; LA32F-FRECIPE-NEXT:    move	$s4, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263256
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    move	$a2, $s1
+; LA32F-FRECIPE-NEXT:    move	$a3, $s2
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    st.w	$s3, $s0, 0
+; LA32F-FRECIPE-NEXT:    st.w	$s4, $s0, 4
+; LA32F-FRECIPE-NEXT:    st.w	$a0, $fp, 0
+; LA32F-FRECIPE-NEXT:    st.w	$a1, $fp, 4
+; LA32F-FRECIPE-NEXT:    move	$a0, $s1
+; LA32F-FRECIPE-NEXT:    move	$a1, $s2
+; LA32F-FRECIPE-NEXT:    ld.w	$s4, $sp, 4                     # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s3, $sp, 8                     # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s2, $sp, 12                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s1, $sp, 16                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s0, $sp, 20                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$fp, $sp, 24                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$ra, $sp, 28                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, 32
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL: sqrt_simplify_before_recip_3_uses_order_f64:
+; LA64D:       # %bb.0:
+; LA64D-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI3_0)
+; LA64D-NEXT:    fld.d	$fa1, $a2, %pc_lo12(.LCPI3_0)
+; LA64D-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI3_1)
+; LA64D-NEXT:    fld.d	$fa2, $a2, %pc_lo12(.LCPI3_1)
+; LA64D-NEXT:    fsqrt.d	$fa0, $fa0
+; LA64D-NEXT:    fdiv.d	$fa1, $fa1, $fa0
+; LA64D-NEXT:    fdiv.d	$fa2, $fa2, $fa0
+; LA64D-NEXT:    fst.d	$fa1, $a0, 0
+; LA64D-NEXT:    fst.d	$fa2, $a1, 0
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f64:
+; LA64D-FRECIPE:       # %bb.0:
+; LA64D-FRECIPE-NEXT:    frsqrte.d	$fa1, $fa0
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI3_0)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa2, $a2, %pc_lo12(.LCPI3_0)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI3_1)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa3, $a2, %pc_lo12(.LCPI3_1)
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa4, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa2, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI3_2)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa3, $a2, %pc_lo12(.LCPI3_2)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI3_3)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa4, $a2, %pc_lo12(.LCPI3_3)
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa2, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fst.d	$fa2, $a0, 0
+; LA64D-FRECIPE-NEXT:    fst.d	$fa1, $a1, 0
+; LA64D-FRECIPE-NEXT:    ret
+  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
+  %sqrt_fast = fdiv fast double %x, %sqrt
+  %r1 = fdiv fast double 42.0, %sqrt
+  %r2 = fdiv fast double 43.0, %sqrt
+  store double %r1, ptr %p1, align 8
+  store double %r2, ptr %p2, align 8
+  ret double %sqrt_fast
+}
+
+define double @sqrt_simplify_before_recip_4_uses_f64(double %x, ptr %p1, ptr %p2, ptr %p3) nounwind {
+; LA32F-LABEL:    sqrt_simplify_before_recip_4_uses_f64:
+; LA32F:    # %bb.0:
+; LA32F-NEXT:    addi.w	$sp, $sp, -48
+; LA32F-NEXT:    st.w	$ra, $sp, 44                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$fp, $sp, 40                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s0, $sp, 36                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s1, $sp, 32                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s2, $sp, 28                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s3, $sp, 24                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s4, $sp, 20                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s5, $sp, 16                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s6, $sp, 12                    # 4-byte Folded Spill
+; LA32F-NEXT:    st.w	$s7, $sp, 8                     # 4-byte Folded Spill
+; LA32F-NEXT:    move	$fp, $a4
+; LA32F-NEXT:    move	$s0, $a3
+; LA32F-NEXT:    move	$s1, $a2
+; LA32F-NEXT:    bl	%plt(sqrt)
+; LA32F-NEXT:    move	$s2, $a0
+; LA32F-NEXT:    move	$s3, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 261888
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    move	$a2, $s2
+; LA32F-NEXT:    move	$a3, $s3
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    move	$s4, $a0
+; LA32F-NEXT:    move	$s5, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 263248
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    move	$a2, $s2
+; LA32F-NEXT:    move	$a3, $s3
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    move	$s6, $a0
+; LA32F-NEXT:    move	$s7, $a1
+; LA32F-NEXT:    lu12i.w	$a1, 263256
+; LA32F-NEXT:    move	$a0, $zero
+; LA32F-NEXT:    move	$a2, $s2
+; LA32F-NEXT:    move	$a3, $s3
+; LA32F-NEXT:    bl	%plt(__divdf3)
+; LA32F-NEXT:    st.w	$s4, $s1, 0
+; LA32F-NEXT:    st.w	$s5, $s1, 4
+; LA32F-NEXT:    st.w	$s6, $s0, 0
+; LA32F-NEXT:    st.w	$s7, $s0, 4
+; LA32F-NEXT:    st.w	$a0, $fp, 0
+; LA32F-NEXT:    st.w	$a1, $fp, 4
+; LA32F-NEXT:    move	$a0, $s2
+; LA32F-NEXT:    move	$a1, $s3
+; LA32F-NEXT:    ld.w	$s7, $sp, 8                     # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s6, $sp, 12                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s5, $sp, 16                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s4, $sp, 20                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s3, $sp, 24                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s2, $sp, 28                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s1, $sp, 32                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$s0, $sp, 36                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$fp, $sp, 40                    # 4-byte Folded Reload
+; LA32F-NEXT:    ld.w	$ra, $sp, 44                    # 4-byte Folded Reload
+; LA32F-NEXT:    addi.w	$sp, $sp, 48
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL:    sqrt_simplify_before_recip_4_uses_f64:
+; LA32F-FRECIPE:    # %bb.0:
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, -48
+; LA32F-FRECIPE-NEXT:    st.w	$ra, $sp, 44                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$fp, $sp, 40                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s0, $sp, 36                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s1, $sp, 32                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s2, $sp, 28                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s3, $sp, 24                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s4, $sp, 20                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s5, $sp, 16                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s6, $sp, 12                    # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    st.w	$s7, $sp, 8                     # 4-byte Folded Spill
+; LA32F-FRECIPE-NEXT:    move	$fp, $a4
+; LA32F-FRECIPE-NEXT:    move	$s0, $a3
+; LA32F-FRECIPE-NEXT:    move	$s1, $a2
+; LA32F-FRECIPE-NEXT:    bl	%plt(sqrt)
+; LA32F-FRECIPE-NEXT:    move	$s2, $a0
+; LA32F-FRECIPE-NEXT:    move	$s3, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 261888
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    move	$a2, $s2
+; LA32F-FRECIPE-NEXT:    move	$a3, $s3
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    move	$s4, $a0
+; LA32F-FRECIPE-NEXT:    move	$s5, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263248
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    move	$a2, $s2
+; LA32F-FRECIPE-NEXT:    move	$a3, $s3
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    move	$s6, $a0
+; LA32F-FRECIPE-NEXT:    move	$s7, $a1
+; LA32F-FRECIPE-NEXT:    lu12i.w	$a1, 263256
+; LA32F-FRECIPE-NEXT:    move	$a0, $zero
+; LA32F-FRECIPE-NEXT:    move	$a2, $s2
+; LA32F-FRECIPE-NEXT:    move	$a3, $s3
+; LA32F-FRECIPE-NEXT:    bl	%plt(__divdf3)
+; LA32F-FRECIPE-NEXT:    st.w	$s4, $s1, 0
+; LA32F-FRECIPE-NEXT:    st.w	$s5, $s1, 4
+; LA32F-FRECIPE-NEXT:    st.w	$s6, $s0, 0
+; LA32F-FRECIPE-NEXT:    st.w	$s7, $s0, 4
+; LA32F-FRECIPE-NEXT:    st.w	$a0, $fp, 0
+; LA32F-FRECIPE-NEXT:    st.w	$a1, $fp, 4
+; LA32F-FRECIPE-NEXT:    move	$a0, $s2
+; LA32F-FRECIPE-NEXT:    move	$a1, $s3
+; LA32F-FRECIPE-NEXT:    ld.w	$s7, $sp, 8                     # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s6, $sp, 12                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s5, $sp, 16                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s4, $sp, 20                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s3, $sp, 24                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s2, $sp, 28                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s1, $sp, 32                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$s0, $sp, 36                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$fp, $sp, 40                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    ld.w	$ra, $sp, 44                    # 4-byte Folded Reload
+; LA32F-FRECIPE-NEXT:    addi.w	$sp, $sp, 48
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL: sqrt_simplify_before_recip_4_uses_f64:
+; LA64D:       # %bb.0:
+; LA64D-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI4_0)
+; LA64D-NEXT:    fld.d	$fa2, $a3, %pc_lo12(.LCPI4_0)
+; LA64D-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI4_1)
+; LA64D-NEXT:    fld.d	$fa3, $a3, %pc_lo12(.LCPI4_1)
+; LA64D-NEXT:    fsqrt.d	$fa1, $fa0
+; LA64D-NEXT:    frsqrt.d	$fa0, $fa0
+; LA64D-NEXT:    fdiv.d	$fa2, $fa2, $fa1
+; LA64D-NEXT:    fdiv.d	$fa3, $fa3, $fa1
+; LA64D-NEXT:    fst.d	$fa0, $a0, 0
+; LA64D-NEXT:    fst.d	$fa2, $a1, 0
+; LA64D-NEXT:    fst.d	$fa3, $a2, 0
+; LA64D-NEXT:    fmov.d	$fa0, $fa1
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f64:
+; LA64D-FRECIPE:       # %bb.0:
+; LA64D-FRECIPE-NEXT:    frsqrte.d	$fa1, $fa0
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI4_0)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa2, $a3, %pc_lo12(.LCPI4_0)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI4_1)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa3, $a3, %pc_lo12(.LCPI4_1)
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa4, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.d	$fa2, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI4_2)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa4, $a3, %pc_lo12(.LCPI4_2)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI4_3)
+; LA64D-FRECIPE-NEXT:    fld.d	$fa5, $a3, %pc_lo12(.LCPI4_3)
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa1, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa2, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa3, $fa1, $fa5
+; LA64D-FRECIPE-NEXT:    fmul.d	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fst.d	$fa1, $a0, 0
+; LA64D-FRECIPE-NEXT:    fst.d	$fa2, $a1, 0
+; LA64D-FRECIPE-NEXT:    fst.d	$fa3, $a2, 0
+; LA64D-FRECIPE-NEXT:    ret
+  %sqrt = tail call fast double @llvm.sqrt.f64(double %x)
+  %rsqrt = fdiv fast double 1.0, %sqrt
+  %r1 = fdiv fast double 42.0, %sqrt
+  %r2 = fdiv fast double 43.0, %sqrt
+  %sqrt_fast = fdiv fast double %x, %sqrt
+  store double %rsqrt, ptr %p1, align 8
+  store double %r1, ptr %p2, align 8
+  store double %r2, ptr %p3, align 8
+  ret double %sqrt_fast
+}
+
+define float @sqrt_simplify_before_recip_3_uses_f32(float %x, ptr %p1, ptr %p2) nounwind {
+; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_f32:
+; LA32F:       # %bb.0:
+; LA32F-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_0)
+; LA32F-NEXT:    fld.s	$fa2, $a2, %pc_lo12(.LCPI5_0)
+; LA32F-NEXT:    fsqrt.s	$fa1, $fa0
+; LA32F-NEXT:    frsqrt.s	$fa0, $fa0
+; LA32F-NEXT:    fdiv.s	$fa2, $fa2, $fa1
+; LA32F-NEXT:    fst.s	$fa0, $a0, 0
+; LA32F-NEXT:    fst.s	$fa2, $a1, 0
+; LA32F-NEXT:    fmov.s	$fa0, $fa1
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_f32:
+; LA32F-FRECIPE:       # %bb.0:
+; LA32F-FRECIPE-NEXT:    frsqrte.s	$fa1, $fa0
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa2, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_0)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa3, $a2, %pc_lo12(.LCPI5_0)
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_1)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa4, $a2, %pc_lo12(.LCPI5_1)
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_2)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa5, $a2, %pc_lo12(.LCPI5_2)
+; LA32F-FRECIPE-NEXT:    fmadd.s	$fa2, $fa2, $fa1, $fa3
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa4
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa2
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa2, $fa1, $fa5
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fst.s	$fa1, $a0, 0
+; LA32F-FRECIPE-NEXT:    fst.s	$fa2, $a1, 0
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL:    sqrt_simplify_before_recip_3_uses_f32:
+; LA64D:    # %bb.0:
+; LA64D-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_0)
+; LA64D-NEXT:    fld.s	$fa2, $a2, %pc_lo12(.LCPI5_0)
+; LA64D-NEXT:    fsqrt.s	$fa1, $fa0
+; LA64D-NEXT:    frsqrt.s	$fa0, $fa0
+; LA64D-NEXT:    fdiv.s	$fa2, $fa2, $fa1
+; LA64D-NEXT:    fst.s	$fa0, $a0, 0
+; LA64D-NEXT:    fst.s	$fa2, $a1, 0
+; LA64D-NEXT:    fmov.s	$fa0, $fa1
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL:    sqrt_simplify_before_recip_3_uses_f32:
+; LA64D-FRECIPE:    # %bb.0:
+; LA64D-FRECIPE-NEXT:    frsqrte.s	$fa1, $fa0
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa2, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_0)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa3, $a2, %pc_lo12(.LCPI5_0)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_1)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa4, $a2, %pc_lo12(.LCPI5_1)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI5_2)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa5, $a2, %pc_lo12(.LCPI5_2)
+; LA64D-FRECIPE-NEXT:    fmadd.s	$fa2, $fa2, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa2, $fa1, $fa5
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fst.s	$fa1, $a0, 0
+; LA64D-FRECIPE-NEXT:    fst.s	$fa2, $a1, 0
+; LA64D-FRECIPE-NEXT:    ret
+;
+  %sqrt = tail call fast float @llvm.sqrt.f32(float %x)
+  %rsqrt = fdiv fast float 1.0, %sqrt
+  %r = fdiv fast float 42.0, %sqrt
+  %sqrt_fast = fdiv fast float %x, %sqrt
+  store float %rsqrt, ptr %p1, align 8
+  store float %r, ptr %p2, align 8
+  ret float %sqrt_fast
+}
+
+define float @sqrt_simplify_before_recip_4_uses_f32(float %x, ptr %p1, ptr %p2, ptr %p3) nounwind {
+; LA32F-LABEL: sqrt_simplify_before_recip_4_uses_f32:
+; LA32F:       # %bb.0:
+; LA32F-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_0)
+; LA32F-NEXT:    fld.s	$fa2, $a3, %pc_lo12(.LCPI6_0)
+; LA32F-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_1)
+; LA32F-NEXT:    fld.s	$fa3, $a3, %pc_lo12(.LCPI6_1)
+; LA32F-NEXT:    fsqrt.s	$fa1, $fa0
+; LA32F-NEXT:    frsqrt.s	$fa0, $fa0
+; LA32F-NEXT:    fdiv.s	$fa2, $fa2, $fa1
+; LA32F-NEXT:    fdiv.s	$fa3, $fa3, $fa1
+; LA32F-NEXT:    fst.s	$fa0, $a0, 0
+; LA32F-NEXT:    fst.s	$fa2, $a1, 0
+; LA32F-NEXT:    fst.s	$fa3, $a2, 0
+; LA32F-NEXT:    fmov.s	$fa0, $fa1
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_4_uses_f32:
+; LA32F-FRECIPE:       # %bb.0:
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_0)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa1, $a3, %pc_lo12(.LCPI6_0)
+; LA32F-FRECIPE-NEXT:    frsqrte.s	$fa2, $fa0
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa2, $fa0, $fa2
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa3, $fa0, $fa2
+; LA32F-FRECIPE-NEXT:    fmadd.s	$fa1, $fa3, $fa2, $fa1
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_1)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa3, $a3, %pc_lo12(.LCPI6_1)
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_2)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa4, $a3, %pc_lo12(.LCPI6_2)
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_3)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa5, $a3, %pc_lo12(.LCPI6_3)
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa2, $fa2, $fa3
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa2, $fa1
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa2, $fa1, $fa4
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa3, $fa1, $fa5
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fst.s	$fa1, $a0, 0
+; LA32F-FRECIPE-NEXT:    fst.s	$fa2, $a1, 0
+; LA32F-FRECIPE-NEXT:    fst.s	$fa3, $a2, 0
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL:    sqrt_simplify_before_recip_4_uses_f32:
+; LA64D:    # %bb.0:
+; LA64D-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_0)
+; LA64D-NEXT:    fld.s	$fa2, $a3, %pc_lo12(.LCPI6_0)
+; LA64D-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_1)
+; LA64D-NEXT:    fld.s	$fa3, $a3, %pc_lo12(.LCPI6_1)
+; LA64D-NEXT:    fsqrt.s	$fa1, $fa0
+; LA64D-NEXT:    frsqrt.s	$fa0, $fa0
+; LA64D-NEXT:    fdiv.s	$fa2, $fa2, $fa1
+; LA64D-NEXT:    fdiv.s	$fa3, $fa3, $fa1
+; LA64D-NEXT:    fst.s	$fa0, $a0, 0
+; LA64D-NEXT:    fst.s	$fa2, $a1, 0
+; LA64D-NEXT:    fst.s	$fa3, $a2, 0
+; LA64D-NEXT:    fmov.s	$fa0, $fa1
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL:    sqrt_simplify_before_recip_4_uses_f32:
+; LA64D-FRECIPE:    # %bb.0:
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_0)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa1, $a3, %pc_lo12(.LCPI6_0)
+; LA64D-FRECIPE-NEXT:    frsqrte.s	$fa2, $fa0
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa2, $fa0, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa3, $fa0, $fa2
+; LA64D-FRECIPE-NEXT:    fmadd.s	$fa1, $fa3, $fa2, $fa1
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_1)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa3, $a3, %pc_lo12(.LCPI6_1)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_2)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa4, $a3, %pc_lo12(.LCPI6_2)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a3, %pc_hi20(.LCPI6_3)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa5, $a3, %pc_lo12(.LCPI6_3)
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa2, $fa2, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa2, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa2, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa3, $fa1, $fa5
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fst.s	$fa1, $a0, 0
+; LA64D-FRECIPE-NEXT:    fst.s	$fa2, $a1, 0
+; LA64D-FRECIPE-NEXT:    fst.s	$fa3, $a2, 0
+; LA64D-FRECIPE-NEXT:    ret
+;
+  %sqrt = tail call fast float @llvm.sqrt.f32(float %x)
+  %rsqrt = fdiv fast float 1.0, %sqrt
+  %r1 = fdiv fast float 42.0, %sqrt
+  %r2 = fdiv fast float 43.0, %sqrt
+  %sqrt_fast = fdiv fast float %x, %sqrt
+  store float %rsqrt, ptr %p1, align 8
+  store float %r1, ptr %p2, align 8
+  store float %r2, ptr %p3, align 8
+  ret float %sqrt_fast
+}
+
+define float @sqrt_simplify_before_recip_3_uses_order_f32(float %x, ptr %p1, ptr %p2) nounwind {
+; LA32F-LABEL: sqrt_simplify_before_recip_3_uses_order_f32:
+; LA32F:       # %bb.0:
+; LA32F-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_0)
+; LA32F-NEXT:    fld.s	$fa1, $a2, %pc_lo12(.LCPI7_0)
+; LA32F-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_1)
+; LA32F-NEXT:    fld.s	$fa2, $a2, %pc_lo12(.LCPI7_1)
+; LA32F-NEXT:    fsqrt.s	$fa0, $fa0
+; LA32F-NEXT:    fdiv.s	$fa1, $fa1, $fa0
+; LA32F-NEXT:    fdiv.s	$fa2, $fa2, $fa0
+; LA32F-NEXT:    fst.s	$fa1, $a0, 0
+; LA32F-NEXT:    fst.s	$fa2, $a1, 0
+; LA32F-NEXT:    ret
+;
+; LA32F-FRECIPE-LABEL: sqrt_simplify_before_recip_3_uses_order_f32:
+; LA32F-FRECIPE:       # %bb.0:
+; LA32F-FRECIPE-NEXT:    frsqrte.s	$fa1, $fa0
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_0)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa2, $a2, %pc_lo12(.LCPI7_0)
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_1)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa3, $a2, %pc_lo12(.LCPI7_1)
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa4, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fmadd.s	$fa2, $fa4, $fa1, $fa2
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa3
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_2)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa3, $a2, %pc_lo12(.LCPI7_2)
+; LA32F-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_3)
+; LA32F-FRECIPE-NEXT:    fld.s	$fa4, $a2, %pc_lo12(.LCPI7_3)
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa2
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa2, $fa1, $fa3
+; LA32F-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa4
+; LA32F-FRECIPE-NEXT:    fst.s	$fa2, $a0, 0
+; LA32F-FRECIPE-NEXT:    fst.s	$fa1, $a1, 0
+; LA32F-FRECIPE-NEXT:    ret
+;
+; LA64D-LABEL:    sqrt_simplify_before_recip_3_uses_order_f32:
+; LA64D:    # %bb.0:
+; LA64D-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_0)
+; LA64D-NEXT:    fld.s	$fa1, $a2, %pc_lo12(.LCPI7_0)
+; LA64D-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_1)
+; LA64D-NEXT:    fld.s	$fa2, $a2, %pc_lo12(.LCPI7_1)
+; LA64D-NEXT:    fsqrt.s	$fa0, $fa0
+; LA64D-NEXT:    fdiv.s	$fa1, $fa1, $fa0
+; LA64D-NEXT:    fdiv.s	$fa2, $fa2, $fa0
+; LA64D-NEXT:    fst.s	$fa1, $a0, 0
+; LA64D-NEXT:    fst.s	$fa2, $a1, 0
+; LA64D-NEXT:    ret
+;
+; LA64D-FRECIPE-LABEL:    sqrt_simplify_before_recip_3_uses_order_f32:
+; LA64D-FRECIPE:    # %bb.0:
+; LA64D-FRECIPE-NEXT:    frsqrte.s	$fa1, $fa0
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_0)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa2, $a2, %pc_lo12(.LCPI7_0)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_1)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa3, $a2, %pc_lo12(.LCPI7_1)
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa4, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmadd.s	$fa2, $fa4, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_2)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa3, $a2, %pc_lo12(.LCPI7_2)
+; LA64D-FRECIPE-NEXT:    pcalau12i	$a2, %pc_hi20(.LCPI7_3)
+; LA64D-FRECIPE-NEXT:    fld.s	$fa4, $a2, %pc_lo12(.LCPI7_3)
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa2
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa0, $fa0, $fa1
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa2, $fa1, $fa3
+; LA64D-FRECIPE-NEXT:    fmul.s	$fa1, $fa1, $fa4
+; LA64D-FRECIPE-NEXT:    fst.s	$fa2, $a0, 0
+; LA64D-FRECIPE-NEXT:    fst.s	$fa1, $a1, 0
+; LA64D-FRECIPE-NEXT:    ret
+;
+  %sqrt = tail call fast float @llvm.sqrt.f32(float %x)
+  %sqrt_fast = fdiv fast float %x, %sqrt
+  %r1 = fdiv fast float 42.0, %sqrt
+  %r2 = fdiv fast float 43.0, %sqrt
+  store float %r1, ptr %p1, align 8
+  store float %r2, ptr %p2, align 8
+  ret float %sqrt_fast
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll
new file mode 100644
index 000000000000..769d9ef81faf
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fdiv-reciprocal-estimate.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+define void @fdiv_v8f32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; FAULT-LABEL: fdiv_v8f32:
+; FAULT:       # %bb.0:
+; FAULT-NEXT:    xvld	$xr0, $a1, 0
+; FAULT-NEXT:    xvld	$xr1, $a2, 0
+; FAULT-NEXT:    xvfdiv.s	$xr0, $xr0, $xr1
+; FAULT-NEXT:    xvst	$xr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: fdiv_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld	$xr0, $a2, 0
+; CHECK-NEXT:    xvld	$xr1, $a1, 0
+; CHECK-NEXT:    xvfrecipe.s	$xr2, $xr0
+; CHECK-NEXT:    xvfmul.s	$xr3, $xr1, $xr2
+; CHECK-NEXT:    xvfnmsub.s	$xr0, $xr0, $xr3, $xr1
+; CHECK-NEXT:    xvfmadd.s	$xr0, $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvst	$xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %v1 = load <8 x float>, ptr %a1
+  %v2 = fdiv fast <8 x float> %v0, %v1
+  store <8 x float> %v2, ptr %res
+  ret void
+}
+
+define void @fdiv_v4f64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; FAULT-LABEL: fdiv_v4f64:
+; FAULT:       # %bb.0:
+; FAULT-NEXT:    xvld	$xr0, $a1, 0
+; FAULT-NEXT:    xvld	$xr1, $a2, 0
+; FAULT-NEXT:    xvfdiv.d	$xr0, $xr0, $xr1
+; FAULT-NEXT:    xvst	$xr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: fdiv_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld	$xr0, $a2, 0
+; CHECK-NEXT:    xvld	$xr1, $a1, 0
+; CHECK-NEXT:    lu52i.d	$a1, $zero, -1025
+; CHECK-NEXT:    xvreplgr2vr.d	$xr2, $a1
+; CHECK-NEXT:    xvfrecipe.d	$xr3, $xr0
+; CHECK-NEXT:    xvfmadd.d	$xr2, $xr0, $xr3, $xr2
+; CHECK-NEXT:    xvfnmsub.d	$xr2, $xr2, $xr3, $xr3
+; CHECK-NEXT:    xvfmul.d	$xr3, $xr1, $xr2
+; CHECK-NEXT:    xvfnmsub.d	$xr0, $xr0, $xr3, $xr1
+; CHECK-NEXT:    xvfmadd.d	$xr0, $xr2, $xr0, $xr3
+; CHECK-NEXT:    xvst	$xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %v1 = load <4 x double>, ptr %a1
+  %v2 = fdiv fast <4 x double> %v0, %v1
+  store <4 x double> %v2, ptr %res
+  ret void
+}
+
+;; 1.0 / vec
+define void @one_fdiv_v8f32(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_fdiv_v8f32:
+; FAULT:       # %bb.0:
+; FAULT-NEXT:    xvld	$xr0, $a1, 0
+; FAULT-NEXT:    xvfrecip.s	$xr0, $xr0
+; FAULT-NEXT:    xvst	$xr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: one_fdiv_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld	$xr0, $a1, 0
+; CHECK-NEXT:    xvfrecipe.s	$xr1, $xr0
+; CHECK-NEXT:    lu12i.w	$a1, -264192
+; CHECK-NEXT:    xvreplgr2vr.w	$xr2, $a1
+; CHECK-NEXT:    xvfmadd.s	$xr0, $xr0, $xr1, $xr2
+; CHECK-NEXT:    xvfnmsub.s	$xr0, $xr0, $xr1, $xr1
+; CHECK-NEXT:    xvst	$xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %v0
+  store <8 x float> %div, ptr %res
+  ret void
+}
+
+define void @one_fdiv_v4f64(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_fdiv_v4f64:
+; FAULT:       # %bb.0:
+; FAULT-NEXT:    xvld	$xr0, $a1, 0
+; FAULT-NEXT:    xvfrecip.d	$xr0, $xr0
+; FAULT-NEXT:    xvst	$xr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: one_fdiv_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld	$xr0, $a1, 0
+; CHECK-NEXT:    xvfrecipe.d	$xr1, $xr0
+; CHECK-NEXT:    lu52i.d	$a1, $zero, 1023
+; CHECK-NEXT:    xvreplgr2vr.d	$xr2, $a1
+; CHECK-NEXT:    xvfnmsub.d	$xr3, $xr0, $xr1, $xr2
+; CHECK-NEXT:    xvfmadd.d	$xr1, $xr1, $xr3, $xr1
+; CHECK-NEXT:    xvfnmsub.d	$xr0, $xr0, $xr1, $xr2
+; CHECK-NEXT:    xvfmadd.d	$xr0, $xr1, $xr0, $xr1
+; CHECK-NEXT:    xvst	$xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0
+  %div = fdiv fast <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %v0
+  store <4 x double> %div, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll
new file mode 100644
index 000000000000..48fd12697417
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lasx/fsqrt-reciprocal-estimate.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,-frecipe < %s | FileCheck %s --check-prefix=FAULT
+; RUN: llc --mtriple=loongarch64 --mattr=+lasx,+frecipe < %s | FileCheck %s
+
+;; 1.0 / (fsqrt vec)
+define void @one_div_sqrt_v8f32(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_div_sqrt_v8f32:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    xvld $xr0, $a1, 0
+; FAULT-NEXT:    xvfrsqrt.s $xr0, $xr0
+; FAULT-NEXT:    xvst $xr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: one_div_sqrt_v8f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld	$xr0, $a1, 0
+; CHECK-NEXT:    xvfrsqrte.s	$xr1, $xr0
+; CHECK-NEXT:    xvfmul.s	$xr1, $xr0, $xr1
+; CHECK-NEXT:    xvfmul.s	$xr0, $xr0, $xr1
+; CHECK-NEXT:    lu12i.w	$a1, -261120
+; CHECK-NEXT:    xvreplgr2vr.w	$xr2, $a1
+; CHECK-NEXT:    xvfmadd.s	$xr0, $xr0, $xr1, $xr2
+; CHECK-NEXT:    lu12i.w	$a1, -266240
+; CHECK-NEXT:    xvreplgr2vr.w	$xr2, $a1
+; CHECK-NEXT:    xvfmul.s	$xr1, $xr1, $xr2
+; CHECK-NEXT:    xvfmul.s	$xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst	$xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <8 x float>, ptr %a0, align 16
+  %sqrt = call fast <8 x float> @llvm.sqrt.v8f32 (<8 x float> %v0)
+  %div = fdiv fast <8 x float> <float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+  store <8 x float> %div, ptr %res, align 16
+  ret void
+}
+
+define void @one_div_sqrt_v4f64(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_div_sqrt_v4f64:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    xvld $xr0, $a1, 0
+; FAULT-NEXT:    xvfrsqrt.d $xr0, $xr0
+; FAULT-NEXT:    xvst $xr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: one_div_sqrt_v4f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    xvld	$xr0, $a1, 0
+; CHECK-NEXT:    xvfrsqrte.d	$xr1, $xr0
+; CHECK-NEXT:    xvfmul.d	$xr1, $xr0, $xr1
+; CHECK-NEXT:    xvfmul.d	$xr2, $xr0, $xr1
+; CHECK-NEXT:    ori	$a1, $zero, 0
+; CHECK-NEXT:    lu32i.d	$a1, -524288
+; CHECK-NEXT:    lu52i.d	$a1, $a1, -1024
+; CHECK-NEXT:    xvreplgr2vr.d	$xr3, $a1
+; CHECK-NEXT:    xvfmadd.d	$xr2, $xr2, $xr1, $xr3
+; CHECK-NEXT:    lu52i.d	$a1, $zero, -1026
+; CHECK-NEXT:    xvreplgr2vr.d	$xr4, $a1
+; CHECK-NEXT:    xvfmul.d	$xr1, $xr1, $xr4
+; CHECK-NEXT:    xvfmul.d	$xr1, $xr1, $xr2
+; CHECK-NEXT:    xvfmul.d	$xr0, $xr0, $xr1
+; CHECK-NEXT:    xvfmadd.d	$xr0, $xr0, $xr1, $xr3
+; CHECK-NEXT:    xvfmul.d	$xr1, $xr1, $xr4
+; CHECK-NEXT:    xvfmul.d	$xr0, $xr1, $xr0
+; CHECK-NEXT:    xvst	$xr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x double>, ptr %a0, align 16
+  %sqrt = call fast <4 x double> @llvm.sqrt.v4f64 (<4 x double> %v0)
+  %div = fdiv fast <4 x double> <double 1.0, double 1.0, double 1.0, double 1.0>, %sqrt
+  store <4 x double> %div, ptr %res, align 16
+  ret void
+}
+
+declare <8 x float> @llvm.sqrt.v8f32(<8 x float>)
+declare <4 x double> @llvm.sqrt.v4f64(<4 x double>)
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll
new file mode 100644
index 000000000000..21dbbf310ad8
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fdiv-reciprocal-estimate.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 4
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+define void @fdiv_v4f32(ptr %res, ptr %a0, ptr %a1) nounwind {
+; FAULT-LABEL: fdiv_v4f32:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    vld $vr0, $a1, 0
+; FAULT-NEXT:    vld $vr1, $a2, 0
+; FAULT-NEXT:    vfdiv.s $vr0, $vr0, $vr1
+; FAULT-NEXT:    vst $vr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: fdiv_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld	$vr0, $a2, 0
+; CHECK-NEXT:    vld	$vr1, $a1, 0
+; CHECK-NEXT:    vfrecipe.s	$vr2, $vr0
+; CHECK-NEXT:    vfmul.s	$vr3, $vr1, $vr2
+; CHECK-NEXT:    vfnmsub.s	$vr0, $vr0, $vr3, $vr1
+; CHECK-NEXT:    vfmadd.s	$vr0, $vr2, $vr0, $vr3
+; CHECK-NEXT:    vst	$vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %v1 = load <4 x float>, ptr %a1
+  %v2 = fdiv fast <4 x float> %v0, %v1
+  store <4 x float> %v2, ptr %res
+  ret void
+}
+
+define void @fdiv_v2f64(ptr %res, ptr %a0, ptr %a1) nounwind {
+; FAULT-LABEL: fdiv_v2f64:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    vld $vr0, $a1, 0
+; FAULT-NEXT:    vld $vr1, $a2, 0
+; FAULT-NEXT:    vfdiv.d $vr0, $vr0, $vr1
+; FAULT-NEXT:    vst $vr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: fdiv_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld	$vr0, $a2, 0
+; CHECK-NEXT:    vld	$vr1, $a1, 0
+; CHECK-NEXT:    lu52i.d	$a1, $zero, -1025
+; CHECK-NEXT:    vreplgr2vr.d	$vr2, $a1
+; CHECK-NEXT:    vfrecipe.d	$vr3, $vr0
+; CHECK-NEXT:    vfmadd.d	$vr2, $vr0, $vr3, $vr2
+; CHECK-NEXT:    vfnmsub.d	$vr2, $vr2, $vr3, $vr3
+; CHECK-NEXT:    vfmul.d	$vr3, $vr1, $vr2
+; CHECK-NEXT:    vfnmsub.d	$vr0, $vr0, $vr3, $vr1
+; CHECK-NEXT:    vfmadd.d	$vr0, $vr2, $vr0, $vr3
+; CHECK-NEXT:    vst	$vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %v1 = load <2 x double>, ptr %a1
+  %v2 = fdiv fast <2 x double> %v0, %v1
+  store <2 x double> %v2, ptr %res
+  ret void
+}
+
+;; 1.0 / vec
+define void @one_fdiv_v4f32(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_fdiv_v4f32:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    vld $vr0, $a1, 0
+; FAULT-NEXT:    vfrecip.s $vr0, $vr0
+; FAULT-NEXT:    vst $vr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: one_fdiv_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld	$vr0, $a1, 0
+; CHECK-NEXT:    vfrecipe.s	$vr1, $vr0
+; CHECK-NEXT:    lu12i.w	$a1, -264192
+; CHECK-NEXT:    vreplgr2vr.w	$vr2, $a1
+; CHECK-NEXT:    vfmadd.s	$vr0, $vr0, $vr1, $vr2
+; CHECK-NEXT:    vfnmsub.s	$vr0, $vr0, $vr1, $vr1
+; CHECK-NEXT:    vst	$vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %v0
+  store <4 x float> %div, ptr %res
+  ret void
+}
+
+define void @one_fdiv_v2f64(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_fdiv_v2f64:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    vld $vr0, $a1, 0
+; FAULT-NEXT:    vfrecip.d $vr0, $vr0
+; FAULT-NEXT:    vst $vr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL: one_fdiv_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld	$vr0, $a1, 0
+; CHECK-NEXT:    vfrecipe.d	$vr1, $vr0
+; CHECK-NEXT:    lu52i.d	$a1, $zero, 1023
+; CHECK-NEXT:    vreplgr2vr.d	$vr2, $a1
+; CHECK-NEXT:    vfnmsub.d	$vr3, $vr0, $vr1, $vr2
+; CHECK-NEXT:    vfmadd.d	$vr1, $vr1, $vr3, $vr1
+; CHECK-NEXT:    vfnmsub.d	$vr0, $vr0, $vr1, $vr2
+; CHECK-NEXT:    vfmadd.d	$vr0, $vr1, $vr0, $vr1
+; CHECK-NEXT:    vst	$vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0
+  %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %v0
+  store <2 x double> %div, ptr %res
+  ret void
+}
diff --git a/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll
new file mode 100644
index 000000000000..912d06242f7d
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/lsx/fsqrt-reciprocal-estimate.ll
@@ -0,0 +1,75 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,-frecipe < %s | FileCheck %s --check-prefix=FAULT
+; RUN: llc --mtriple=loongarch64 --mattr=+lsx,+frecipe < %s | FileCheck %s
+
+;; 1.0 / (fsqrt vec)
+define void @one_div_sqrt_v4f32(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_div_sqrt_v4f32:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    vld $vr0, $a1, 0
+; FAULT-NEXT:    vfrsqrt.s $vr0, $vr0
+; FAULT-NEXT:    vst $vr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL one_div_sqrt_v4f32:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld	$vr0, $a1, 0
+; CHECK-NEXT:    vfrsqrte.s	$vr1, $vr0
+; CHECK-NEXT:    vfmul.s	$vr1, $vr0, $vr1
+; CHECK-NEXT:    vfmul.s	$vr0, $vr0, $vr1
+; CHECK-NEXT:    lu12i.w	$a1, -261120
+; CHECK-NEXT:    vreplgr2vr.w	$vr2, $a1
+; CHECK-NEXT:    vfmadd.s	$vr0, $vr0, $vr1, $vr2
+; CHECK-NEXT:    lu12i.w	$a1, -266240
+; CHECK-NEXT:    vreplgr2vr.w	$vr2, $a1
+; CHECK-NEXT:    vfmul.s	$vr1, $vr1, $vr2
+; CHECK-NEXT:    vfmul.s	$vr0, $vr1, $vr0
+; CHECK-NEXT:    vst	$vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <4 x float>, ptr %a0, align 16
+  %sqrt = call fast <4 x float> @llvm.sqrt.v4f32 (<4 x float> %v0)
+  %div = fdiv fast <4 x float> <float 1.0, float 1.0, float 1.0, float 1.0>, %sqrt
+  store <4 x float> %div, ptr %res, align 16
+  ret void
+}
+
+define void @one_div_sqrt_v2f64(ptr %res, ptr %a0) nounwind {
+; FAULT-LABEL: one_div_sqrt_v2f64:
+; FAULT:       # %bb.0: # %entry
+; FAULT-NEXT:    vld $vr0, $a1, 0
+; FAULT-NEXT:    vfrsqrt.d $vr0, $vr0
+; FAULT-NEXT:    vst $vr0, $a0, 0
+; FAULT-NEXT:    ret
+;
+; CHECK-LABEL one_div_sqrt_v2f64:
+; CHECK:       # %bb.0: # %entry
+; CHECK-NEXT:    vld	$vr0, $a1, 0
+; CHECK-NEXT:    vfrsqrte.d	$vr1, $vr0
+; CHECK-NEXT:    vfmul.d	$vr1, $vr0, $vr1
+; CHECK-NEXT:    vfmul.d	$vr2, $vr0, $vr1
+; CHECK-NEXT:    ori	$a1, $zero, 0
+; CHECK-NEXT:    lu32i.d	$a1, -524288
+; CHECK-NEXT:    lu52i.d	$a1, $a1, -1024
+; CHECK-NEXT:    vreplgr2vr.d	$vr3, $a1
+; CHECK-NEXT:    vfmadd.d	$vr2, $vr2, $vr1, $vr3
+; CHECK-NEXT:    lu52i.d	$a1, $zero, -1026
+; CHECK-NEXT:    vreplgr2vr.d	$vr4, $a1
+; CHECK-NEXT:    vfmul.d	$vr1, $vr1, $vr4
+; CHECK-NEXT:    vfmul.d	$vr1, $vr1, $vr2
+; CHECK-NEXT:    vfmul.d	$vr0, $vr0, $vr1
+; CHECK-NEXT:    vfmadd.d	$vr0, $vr0, $vr1, $vr3
+; CHECK-NEXT:    vfmul.d	$vr1, $vr1, $vr4
+; CHECK-NEXT:    vfmul.d	$vr0, $vr1, $vr0
+; CHECK-NEXT:    vst	$vr0, $a0, 0
+; CHECK-NEXT:    ret
+entry:
+  %v0 = load <2 x double>, ptr %a0, align 16
+  %sqrt = call fast <2 x double> @llvm.sqrt.v2f64 (<2 x double> %v0)
+  %div = fdiv fast <2 x double> <double 1.0, double 1.0>, %sqrt
+  store <2 x double> %div, ptr %res, align 16
+  ret void
+}
+
+declare <4 x float> @llvm.sqrt.v4f32(<4 x float>)
+declare <2 x double> @llvm.sqrt.v2f64(<2 x double>)
-- 
GitLab


From 69a798a996e0cd9d521db38167cadf841d629d38 Mon Sep 17 00:00:00 2001
From: goldsteinn <35538541+goldsteinn@users.noreply.github.com>
Date: Thu, 17 Oct 2024 21:28:47 -0400
Subject: [PATCH 313/329] Reapply "[Inliner] Propagate more attributes to
 params when inlining (#91101)" (2nd Attempt) (#112749)

Root cause of the bug was code hanging onto `range` attr after
changing BitWidth. This was fixed in PR #112633.
---
 .../test/CodeGen/attr-counted-by-pr88931.cpp  |   2 +-
 clang/test/OpenMP/bug57757.cpp                |   2 +-
 llvm/include/llvm/IR/Attributes.h             |   7 +
 llvm/lib/IR/Attributes.cpp                    |  15 ++
 llvm/lib/Transforms/Utils/InlineFunction.cpp  |  90 ++++++++--
 .../Inline/access-attributes-prop.ll          | 164 +++++++++++++++++-
 .../Inline/assumptions-from-callsite-attrs.ll |   2 +-
 llvm/test/Transforms/Inline/byval.ll          |   4 +-
 llvm/test/Transforms/PhaseOrdering/pr95152.ll |   2 +-
 9 files changed, 259 insertions(+), 29 deletions(-)

diff --git a/clang/test/CodeGen/attr-counted-by-pr88931.cpp b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
index 2a8cc1d07e50..6d0c46bbbe8f 100644
--- a/clang/test/CodeGen/attr-counted-by-pr88931.cpp
+++ b/clang/test/CodeGen/attr-counted-by-pr88931.cpp
@@ -13,7 +13,7 @@ void init(void * __attribute__((pass_dynamic_object_size(0))));
 // CHECK-LABEL: define dso_local void @_ZN3foo3barC1Ev(
 // CHECK-SAME: ptr noundef nonnull align 4 dereferenceable(1) [[THIS:%.*]]) unnamed_addr #[[ATTR0:[0-9]+]] align 2 {
 // CHECK-NEXT:  entry:
-// CHECK-NEXT:    tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]]
+// CHECK-NEXT:    tail call void @_Z4initPvU25pass_dynamic_object_size0(ptr noundef nonnull align 4 dereferenceable(1) [[THIS]], i64 noundef -1) #[[ATTR2:[0-9]+]]
 // CHECK-NEXT:    ret void
 //
 foo::bar::bar() {
diff --git a/clang/test/OpenMP/bug57757.cpp b/clang/test/OpenMP/bug57757.cpp
index 240b22a30671..eabf233dde24 100644
--- a/clang/test/OpenMP/bug57757.cpp
+++ b/clang/test/OpenMP/bug57757.cpp
@@ -39,7 +39,7 @@ void foo() {
 // CHECK-NEXT:    ]
 // CHECK:       .untied.jmp..i:
 // CHECK-NEXT:    store i32 1, ptr [[TMP2]], align 4, !tbaa [[TBAA16]], !alias.scope [[META13]], !noalias [[META17]]
-// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr [[TMP1]]), !noalias [[META13]]
+// CHECK-NEXT:    [[TMP4:%.*]] = tail call i32 @__kmpc_omp_task(ptr nonnull @[[GLOB1]], i32 [[TMP0]], ptr nonnull [[TMP1]]), !noalias [[META13]]
 // CHECK-NEXT:    br label [[DOTOMP_OUTLINED__EXIT]]
 // CHECK:       .untied.next..i:
 // CHECK-NEXT:    [[TMP5:%.*]] = getelementptr inbounds nuw i8, ptr [[TMP1]], i64 40
diff --git a/llvm/include/llvm/IR/Attributes.h b/llvm/include/llvm/IR/Attributes.h
index feeb3a9ddba9..2755ced404dd 100644
--- a/llvm/include/llvm/IR/Attributes.h
+++ b/llvm/include/llvm/IR/Attributes.h
@@ -947,6 +947,9 @@ public:
   /// arg.
   uint64_t getParamDereferenceableOrNullBytes(unsigned ArgNo) const;
 
+  /// Get range (or std::nullopt if unknown) of an arg.
+  std::optional<ConstantRange> getParamRange(unsigned ArgNo) const;
+
   /// Get the disallowed floating-point classes of the return value.
   FPClassTest getRetNoFPClass() const;
 
@@ -1123,6 +1126,10 @@ public:
   /// invalid if the Kind is not present in the builder.
   Attribute getAttribute(StringRef Kind) const;
 
+  /// Retrieve the range if the attribute exists (std::nullopt is returned
+  /// otherwise).
+  std::optional<ConstantRange> getRange() const;
+
   /// Return raw (possibly packed/encoded) value of integer attribute or
   /// std::nullopt if not set.
   std::optional<uint64_t> getRawIntAttr(Attribute::AttrKind Kind) const;
diff --git a/llvm/lib/IR/Attributes.cpp b/llvm/lib/IR/Attributes.cpp
index 223c917766a4..e9daa01b899e 100644
--- a/llvm/lib/IR/Attributes.cpp
+++ b/llvm/lib/IR/Attributes.cpp
@@ -1931,6 +1931,14 @@ AttributeList::getParamDereferenceableOrNullBytes(unsigned Index) const {
   return getParamAttrs(Index).getDereferenceableOrNullBytes();
 }
 
+std::optional<ConstantRange>
+AttributeList::getParamRange(unsigned ArgNo) const {
+  auto RangeAttr = getParamAttrs(ArgNo).getAttribute(Attribute::Range);
+  if (RangeAttr.isValid())
+    return RangeAttr.getRange();
+  return std::nullopt;
+}
+
 FPClassTest AttributeList::getRetNoFPClass() const {
   return getRetAttrs().getNoFPClass();
 }
@@ -2277,6 +2285,13 @@ Attribute AttrBuilder::getAttribute(StringRef A) const {
   return {};
 }
 
+std::optional<ConstantRange> AttrBuilder::getRange() const {
+  const Attribute RangeAttr = getAttribute(Attribute::Range);
+  if (RangeAttr.isValid())
+    return RangeAttr.getRange();
+  return std::nullopt;
+}
+
 bool AttrBuilder::contains(Attribute::AttrKind A) const {
   return getAttribute(A).isValid();
 }
diff --git a/llvm/lib/Transforms/Utils/InlineFunction.cpp b/llvm/lib/Transforms/Utils/InlineFunction.cpp
index a0a93dc0dab5..4ad426285ce2 100644
--- a/llvm/lib/Transforms/Utils/InlineFunction.cpp
+++ b/llvm/lib/Transforms/Utils/InlineFunction.cpp
@@ -34,6 +34,7 @@
 #include "llvm/Analysis/VectorUtils.h"
 #include "llvm/IR/Argument.h"
 #include "llvm/IR/AttributeMask.h"
+#include "llvm/IR/Attributes.h"
 #include "llvm/IR/BasicBlock.h"
 #include "llvm/IR/CFG.h"
 #include "llvm/IR/Constant.h"
@@ -59,6 +60,7 @@
 #include "llvm/IR/MDBuilder.h"
 #include "llvm/IR/Metadata.h"
 #include "llvm/IR/Module.h"
+#include "llvm/IR/PatternMatch.h"
 #include "llvm/IR/ProfDataUtils.h"
 #include "llvm/IR/Type.h"
 #include "llvm/IR/User.h"
@@ -1358,18 +1360,36 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB,
   auto &Context = CalledFunction->getContext();
 
   // Collect valid attributes for all params.
-  SmallVector<AttrBuilder> ValidParamAttrs;
+  SmallVector<AttrBuilder> ValidObjParamAttrs, ValidExactParamAttrs;
   bool HasAttrToPropagate = false;
 
+  // Attributes we can only propagate if the exact parameter is forwarded.
+  // We can propagate both poison generating and UB generating attributes
+  // without any extra checks. The only attribute that is tricky to propagate
+  // is `noundef` (skipped for now) as that can create new UB where previous
+  // behavior was just using a poison value.
+  static const Attribute::AttrKind ExactAttrsToPropagate[] = {
+      Attribute::Dereferenceable, Attribute::DereferenceableOrNull,
+      Attribute::NonNull, Attribute::Alignment, Attribute::Range};
+
   for (unsigned I = 0, E = CB.arg_size(); I < E; ++I) {
-    ValidParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
+    ValidObjParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
+    ValidExactParamAttrs.emplace_back(AttrBuilder{CB.getContext()});
     // Access attributes can be propagated to any param with the same underlying
     // object as the argument.
     if (CB.paramHasAttr(I, Attribute::ReadNone))
-      ValidParamAttrs.back().addAttribute(Attribute::ReadNone);
+      ValidObjParamAttrs.back().addAttribute(Attribute::ReadNone);
     if (CB.paramHasAttr(I, Attribute::ReadOnly))
-      ValidParamAttrs.back().addAttribute(Attribute::ReadOnly);
-    HasAttrToPropagate |= ValidParamAttrs.back().hasAttributes();
+      ValidObjParamAttrs.back().addAttribute(Attribute::ReadOnly);
+
+    for (Attribute::AttrKind AK : ExactAttrsToPropagate) {
+      Attribute Attr = CB.getParamAttr(I, AK);
+      if (Attr.isValid())
+        ValidExactParamAttrs.back().addAttribute(Attr);
+    }
+
+    HasAttrToPropagate |= ValidObjParamAttrs.back().hasAttributes();
+    HasAttrToPropagate |= ValidExactParamAttrs.back().hasAttributes();
   }
 
   // Won't be able to propagate anything.
@@ -1391,22 +1411,60 @@ static void AddParamAndFnBasicAttributes(const CallBase &CB,
 
       AttributeList AL = NewInnerCB->getAttributes();
       for (unsigned I = 0, E = InnerCB->arg_size(); I < E; ++I) {
-        // Check if the underlying value for the parameter is an argument.
-        const Value *UnderlyingV =
-            getUnderlyingObject(InnerCB->getArgOperand(I));
-        const Argument *Arg = dyn_cast<Argument>(UnderlyingV);
-        if (!Arg)
+        // It's unsound or requires special handling to propagate
+        // attributes to byval arguments. Even if CalledFunction
+        // doesn't e.g. write to the argument (readonly), the call to
+        // NewInnerCB may write to its by-value copy.
+        if (NewInnerCB->paramHasAttr(I, Attribute::ByVal))
           continue;
 
-        if (NewInnerCB->paramHasAttr(I, Attribute::ByVal))
-          // It's unsound to propagate memory attributes to byval arguments.
-          // Even if CalledFunction doesn't e.g. write to the argument,
-          // the call to NewInnerCB may write to its by-value copy.
+        // Don't bother propagating attrs to constants.
+        if (match(NewInnerCB->getArgOperand(I),
+                  llvm::PatternMatch::m_ImmConstant()))
           continue;
 
-        unsigned ArgNo = Arg->getArgNo();
+        // Check if the underlying value for the parameter is an argument.
+        const Argument *Arg = dyn_cast<Argument>(InnerCB->getArgOperand(I));
+        unsigned ArgNo;
+        if (Arg) {
+          ArgNo = Arg->getArgNo();
+          // For dereferenceable, dereferenceable_or_null, align, etc...
+          // we don't want to propagate if the existing param has the same
+          // attribute with "better" constraints. So  remove from the
+          // new AL if the region of the existing param is larger than
+          // what we can propagate.
+          AttrBuilder NewAB{
+              Context, AttributeSet::get(Context, ValidExactParamAttrs[ArgNo])};
+          if (AL.getParamDereferenceableBytes(I) >
+              NewAB.getDereferenceableBytes())
+            NewAB.removeAttribute(Attribute::Dereferenceable);
+          if (AL.getParamDereferenceableOrNullBytes(I) >
+              NewAB.getDereferenceableOrNullBytes())
+            NewAB.removeAttribute(Attribute::DereferenceableOrNull);
+          if (AL.getParamAlignment(I).valueOrOne() >
+              NewAB.getAlignment().valueOrOne())
+            NewAB.removeAttribute(Attribute::Alignment);
+          if (auto ExistingRange = AL.getParamRange(I)) {
+            if (auto NewRange = NewAB.getRange()) {
+              ConstantRange CombinedRange =
+                  ExistingRange->intersectWith(*NewRange);
+              NewAB.removeAttribute(Attribute::Range);
+              NewAB.addRangeAttr(CombinedRange);
+            }
+          }
+          AL = AL.addParamAttributes(Context, I, NewAB);
+        } else {
+          // Check if the underlying value for the parameter is an argument.
+          const Value *UnderlyingV =
+              getUnderlyingObject(InnerCB->getArgOperand(I));
+          Arg = dyn_cast<Argument>(UnderlyingV);
+          if (!Arg)
+            continue;
+          ArgNo = Arg->getArgNo();
+        }
+
         // If so, propagate its access attributes.
-        AL = AL.addParamAttributes(Context, I, ValidParamAttrs[ArgNo]);
+        AL = AL.addParamAttributes(Context, I, ValidObjParamAttrs[ArgNo]);
         // We can have conflicting attributes from the inner callsite and
         // to-be-inlined callsite. In that case, choose the most
         // restrictive.
diff --git a/llvm/test/Transforms/Inline/access-attributes-prop.ll b/llvm/test/Transforms/Inline/access-attributes-prop.ll
index 5051c92345ec..5bf845d5ba94 100644
--- a/llvm/test/Transforms/Inline/access-attributes-prop.ll
+++ b/llvm/test/Transforms/Inline/access-attributes-prop.ll
@@ -47,7 +47,6 @@ define dso_local void @foo3_writable(ptr %p) {
   ret void
 }
 
-
 define dso_local void @foo1_bar_aligned64_deref512(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@foo1_bar_aligned64_deref512
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -306,7 +305,7 @@ define void @prop_param_callbase_def_1x_partial_3(ptr %p, ptr %p2) {
 define void @prop_deref(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_deref
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr dereferenceable(16) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr dereferenceable(16) %p)
@@ -316,7 +315,7 @@ define void @prop_deref(ptr %p) {
 define void @prop_deref_or_null(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_deref_or_null
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr dereferenceable_or_null(256) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr dereferenceable_or_null(256) %p)
@@ -326,13 +325,23 @@ define void @prop_deref_or_null(ptr %p) {
 define void @prop_param_nonnull_and_align(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_nonnull_and_align
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr [[P]])
+; CHECK-NEXT:    call void @bar1(ptr nonnull align 32 [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1(ptr nonnull align 32 %p)
   ret void
 }
 
+define void @prop_param_nofree_and_align(ptr %p) {
+; CHECK-LABEL: define {{[^@]+}}@prop_param_nofree_and_align
+; CHECK-SAME: (ptr [[P:%.*]]) {
+; CHECK-NEXT:    call void @bar1(ptr align 32 [[P]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo1(ptr nofree align 32 %p)
+  ret void
+}
+
 define void @prop_param_deref_align_no_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_no_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -346,7 +355,7 @@ define void @prop_param_deref_align_no_update(ptr %p) {
 define void @prop_param_deref_align_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_align_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr align 64 dereferenceable(512) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr align 128 dereferenceable(1024) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1_bar_aligned64_deref512(ptr align 128 dereferenceable(1024) %p)
@@ -356,7 +365,7 @@ define void @prop_param_deref_align_update(ptr %p) {
 define void @prop_param_deref_or_null_update(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_param_deref_or_null_update
 ; CHECK-SAME: (ptr [[P:%.*]]) {
-; CHECK-NEXT:    call void @bar1(ptr align 512 dereferenceable_or_null(512) [[P]])
+; CHECK-NEXT:    call void @bar1(ptr align 512 dereferenceable_or_null(1024) [[P]])
 ; CHECK-NEXT:    ret void
 ;
   call void @foo1_bar_aligned512_deref_or_null512(ptr dereferenceable_or_null(1024) %p)
@@ -539,7 +548,6 @@ define void @prop_no_conflict_writable(ptr %p) {
   ret void
 }
 
-
 define void @prop_no_conflict_writable2(ptr %p) {
 ; CHECK-LABEL: define {{[^@]+}}@prop_no_conflict_writable2
 ; CHECK-SAME: (ptr [[P:%.*]]) {
@@ -600,3 +608,145 @@ define void @prop_byval_readonly2(ptr %p) {
   call void @foo_byval_readonly2(ptr %p)
   ret void
 }
+
+declare void @bar5(i32)
+
+define dso_local void @foo4_range_0_10(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4_range_0_10
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 10) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 range(i32 0, 10) %v)
+  ret void
+}
+
+define dso_local void @foo4_range_10_40(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4_range_10_40
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 range(i32 10, 40) %v)
+  ret void
+}
+
+define dso_local void @foo4_2_range_0_10(i32 range(i32 0, 10) %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4_2_range_0_10
+; CHECK-SAME: (i32 range(i32 0, 10) [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 %v)
+  ret void
+}
+
+define dso_local void @foo4(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@foo4
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @bar5(i32 %v)
+  ret void
+}
+
+define void @prop_range_empty_intersect(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_intersect
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 0) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 11, 50) %v)
+  ret void
+}
+
+define void @prop_range_empty(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_empty
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 0) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4(i32 range(i32 1, 0) %v)
+  ret void
+}
+
+define void @prop_range_empty_with_intersect(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_empty_with_intersect
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 10) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 1, 0) %v)
+  ret void
+}
+
+define void @prop_range_intersect1(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect1
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 9) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 0, 9) %v)
+  ret void
+}
+
+define void @prop_range_intersect2(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect2
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 9) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 1, 9) %v)
+  ret void
+}
+
+define void @prop_range_intersect3(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect3
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 11) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_2_range_0_10(i32 range(i32 0, 11) %v)
+  ret void
+}
+
+define void @prop_range_intersect4(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect4
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 0, 5) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_0_10(i32 range(i32 40, 5) %v)
+  ret void
+}
+
+define void @prop_range_intersect5(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_intersect5
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_10_40(i32 range(i32 30, 20) %v)
+  ret void
+}
+
+define void @prop_range_keep(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_keep
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 10, 40) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4_range_10_40(i32 %v)
+  ret void
+}
+
+define void @prop_range_direct(i32 %v) {
+; CHECK-LABEL: define {{[^@]+}}@prop_range_direct
+; CHECK-SAME: (i32 [[V:%.*]]) {
+; CHECK-NEXT:    call void @bar5(i32 range(i32 1, 11) [[V]])
+; CHECK-NEXT:    ret void
+;
+  call void @foo4(i32 range(i32 1, 11) %v)
+  ret void
+}
diff --git a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
index 1a219a22019c..c0943f4aefb8 100644
--- a/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
+++ b/llvm/test/Transforms/Inline/assumptions-from-callsite-attrs.ll
@@ -8,7 +8,7 @@ declare void @h(ptr %p, ptr %q, ptr %z)
 define void @f(ptr %p, ptr %q, ptr %z) {
 ; CHECK-LABEL: define void @f
 ; CHECK-SAME: (ptr [[P:%.*]], ptr [[Q:%.*]], ptr [[Z:%.*]]) {
-; CHECK-NEXT:    call void @h(ptr [[P]], ptr [[Q]], ptr [[Z]])
+; CHECK-NEXT:    call void @h(ptr nonnull [[P]], ptr [[Q]], ptr nonnull [[Z]])
 ; CHECK-NEXT:    ret void
 ;
   call void @g(ptr nonnull %p, ptr %q, ptr nonnull %z)
diff --git a/llvm/test/Transforms/Inline/byval.ll b/llvm/test/Transforms/Inline/byval.ll
index dd5be40b90a8..1a70da8472cb 100644
--- a/llvm/test/Transforms/Inline/byval.ll
+++ b/llvm/test/Transforms/Inline/byval.ll
@@ -106,7 +106,7 @@ define void @test3() nounwind  {
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS]], align 1
 ; CHECK-NEXT:    call void @llvm.lifetime.start.p0(i64 12, ptr [[S1]])
 ; CHECK-NEXT:    call void @llvm.memcpy.p0.p0.i64(ptr align 1 [[S1]], ptr align 1 [[S]], i64 12, i1 false)
-; CHECK-NEXT:    call void @g3(ptr [[S1]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @g3(ptr align 64 [[S1]]) #[[ATTR0]]
 ; CHECK-NEXT:    call void @llvm.lifetime.end.p0(i64 12, ptr [[S1]])
 ; CHECK-NEXT:    ret void
 ;
@@ -131,7 +131,7 @@ define i32 @test4() nounwind  {
 ; CHECK-SAME: ) #[[ATTR0]] {
 ; CHECK-NEXT:  entry:
 ; CHECK-NEXT:    [[S:%.*]] = alloca [[STRUCT_SS:%.*]], align 64
-; CHECK-NEXT:    call void @g3(ptr [[S]]) #[[ATTR0]]
+; CHECK-NEXT:    call void @g3(ptr align 64 [[S]]) #[[ATTR0]]
 ; CHECK-NEXT:    ret i32 4
 ;
 entry:
diff --git a/llvm/test/Transforms/PhaseOrdering/pr95152.ll b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
index 16610c439f4c..fff94673a1a5 100644
--- a/llvm/test/Transforms/PhaseOrdering/pr95152.ll
+++ b/llvm/test/Transforms/PhaseOrdering/pr95152.ll
@@ -47,7 +47,7 @@ define void @f(ptr dead_on_unwind noalias %p) {
 ; CHECK-LABEL: define void @f(
 ; CHECK-SAME: ptr dead_on_unwind noalias [[P:%.*]]) local_unnamed_addr {
 ; CHECK-NEXT:    store i64 3, ptr [[P]], align 4
-; CHECK-NEXT:    tail call void @j(ptr nonnull [[P]])
+; CHECK-NEXT:    tail call void @j(ptr nonnull align 8 dereferenceable(8) [[P]])
 ; CHECK-NEXT:    store i64 43, ptr [[P]], align 4
 ; CHECK-NEXT:    ret void
 ;
-- 
GitLab


From c3bbc3a57d439a039d2ea49d9b7e0f6e1c87219d Mon Sep 17 00:00:00 2001
From: sinan <sinan.lin@linux.alibaba.com>
Date: Fri, 18 Oct 2024 09:46:41 +0800
Subject: [PATCH 314/329] [BOLT] Fix logs with no hex convension (#112650)

Add `utohexstr` to ensure that offsets/addresses are correctly formatted
as hexadecimal values.
---
 bolt/include/bolt/Core/DIEBuilder.h | 2 +-
 bolt/lib/Core/BinaryContext.cpp     | 4 ++--
 bolt/lib/Rewrite/DWARFRewriter.cpp  | 2 +-
 3 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/bolt/include/bolt/Core/DIEBuilder.h b/bolt/include/bolt/Core/DIEBuilder.h
index e5b057ea1e42..d1acba0f26c7 100644
--- a/bolt/include/bolt/Core/DIEBuilder.h
+++ b/bolt/include/bolt/Core/DIEBuilder.h
@@ -314,7 +314,7 @@ public:
 
     BC.errs()
         << "BOLT-ERROR: unable to find TypeUnit for Type Unit at offset 0x"
-        << DU.getOffset() << "\n";
+        << Twine::utohexstr(DU.getOffset()) << "\n";
     return nullptr;
   }
 
diff --git a/bolt/lib/Core/BinaryContext.cpp b/bolt/lib/Core/BinaryContext.cpp
index 1347047e1b70..f246750209d6 100644
--- a/bolt/lib/Core/BinaryContext.cpp
+++ b/bolt/lib/Core/BinaryContext.cpp
@@ -1294,8 +1294,8 @@ bool BinaryContext::handleAArch64Veneer(uint64_t Address, bool MatchOnly) {
     Veneer->getOrCreateLocalLabel(Address);
     Veneer->setMaxSize(TotalSize);
     Veneer->updateState(BinaryFunction::State::Disassembled);
-    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: handling veneer function at 0x" << Address
-                      << "\n");
+    LLVM_DEBUG(dbgs() << "BOLT-DEBUG: handling veneer function at 0x"
+                      << Twine::utohexstr(Address) << "\n");
     return true;
   };
 
diff --git a/bolt/lib/Rewrite/DWARFRewriter.cpp b/bolt/lib/Rewrite/DWARFRewriter.cpp
index f9cb1b3895e7..1b5ba8b49d36 100644
--- a/bolt/lib/Rewrite/DWARFRewriter.cpp
+++ b/bolt/lib/Rewrite/DWARFRewriter.cpp
@@ -1362,7 +1362,7 @@ void DWARFRewriter::updateDWARFObjectAddressRanges(
                Die.getTag() == dwarf::DW_TAG_compile_unit)) {
     if (opts::Verbosity >= 1)
       errs() << "BOLT-WARNING: cannot update ranges for DIE in Unit offset 0x"
-             << Unit.getOffset() << '\n';
+             << Twine::utohexstr(Unit.getOffset()) << '\n';
   }
 }
 
-- 
GitLab


From 70865c448ca9ebca08a77264e748ac4343789675 Mon Sep 17 00:00:00 2001
From: Longsheng Mou <longshengmou@gmail.com>
Date: Fri, 18 Oct 2024 09:55:41 +0800
Subject: [PATCH 315/329] [mlir][transforms] Add `signalPassFailure` in
 RemoveDeadValues (#112199)

This PR adds `signalPassFailure` in RemoveDeadValues to ensure that a
pipeline would stop here.
Fixes #111757.
---
 mlir/lib/Transforms/RemoveDeadValues.cpp | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/mlir/lib/Transforms/RemoveDeadValues.cpp b/mlir/lib/Transforms/RemoveDeadValues.cpp
index 3de4fb75ed83..7e45f18b660b 100644
--- a/mlir/lib/Transforms/RemoveDeadValues.cpp
+++ b/mlir/lib/Transforms/RemoveDeadValues.cpp
@@ -589,7 +589,7 @@ void RemoveDeadValues::runOnOperation() {
   });
 
   if (acceptableIR.wasInterrupted())
-    return;
+    return signalPassFailure();
 
   module->walk([&](Operation *op) {
     if (auto funcOp = dyn_cast<FunctionOpInterface>(op)) {
-- 
GitLab


From 44b020a3818a01b77415ce12629b020b641af2ea Mon Sep 17 00:00:00 2001
From: Keith Packard <keithp@keithp.com>
Date: Thu, 17 Oct 2024 19:06:47 -0700
Subject: [PATCH 316/329] [PowerPC][ISelLowering] Support
 -mstack-protector-guard=tls (#110928)

Add support for using a thread-local variable with a specified offset
for holding the stack guard canary value. This supports both 32- and 64-
bit PowerPC targets.

This mirrors changes from #108942 but targeting PowerPC instead of
RISCV. Because both of these PRs modify the same driver functions, this
series is stack on top of the RISC-V one.

---------

Signed-off-by: Keith Packard <keithp@keithp.com>
---
 clang/lib/Driver/ToolChains/Clang.cpp         |  18 ++-
 clang/test/CodeGen/stack-protector-guard.c    |  16 +++
 clang/test/Driver/stack-protector-guard.c     |  57 +++++++-
 llvm/include/llvm/CodeGen/TargetLowering.h    |   4 +-
 llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp  |   4 +-
 .../SelectionDAG/SelectionDAGBuilder.cpp      |   7 +-
 .../Target/AArch64/AArch64ISelLowering.cpp    |   4 +-
 llvm/lib/Target/AArch64/AArch64ISelLowering.h |   2 +-
 llvm/lib/Target/ARM/ARMISelLowering.cpp       |   2 +-
 llvm/lib/Target/ARM/ARMISelLowering.h         |   2 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.cpp   |   8 +-
 llvm/lib/Target/PowerPC/PPCISelLowering.h     |   2 +-
 llvm/lib/Target/PowerPC/PPCInstrInfo.cpp      |  14 +-
 llvm/lib/Target/Sparc/SparcISelLowering.cpp   |   4 +-
 llvm/lib/Target/Sparc/SparcISelLowering.h     |   2 +-
 llvm/lib/Target/SystemZ/SystemZISelLowering.h |   4 +-
 llvm/lib/Target/X86/X86ISelLowering.cpp       |   2 +-
 llvm/lib/Target/X86/X86ISelLowering.h         |   2 +-
 .../CodeGen/PowerPC/stack-guard-global.ll     | 122 ++++++++++++++++++
 llvm/test/CodeGen/PowerPC/stack-guard-tls.ll  | 114 ++++++++++++++++
 20 files changed, 354 insertions(+), 36 deletions(-)
 create mode 100644 llvm/test/CodeGen/PowerPC/stack-guard-global.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/stack-guard-tls.ll

diff --git a/clang/lib/Driver/ToolChains/Clang.cpp b/clang/lib/Driver/ToolChains/Clang.cpp
index 3fc39296f442..192eb608de43 100644
--- a/clang/lib/Driver/ToolChains/Clang.cpp
+++ b/clang/lib/Driver/ToolChains/Clang.cpp
@@ -3595,7 +3595,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC,
     StringRef Value = A->getValue();
     if (!EffectiveTriple.isX86() && !EffectiveTriple.isAArch64() &&
         !EffectiveTriple.isARM() && !EffectiveTriple.isThumb() &&
-        !EffectiveTriple.isRISCV())
+        !EffectiveTriple.isRISCV() && !EffectiveTriple.isPPC())
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << TripleStr;
     if ((EffectiveTriple.isX86() || EffectiveTriple.isARM() ||
@@ -3635,7 +3635,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC,
           << A->getOption().getName() << Value << "sysreg global";
       return;
     }
-    if (EffectiveTriple.isRISCV()) {
+    if (EffectiveTriple.isRISCV() || EffectiveTriple.isPPC()) {
       if (Value != "tls" && Value != "global") {
         D.Diag(diag::err_drv_invalid_value_with_suggestion)
             << A->getOption().getName() << Value << "tls global";
@@ -3656,7 +3656,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC,
     StringRef Value = A->getValue();
     if (!EffectiveTriple.isX86() && !EffectiveTriple.isAArch64() &&
         !EffectiveTriple.isARM() && !EffectiveTriple.isThumb() &&
-        !EffectiveTriple.isRISCV())
+        !EffectiveTriple.isRISCV() && !EffectiveTriple.isPPC())
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << TripleStr;
     int Offset;
@@ -3676,7 +3676,7 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC,
   if (Arg *A = Args.getLastArg(options::OPT_mstack_protector_guard_reg_EQ)) {
     StringRef Value = A->getValue();
     if (!EffectiveTriple.isX86() && !EffectiveTriple.isAArch64() &&
-        !EffectiveTriple.isRISCV())
+        !EffectiveTriple.isRISCV() && !EffectiveTriple.isPPC())
       D.Diag(diag::err_drv_unsupported_opt_for_target)
           << A->getAsString(Args) << TripleStr;
     if (EffectiveTriple.isX86() && (Value != "fs" && Value != "gs")) {
@@ -3693,6 +3693,16 @@ static void RenderSSPOptions(const Driver &D, const ToolChain &TC,
           << A->getOption().getName() << Value << "tp";
       return;
     }
+    if (EffectiveTriple.isPPC64() && Value != "r13") {
+      D.Diag(diag::err_drv_invalid_value_with_suggestion)
+          << A->getOption().getName() << Value << "r13";
+      return;
+    }
+    if (EffectiveTriple.isPPC32() && Value != "r2") {
+      D.Diag(diag::err_drv_invalid_value_with_suggestion)
+          << A->getOption().getName() << Value << "r2";
+      return;
+    }
     A->render(Args, CmdArgs);
   }
 
diff --git a/clang/test/CodeGen/stack-protector-guard.c b/clang/test/CodeGen/stack-protector-guard.c
index 4777367c94e7..82616ae800c4 100644
--- a/clang/test/CodeGen/stack-protector-guard.c
+++ b/clang/test/CodeGen/stack-protector-guard.c
@@ -12,6 +12,12 @@
 // RUN: %clang_cc1 -mstack-protector-guard=tls -triple riscv64-unknown-elf \
 // RUN:   -mstack-protector-guard-offset=44 -mstack-protector-guard-reg=tp \
 // RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=RISCV
+// RUN: %clang_cc1 -mstack-protector-guard=tls -triple powerpc64-unknown-elf \
+// RUN:   -mstack-protector-guard-offset=52 -mstack-protector-guard-reg=r13 \
+// RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=POWERPC64
+// RUN: %clang_cc1 -mstack-protector-guard=tls -triple ppc32-unknown-elf \
+// RUN:   -mstack-protector-guard-offset=16 -mstack-protector-guard-reg=r2 \
+// RUN:   -emit-llvm %s -o - | FileCheck %s --check-prefix=POWERPC32
 void foo(int*);
 void bar(int x) {
   int baz[x];
@@ -31,3 +37,13 @@ void bar(int x) {
 // RISCV: [[ATTR1]] = !{i32 1, !"stack-protector-guard", !"tls"}
 // RISCV: [[ATTR2]] = !{i32 1, !"stack-protector-guard-reg", !"tp"}
 // RISCV: [[ATTR3]] = !{i32 1, !"stack-protector-guard-offset", i32 44}
+
+// POWERPC64: !llvm.module.flags = !{{{.*}}[[ATTR1:![0-9]+]], [[ATTR2:![0-9]+]], [[ATTR3:![0-9]+]], [[ATTR4:![0-9]+]]}
+// POWERPC64: [[ATTR2]] = !{i32 1, !"stack-protector-guard", !"tls"}
+// POWERPC64: [[ATTR3]] = !{i32 1, !"stack-protector-guard-reg", !"r13"}
+// POWERPC64: [[ATTR4]] = !{i32 1, !"stack-protector-guard-offset", i32 52}
+
+// POWERPC32: !llvm.module.flags = !{{{.*}}[[ATTR1:![0-9]+]], [[ATTR2:![0-9]+]], [[ATTR3:![0-9]+]], [[ATTR4:![0-9]+]]}
+// POWERPC32: [[ATTR2]] = !{i32 1, !"stack-protector-guard", !"tls"}
+// POWERPC32: [[ATTR3]] = !{i32 1, !"stack-protector-guard-reg", !"r2"}
+// POWERPC32: [[ATTR4]] = !{i32 1, !"stack-protector-guard-offset", i32 16}
diff --git a/clang/test/Driver/stack-protector-guard.c b/clang/test/Driver/stack-protector-guard.c
index d8475a70e370..666c83079e51 100644
--- a/clang/test/Driver/stack-protector-guard.c
+++ b/clang/test/Driver/stack-protector-guard.c
@@ -17,15 +17,15 @@
 // RUN:   FileCheck -check-prefix=CHECK-SYM %s
 
 // Invalid arch
-// RUN: not %clang -target powerpc64le-linux-gnu -mstack-protector-guard=tls %s 2>&1 | \
+// RUN: not %clang -target mipsel-linux-gnu -mstack-protector-guard=tls %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=INVALID-ARCH %s
 // INVALID-ARCH: unsupported option '-mstack-protector-guard=tls' for target
 
-// RUN: not %clang -target powerpc64le-linux-gnu -mstack-protector-guard-reg=fs %s 2>&1 | \
+// RUN: not %clang -target mipsel-linux-gnu -mstack-protector-guard-reg=fs %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=INVALID-ARCH2 %s
 // INVALID-ARCH2: unsupported option '-mstack-protector-guard-reg=fs' for target
 
-// RUN: not %clang -target powerpc64le-linux-gnu -mstack-protector-guard-offset=10 %s 2>&1 | \
+// RUN: not %clang -target mipsel-linux-gnu -mstack-protector-guard-offset=10 %s 2>&1 | \
 // RUN:   FileCheck -check-prefix=INVALID-ARCH3 %s
 // INVALID-ARCH3: unsupported option '-mstack-protector-guard-offset=10' for target
 
@@ -104,3 +104,54 @@
 // RUN:   FileCheck -check-prefix=INVALID-REG-RISCV %s
 
 // INVALID-REG-RISCV: error: invalid value 'sp' in 'mstack-protector-guard-reg=', expected one of: tp
+
+// RUN: %clang -### -target powerpc64-unknown-elf -mstack-protector-guard=tls -mstack-protector-guard-offset=24 -mstack-protector-guard-reg=r13 %s 2>&1 | \
+// RUN:   FileCheck -v -check-prefix=CHECK-TLS-POWERPC64 %s
+// RUN: %clang -### -target powerpc64-unknown-linux-gnu -mstack-protector-guard=global %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=CHECK-GLOBAL %s
+
+// RUN: not %clang -target powerpc64-unknown-linux-gnu -mstack-protector-guard=tls %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=MISSING-OFFSET %s
+
+// RUN: not %clang -target powerpc64-unknown-elf -mstack-protector-guard=sysreg %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=INVALID-VALUE2 %s
+
+// RUN: not %clang -target powerpc64-unknown-elf -mstack-protector-guard=tls \
+// RUN: -mstack-protector-guard-offset=20 -mstack-protector-guard-reg=r12 %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=INVALID-REG-POWERPC64 %s
+
+// CHECK-TLS-POWERPC64: "-cc1" {{.*}}"-mstack-protector-guard=tls" "-mstack-protector-guard-offset=24" "-mstack-protector-guard-reg=r13"
+// INVALID-REG-POWERPC64: error: invalid value 'r12' in 'mstack-protector-guard-reg=', expected one of: r13
+
+// RUN: %clang -### -target powerpc64le-unknown-elf -mstack-protector-guard=tls -mstack-protector-guard-offset=24 -mstack-protector-guard-reg=r13 %s 2>&1 | \
+// RUN:   FileCheck -v -check-prefix=CHECK-TLS-POWERPC64 %s
+// RUN: %clang -### -target powerpc64le-unknown-elf -mstack-protector-guard=global %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=CHECK-GLOBAL %s
+
+// RUN: not %clang -target powerpc64le-unknown-elf -mstack-protector-guard=tls %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=MISSING-OFFSET %s
+
+// RUN: not %clang -target powerpc64le-unknown-elf -mstack-protector-guard=sysreg %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=INVALID-VALUE2 %s
+
+// RUN: not %clang -target powerpc64le-unknown-elf -mstack-protector-guard=tls \
+// RUN: -mstack-protector-guard-offset=20 -mstack-protector-guard-reg=r12 %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=INVALID-REG-POWERPC64 %s
+
+// RUN: %clang -### -target ppc32-unknown-elf -mstack-protector-guard=tls -mstack-protector-guard-offset=24 -mstack-protector-guard-reg=r2 %s 2>&1 | \
+// RUN:   FileCheck -v -check-prefix=CHECK-TLS-POWERPC32 %s
+// RUN: %clang -### -target ppc32-unknown-elf -mstack-protector-guard=global %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=CHECK-GLOBAL %s
+
+// RUN: not %clang -target ppc32-unknown-elf -mstack-protector-guard=tls %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=MISSING-OFFSET %s
+
+// RUN: not %clang -target ppc32-unknown-elf -mstack-protector-guard=sysreg %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=INVALID-VALUE2 %s
+
+// RUN: not %clang -target ppc32-unknown-elf -mstack-protector-guard=tls \
+// RUN: -mstack-protector-guard-offset=20 -mstack-protector-guard-reg=r3 %s 2>&1 | \
+// RUN:   FileCheck -check-prefix=INVALID-REG-POWERPC32 %s
+
+// CHECK-TLS-POWERPC32: "-cc1" {{.*}}"-mstack-protector-guard=tls" "-mstack-protector-guard-offset=24" "-mstack-protector-guard-reg=r2"
+// INVALID-REG-POWERPC32: error: invalid value 'r3' in 'mstack-protector-guard-reg=', expected one of: r2
diff --git a/llvm/include/llvm/CodeGen/TargetLowering.h b/llvm/include/llvm/CodeGen/TargetLowering.h
index 5ab31a687ec5..61615cb0f7b3 100644
--- a/llvm/include/llvm/CodeGen/TargetLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetLowering.h
@@ -5567,9 +5567,7 @@ public:
 
   /// If this function returns true, SelectionDAGBuilder emits a
   /// LOAD_STACK_GUARD node when it is lowering Intrinsic::stackprotector.
-  virtual bool useLoadStackGuardNode() const {
-    return false;
-  }
+  virtual bool useLoadStackGuardNode(const Module &M) const { return false; }
 
   virtual SDValue emitStackGuardXorFP(SelectionDAG &DAG, SDValue Val,
                                       const SDLoc &DL) const {
diff --git a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
index d0464670b292..563a82644134 100644
--- a/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
+++ b/llvm/lib/CodeGen/GlobalISel/IRTranslator.cpp
@@ -2378,7 +2378,7 @@ bool IRTranslator::translateKnownIntrinsic(const CallInst &CI, Intrinsic::ID ID,
   case Intrinsic::stackprotector: {
     LLT PtrTy = getLLTForType(*CI.getArgOperand(0)->getType(), *DL);
     Register GuardVal;
-    if (TLI->useLoadStackGuardNode()) {
+    if (TLI->useLoadStackGuardNode(*CI.getModule())) {
       GuardVal = MRI->createGenericVirtualRegister(PtrTy);
       getStackGuard(GuardVal, MIRBuilder);
     } else
@@ -3869,7 +3869,7 @@ bool IRTranslator::emitSPDescriptorParent(StackProtectorDescriptor &SPD,
 
   // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
   // Otherwise, emit a volatile load to retrieve the stack guard value.
-  if (TLI->useLoadStackGuardNode()) {
+  if (TLI->useLoadStackGuardNode(*ParentBB->getBasicBlock()->getModule())) {
     Guard =
         MRI->createGenericVirtualRegister(LLT::scalar(PtrTy.getSizeInBits()));
     getStackGuard(Guard, *CurBuilder);
diff --git a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
index 3e13364cf28a..845055374307 100644
--- a/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
+++ b/llvm/lib/CodeGen/SelectionDAG/SelectionDAGBuilder.cpp
@@ -3138,7 +3138,7 @@ void SelectionDAGBuilder::visitSPDescriptorParent(StackProtectorDescriptor &SPD,
   // If useLoadStackGuardNode returns true, generate LOAD_STACK_GUARD.
   // Otherwise, emit a volatile load to retrieve the stack guard value.
   SDValue Chain = DAG.getEntryNode();
-  if (TLI.useLoadStackGuardNode()) {
+  if (TLI.useLoadStackGuardNode(M)) {
     Guard = getLoadStackGuard(DAG, dl, Chain);
   } else {
     const Value *IRGuard = TLI.getSDagStackGuard(M);
@@ -7349,7 +7349,7 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     const Module &M = *MF.getFunction().getParent();
     EVT PtrTy = TLI.getValueType(DAG.getDataLayout(), I.getType());
     SDValue Chain = getRoot();
-    if (TLI.useLoadStackGuardNode()) {
+    if (TLI.useLoadStackGuardNode(M)) {
       Res = getLoadStackGuard(DAG, sdl, Chain);
       Res = DAG.getPtrExtOrTrunc(Res, sdl, PtrTy);
     } else {
@@ -7369,9 +7369,10 @@ void SelectionDAGBuilder::visitIntrinsicCall(const CallInst &I,
     // Emit code into the DAG to store the stack guard onto the stack.
     MachineFunction &MF = DAG.getMachineFunction();
     MachineFrameInfo &MFI = MF.getFrameInfo();
+    const Module &M = *MF.getFunction().getParent();
     SDValue Src, Chain = getRoot();
 
-    if (TLI.useLoadStackGuardNode())
+    if (TLI.useLoadStackGuardNode(M))
       Src = getLoadStackGuard(DAG, sdl, Chain);
     else
       Src = getValue(I.getArgOperand(0));   // The guard's value.
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
index b5657584016e..a44a73eb2c0f 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.cpp
@@ -26982,9 +26982,9 @@ void AArch64TargetLowering::ReplaceNodeResults(
   }
 }
 
-bool AArch64TargetLowering::useLoadStackGuardNode() const {
+bool AArch64TargetLowering::useLoadStackGuardNode(const Module &M) const {
   if (Subtarget->isTargetAndroid() || Subtarget->isTargetFuchsia())
-    return TargetLowering::useLoadStackGuardNode();
+    return TargetLowering::useLoadStackGuardNode(M);
   return true;
 }
 
diff --git a/llvm/lib/Target/AArch64/AArch64ISelLowering.h b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
index cf2ae5fd027c..217e971568a9 100644
--- a/llvm/lib/Target/AArch64/AArch64ISelLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64ISelLowering.h
@@ -811,7 +811,7 @@ public:
   TargetLoweringBase::AtomicExpansionKind
   shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
-  bool useLoadStackGuardNode() const override;
+  bool useLoadStackGuardNode(const Module &M) const override;
   TargetLoweringBase::LegalizeTypeAction
   getPreferredVectorAction(MVT VT) const override;
 
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.cpp b/llvm/lib/Target/ARM/ARMISelLowering.cpp
index 64c0500191e4..5d679a1a916d 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMISelLowering.cpp
@@ -21305,7 +21305,7 @@ bool ARMTargetLowering::shouldInsertFencesForAtomic(
   return InsertFencesForAtomic;
 }
 
-bool ARMTargetLowering::useLoadStackGuardNode() const {
+bool ARMTargetLowering::useLoadStackGuardNode(const Module &M) const {
   // ROPI/RWPI are not supported currently.
   return !Subtarget->isROPI() && !Subtarget->isRWPI();
 }
diff --git a/llvm/lib/Target/ARM/ARMISelLowering.h b/llvm/lib/Target/ARM/ARMISelLowering.h
index 316f7d3b9bce..ef651bc3d84c 100644
--- a/llvm/lib/Target/ARM/ARMISelLowering.h
+++ b/llvm/lib/Target/ARM/ARMISelLowering.h
@@ -675,7 +675,7 @@ class VectorType;
     TargetLoweringBase::AtomicExpansionKind
     shouldExpandAtomicCmpXchgInIR(AtomicCmpXchgInst *AI) const override;
 
-    bool useLoadStackGuardNode() const override;
+    bool useLoadStackGuardNode(const Module &M) const override;
 
     void insertSSPDeclarations(Module &M) const override;
     Value *getSDagStackGuard(const Module &M) const override;
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
index 7b07f6b6d151..5d6c7c729a76 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.cpp
@@ -17884,10 +17884,10 @@ SDValue PPCTargetLowering::getNegatedExpression(SDValue Op, SelectionDAG &DAG,
 }
 
 // Override to enable LOAD_STACK_GUARD lowering on Linux.
-bool PPCTargetLowering::useLoadStackGuardNode() const {
-  if (!Subtarget.isTargetLinux())
-    return TargetLowering::useLoadStackGuardNode();
-  return true;
+bool PPCTargetLowering::useLoadStackGuardNode(const Module &M) const {
+  if (M.getStackProtectorGuard() == "tls" || Subtarget.isTargetLinux())
+    return true;
+  return TargetLowering::useLoadStackGuardNode(M);
 }
 
 // Override to disable global variable loading on Linux and insert AIX canary
diff --git a/llvm/lib/Target/PowerPC/PPCISelLowering.h b/llvm/lib/Target/PowerPC/PPCISelLowering.h
index 8907c3c5a81c..8c7961e641c3 100644
--- a/llvm/lib/Target/PowerPC/PPCISelLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCISelLowering.h
@@ -1137,7 +1137,7 @@ namespace llvm {
     getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
     /// Override to support customized stack guard loading.
-    bool useLoadStackGuardNode() const override;
+    bool useLoadStackGuardNode(const Module &M) const override;
     void insertSSPDeclarations(Module &M) const override;
     Value *getSDagStackGuard(const Module &M) const override;
 
diff --git a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
index 48833e8f8806..bc2a1b295b43 100644
--- a/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
+++ b/llvm/lib/Target/PowerPC/PPCInstrInfo.cpp
@@ -35,6 +35,7 @@
 #include "llvm/CodeGen/ScheduleDAG.h"
 #include "llvm/CodeGen/SlotIndexes.h"
 #include "llvm/CodeGen/StackMaps.h"
+#include "llvm/IR/Module.h"
 #include "llvm/MC/MCAsmInfo.h"
 #include "llvm/MC/MCInst.h"
 #include "llvm/MC/TargetRegistry.h"
@@ -3107,9 +3108,16 @@ bool PPCInstrInfo::expandPostRAPseudo(MachineInstr &MI) const {
     return true;
   }
   case TargetOpcode::LOAD_STACK_GUARD: {
-    assert(Subtarget.isTargetLinux() &&
-           "Only Linux target is expected to contain LOAD_STACK_GUARD");
-    const int64_t Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
+    auto M = MBB.getParent()->getFunction().getParent();
+    assert(
+        (Subtarget.isTargetLinux() || M->getStackProtectorGuard() == "tls") &&
+        "Only Linux target or tls mode are expected to contain "
+        "LOAD_STACK_GUARD");
+    int64_t Offset;
+    if (M->getStackProtectorGuard() == "tls")
+      Offset = M->getStackProtectorGuardOffset();
+    else
+      Offset = Subtarget.isPPC64() ? -0x7010 : -0x7008;
     const unsigned Reg = Subtarget.isPPC64() ? PPC::X13 : PPC::R2;
     MI.setDesc(get(Subtarget.isPPC64() ? PPC::LD : PPC::LWZ));
     MachineInstrBuilder(*MI.getParent()->getParent(), MI)
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.cpp b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
index 42b8248006d1..de4986ef1e89 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.cpp
@@ -3548,9 +3548,9 @@ void SparcTargetLowering::ReplaceNodeResults(SDNode *N,
 }
 
 // Override to enable LOAD_STACK_GUARD lowering on Linux.
-bool SparcTargetLowering::useLoadStackGuardNode() const {
+bool SparcTargetLowering::useLoadStackGuardNode(const Module &M) const {
   if (!Subtarget->isTargetLinux())
-    return TargetLowering::useLoadStackGuardNode();
+    return TargetLowering::useLoadStackGuardNode(M);
   return true;
 }
 
diff --git a/llvm/lib/Target/Sparc/SparcISelLowering.h b/llvm/lib/Target/Sparc/SparcISelLowering.h
index 15d09bc93097..cc672074a4be 100644
--- a/llvm/lib/Target/Sparc/SparcISelLowering.h
+++ b/llvm/lib/Target/Sparc/SparcISelLowering.h
@@ -119,7 +119,7 @@ namespace llvm {
     }
 
     /// Override to support customized stack guard loading.
-    bool useLoadStackGuardNode() const override;
+    bool useLoadStackGuardNode(const Module &M) const override;
     void insertSSPDeclarations(Module &M) const override;
 
     /// getSetCCResultType - Return the ISD::SETCC ValueType
diff --git a/llvm/lib/Target/SystemZ/SystemZISelLowering.h b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
index 2b065245c16f..3c06c1fdf2b1 100644
--- a/llvm/lib/Target/SystemZ/SystemZISelLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZISelLowering.h
@@ -570,9 +570,7 @@ public:
   getExceptionSelectorRegister(const Constant *PersonalityFn) const override;
 
   /// Override to support customized stack guard loading.
-  bool useLoadStackGuardNode() const override {
-    return true;
-  }
+  bool useLoadStackGuardNode(const Module &M) const override { return true; }
   void insertSSPDeclarations(Module &M) const override {
   }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.cpp b/llvm/lib/Target/X86/X86ISelLowering.cpp
index db633d10edc4..102789a3e952 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.cpp
+++ b/llvm/lib/Target/X86/X86ISelLowering.cpp
@@ -2682,7 +2682,7 @@ X86TargetLowering::X86TargetLowering(const X86TargetMachine &TM,
 }
 
 // This has so far only been implemented for 64-bit MachO.
-bool X86TargetLowering::useLoadStackGuardNode() const {
+bool X86TargetLowering::useLoadStackGuardNode(const Module &M) const {
   return Subtarget.isTargetMachO() && Subtarget.is64Bit();
 }
 
diff --git a/llvm/lib/Target/X86/X86ISelLowering.h b/llvm/lib/Target/X86/X86ISelLowering.h
index 3b1bd0ad9a26..14ada1721fd4 100644
--- a/llvm/lib/Target/X86/X86ISelLowering.h
+++ b/llvm/lib/Target/X86/X86ISelLowering.h
@@ -1568,7 +1568,7 @@ namespace llvm {
     /// returns the address of that location. Otherwise, returns nullptr.
     Value *getIRStackGuard(IRBuilderBase &IRB) const override;
 
-    bool useLoadStackGuardNode() const override;
+    bool useLoadStackGuardNode(const Module &M) const override;
     bool useStackGuardXorFP() const override;
     void insertSSPDeclarations(Module &M) const override;
     Value *getSDagStackGuard(const Module &M) const override;
diff --git a/llvm/test/CodeGen/PowerPC/stack-guard-global.ll b/llvm/test/CodeGen/PowerPC/stack-guard-global.ll
new file mode 100644
index 000000000000..022a62a4b091
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/stack-guard-global.ll
@@ -0,0 +1,122 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64 -verify-machineinstrs < %s \
+; RUN:     | FileCheck %s --check-prefixes=BE64
+; RUN: llc -mtriple=powerpc64le -verify-machineinstrs < %s \
+; RUN:     | FileCheck %s --check-prefixes=LE64
+; RUN: llc -mtriple=ppc32 -verify-machineinstrs < %s \
+; RUN:     | FileCheck %s --check-prefixes=LE32
+
+define void @foo(i64 %t) sspstrong nounwind {
+; BE64-LABEL: foo:
+; BE64:       # %bb.0:
+; BE64-NEXT:    mflr 0
+; BE64-NEXT:    std 31, -8(1)
+; BE64-NEXT:    stdu 1, -144(1)
+; BE64-NEXT:    mr 31, 1
+; BE64-NEXT:    std 0, 160(1)
+; BE64-NEXT:    std 30, 128(31) # 8-byte Folded Spill
+; BE64-NEXT:    addis 30, 2, __stack_chk_guard@toc@ha
+; BE64-NEXT:    sldi 3, 3, 2
+; BE64-NEXT:    ld 4, __stack_chk_guard@toc@l(30)
+; BE64-NEXT:    addi 3, 3, 15
+; BE64-NEXT:    rldicr 3, 3, 0, 59
+; BE64-NEXT:    neg 3, 3
+; BE64-NEXT:    std 4, 120(31)
+; BE64-NEXT:    addi 4, 31, 144
+; BE64-NEXT:    stdux 4, 1, 3
+; BE64-NEXT:    addi 3, 1, 112
+; BE64-NEXT:    bl baz
+; BE64-NEXT:    nop
+; BE64-NEXT:    ld 3, __stack_chk_guard@toc@l(30)
+; BE64-NEXT:    ld 4, 120(31)
+; BE64-NEXT:    cmpld 3, 4
+; BE64-NEXT:    bne 0, .LBB0_2
+; BE64-NEXT:  # %bb.1:
+; BE64-NEXT:    ld 30, 128(31) # 8-byte Folded Reload
+; BE64-NEXT:    ld 1, 0(1)
+; BE64-NEXT:    ld 0, 16(1)
+; BE64-NEXT:    ld 31, -8(1)
+; BE64-NEXT:    mtlr 0
+; BE64-NEXT:    blr
+; BE64-NEXT:  .LBB0_2:
+; BE64-NEXT:    bl __stack_chk_fail
+; BE64-NEXT:    nop
+;
+; LE64-LABEL: foo:
+; LE64:       # %bb.0:
+; LE64-NEXT:    mflr 0
+; LE64-NEXT:    std 31, -8(1)
+; LE64-NEXT:    stdu 1, -64(1)
+; LE64-NEXT:    mr 31, 1
+; LE64-NEXT:    sldi 3, 3, 2
+; LE64-NEXT:    std 0, 80(1)
+; LE64-NEXT:    std 30, 48(31) # 8-byte Folded Spill
+; LE64-NEXT:    addis 30, 2, __stack_chk_guard@toc@ha
+; LE64-NEXT:    addi 3, 3, 15
+; LE64-NEXT:    ld 4, __stack_chk_guard@toc@l(30)
+; LE64-NEXT:    rldicr 3, 3, 0, 59
+; LE64-NEXT:    neg 3, 3
+; LE64-NEXT:    std 4, 40(31)
+; LE64-NEXT:    addi 4, 31, 64
+; LE64-NEXT:    stdux 4, 1, 3
+; LE64-NEXT:    addi 3, 1, 32
+; LE64-NEXT:    bl baz
+; LE64-NEXT:    nop
+; LE64-NEXT:    ld 3, __stack_chk_guard@toc@l(30)
+; LE64-NEXT:    ld 4, 40(31)
+; LE64-NEXT:    cmpld 3, 4
+; LE64-NEXT:    bne 0, .LBB0_2
+; LE64-NEXT:  # %bb.1:
+; LE64-NEXT:    ld 30, 48(31) # 8-byte Folded Reload
+; LE64-NEXT:    ld 1, 0(1)
+; LE64-NEXT:    ld 0, 16(1)
+; LE64-NEXT:    ld 31, -8(1)
+; LE64-NEXT:    mtlr 0
+; LE64-NEXT:    blr
+; LE64-NEXT:  .LBB0_2:
+; LE64-NEXT:    bl __stack_chk_fail
+; LE64-NEXT:    nop
+;
+; LE32-LABEL: foo:
+; LE32:       # %bb.0:
+; LE32-NEXT:    mflr 0
+; LE32-NEXT:    stwu 1, -32(1)
+; LE32-NEXT:    stw 31, 28(1)
+; LE32-NEXT:    mr 31, 1
+; LE32-NEXT:    stw 0, 36(1)
+; LE32-NEXT:    slwi 4, 4, 2
+; LE32-NEXT:    stw 30, 24(31) # 4-byte Folded Spill
+; LE32-NEXT:    lis 30, __stack_chk_guard@ha
+; LE32-NEXT:    lwz 3, __stack_chk_guard@l(30)
+; LE32-NEXT:    addi 4, 4, 15
+; LE32-NEXT:    rlwinm 4, 4, 0, 0, 27
+; LE32-NEXT:    neg 4, 4
+; LE32-NEXT:    stw 3, 20(31)
+; LE32-NEXT:    addi 3, 31, 32
+; LE32-NEXT:    stwux 3, 1, 4
+; LE32-NEXT:    addi 3, 1, 16
+; LE32-NEXT:    bl baz
+; LE32-NEXT:    lwz 3, __stack_chk_guard@l(30)
+; LE32-NEXT:    lwz 4, 20(31)
+; LE32-NEXT:    cmplw 3, 4
+; LE32-NEXT:    bne 0, .LBB0_2
+; LE32-NEXT:  # %bb.1:
+; LE32-NEXT:    lwz 30, 24(31) # 4-byte Folded Reload
+; LE32-NEXT:    lwz 31, 0(1)
+; LE32-NEXT:    lwz 0, -4(31)
+; LE32-NEXT:    mr 1, 31
+; LE32-NEXT:    mr 31, 0
+; LE32-NEXT:    lwz 0, 4(1)
+; LE32-NEXT:    mtlr 0
+; LE32-NEXT:    blr
+; LE32-NEXT:  .LBB0_2:
+; LE32-NEXT:    bl __stack_chk_fail
+  %vla = alloca i32, i64 %t, align 4
+  call void @baz(ptr %vla)
+  ret void
+}
+
+declare void @baz(ptr)
+
+!llvm.module.flags = !{!1}
+!1 = !{i32 2, !"stack-protector-guard", !"global"}
diff --git a/llvm/test/CodeGen/PowerPC/stack-guard-tls.ll b/llvm/test/CodeGen/PowerPC/stack-guard-tls.ll
new file mode 100644
index 000000000000..de0becc03730
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/stack-guard-tls.ll
@@ -0,0 +1,114 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=powerpc64 -verify-machineinstrs < %s \
+; RUN:     | FileCheck %s --check-prefixes=BE64
+; RUN: llc -mtriple=powerpc64le -verify-machineinstrs < %s \
+; RUN:     | FileCheck %s --check-prefixes=LE64
+; RUN: llc -mtriple=ppc32 -verify-machineinstrs < %s \
+; RUN:     | FileCheck %s --check-prefixes=LE32
+
+define void @foo(i64 %t) sspstrong nounwind {
+; BE64-LABEL: foo:
+; BE64:       # %bb.0:
+; BE64-NEXT:    mflr 0
+; BE64-NEXT:    std 31, -8(1)
+; BE64-NEXT:    stdu 1, -144(1)
+; BE64-NEXT:    ld 4, 500(13)
+; BE64-NEXT:    sldi 3, 3, 2
+; BE64-NEXT:    mr 31, 1
+; BE64-NEXT:    addi 3, 3, 15
+; BE64-NEXT:    rldicr 3, 3, 0, 59
+; BE64-NEXT:    std 0, 160(1)
+; BE64-NEXT:    neg 3, 3
+; BE64-NEXT:    std 4, 128(31)
+; BE64-NEXT:    addi 4, 31, 144
+; BE64-NEXT:    stdux 4, 1, 3
+; BE64-NEXT:    addi 3, 1, 112
+; BE64-NEXT:    bl baz
+; BE64-NEXT:    nop
+; BE64-NEXT:    ld 3, 128(31)
+; BE64-NEXT:    ld 4, 500(13)
+; BE64-NEXT:    cmpld 4, 3
+; BE64-NEXT:    bne 0, .LBB0_2
+; BE64-NEXT:  # %bb.1:
+; BE64-NEXT:    ld 1, 0(1)
+; BE64-NEXT:    ld 0, 16(1)
+; BE64-NEXT:    ld 31, -8(1)
+; BE64-NEXT:    mtlr 0
+; BE64-NEXT:    blr
+; BE64-NEXT:  .LBB0_2:
+; BE64-NEXT:    bl __stack_chk_fail
+; BE64-NEXT:    nop
+;
+; LE64-LABEL: foo:
+; LE64:       # %bb.0:
+; LE64-NEXT:    mflr 0
+; LE64-NEXT:    std 31, -8(1)
+; LE64-NEXT:    stdu 1, -64(1)
+; LE64-NEXT:    sldi 3, 3, 2
+; LE64-NEXT:    ld 4, 500(13)
+; LE64-NEXT:    std 0, 80(1)
+; LE64-NEXT:    addi 3, 3, 15
+; LE64-NEXT:    mr 31, 1
+; LE64-NEXT:    std 4, 48(31)
+; LE64-NEXT:    addi 4, 31, 64
+; LE64-NEXT:    rldicr 3, 3, 0, 59
+; LE64-NEXT:    neg 3, 3
+; LE64-NEXT:    stdux 4, 1, 3
+; LE64-NEXT:    addi 3, 1, 32
+; LE64-NEXT:    bl baz
+; LE64-NEXT:    nop
+; LE64-NEXT:    ld 3, 48(31)
+; LE64-NEXT:    ld 4, 500(13)
+; LE64-NEXT:    cmpld 4, 3
+; LE64-NEXT:    bne 0, .LBB0_2
+; LE64-NEXT:  # %bb.1:
+; LE64-NEXT:    ld 1, 0(1)
+; LE64-NEXT:    ld 0, 16(1)
+; LE64-NEXT:    ld 31, -8(1)
+; LE64-NEXT:    mtlr 0
+; LE64-NEXT:    blr
+; LE64-NEXT:  .LBB0_2:
+; LE64-NEXT:    bl __stack_chk_fail
+; LE64-NEXT:    nop
+;
+; LE32-LABEL: foo:
+; LE32:       # %bb.0:
+; LE32-NEXT:    mflr 0
+; LE32-NEXT:    stwu 1, -32(1)
+; LE32-NEXT:    lwz 3, 500(2)
+; LE32-NEXT:    slwi 4, 4, 2
+; LE32-NEXT:    addi 4, 4, 15
+; LE32-NEXT:    stw 31, 28(1)
+; LE32-NEXT:    mr 31, 1
+; LE32-NEXT:    rlwinm 4, 4, 0, 0, 27
+; LE32-NEXT:    stw 0, 36(1)
+; LE32-NEXT:    neg 4, 4
+; LE32-NEXT:    stw 3, 24(31)
+; LE32-NEXT:    addi 3, 31, 32
+; LE32-NEXT:    stwux 3, 1, 4
+; LE32-NEXT:    addi 3, 1, 16
+; LE32-NEXT:    bl baz
+; LE32-NEXT:    lwz 3, 24(31)
+; LE32-NEXT:    lwz 4, 500(2)
+; LE32-NEXT:    cmplw 4, 3
+; LE32-NEXT:    bne 0, .LBB0_2
+; LE32-NEXT:  # %bb.1:
+; LE32-NEXT:    lwz 31, 0(1)
+; LE32-NEXT:    lwz 0, -4(31)
+; LE32-NEXT:    mr 1, 31
+; LE32-NEXT:    mr 31, 0
+; LE32-NEXT:    lwz 0, 4(1)
+; LE32-NEXT:    mtlr 0
+; LE32-NEXT:    blr
+; LE32-NEXT:  .LBB0_2:
+; LE32-NEXT:    bl __stack_chk_fail
+  %vla = alloca i32, i64 %t, align 4
+  call void @baz(ptr %vla)
+  ret void
+}
+
+declare void @baz(ptr)
+
+!llvm.module.flags = !{!1, !2}
+!1 = !{i32 2, !"stack-protector-guard", !"tls"}
+!2 = !{i32 2, !"stack-protector-guard-offset", i32 500}
-- 
GitLab


From bbccc521c6a0de151c4d7a34e7f78ae47f3a3298 Mon Sep 17 00:00:00 2001
From: Vitaly Buka <vitalybuka@google.com>
Date: Thu, 17 Oct 2024 19:23:08 -0700
Subject: [PATCH 317/329] [lsan] Disable test with barriers on Darwin (#112810)

---
 compiler-rt/test/lsan/TestCases/print_threads.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/compiler-rt/test/lsan/TestCases/print_threads.c b/compiler-rt/test/lsan/TestCases/print_threads.c
index b3072da93fab..a9389412af1c 100644
--- a/compiler-rt/test/lsan/TestCases/print_threads.c
+++ b/compiler-rt/test/lsan/TestCases/print_threads.c
@@ -2,6 +2,9 @@
 
 // XFAIL: hwasan
 
+// No pthread barriers on Darwin.
+// UNSUPPORTED: darwin
+
 #include <assert.h>
 #include <pthread.h>
 #include <sanitizer/lsan_interface.h>
-- 
GitLab


From 9c6f85f57a74278e4833f3da2606d80e7577d6d5 Mon Sep 17 00:00:00 2001
From: Jason Molenda <jmolenda@apple.com>
Date: Thu, 17 Oct 2024 19:46:08 -0700
Subject: [PATCH 318/329] [lldb][NFC] fix two small typeos in aarch64-linxu.md

---
 lldb/docs/use/aarch64-linux.md | 8 ++++----
 1 file changed, 4 insertions(+), 4 deletions(-)

diff --git a/lldb/docs/use/aarch64-linux.md b/lldb/docs/use/aarch64-linux.md
index 803f56d16f98..70432f57857a 100644
--- a/lldb/docs/use/aarch64-linux.md
+++ b/lldb/docs/use/aarch64-linux.md
@@ -160,7 +160,7 @@ Kernel does.
 ### Visibility of an Inactive ZA Register
 
 LLDB does not handle registers that can come and go at runtime (SVE changes
-size but it does not dissappear). Therefore when `za` is not enabled, LLDB
+size but it does not disappear). Therefore when `za` is not enabled, LLDB
 will return a block of 0s instead. This block will match the expected size of
 `za`:
 ```
@@ -183,9 +183,9 @@ If you want to know whether `za` is active or not, refer to bit 2 of the
 
 As for SVE, LLDB does not know how the debugee will use `za`, and therefore
 does not know how it would be best to display it. At any time any given
-instrucion could interpret its contents as many kinds and sizes of data.
+instruction could interpret its contents as many kinds and sizes of data.
 
-So LLDB will default to showing  `za` as one large vector of individual bytes.
+So LLDB will default to showing `za` as one large vector of individual bytes.
 You can override this with a format option (see the SVE example above).
 
 ### Expression Evaluation
@@ -228,4 +228,4 @@ bytes.
 ### Expression Evaluation
 
 `zt0`'s value and whether it is active or not will be saved prior to
-expression evaluation and restored afterwards.
\ No newline at end of file
+expression evaluation and restored afterwards.
-- 
GitLab


From 67f576f31d661897c5da302b8611decb7e0f9237 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 17 Oct 2024 19:56:09 -0700
Subject: [PATCH 319/329] [clang-format] Handle template opener/closer in
 braced list (#112494)

Fixes #112487.
---
 clang/lib/Format/UnwrappedLineParser.cpp      | 5 +++++
 clang/unittests/Format/TokenAnnotatorTest.cpp | 6 ++++++
 2 files changed, 11 insertions(+)

diff --git a/clang/lib/Format/UnwrappedLineParser.cpp b/clang/lib/Format/UnwrappedLineParser.cpp
index c9625c39e527..bda9850670ab 100644
--- a/clang/lib/Format/UnwrappedLineParser.cpp
+++ b/clang/lib/Format/UnwrappedLineParser.cpp
@@ -2504,6 +2504,11 @@ bool UnwrappedLineParser::parseBracedList(bool IsAngleBracket, bool IsEnum) {
       // Assume there are no blocks inside a braced init list apart
       // from the ones we explicitly parse out (like lambdas).
       FormatTok->setBlockKind(BK_BracedInit);
+      if (!IsAngleBracket) {
+        auto *Prev = FormatTok->Previous;
+        if (Prev && Prev->is(tok::greater))
+          Prev->setFinalizedType(TT_TemplateCloser);
+      }
       nextToken();
       parseBracedList();
       break;
diff --git a/clang/unittests/Format/TokenAnnotatorTest.cpp b/clang/unittests/Format/TokenAnnotatorTest.cpp
index 00776dac28a1..60deae0c9b11 100644
--- a/clang/unittests/Format/TokenAnnotatorTest.cpp
+++ b/clang/unittests/Format/TokenAnnotatorTest.cpp
@@ -3554,6 +3554,12 @@ TEST_F(TokenAnnotatorTest, TemplateInstantiation) {
   ASSERT_EQ(Tokens.size(), 21u) << Tokens;
   EXPECT_TOKEN(Tokens[4], tok::less, TT_TemplateOpener);
   EXPECT_TOKEN(Tokens[16], tok::greater, TT_TemplateCloser);
+
+  Tokens =
+      annotate("auto x{std::conditional_t<T::value == U::value, T, U>{}};");
+  ASSERT_EQ(Tokens.size(), 24u) << Tokens;
+  EXPECT_TOKEN(Tokens[6], tok::less, TT_TemplateOpener);
+  EXPECT_TOKEN(Tokens[18], tok::greater, TT_TemplateCloser);
 }
 
 } // namespace
-- 
GitLab


From d989c2410eb883f464c3efa472ed026dc5fd9f88 Mon Sep 17 00:00:00 2001
From: Owen Pan <owenpiano@gmail.com>
Date: Thu, 17 Oct 2024 20:21:42 -0700
Subject: [PATCH 320/329] [clang-format] Add RemoveEmptyLinesInUnwrappedLines
 option (#112325)

Fixes #111340.
---
 clang/docs/ClangFormatStyleOptions.rst     | 25 +++++++
 clang/docs/ReleaseNotes.rst                |  6 +-
 clang/include/clang/Format/Format.h        | 25 +++++++
 clang/lib/Format/Format.cpp                |  3 +
 clang/lib/Format/TokenAnnotator.cpp        |  4 +-
 clang/unittests/Format/ConfigParseTest.cpp |  1 +
 clang/unittests/Format/FormatTest.cpp      | 77 ++++++++++++++++++++++
 7 files changed, 138 insertions(+), 3 deletions(-)

diff --git a/clang/docs/ClangFormatStyleOptions.rst b/clang/docs/ClangFormatStyleOptions.rst
index 8add0a53e5be..f36a5472b7e1 100644
--- a/clang/docs/ClangFormatStyleOptions.rst
+++ b/clang/docs/ClangFormatStyleOptions.rst
@@ -5505,6 +5505,31 @@ the configuration (without a prefix: ``Auto``).
       }
     }
 
+.. _RemoveEmptyLinesInUnwrappedLines:
+
+**RemoveEmptyLinesInUnwrappedLines** (``Boolean``) :versionbadge:`clang-format 20` :ref:`¶ <RemoveEmptyLinesInUnwrappedLines>`
+  Remove empty lines within unwrapped lines.
+
+  .. code-block:: c++
+
+    false:                            true:
+
+    int c                  vs.        int c = a + b;
+
+        = a + b;
+
+    enum : unsigned        vs.        enum : unsigned {
+                                        AA = 0,
+    {                                   BB
+      AA = 0,                         } myEnum;
+      BB
+    } myEnum;
+
+    while (                vs.        while (true) {
+                                      }
+        true) {
+    }
+
 .. _RemoveParentheses:
 
 **RemoveParentheses** (``RemoveParenthesesStyle``) :versionbadge:`clang-format 17` :ref:`¶ <RemoveParentheses>`
diff --git a/clang/docs/ReleaseNotes.rst b/clang/docs/ReleaseNotes.rst
index 1da8c82d52e6..a65bd6f38290 100644
--- a/clang/docs/ReleaseNotes.rst
+++ b/clang/docs/ReleaseNotes.rst
@@ -699,8 +699,10 @@ clang-format
 - Adds ``BreakBinaryOperations`` option.
 - Adds ``TemplateNames`` option.
 - Adds ``AlignFunctionDeclarations`` option to ``AlignConsecutiveDeclarations``.
-- Adds ``IndentOnly`` suboption to ``ReflowComments`` to fix the indentation of multi-line comments
-  without touching their contents, renames ``false`` to ``Never``, and ``true`` to ``Always``.
+- Adds ``IndentOnly`` suboption to ``ReflowComments`` to fix the indentation of
+  multi-line comments without touching their contents, renames ``false`` to
+  ``Never``, and ``true`` to ``Always``.
+- Adds ``RemoveEmptyLinesInUnwrappedLines`` option.
 
 libclang
 --------
diff --git a/clang/include/clang/Format/Format.h b/clang/include/clang/Format/Format.h
index a0762b088b68..debba1c78228 100644
--- a/clang/include/clang/Format/Format.h
+++ b/clang/include/clang/Format/Format.h
@@ -3938,6 +3938,29 @@ struct FormatStyle {
   /// \version 14
   bool RemoveBracesLLVM;
 
+  /// Remove empty lines within unwrapped lines.
+  /// \code
+  ///   false:                            true:
+  ///
+  ///   int c                  vs.        int c = a + b;
+  ///
+  ///       = a + b;
+  ///
+  ///   enum : unsigned        vs.        enum : unsigned {
+  ///                                       AA = 0,
+  ///   {                                   BB
+  ///     AA = 0,                         } myEnum;
+  ///     BB
+  ///   } myEnum;
+  ///
+  ///   while (                vs.        while (true) {
+  ///                                     }
+  ///       true) {
+  ///   }
+  /// \endcode
+  /// \version 20
+  bool RemoveEmptyLinesInUnwrappedLines;
+
   /// Types of redundant parentheses to remove.
   enum RemoveParenthesesStyle : int8_t {
     /// Do not remove parentheses.
@@ -5232,6 +5255,8 @@ struct FormatStyle {
            RawStringFormats == R.RawStringFormats &&
            ReferenceAlignment == R.ReferenceAlignment &&
            RemoveBracesLLVM == R.RemoveBracesLLVM &&
+           RemoveEmptyLinesInUnwrappedLines ==
+               R.RemoveEmptyLinesInUnwrappedLines &&
            RemoveParentheses == R.RemoveParentheses &&
            RemoveSemicolon == R.RemoveSemicolon &&
            RequiresClausePosition == R.RequiresClausePosition &&
diff --git a/clang/lib/Format/Format.cpp b/clang/lib/Format/Format.cpp
index 148270795c56..c612960ff37a 100644
--- a/clang/lib/Format/Format.cpp
+++ b/clang/lib/Format/Format.cpp
@@ -1104,6 +1104,8 @@ template <> struct MappingTraits<FormatStyle> {
     IO.mapOptional("ReferenceAlignment", Style.ReferenceAlignment);
     IO.mapOptional("ReflowComments", Style.ReflowComments);
     IO.mapOptional("RemoveBracesLLVM", Style.RemoveBracesLLVM);
+    IO.mapOptional("RemoveEmptyLinesInUnwrappedLines",
+                   Style.RemoveEmptyLinesInUnwrappedLines);
     IO.mapOptional("RemoveParentheses", Style.RemoveParentheses);
     IO.mapOptional("RemoveSemicolon", Style.RemoveSemicolon);
     IO.mapOptional("RequiresClausePosition", Style.RequiresClausePosition);
@@ -1582,6 +1584,7 @@ FormatStyle getLLVMStyle(FormatStyle::LanguageKind Language) {
   LLVMStyle.ReferenceAlignment = FormatStyle::RAS_Pointer;
   LLVMStyle.ReflowComments = FormatStyle::RCS_Always;
   LLVMStyle.RemoveBracesLLVM = false;
+  LLVMStyle.RemoveEmptyLinesInUnwrappedLines = false;
   LLVMStyle.RemoveParentheses = FormatStyle::RPS_Leave;
   LLVMStyle.RemoveSemicolon = false;
   LLVMStyle.RequiresClausePosition = FormatStyle::RCPS_OwnLine;
diff --git a/clang/lib/Format/TokenAnnotator.cpp b/clang/lib/Format/TokenAnnotator.cpp
index fcefaa7bb298..13037b6d0060 100644
--- a/clang/lib/Format/TokenAnnotator.cpp
+++ b/clang/lib/Format/TokenAnnotator.cpp
@@ -5509,8 +5509,10 @@ static bool isAllmanLambdaBrace(const FormatToken &Tok) {
 bool TokenAnnotator::mustBreakBefore(const AnnotatedLine &Line,
                                      const FormatToken &Right) const {
   const FormatToken &Left = *Right.Previous;
-  if (Right.NewlinesBefore > 1 && Style.MaxEmptyLinesToKeep > 0)
+  if (Right.NewlinesBefore > 1 && Style.MaxEmptyLinesToKeep > 0 &&
+      (!Style.RemoveEmptyLinesInUnwrappedLines || &Right == Line.First)) {
     return true;
+  }
 
   if (Style.BreakFunctionDefinitionParameters && Line.MightBeFunctionDecl &&
       Line.mightBeFunctionDefinition() && Left.MightBeFunctionDeclParen &&
diff --git a/clang/unittests/Format/ConfigParseTest.cpp b/clang/unittests/Format/ConfigParseTest.cpp
index 318f08c04759..9e8529050ed8 100644
--- a/clang/unittests/Format/ConfigParseTest.cpp
+++ b/clang/unittests/Format/ConfigParseTest.cpp
@@ -184,6 +184,7 @@ TEST(ConfigParseTest, ParsesConfigurationBools) {
   CHECK_PARSE_BOOL(ObjCSpaceBeforeProtocolList);
   CHECK_PARSE_BOOL(Cpp11BracedListStyle);
   CHECK_PARSE_BOOL(RemoveBracesLLVM);
+  CHECK_PARSE_BOOL(RemoveEmptyLinesInUnwrappedLines);
   CHECK_PARSE_BOOL(RemoveSemicolon);
   CHECK_PARSE_BOOL(SkipMacroDefinitionBody);
   CHECK_PARSE_BOOL(SpacesInSquareBrackets);
diff --git a/clang/unittests/Format/FormatTest.cpp b/clang/unittests/Format/FormatTest.cpp
index 43513f18321b..8f4c92148ada 100644
--- a/clang/unittests/Format/FormatTest.cpp
+++ b/clang/unittests/Format/FormatTest.cpp
@@ -28135,6 +28135,83 @@ TEST_F(FormatTest, BreakBinaryOperations) {
                Style);
 }
 
+TEST_F(FormatTest, RemovesEmptyLinesInUnwrappedLines) {
+  auto Style = getLLVMStyle();
+  Style.RemoveEmptyLinesInUnwrappedLines = true;
+
+  verifyFormat("int c = a + b;",
+               "int c\n"
+               "\n"
+               "    = a + b;",
+               Style);
+
+  verifyFormat("enum : unsigned { AA = 0, BB } myEnum;",
+               "enum : unsigned\n"
+               "\n"
+               "{\n"
+               "  AA = 0,\n"
+               "  BB\n"
+               "} myEnum;",
+               Style);
+
+  verifyFormat("class B : public E {\n"
+               "private:\n"
+               "};",
+               "class B : public E\n"
+               "\n"
+               "{\n"
+               "private:\n"
+               "};",
+               Style);
+
+  verifyFormat(
+      "struct AAAAAAAAAAAAAAA test[3] = {{56, 23, \"hello\"}, {7, 5, \"!!\"}};",
+      "struct AAAAAAAAAAAAAAA test[3] = {{56,\n"
+      "\n"
+      "                                   23, \"hello\"},\n"
+      "                                  {7, 5, \"!!\"}};",
+      Style);
+
+  verifyFormat("int myFunction(int aaaaaaaaaaaaa, int ccccccccccccc, int d);",
+               "int myFunction(\n"
+               "\n"
+               "    int aaaaaaaaaaaaa,\n"
+               "\n"
+               "    int ccccccccccccc, int d);",
+               Style);
+
+  verifyFormat("switch (e) {\n"
+               "case 1:\n"
+               "  return e;\n"
+               "case 2:\n"
+               "  return 2;\n"
+               "}",
+               "switch (\n"
+               "\n"
+               "    e) {\n"
+               "case 1:\n"
+               "  return e;\n"
+               "case 2:\n"
+               "  return 2;\n"
+               "}",
+               Style);
+
+  verifyFormat("while (true) {\n"
+               "}",
+               "while (\n"
+               "\n"
+               "    true) {\n"
+               "}",
+               Style);
+
+  verifyFormat("void loooonFunctionIsVeryLongButNotAsLongAsJavaTypeNames(\n"
+               "    std::map<int, std::string> *outputMap);",
+               "void loooonFunctionIsVeryLongButNotAsLongAsJavaTypeNames\n"
+               "\n"
+               "    (std::map<int, std::string> *outputMap);",
+               Style);
+}
+
 } // namespace
 } // namespace test
 } // namespace format
-- 
GitLab


From 252645528eefee9319f99172c2470aea0dcc31cf Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Fri, 18 Oct 2024 10:11:59 +0530
Subject: [PATCH 321/329] [Flang][OpenMP] Add semantic checks for Worshare
 construct (#111358)

Add missing semantic checks for the Workshare construct:
OpenMP 5.2: 11.4 Workshare Construct
- The construct must not contain any user-defined function calls unless
either the function is pure and elemental or the function call is
contained inside a parallel construct that is nested inside the
workshare construct. (Flang-new used to check only the elemental function,
but now it needs to be an impure elemental function)
- At most one NoWait clause can appear in the Workshare construct.
- Add tests for the same.
---
 flang/lib/Semantics/check-omp-structure.cpp | 22 ++++++++++++++++-----
 flang/test/Semantics/OpenMP/workshare02.f90 | 18 +++++++++++++++++
 llvm/include/llvm/Frontend/OpenMP/OMP.td    |  2 +-
 3 files changed, 36 insertions(+), 6 deletions(-)

diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index bdb8a7249f1a..3db252e5fc8e 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -68,11 +68,23 @@ public:
     if (const auto *e{GetExpr(context_, expr)}) {
       for (const Symbol &symbol : evaluate::CollectSymbols(*e)) {
         const Symbol &root{GetAssociationRoot(symbol)};
-        if (IsFunction(root) && !IsElementalProcedure(root)) {
-          context_.Say(expr.source,
-              "User defined non-ELEMENTAL function "
-              "'%s' is not allowed in a WORKSHARE construct"_err_en_US,
-              root.name());
+        if (IsFunction(root)) {
+          std::string attrs{""};
+          if (!IsElementalProcedure(root)) {
+            attrs = " non-ELEMENTAL";
+          }
+          if (root.attrs().test(Attr::IMPURE)) {
+            if (attrs != "") {
+              attrs = "," + attrs;
+            }
+            attrs = " IMPURE" + attrs;
+          }
+          if (attrs != "") {
+            context_.Say(expr.source,
+                "User defined%s function '%s' is not allowed in a "
+                "WORKSHARE construct"_err_en_US,
+                attrs, root.name());
+          }
         }
       }
     }
diff --git a/flang/test/Semantics/OpenMP/workshare02.f90 b/flang/test/Semantics/OpenMP/workshare02.f90
index 11f33d63a3eb..dddaa354fff9 100644
--- a/flang/test/Semantics/OpenMP/workshare02.f90
+++ b/flang/test/Semantics/OpenMP/workshare02.f90
@@ -9,6 +9,14 @@ module my_mod
   integer function my_func()
     my_func = 10
   end function my_func
+
+  impure integer function impure_my_func()
+    impure_my_func = 20
+  end function impure_my_func
+
+  impure elemental integer function impure_ele_my_func()
+    impure_ele_my_func = 20
+  end function impure_ele_my_func
 end module my_mod
 
 subroutine workshare(aa, bb, cc, dd, ee, ff, n)
@@ -61,6 +69,16 @@ subroutine workshare(aa, bb, cc, dd, ee, ff, n)
   j = j - my_func()
   !$omp end atomic
 
+  !ERROR: User defined IMPURE, non-ELEMENTAL function 'impure_my_func' is not allowed in a WORKSHARE construct
+  cc = impure_my_func()
+  !ERROR: User defined IMPURE function 'impure_ele_my_func' is not allowed in a WORKSHARE construct
+  aa(1) = impure_ele_my_func()
+
   !$omp end workshare
 
+  !$omp workshare
+    j = j + 1
+  !ERROR: At most one NOWAIT clause can appear on the END WORKSHARE directive
+  !$omp end workshare nowait nowait
+
 end subroutine workshare
diff --git a/llvm/include/llvm/Frontend/OpenMP/OMP.td b/llvm/include/llvm/Frontend/OpenMP/OMP.td
index f2f09812a869..f784c37cbe95 100644
--- a/llvm/include/llvm/Frontend/OpenMP/OMP.td
+++ b/llvm/include/llvm/Frontend/OpenMP/OMP.td
@@ -1170,7 +1170,7 @@ def OMP_Workshare : Directive<"workshare"> {
   let category = CA_Executable;
 }
 def OMP_EndWorkshare : Directive<"end workshare"> {
-  let allowedClauses = [
+  let allowedOnceClauses = [
     VersionedClause<OMPC_NoWait>,
   ];
   let leafConstructs = OMP_Workshare.leafConstructs;
-- 
GitLab


From b3403100673dbc61ed26b5500ed74106bca908d3 Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Fri, 18 Oct 2024 10:12:50 +0530
Subject: [PATCH 322/329] [NFC][Flang][Test] Add some missing tests (#110468)

- At most one Collapse clause in SIMD construct
- A DO loop must follow the SIMD directive
---
 flang/test/Semantics/OpenMP/do-collapse.f90      | 8 +++++++-
 flang/test/Semantics/OpenMP/loop-association.f90 | 6 ++++++
 2 files changed, 13 insertions(+), 1 deletion(-)

diff --git a/flang/test/Semantics/OpenMP/do-collapse.f90 b/flang/test/Semantics/OpenMP/do-collapse.f90
index 4f2512937ace..480bd45b79b8 100644
--- a/flang/test/Semantics/OpenMP/do-collapse.f90
+++ b/flang/test/Semantics/OpenMP/do-collapse.f90
@@ -30,5 +30,11 @@ program omp_doCollapse
       do
       end do
     end do
-end program omp_doCollapse
 
+  !ERROR: At most one COLLAPSE clause can appear on the SIMD directive
+  !$omp simd collapse(2) collapse(1)
+  do i = 1, 4
+    j = j + i + 1
+  end do
+  !$omp end simd
+end program omp_doCollapse
diff --git a/flang/test/Semantics/OpenMP/loop-association.f90 b/flang/test/Semantics/OpenMP/loop-association.f90
index d2167663c5dd..9fac508e6128 100644
--- a/flang/test/Semantics/OpenMP/loop-association.f90
+++ b/flang/test/Semantics/OpenMP/loop-association.f90
@@ -131,4 +131,10 @@
   !$omp end parallel do simd
   !ERROR: The END PARALLEL DO SIMD directive must follow the DO loop associated with the loop construct
   !$omp end parallel do simd
+
+  !ERROR: A DO loop must follow the SIMD directive
+  !$omp simd
+    a = i + 1
+  !ERROR: The END SIMD directive must follow the DO loop associated with the loop construct
+  !$omp end simd
 end
-- 
GitLab


From e6321d94dee1c7f611bc08dacd3a851e3299fc16 Mon Sep 17 00:00:00 2001
From: Thirumalai Shaktivel
 <74826228+Thirumalai-Shaktivel@users.noreply.github.com>
Date: Fri, 18 Oct 2024 10:13:49 +0530
Subject: [PATCH 323/329] [Flang][Semantics] Add a semantic check for simd
 construct (#109089)

Add missing semantic check for the SAFELEN clause in the SIMD Order
construct
---
 flang/lib/Semantics/check-omp-structure.cpp       | 15 +++++++++++++++
 flang/test/Semantics/OpenMP/clause-validity01.f90 |  6 ++++++
 2 files changed, 21 insertions(+)

diff --git a/flang/lib/Semantics/check-omp-structure.cpp b/flang/lib/Semantics/check-omp-structure.cpp
index 3db252e5fc8e..473ed2be3dbc 100644
--- a/flang/lib/Semantics/check-omp-structure.cpp
+++ b/flang/lib/Semantics/check-omp-structure.cpp
@@ -2285,6 +2285,21 @@ void OmpStructureChecker::Leave(const parser::OmpClauseList &) {
         }
       }
     }
+
+    // 2.11.5 Simd construct restriction (OpenMP 5.1)
+    if (auto *sl_clause{FindClause(llvm::omp::Clause::OMPC_safelen)}) {
+      if (auto *o_clause{FindClause(llvm::omp::Clause::OMPC_order)}) {
+        const auto &orderClause{
+            std::get<parser::OmpClause::Order>(o_clause->u)};
+        if (std::get<parser::OmpOrderClause::Type>(orderClause.v.t) ==
+            parser::OmpOrderClause::Type::Concurrent) {
+          context_.Say(sl_clause->source,
+              "The `SAFELEN` clause cannot appear in the `SIMD` directive "
+              "with `ORDER(CONCURRENT)` clause"_err_en_US);
+        }
+      }
+    }
+
     // Sema checks related to presence of multiple list items within the same
     // clause
     CheckMultListItems();
diff --git a/flang/test/Semantics/OpenMP/clause-validity01.f90 b/flang/test/Semantics/OpenMP/clause-validity01.f90
index 24540492e732..1a7a57b124e9 100644
--- a/flang/test/Semantics/OpenMP/clause-validity01.f90
+++ b/flang/test/Semantics/OpenMP/clause-validity01.f90
@@ -390,6 +390,12 @@ use omp_lib
   enddo
   !$omp end parallel
 
+  !ERROR: The `SAFELEN` clause cannot appear in the `SIMD` directive with `ORDER(CONCURRENT)` clause
+  !$omp simd order(concurrent) safelen(1+2)
+  do i = 1, N
+    a = 3.14
+  enddo
+
 ! 2.11.1 parallel-do-clause -> parallel-clause |
 !                              do-clause
 
-- 
GitLab


From b49701085414838deb0213b9f10b68d9d3af6b0a Mon Sep 17 00:00:00 2001
From: Florian Hahn <flo@fhahn.com>
Date: Fri, 18 Oct 2024 05:52:34 +0100
Subject: [PATCH 324/329] [VPlan] Use VPInstruction::Name when assigning names
 (NFCI).

This slightly improves the printing of VPInstructions. NFC except debug
output.
---
 llvm/lib/Transforms/Vectorize/VPlan.cpp        | 14 ++++++++++----
 llvm/lib/Transforms/Vectorize/VPlan.h          |  3 +++
 .../AArch64/sve-tail-folding-forced.ll         |  8 ++++----
 .../RISCV/vplan-vp-intrinsics-reduction.ll     |  8 ++++----
 .../LoopVectorize/RISCV/vplan-vp-intrinsics.ll |  8 ++++----
 .../RISCV/vplan-vp-select-intrinsics.ll        |  4 ++--
 .../LoopVectorize/X86/vplan-vp-intrinsics.ll   |  4 ++--
 .../first-order-recurrence-chains-vplan.ll     | 18 +++++++++---------
 .../vplan-unused-interleave-group.ll           |  6 +++---
 .../Transforms/Vectorize/VPlanHCFGTest.cpp     |  4 ++--
 10 files changed, 43 insertions(+), 34 deletions(-)

diff --git a/llvm/lib/Transforms/Vectorize/VPlan.cpp b/llvm/lib/Transforms/Vectorize/VPlan.cpp
index 7ded51d9e3ab..c1b97791331b 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.cpp
+++ b/llvm/lib/Transforms/Vectorize/VPlan.cpp
@@ -1555,7 +1555,8 @@ VPInterleavedAccessInfo::VPInterleavedAccessInfo(VPlan &Plan,
 void VPSlotTracker::assignName(const VPValue *V) {
   assert(!VPValue2Name.contains(V) && "VPValue already has a name!");
   auto *UV = V->getUnderlyingValue();
-  if (!UV) {
+  auto *VPI = dyn_cast_or_null<VPInstruction>(V->getDefiningRecipe());
+  if (!UV && !(VPI && !VPI->getName().empty())) {
     VPValue2Name[V] = (Twine("vp<%") + Twine(NextSlot) + ">").str();
     NextSlot++;
     return;
@@ -1564,10 +1565,15 @@ void VPSlotTracker::assignName(const VPValue *V) {
   // Use the name of the underlying Value, wrapped in "ir<>", and versioned by
   // appending ".Number" to the name if there are multiple uses.
   std::string Name;
-  raw_string_ostream S(Name);
-  UV->printAsOperand(S, false);
+  if (UV) {
+    raw_string_ostream S(Name);
+    UV->printAsOperand(S, false);
+  } else
+    Name = VPI->getName();
+
   assert(!Name.empty() && "Name cannot be empty.");
-  std::string BaseName = (Twine("ir<") + Name + Twine(">")).str();
+  StringRef Prefix = UV ? "ir<" : "vp<%";
+  std::string BaseName = (Twine(Prefix) + Name + Twine(">")).str();
 
   // First assign the base name for V.
   const auto &[A, _] = VPValue2Name.insert({V, BaseName});
diff --git a/llvm/lib/Transforms/Vectorize/VPlan.h b/llvm/lib/Transforms/Vectorize/VPlan.h
index fd97dda6dc1b..59a084401cc9 100644
--- a/llvm/lib/Transforms/Vectorize/VPlan.h
+++ b/llvm/lib/Transforms/Vectorize/VPlan.h
@@ -1414,6 +1414,9 @@ public:
   /// Returns true if this VPInstruction's operands are single scalars and the
   /// result is also a single scalar.
   bool isSingleScalar() const;
+
+  /// Returns the symbolic name assigned to the VPInstruction.
+  StringRef getName() const { return Name; }
 };
 
 /// A recipe to wrap on original IR instruction not to be modified during
diff --git a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
index 6a7263d64985..0b3f28e8db5c 100644
--- a/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
+++ b/llvm/test/Transforms/LoopVectorize/AArch64/sve-tail-folding-forced.ll
@@ -19,19 +19,19 @@ target triple = "aarch64-unknown-linux-gnu"
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: vector.ph:
 ; VPLANS-NEXT:   EMIT vp<[[NEWTC:%[0-9]+]]> = TC > VF ? TC - VF : 0 vp<[[TC]]>
-; VPLANS-NEXT:   EMIT vp<[[VF:%[0-9]+]]> = VF * Part + ir<0>
-; VPLANS-NEXT:   EMIT vp<[[LANEMASK_ENTRY:%[0-9]+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]>
+; VPLANS-NEXT:   EMIT vp<[[VF:%.+]]> = VF * Part + ir<0>
+; VPLANS-NEXT:   EMIT vp<[[LANEMASK_ENTRY:%.+]]> = active lane mask vp<[[VF]]>, vp<[[TC]]>
 ; VPLANS-NEXT: Successor(s): vector loop
 ; VPLANS-EMPTY:
 ; VPLANS-NEXT: <x1> vector loop: {
 ; VPLANS-NEXT:   vector.body:
 ; VPLANS-NEXT:     EMIT vp<[[INDV:%[0-9]+]]> = CANONICAL-INDUCTION
-; VPLANS-NEXT:     ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%[0-9]+]]>
+; VPLANS-NEXT:     ACTIVE-LANE-MASK-PHI vp<[[LANEMASK_PHI:%[0-9]+]]> = phi vp<[[LANEMASK_ENTRY]]>, vp<[[LANEMASK_LOOP:%.+]]>
 ; VPLANS-NEXT:     vp<[[STEP:%[0-9]+]]>    = SCALAR-STEPS vp<[[INDV]]>, ir<1>
 ; VPLANS-NEXT:     CLONE ir<%gep> = getelementptr ir<%ptr>, vp<[[STEP]]>
 ; VPLANS-NEXT:     vp<[[VEC_PTR:%[0-9]+]]> = vector-pointer ir<%gep>
 ; VPLANS-NEXT:     WIDEN store vp<[[VEC_PTR]]>, ir<%val>, vp<[[LANEMASK_PHI]]>
-; VPLANS-NEXT:     EMIT vp<[[INDV_UPDATE:%[0-9]+]]> = add vp<[[INDV]]>, vp<[[VFxUF]]>
+; VPLANS-NEXT:     EMIT vp<[[INDV_UPDATE:%.+]]> = add vp<[[INDV]]>, vp<[[VFxUF]]>
 ; VPLANS-NEXT:     EMIT vp<[[INC:%[0-9]+]]> = VF * Part + vp<[[INDV]]>
 ; VPLANS-NEXT:     EMIT vp<[[LANEMASK_LOOP]]> = active lane mask vp<[[INC]]>, vp<[[NEWTC]]>
 ; VPLANS-NEXT:     EMIT vp<[[NOT:%[0-9]+]]> = not vp<[[LANEMASK_LOOP]]>
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
index 90c209cf3f51..1326751a847d 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics-reduction.ll
@@ -37,7 +37,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT: <x1> vector loop: {
 ; IF-EVL-INLOOP-NEXT:  vector.body:
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-INLOOP-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-INLOOP-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
 ; IF-EVL-INLOOP-NEXT:    WIDEN-REDUCTION-PHI ir<[[RDX_PHI:%.+]]> = phi ir<%start>, ir<[[RDX_NEXT:%.+]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[AVL:%.+]]> = sub ir<%n>, vp<[[EVL_PHI]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
@@ -48,7 +48,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; IF-EVL-INLOOP-NEXT:    REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + vp.reduce.add (ir<[[LD1]]>, vp<[[EVL]]>)
 ; IF-EVL-INLOOP-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-INLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-INLOOP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
 ; IF-EVL-INLOOP-NEXT:  No successors
 ; IF-EVL-INLOOP-NEXT: }
@@ -86,7 +86,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-OUTLOOP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
 ; NO-VP-OUTLOOP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
 ; NO-VP-OUTLOOP-NEXT:    WIDEN ir<[[ADD:%.+]]> = add ir<[[LD1]]>, ir<[[RDX_PHI]]>
-; NO-VP-OUTLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-OUTLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
 ; NO-VP-OUTLOOP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
 ; NO-VP-OUTLOOP-NEXT:  No successors
 ; NO-VP-OUTLOOP-NEXT: }
@@ -125,7 +125,7 @@ define i32 @reduction(ptr %a, i64 %n, i32 %start) {
 ; NO-VP-INLOOP-NEXT:    vp<[[PTR1:%[0-9]+]]> = vector-pointer ir<[[GEP1]]>
 ; NO-VP-INLOOP-NEXT:    WIDEN ir<[[LD1:%.+]]> = load vp<[[PTR1]]>
 ; NO-VP-INLOOP-NEXT:    REDUCE ir<[[ADD:%.+]]> = ir<[[RDX_PHI]]> + reduce.add (ir<[[LD1]]>)
-; NO-VP-INLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-INLOOP-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
 ; NO-VP-INLOOP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
 ; NO-VP-INLOOP-NEXT:  No successors
 ; NO-VP-INLOOP-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
index c14a8bce8f48..706b6f888298 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-intrinsics.ll
@@ -22,7 +22,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT: <x1> vector loop: {
 ; IF-EVL-NEXT:  vector.body:
 ; IF-EVL-NEXT:    EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
-; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%[0-9]+]]>
+; IF-EVL-NEXT:    EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]> = phi ir<0>, vp<[[IV_NEXT:%.+]]>
 ; IF-EVL-NEXT:    EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
 ; IF-EVL-NEXT:    EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
 ; IF-EVL-NEXT:    vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
@@ -38,7 +38,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
 ; IF-EVL-NEXT:    SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
 ; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
-; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT_EXIT]]>, vp<[[VTC]]>
 ; IF-EVL-NEXT:  No successors
 ; IF-EVL-NEXT: }
@@ -65,7 +65,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
-; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
 ; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; NO-VP-NEXT:  No successors
 ; NO-VP-NEXT: }
@@ -110,7 +110,7 @@ define void @safe_dep(ptr %p) {
 ; CHECK-NEXT:    CLONE ir<[[GEP2:%.+]]> = getelementptr ir<%p>, ir<[[OFFSET]]>
 ; CHECK-NEXT:    vp<[[PTR2:%[0-9]+]]> = vector-pointer ir<[[GEP2]]>
 ; CHECK-NEXT:    WIDEN store vp<[[PTR2]]>, ir<[[V]]>
-; CHECK-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; CHECK-NEXT:    EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
 ; CHECK-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; CHECK-NEXT:  No successors
 ; CHECK-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
index c26ab2017280..6d6cfb5e9d18 100644
--- a/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/RISCV/vplan-vp-select-intrinsics.ll
@@ -17,7 +17,7 @@
  ; IF-EVL: <x1> vector loop: {
  ; IF-EVL-NEXT:   vector.body:
  ; IF-EVL-NEXT:     EMIT vp<[[IV:%[0-9]+]]> = CANONICAL-INDUCTION
- ; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%[0-9]+]]>
+ ; IF-EVL-NEXT:     EXPLICIT-VECTOR-LENGTH-BASED-IV-PHI vp<[[EVL_PHI:%[0-9]+]]>  = phi ir<0>, vp<[[IV_NEX:%.+]]>
  ; IF-EVL-NEXT:     EMIT vp<[[AVL:%.+]]> = sub ir<%N>, vp<[[EVL_PHI]]>
  ; IF-EVL-NEXT:     EMIT vp<[[EVL:%.+]]> = EXPLICIT-VECTOR-LENGTH vp<[[AVL]]>
  ; IF-EVL-NEXT:     vp<[[ST:%[0-9]+]]> = SCALAR-STEPS vp<[[EVL_PHI]]>, ir<1>
@@ -36,7 +36,7 @@
  ; IF-EVL-NEXT:     WIDEN vp.store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[EVL]]>
  ; IF-EVL-NEXT:     SCALAR-CAST vp<[[CAST:%[0-9]+]]> = zext vp<[[EVL]]> to i64
  ; IF-EVL-NEXT:     EMIT vp<[[IV_NEX]]> = add vp<[[CAST]]>, vp<[[EVL_PHI]]>
- ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+ ; IF-EVL-NEXT:     EMIT vp<[[IV_NEXT_EXIT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
  ; IF-EVL-NEXT:     EMIT branch-on-count vp<[[IV_NEXT_EXIT]]>,  vp<[[VTC]]>
  ; IF-EVL-NEXT:   No successors
  ; IF-EVL-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
index 9b49d44141db..1af03e740ef1 100644
--- a/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
+++ b/llvm/test/Transforms/LoopVectorize/X86/vplan-vp-intrinsics.ll
@@ -36,7 +36,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; IF-EVL-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; IF-EVL-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; IF-EVL-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>, vp<[[MASK]]>
-; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
+; IF-EVL-NEXT:    EMIT vp<[[IV_NEXT:%.+]]> = add vp<[[IV]]>, vp<[[VFUF]]>
 ; IF-EVL-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; IF-EVL-NEXT:  No successors
 ; IF-EVL-NEXT: }
@@ -63,7 +63,7 @@ define void @foo(ptr noalias %a, ptr noalias %b, ptr noalias %c, i64 %N) {
 ; NO-VP-NEXT:    CLONE ir<[[GEP3:%.+]]> = getelementptr inbounds ir<%a>, vp<[[ST]]>
 ; NO-VP-NEXT:    vp<[[PTR3:%[0-9]+]]> = vector-pointer ir<[[GEP3]]>
 ; NO-VP-NEXT:    WIDEN store vp<[[PTR3]]>, ir<[[ADD]]>
-; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%[0-9]+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
+; NO-VP-NEXT:    EMIT vp<[[IV_NEXT:%.+]]> = add nuw vp<[[IV]]>, vp<[[VFUF]]>
 ; NO-VP-NEXT:    EMIT branch-on-count  vp<[[IV_NEXT]]>, vp<[[VTC]]>
 ; NO-VP-NEXT:  No successors
 ; NO-VP-NEXT: }
diff --git a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
index 5e4ea2c0bfc5..9de675b28530 100644
--- a/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
+++ b/llvm/test/Transforms/LoopVectorize/first-order-recurrence-chains-vplan.ll
@@ -34,7 +34,7 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]> = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
 ; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
@@ -44,11 +44,11 @@ define void @test_chained_first_order_recurrences_1(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]> = resume-phi vp<[[RESUME_2]]>, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
 ; CHECK-NEXT:  No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_1_P]]>
-; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>
+; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>.1
 ; CHECK-NEXT: }
 ;
 entry:
@@ -105,8 +105,8 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: middle.block:
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1:%.+]]> = extract-from-end ir<%for.1.next>, ir<1>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]> = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
-; CHECK-NEXT:    EMIT vp<[[RESUME_3:%.+]]> = extract-from-end vp<[[FOR2_SPLICE]]>, ir<1>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2:%.+]]>.1 = extract-from-end vp<[[FOR1_SPLICE]]>, ir<1>
+; CHECK-NEXT:    EMIT vp<[[RESUME_3:%.+]]>.2 = extract-from-end vp<[[FOR2_SPLICE]]>, ir<1>
 ; CHECK-NEXT:    EMIT vp<[[CMP:%.+]]> = icmp eq ir<1000>, vp<[[VTC]]>
 ; CHECK-NEXT:    EMIT branch-on-cond vp<[[CMP]]>
 ; CHECK-NEXT:  Successor(s): ir-bb<exit>, scalar.ph
@@ -116,13 +116,13 @@ define void @test_chained_first_order_recurrences_3(ptr %ptr) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT:  scalar.ph
 ; CHECK-NEXT:    EMIT vp<[[RESUME_1_P:%.*]]> = resume-phi vp<[[RESUME_1]]>, ir<22>
-; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]> = resume-phi vp<[[RESUME_2]]>, ir<33>
-; CHECK-NEXT:    EMIT vp<[[RESUME_3_P:%.*]]> = resume-phi vp<[[RESUME_3]]>, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_2_P:%.*]]>.1 = resume-phi vp<[[RESUME_2]]>.1, ir<33>
+; CHECK-NEXT:    EMIT vp<[[RESUME_3_P:%.*]]>.2 = resume-phi vp<[[RESUME_3]]>.2, ir<33>
 ; CHECK-NEXT: No successors
 ; CHECK-EMPTY:
 ; CHECK-NEXT: Live-out i16 %for.1 = vp<[[RESUME_1_P]]>
-; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>
-; CHECK-NEXT: Live-out i16 %for.3 = vp<[[RESUME_3_P]]>
+; CHECK-NEXT: Live-out i16 %for.2 = vp<[[RESUME_2_P]]>.1
+; CHECK-NEXT: Live-out i16 %for.3 = vp<[[RESUME_3_P]]>.2
 ; CHECK-NEXT: }
 ;
 entry:
diff --git a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll
index 5ea27994b356..27d81de260d3 100644
--- a/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll
+++ b/llvm/test/Transforms/LoopVectorize/vplan-unused-interleave-group.ll
@@ -18,9 +18,9 @@ define void @test_unused_interleave(ptr %src, i32 %length) {
 ; CHECK-EMPTY:
 ; CHECK-NEXT: <x1> vector loop: {
 ; CHECK-NEXT:   vector.body:
-; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%3>
-; CHECK-NEXT:     EMIT vp<%3> = add nuw vp<%2>, vp<%0>
-; CHECK-NEXT:     EMIT branch-on-count vp<%3>, vp<%1>
+; CHECK-NEXT:     EMIT vp<%2> = CANONICAL-INDUCTION ir<0>, vp<%index.next>
+; CHECK-NEXT:     EMIT vp<%index.next> = add nuw vp<%2>, vp<%0>
+; CHECK-NEXT:     EMIT branch-on-count vp<%index.next>, vp<%1>
 ; CHECK-NEXT:   No successors
 ; CHECK-NEXT: }
 entry:
diff --git a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
index 4926afbfc6d8..00a3c737c0e4 100644
--- a/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
+++ b/llvm/unittests/Transforms/Vectorize/VPlanHCFGTest.cpp
@@ -133,8 +133,8 @@ compound=true
   N2 -> N4 [ label="" ltail=cluster_N3]
   N4 [label =
     "middle.block:\l" +
-    "  EMIT vp\<%1\> = icmp eq ir\<%N\>, vp\<%0\>\l" +
-    "  EMIT branch-on-cond vp\<%1\>\l" +
+    "  EMIT vp\<%cmp.n\> = icmp eq ir\<%N\>, vp\<%0\>\l" +
+    "  EMIT branch-on-cond vp\<%cmp.n\>\l" +
     "Successor(s): ir-bb\<for.end\>, scalar.ph\l"
   ]
   N4 -> N5 [ label="T"]
-- 
GitLab


From ad4a582fd938c933e784f0052bd773676b37b690 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Alex=20R=C3=B8nne=20Petersen?= <alex@alexrp.com>
Date: Fri, 18 Oct 2024 07:35:42 +0200
Subject: [PATCH 325/329] [llvm] Consistently respect `naked` fn attribute in
 `TargetFrameLowering::hasFP()` (#106014)

Some targets (e.g. PPC and Hexagon) already did this. I think it's best
to do this consistently so that frontend authors don't run into
inconsistent results when they emit `naked` functions. For example, in
Zig, we had to change our emit code to also set `frame-pointer=none` to
get reliable results across targets.

Note: I don't have commit access.
---
 .../llvm/CodeGen/TargetFrameLowering.h        |  9 +-
 .../Target/AArch64/AArch64FrameLowering.cpp   |  6 +-
 .../lib/Target/AArch64/AArch64FrameLowering.h |  4 +-
 llvm/lib/Target/AMDGPU/R600FrameLowering.h    |  5 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.cpp    |  2 +-
 llvm/lib/Target/AMDGPU/SIFrameLowering.h      |  5 +-
 llvm/lib/Target/ARC/ARCFrameLowering.cpp      |  2 +-
 llvm/lib/Target/ARC/ARCFrameLowering.h        |  5 +-
 llvm/lib/Target/ARM/ARMFrameLowering.cpp      |  8 +-
 llvm/lib/Target/ARM/ARMFrameLowering.h        |  4 +-
 llvm/lib/Target/AVR/AVRFrameLowering.cpp      |  2 +-
 llvm/lib/Target/AVR/AVRFrameLowering.h        |  4 +-
 llvm/lib/Target/BPF/BPFFrameLowering.cpp      |  4 +-
 llvm/lib/Target/BPF/BPFFrameLowering.h        |  4 +-
 llvm/lib/Target/CSKY/CSKYFrameLowering.cpp    |  2 +-
 llvm/lib/Target/CSKY/CSKYFrameLowering.h      |  4 +-
 .../lib/Target/DirectX/DirectXFrameLowering.h |  3 +-
 .../Target/Hexagon/HexagonFrameLowering.cpp   |  5 +-
 .../lib/Target/Hexagon/HexagonFrameLowering.h |  4 +-
 llvm/lib/Target/Lanai/LanaiFrameLowering.h    |  5 +-
 .../LoongArch/LoongArchFrameLowering.cpp      |  2 +-
 .../Target/LoongArch/LoongArchFrameLowering.h |  4 +-
 llvm/lib/Target/M68k/M68kFrameLowering.cpp    |  2 +-
 llvm/lib/Target/M68k/M68kFrameLowering.h      | 13 +--
 .../lib/Target/MSP430/MSP430FrameLowering.cpp |  2 +-
 llvm/lib/Target/MSP430/MSP430FrameLowering.h  |  2 +-
 llvm/lib/Target/Mips/MipsFrameLowering.cpp    | 10 +--
 llvm/lib/Target/Mips/MipsFrameLowering.h      |  4 +-
 llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp  |  4 +-
 llvm/lib/Target/NVPTX/NVPTXFrameLowering.h    |  4 +-
 llvm/lib/Target/PowerPC/PPCFrameLowering.cpp  |  6 +-
 llvm/lib/Target/PowerPC/PPCFrameLowering.h    |  4 +-
 llvm/lib/Target/RISCV/RISCVFrameLowering.cpp  |  2 +-
 llvm/lib/Target/RISCV/RISCVFrameLowering.h    |  4 +-
 llvm/lib/Target/SPIRV/SPIRVFrameLowering.h    |  3 +-
 llvm/lib/Target/Sparc/SparcFrameLowering.cpp  |  8 +-
 llvm/lib/Target/Sparc/SparcFrameLowering.h    |  4 +-
 .../Target/SystemZ/SystemZFrameLowering.cpp   |  9 +-
 .../lib/Target/SystemZ/SystemZFrameLowering.h |  9 +-
 llvm/lib/Target/VE/VEFrameLowering.cpp        |  8 +-
 llvm/lib/Target/VE/VEFrameLowering.h          |  3 +-
 .../WebAssembly/WebAssemblyFrameLowering.cpp  |  2 +-
 .../WebAssembly/WebAssemblyFrameLowering.h    |  4 +-
 llvm/lib/Target/X86/X86FrameLowering.cpp      |  8 +-
 llvm/lib/Target/X86/X86FrameLowering.h        |  4 +-
 llvm/lib/Target/XCore/XCoreFrameLowering.cpp  |  2 +-
 llvm/lib/Target/XCore/XCoreFrameLowering.h    |  5 +-
 .../lib/Target/Xtensa/XtensaFrameLowering.cpp |  2 +-
 llvm/lib/Target/Xtensa/XtensaFrameLowering.h  |  5 +-
 .../AArch64/naked-fn-with-frame-pointer.ll    | 39 +++++++++
 .../AMDGPU/naked-fn-with-frame-pointer.ll     | 42 +++++++++
 .../ARM/naked-fn-with-frame-pointer.ll        | 55 ++++++++++++
 .../AVR/naked-fn-with-frame-pointer.ll        | 20 +++++
 .../BPF/naked-fn-with-frame-pointer.ll        | 41 +++++++++
 .../CSKY/naked-fn-with-frame-pointer.ll       | 41 +++++++++
 .../Hexagon/naked-fn-with-frame-pointer.ll    | 30 +++++++
 .../Lanai/naked-fn-with-frame-pointer.ll      | 35 ++++++++
 .../LoongArch/naked-fn-with-frame-pointer.ll  | 45 ++++++++++
 .../M68k/naked-fn-with-frame-pointer.ll       | 26 ++++++
 .../MSP430/naked-fn-with-frame-pointer.ll     | 27 ++++++
 .../Mips/naked-fn-with-frame-pointer.ll       | 87 +++++++++++++++++++
 .../NVPTX/naked-fn-with-frame-pointer.ll      | 73 ++++++++++++++++
 .../PowerPC/naked-fn-with-frame-pointer.ll    | 87 +++++++++++++++++++
 .../RISCV/naked-fn-with-frame-pointer.ll      | 45 ++++++++++
 .../SPARC/naked-fn-with-frame-pointer.ll      | 45 ++++++++++
 .../SystemZ/naked-fn-with-frame-pointer.ll    | 28 ++++++
 .../CodeGen/VE/naked-fn-with-frame-pointer.ll | 41 +++++++++
 .../naked-fn-with-frame-pointer.ll            | 37 ++++++++
 .../X86/naked-fn-with-frame-pointer.ll        | 39 +++++++++
 .../XCore/naked-fn-with-frame-pointer.ll      | 31 +++++++
 .../Xtensa/naked-fn-with-frame-pointer.ll     | 31 +++++++
 llvm/unittests/CodeGen/MFCommon.inc           |  4 +-
 72 files changed, 1085 insertions(+), 90 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/ARM/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/AVR/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/BPF/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/CSKY/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/Hexagon/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/Lanai/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/LoongArch/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/M68k/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/MSP430/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/Mips/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/PowerPC/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/RISCV/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/SPARC/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/SystemZ/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/VE/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/WebAssembly/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/X86/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/XCore/naked-fn-with-frame-pointer.ll
 create mode 100644 llvm/test/CodeGen/Xtensa/naked-fn-with-frame-pointer.ll

diff --git a/llvm/include/llvm/CodeGen/TargetFrameLowering.h b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
index 9882d8511875..97de0197da9b 100644
--- a/llvm/include/llvm/CodeGen/TargetFrameLowering.h
+++ b/llvm/include/llvm/CodeGen/TargetFrameLowering.h
@@ -280,7 +280,11 @@ public:
   /// hasFP - Return true if the specified function should have a dedicated
   /// frame pointer register. For most targets this is true only if the function
   /// has variable sized allocas or if frame pointer elimination is disabled.
-  virtual bool hasFP(const MachineFunction &MF) const = 0;
+  /// For all targets, this is false if the function has the naked attribute
+  /// since there is no prologue to set up the frame pointer.
+  bool hasFP(const MachineFunction &MF) const {
+    return !MF.getFunction().hasFnAttribute(Attribute::Naked) && hasFPImpl(MF);
+  }
 
   /// hasReservedCallFrame - Under normal circumstances, when a frame pointer is
   /// not required, we reserve argument space for call sites in the function
@@ -477,6 +481,9 @@ public:
   /// targets can emit remarks based on the final frame layout.
   virtual void emitRemarks(const MachineFunction &MF,
                            MachineOptimizationRemarkEmitter *ORE) const {};
+
+protected:
+  virtual bool hasFPImpl(const MachineFunction &MF) const = 0;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
index 1b8eac7fac21..bbf2f2677954 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.cpp
@@ -480,9 +480,9 @@ bool AArch64FrameLowering::canUseRedZone(const MachineFunction &MF) const {
            getSVEStackSize(MF) || LowerQRegCopyThroughMem);
 }
 
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.
-bool AArch64FrameLowering::hasFP(const MachineFunction &MF) const {
+/// hasFPImpl - Return true if the specified function should have a dedicated
+/// frame pointer register.
+bool AArch64FrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
diff --git a/llvm/lib/Target/AArch64/AArch64FrameLowering.h b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
index c19731249620..20445e63bcb1 100644
--- a/llvm/lib/Target/AArch64/AArch64FrameLowering.h
+++ b/llvm/lib/Target/AArch64/AArch64FrameLowering.h
@@ -65,7 +65,6 @@ public:
   /// Can this function use the red zone for local allocations.
   bool canUseRedZone(const MachineFunction &MF) const;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
 
   bool assignCalleeSavedSpillSlots(MachineFunction &MF,
@@ -125,6 +124,9 @@ public:
   orderFrameObjects(const MachineFunction &MF,
                     SmallVectorImpl<int> &ObjectsToAllocate) const override;
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   /// Returns true if a homogeneous prolog or epilog code can be emitted
   /// for the size optimization. If so, HOM_Prolog/HOM_Epilog pseudo
diff --git a/llvm/lib/Target/AMDGPU/R600FrameLowering.h b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
index f171bc4fea78..c4621174acab 100644
--- a/llvm/lib/Target/AMDGPU/R600FrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/R600FrameLowering.h
@@ -27,9 +27,8 @@ public:
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
 
-  bool hasFP(const MachineFunction &MF) const override {
-    return false;
-  }
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override { return false; }
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
index bc162b0953a7..13a2db7a87b4 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.cpp
@@ -1805,7 +1805,7 @@ static bool frameTriviallyRequiresSP(const MachineFrameInfo &MFI) {
 // The FP for kernels is always known 0, so we never really need to setup an
 // explicit register for it. However, DisableFramePointerElim will force us to
 // use a register for it.
-bool SIFrameLowering::hasFP(const MachineFunction &MF) const {
+bool SIFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // For entry & chain functions we can use an immediate offset in most cases,
diff --git a/llvm/lib/Target/AMDGPU/SIFrameLowering.h b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
index b3feb759ed81..938c75099a3b 100644
--- a/llvm/lib/Target/AMDGPU/SIFrameLowering.h
+++ b/llvm/lib/Target/AMDGPU/SIFrameLowering.h
@@ -66,6 +66,9 @@ public:
                                 MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   void emitEntryFunctionFlatScratchInit(MachineFunction &MF,
                                         MachineBasicBlock &MBB,
@@ -82,8 +85,6 @@ private:
       Register ScratchWaveOffsetReg) const;
 
 public:
-  bool hasFP(const MachineFunction &MF) const override;
-
   bool requiresStackPointerReference(const MachineFunction &MF) const;
 };
 
diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.cpp b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
index 1227fae13211..472f1c13f362 100644
--- a/llvm/lib/Target/ARC/ARCFrameLowering.cpp
+++ b/llvm/lib/Target/ARC/ARCFrameLowering.cpp
@@ -487,7 +487,7 @@ MachineBasicBlock::iterator ARCFrameLowering::eliminateCallFramePseudoInstr(
   return MBB.erase(I);
 }
 
-bool ARCFrameLowering::hasFP(const MachineFunction &MF) const {
+bool ARCFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   bool HasFP = MF.getTarget().Options.DisableFramePointerElim(MF) ||
                MF.getFrameInfo().hasVarSizedObjects() ||
diff --git a/llvm/lib/Target/ARC/ARCFrameLowering.h b/llvm/lib/Target/ARC/ARCFrameLowering.h
index 9951a09842c5..089326fe3205 100644
--- a/llvm/lib/Target/ARC/ARCFrameLowering.h
+++ b/llvm/lib/Target/ARC/ARCFrameLowering.h
@@ -54,8 +54,6 @@ public:
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                            RegScavenger *RS) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
-
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
@@ -64,6 +62,9 @@ public:
       llvm::MachineFunction &, const llvm::TargetRegisterInfo *,
       std::vector<llvm::CalleeSavedInfo> &) const override;
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   void adjustStackToMatchRecords(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.cpp b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
index 2706efa83fc3..82b6f808688e 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.cpp
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.cpp
@@ -323,10 +323,10 @@ bool ARMFrameLowering::enableCalleeSaveSkip(const MachineFunction &MF) const {
   return true;
 }
 
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.  This is true if the function has variable sized allocas
-/// or if frame pointer elimination is disabled.
-bool ARMFrameLowering::hasFP(const MachineFunction &MF) const {
+/// hasFPImpl - Return true if the specified function should have a dedicated
+/// frame pointer register.  This is true if the function has variable sized
+/// allocas or if frame pointer elimination is disabled.
+bool ARMFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
diff --git a/llvm/lib/Target/ARM/ARMFrameLowering.h b/llvm/lib/Target/ARM/ARMFrameLowering.h
index 3c5bc00cb449..ff51f1a7af02 100644
--- a/llvm/lib/Target/ARM/ARMFrameLowering.h
+++ b/llvm/lib/Target/ARM/ARMFrameLowering.h
@@ -45,7 +45,6 @@ public:
 
   bool enableCalleeSaveSkip(const MachineFunction &MF) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool isFPReserved(const MachineFunction &MF) const;
   bool requiresAAPCSFrameRecord(const MachineFunction &MF) const;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
@@ -87,6 +86,9 @@ public:
   const SpillSlot *
   getCalleeSavedSpillSlots(unsigned &NumEntries) const override;
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   void emitPushInst(MachineBasicBlock &MBB, MachineBasicBlock::iterator MI,
                     ArrayRef<CalleeSavedInfo> CSI, unsigned StmOpc,
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.cpp b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
index 64dd0338bf60..91b0f8c6b2df 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.cpp
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.cpp
@@ -232,7 +232,7 @@ void AVRFrameLowering::emitEpilogue(MachineFunction &MF,
 //
 // Notice that strictly this is not a frame pointer because it contains SP after
 // frame allocation instead of having the original SP in function entry.
-bool AVRFrameLowering::hasFP(const MachineFunction &MF) const {
+bool AVRFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const AVRMachineFunctionInfo *FuncInfo = MF.getInfo<AVRMachineFunctionInfo>();
 
   return (FuncInfo->getHasSpills() || FuncInfo->getHasAllocas() ||
diff --git a/llvm/lib/Target/AVR/AVRFrameLowering.h b/llvm/lib/Target/AVR/AVRFrameLowering.h
index a550c0efbb8e..7baa5e9d62f6 100644
--- a/llvm/lib/Target/AVR/AVRFrameLowering.h
+++ b/llvm/lib/Target/AVR/AVRFrameLowering.h
@@ -21,7 +21,6 @@ public:
 public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
-  bool hasFP(const MachineFunction &MF) const override;
   bool spillCalleeSavedRegisters(MachineBasicBlock &MBB,
                                  MachineBasicBlock::iterator MI,
                                  ArrayRef<CalleeSavedInfo> CSI,
@@ -38,6 +37,9 @@ public:
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 
 } // end namespace llvm
diff --git a/llvm/lib/Target/BPF/BPFFrameLowering.cpp b/llvm/lib/Target/BPF/BPFFrameLowering.cpp
index 8812cfdd86da..123b99f25423 100644
--- a/llvm/lib/Target/BPF/BPFFrameLowering.cpp
+++ b/llvm/lib/Target/BPF/BPFFrameLowering.cpp
@@ -20,7 +20,9 @@
 
 using namespace llvm;
 
-bool BPFFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+bool BPFFrameLowering::hasFPImpl(const MachineFunction &MF) const {
+  return true;
+}
 
 void BPFFrameLowering::emitPrologue(MachineFunction &MF,
                                     MachineBasicBlock &MBB) const {}
diff --git a/llvm/lib/Target/BPF/BPFFrameLowering.h b/llvm/lib/Target/BPF/BPFFrameLowering.h
index a546351ec6cb..6beffcbe69dd 100644
--- a/llvm/lib/Target/BPF/BPFFrameLowering.h
+++ b/llvm/lib/Target/BPF/BPFFrameLowering.h
@@ -26,7 +26,6 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS) const override;
 
@@ -35,6 +34,9 @@ public:
                                 MachineBasicBlock::iterator MI) const override {
     return MBB.erase(MI);
   }
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 }
 #endif
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
index cedcbff1db24..c023b5a0de5a 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.cpp
@@ -33,7 +33,7 @@ static Register getFPReg(const CSKYSubtarget &STI) { return CSKY::R8; }
 // callee saved register to save the value.
 static Register getBPReg(const CSKYSubtarget &STI) { return CSKY::R7; }
 
-bool CSKYFrameLowering::hasFP(const MachineFunction &MF) const {
+bool CSKYFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/CSKY/CSKYFrameLowering.h b/llvm/lib/Target/CSKY/CSKYFrameLowering.h
index 69bf01cf1801..0b3b287bb6a5 100644
--- a/llvm/lib/Target/CSKY/CSKYFrameLowering.h
+++ b/llvm/lib/Target/CSKY/CSKYFrameLowering.h
@@ -61,7 +61,6 @@ public:
                               MutableArrayRef<CalleeSavedInfo> CSI,
                               const TargetRegisterInfo *TRI) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool hasBP(const MachineFunction &MF) const;
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
@@ -69,6 +68,9 @@ public:
   MachineBasicBlock::iterator
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator MI) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 } // namespace llvm
 #endif
diff --git a/llvm/lib/Target/DirectX/DirectXFrameLowering.h b/llvm/lib/Target/DirectX/DirectXFrameLowering.h
index 76a1450054be..85823556d555 100644
--- a/llvm/lib/Target/DirectX/DirectXFrameLowering.h
+++ b/llvm/lib/Target/DirectX/DirectXFrameLowering.h
@@ -29,7 +29,8 @@ public:
   void emitPrologue(MachineFunction &, MachineBasicBlock &) const override {}
   void emitEpilogue(MachineFunction &, MachineBasicBlock &) const override {}
 
-  bool hasFP(const MachineFunction &) const override { return false; }
+protected:
+  bool hasFPImpl(const MachineFunction &) const override { return false; }
 };
 } // namespace llvm
 #endif // LLVM_DIRECTX_DIRECTXFRAMELOWERING_H
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
index 7c82f5e9f9a6..48acd9da9587 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.cpp
@@ -1144,10 +1144,7 @@ void HexagonFrameLowering::insertCFIInstructionsAt(MachineBasicBlock &MBB,
   }
 }
 
-bool HexagonFrameLowering::hasFP(const MachineFunction &MF) const {
-  if (MF.getFunction().hasFnAttribute(Attribute::Naked))
-    return false;
-
+bool HexagonFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   auto &MFI = MF.getFrameInfo();
   auto &HRI = *MF.getSubtarget<HexagonSubtarget>().getRegisterInfo();
   bool HasExtraAlign = HRI.hasStackRealignment(MF);
diff --git a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
index 98e69dcc4b39..926aadb01f50 100644
--- a/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
+++ b/llvm/lib/Target/Hexagon/HexagonFrameLowering.h
@@ -89,7 +89,6 @@ public:
 
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
-  bool hasFP(const MachineFunction &MF) const override;
 
   const SpillSlot *getCalleeSavedSpillSlots(unsigned &NumEntries)
       const override {
@@ -114,6 +113,9 @@ public:
 
   void insertCFIInstructions(MachineFunction &MF) const;
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   using CSIVect = std::vector<CalleeSavedInfo>;
 
diff --git a/llvm/lib/Target/Lanai/LanaiFrameLowering.h b/llvm/lib/Target/Lanai/LanaiFrameLowering.h
index 380d63df7301..9bd78d008f77 100644
--- a/llvm/lib/Target/Lanai/LanaiFrameLowering.h
+++ b/llvm/lib/Target/Lanai/LanaiFrameLowering.h
@@ -44,10 +44,11 @@ public:
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
 
-  bool hasFP(const MachineFunction & /*MF*/) const override { return true; }
-
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction & /*MF*/) const override { return true; }
 };
 
 } // namespace llvm
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
index 4e504729b23e..1a787c63c624 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.cpp
@@ -31,7 +31,7 @@ using namespace llvm;
 // pointer register.  This is true if frame pointer elimination is
 // disabled, if it needs dynamic stack realignment, if the function has
 // variable sized allocas, or if the frame address is taken.
-bool LoongArchFrameLowering::hasFP(const MachineFunction &MF) const {
+bool LoongArchFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
index bc2ac02c91f8..6cbfcf665f6a 100644
--- a/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
+++ b/llvm/lib/Target/LoongArch/LoongArchFrameLowering.h
@@ -49,13 +49,15 @@ public:
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool hasBP(const MachineFunction &MF) const;
 
   uint64_t getFirstSPAdjustAmount(const MachineFunction &MF) const;
 
   bool enableShrinkWrapping(const MachineFunction &MF) const override;
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   void determineFrameLayout(MachineFunction &MF) const;
   void adjustReg(MachineBasicBlock &MBB, MachineBasicBlock::iterator MBBI,
diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.cpp b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
index 1445bac0b92e..4245061f0ae7 100644
--- a/llvm/lib/Target/M68k/M68kFrameLowering.cpp
+++ b/llvm/lib/Target/M68k/M68kFrameLowering.cpp
@@ -40,7 +40,7 @@ M68kFrameLowering::M68kFrameLowering(const M68kSubtarget &STI, Align Alignment)
   StackPtr = TRI->getStackRegister();
 }
 
-bool M68kFrameLowering::hasFP(const MachineFunction &MF) const {
+bool M68kFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
diff --git a/llvm/lib/Target/M68k/M68kFrameLowering.h b/llvm/lib/Target/M68k/M68kFrameLowering.h
index a5349377232e..ed2bfb605ff1 100644
--- a/llvm/lib/Target/M68k/M68kFrameLowering.h
+++ b/llvm/lib/Target/M68k/M68kFrameLowering.h
@@ -121,12 +121,6 @@ public:
                               MutableArrayRef<CalleeSavedInfo> CSI,
                               const TargetRegisterInfo *TRI) const override;
 
-  /// Return true if the specified function should have a dedicated frame
-  /// pointer register.  This is true if the function has variable sized
-  /// allocas, if it needs dynamic stack realignment, if frame pointer
-  /// elimination is disabled, or if the frame address is taken.
-  bool hasFP(const MachineFunction &MF) const override;
-
   /// Under normal circumstances, when a frame pointer is not required, we
   /// reserve argument space for call sites in the function immediately on
   /// entry to the current function. This eliminates the need for add/sub sp
@@ -166,6 +160,13 @@ public:
   /// pointer by a constant value.
   void emitSPUpdate(MachineBasicBlock &MBB, MachineBasicBlock::iterator &MBBI,
                     int64_t NumBytes, bool InEpilogue) const;
+
+protected:
+  /// Return true if the specified function should have a dedicated frame
+  /// pointer register.  This is true if the function has variable sized
+  /// allocas, if it needs dynamic stack realignment, if frame pointer
+  /// elimination is disabled, or if the frame address is taken.
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 } // namespace llvm
 
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
index d0dc6dd146ef..045dedfb3853 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.cpp
@@ -30,7 +30,7 @@ MSP430FrameLowering::MSP430FrameLowering(const MSP430Subtarget &STI)
                           Align(2)),
       STI(STI), TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {}
 
-bool MSP430FrameLowering::hasFP(const MachineFunction &MF) const {
+bool MSP430FrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
diff --git a/llvm/lib/Target/MSP430/MSP430FrameLowering.h b/llvm/lib/Target/MSP430/MSP430FrameLowering.h
index 5227d3e731ed..daa4eec998ee 100644
--- a/llvm/lib/Target/MSP430/MSP430FrameLowering.h
+++ b/llvm/lib/Target/MSP430/MSP430FrameLowering.h
@@ -24,6 +24,7 @@ class MSP430RegisterInfo;
 
 class MSP430FrameLowering : public TargetFrameLowering {
 protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 
 public:
   MSP430FrameLowering(const MSP430Subtarget &STI);
@@ -51,7 +52,6 @@ public:
                               MutableArrayRef<CalleeSavedInfo> CSI,
                               const TargetRegisterInfo *TRI) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                      RegScavenger *RS = nullptr) const override;
diff --git a/llvm/lib/Target/Mips/MipsFrameLowering.cpp b/llvm/lib/Target/Mips/MipsFrameLowering.cpp
index 99d225f9abfe..9b3edcd61ae1 100644
--- a/llvm/lib/Target/Mips/MipsFrameLowering.cpp
+++ b/llvm/lib/Target/Mips/MipsFrameLowering.cpp
@@ -86,11 +86,11 @@ const MipsFrameLowering *MipsFrameLowering::create(const MipsSubtarget &ST) {
   return llvm::createMipsSEFrameLowering(ST);
 }
 
-// hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas,
-// if it needs dynamic stack realignment, if frame pointer elimination is
-// disabled, or if the frame address is taken.
-bool MipsFrameLowering::hasFP(const MachineFunction &MF) const {
+// hasFPImpl - Return true if the specified function should have a dedicated
+// frame pointer register.  This is true if the function has variable sized
+// allocas, if it needs dynamic stack realignment, if frame pointer elimination
+// is disabled, or if the frame address is taken.
+bool MipsFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   const TargetRegisterInfo *TRI = STI.getRegisterInfo();
 
diff --git a/llvm/lib/Target/Mips/MipsFrameLowering.h b/llvm/lib/Target/Mips/MipsFrameLowering.h
index 710a3d40c38e..25adc33fbf5c 100644
--- a/llvm/lib/Target/Mips/MipsFrameLowering.h
+++ b/llvm/lib/Target/Mips/MipsFrameLowering.h
@@ -23,6 +23,8 @@ class MipsFrameLowering : public TargetFrameLowering {
 protected:
   const MipsSubtarget &STI;
 
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 public:
   explicit MipsFrameLowering(const MipsSubtarget &sti, Align Alignment)
       : TargetFrameLowering(StackGrowsDown, Alignment, 0, Alignment), STI(sti) {
@@ -30,8 +32,6 @@ public:
 
   static const MipsFrameLowering *create(const MipsSubtarget &ST);
 
-  bool hasFP(const MachineFunction &MF) const override;
-
   bool hasBP(const MachineFunction &MF) const;
 
   bool allocateScavengingFrameIndexesNearIncomingSP(
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
index 9abe0e3186f2..a5f6cab421fb 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.cpp
@@ -27,7 +27,9 @@ using namespace llvm;
 NVPTXFrameLowering::NVPTXFrameLowering()
     : TargetFrameLowering(TargetFrameLowering::StackGrowsUp, Align(8), 0) {}
 
-bool NVPTXFrameLowering::hasFP(const MachineFunction &MF) const { return true; }
+bool NVPTXFrameLowering::hasFPImpl(const MachineFunction &MF) const {
+  return true;
+}
 
 void NVPTXFrameLowering::emitPrologue(MachineFunction &MF,
                                       MachineBasicBlock &MBB) const {
diff --git a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
index a5d49ac3ab29..f8d1f978327b 100644
--- a/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
+++ b/llvm/lib/Target/NVPTX/NVPTXFrameLowering.h
@@ -22,7 +22,6 @@ class NVPTXFrameLowering : public TargetFrameLowering {
 public:
   explicit NVPTXFrameLowering();
 
-  bool hasFP(const MachineFunction &MF) const override;
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
@@ -32,6 +31,9 @@ public:
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
   DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 
 } // End llvm namespace
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
index f7188b856461..1083febc5f85 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.cpp
@@ -355,9 +355,9 @@ PPCFrameLowering::determineFrameLayout(const MachineFunction &MF,
   return FrameSize;
 }
 
-// hasFP - Return true if the specified function actually has a dedicated frame
-// pointer register.
-bool PPCFrameLowering::hasFP(const MachineFunction &MF) const {
+// hasFPImpl - Return true if the specified function actually has a dedicated
+// frame pointer register.
+bool PPCFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   // FIXME: This is pretty much broken by design: hasFP() might be called really
   // early, before the stack layout was calculated and thus hasFP() might return
diff --git a/llvm/lib/Target/PowerPC/PPCFrameLowering.h b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
index d74c87428326..47f249862946 100644
--- a/llvm/lib/Target/PowerPC/PPCFrameLowering.h
+++ b/llvm/lib/Target/PowerPC/PPCFrameLowering.h
@@ -107,7 +107,6 @@ public:
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologMBB) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool needsFP(const MachineFunction &MF) const;
   void replaceFPWithRealFP(MachineFunction &MF) const;
 
@@ -176,6 +175,9 @@ public:
   void updateCalleeSaves(const MachineFunction &MF, BitVector &SavedRegs) const;
 
   uint64_t getStackThreshold() const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 } // End llvm namespace
 
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
index f388376c12c9..b49cbab1876d 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.cpp
@@ -309,7 +309,7 @@ static Register getMaxPushPopReg(const MachineFunction &MF,
 // pointer register.  This is true if frame pointer elimination is
 // disabled, if it needs dynamic stack realignment, if the function has
 // variable sized allocas, or if the frame address is taken.
-bool RISCVFrameLowering::hasFP(const MachineFunction &MF) const {
+bool RISCVFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/RISCV/RISCVFrameLowering.h b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
index d660f3ad67c9..f45fcdb0acd6 100644
--- a/llvm/lib/Target/RISCV/RISCVFrameLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVFrameLowering.h
@@ -37,8 +37,6 @@ public:
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                            RegScavenger *RS) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
-
   bool hasBP(const MachineFunction &MF) const;
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
@@ -83,6 +81,8 @@ public:
 protected:
   const RISCVSubtarget &STI;
 
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   void determineFrameLayout(MachineFunction &MF) const;
   void adjustStackForRVV(MachineFunction &MF, MachineBasicBlock &MBB,
diff --git a/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h b/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h
index b98f8d0928e5..c7522554166a 100644
--- a/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h
+++ b/llvm/lib/Target/SPIRV/SPIRVFrameLowering.h
@@ -33,7 +33,8 @@ public:
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override {}
 
-  bool hasFP(const MachineFunction &MF) const override { return false; }
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override { return false; }
 };
 } // namespace llvm
 #endif // LLVM_LIB_TARGET_SPIRV_SPIRVFRAMELOWERING_H
diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
index 000418be9a9e..fa38c6cbb6eb 100644
--- a/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.cpp
@@ -249,10 +249,10 @@ bool SparcFrameLowering::hasReservedCallFrame(const MachineFunction &MF) const {
   return !MF.getFrameInfo().hasVarSizedObjects();
 }
 
-// hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas or
-// if frame pointer elimination is disabled.
-bool SparcFrameLowering::hasFP(const MachineFunction &MF) const {
+// hasFPImpl - Return true if the specified function should have a dedicated
+// frame pointer register.  This is true if the function has variable sized
+// allocas or if frame pointer elimination is disabled.
+bool SparcFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/Sparc/SparcFrameLowering.h b/llvm/lib/Target/Sparc/SparcFrameLowering.h
index ab0ceb6591c6..803856811969 100644
--- a/llvm/lib/Target/Sparc/SparcFrameLowering.h
+++ b/llvm/lib/Target/Sparc/SparcFrameLowering.h
@@ -35,7 +35,6 @@ public:
                                 MachineBasicBlock::iterator I) const override;
 
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
-  bool hasFP(const MachineFunction &MF) const override;
   void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                             RegScavenger *RS = nullptr) const override;
 
@@ -47,6 +46,9 @@ public:
   /// time).
   bool targetHandlesStackFrameRounding() const override { return true; }
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   // Remap input registers to output registers for leaf procedure.
   void remapRegsForLeafProc(MachineFunction &MF) const;
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
index 8c53b8dffc2f..8fbd05eab5f6 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.cpp
@@ -832,7 +832,7 @@ void SystemZELFFrameLowering::inlineStackProbe(
   }
 }
 
-bool SystemZELFFrameLowering::hasFP(const MachineFunction &MF) const {
+bool SystemZELFFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           MF.getFrameInfo().hasVarSizedObjects());
 }
@@ -1449,7 +1449,12 @@ void SystemZXPLINKFrameLowering::inlineStackProbe(
   fullyRecomputeLiveIns({StackExtMBB, NextMBB});
 }
 
-bool SystemZXPLINKFrameLowering::hasFP(const MachineFunction &MF) const {
+bool SystemZXPLINKFrameLowering::hasFPImpl(const MachineFunction &MF) const {
+  // Naked functions have no stack frame pushed, so we don't have a frame
+  // pointer.
+  if (MF.getFunction().hasFnAttribute(Attribute::Naked))
+    return false;
+
   return (MF.getFrameInfo().hasVarSizedObjects());
 }
 
diff --git a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
index c4367b491f99..57fc73b78bbf 100644
--- a/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
+++ b/llvm/lib/Target/SystemZ/SystemZFrameLowering.h
@@ -86,7 +86,6 @@ public:
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologMBB) const override;
-  bool hasFP(const MachineFunction &MF) const override;
   StackOffset getFrameIndexReference(const MachineFunction &MF, int FI,
                                      Register &FrameReg) const override;
   void
@@ -113,6 +112,9 @@ public:
 
   // Get or create the frame index of where the old frame pointer is stored.
   int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 
 class SystemZXPLINKFrameLowering : public SystemZFrameLowering {
@@ -147,8 +149,6 @@ public:
   void inlineStackProbe(MachineFunction &MF,
                         MachineBasicBlock &PrologMBB) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
-
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                            RegScavenger *RS) const override;
 
@@ -167,6 +167,9 @@ public:
 
   // Get or create the frame index of where the old frame pointer is stored.
   int getOrCreateFramePointerSaveIndex(MachineFunction &MF) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 } // end namespace llvm
 
diff --git a/llvm/lib/Target/VE/VEFrameLowering.cpp b/llvm/lib/Target/VE/VEFrameLowering.cpp
index 195bd4e6c3ae..10e94c28072f 100644
--- a/llvm/lib/Target/VE/VEFrameLowering.cpp
+++ b/llvm/lib/Target/VE/VEFrameLowering.cpp
@@ -415,10 +415,10 @@ void VEFrameLowering::emitEpilogue(MachineFunction &MF,
   emitEpilogueInsns(MF, MBB, MBBI, NumBytes, true);
 }
 
-// hasFP - Return true if the specified function should have a dedicated frame
-// pointer register.  This is true if the function has variable sized allocas
-// or if frame pointer elimination is disabled.
-bool VEFrameLowering::hasFP(const MachineFunction &MF) const {
+// hasFPImpl - Return true if the specified function should have a dedicated
+// frame pointer register.  This is true if the function has variable sized
+// allocas or if frame pointer elimination is disabled.
+bool VEFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const TargetRegisterInfo *RegInfo = MF.getSubtarget().getRegisterInfo();
 
   const MachineFrameInfo &MFI = MF.getFrameInfo();
diff --git a/llvm/lib/Target/VE/VEFrameLowering.h b/llvm/lib/Target/VE/VEFrameLowering.h
index 36fc8b201b64..be9cdc01d6f4 100644
--- a/llvm/lib/Target/VE/VEFrameLowering.h
+++ b/llvm/lib/Target/VE/VEFrameLowering.h
@@ -39,7 +39,6 @@ public:
   eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                 MachineBasicBlock::iterator I) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool hasBP(const MachineFunction &MF) const;
   bool hasGOT(const MachineFunction &MF) const;
 
@@ -69,6 +68,8 @@ public:
 protected:
   const VESubtarget &STI;
 
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   // Returns true if MF is a leaf procedure.
   bool isLeafProc(MachineFunction &MF) const;
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
index 8f3ad167ae41..f0334ccb3afc 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.cpp
@@ -98,7 +98,7 @@ bool WebAssemblyFrameLowering::hasBP(const MachineFunction &MF) const {
 
 /// Return true if the specified function should have a dedicated frame pointer
 /// register.
-bool WebAssemblyFrameLowering::hasFP(const MachineFunction &MF) const {
+bool WebAssemblyFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
 
   // When we have var-sized objects, we move the stack pointer by an unknown
diff --git a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
index 528b33e34bee..710d5173d64d 100644
--- a/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
+++ b/llvm/lib/Target/WebAssembly/WebAssemblyFrameLowering.h
@@ -41,7 +41,6 @@ public:
   void emitPrologue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
   void emitEpilogue(MachineFunction &MF, MachineBasicBlock &MBB) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool isSupportedStackID(TargetStackID::Value ID) const override;
   DwarfFrameBase getDwarfFrameBase(const MachineFunction &MF) const override;
@@ -68,6 +67,9 @@ public:
   static unsigned getOpcGlobGet(const MachineFunction &MF);
   static unsigned getOpcGlobSet(const MachineFunction &MF);
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   bool hasBP(const MachineFunction &MF) const;
   bool needsSPForLocalFrame(const MachineFunction &MF) const;
diff --git a/llvm/lib/Target/X86/X86FrameLowering.cpp b/llvm/lib/Target/X86/X86FrameLowering.cpp
index 4f83267c999e..a35b04606e59 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.cpp
+++ b/llvm/lib/Target/X86/X86FrameLowering.cpp
@@ -91,10 +91,10 @@ bool X86FrameLowering::needsFrameIndexResolution(
          MF.getInfo<X86MachineFunctionInfo>()->getHasPushSequences();
 }
 
-/// hasFP - Return true if the specified function should have a dedicated frame
-/// pointer register.  This is true if the function has variable sized allocas
-/// or if frame pointer elimination is disabled.
-bool X86FrameLowering::hasFP(const MachineFunction &MF) const {
+/// hasFPImpl - Return true if the specified function should have a dedicated
+/// frame pointer register.  This is true if the function has variable sized
+/// allocas or if frame pointer elimination is disabled.
+bool X86FrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   return (MF.getTarget().Options.DisableFramePointerElim(MF) ||
           TRI->hasStackRealignment(MF) || MFI.hasVarSizedObjects() ||
diff --git a/llvm/lib/Target/X86/X86FrameLowering.h b/llvm/lib/Target/X86/X86FrameLowering.h
index 78217911daca..02fe8ee02a7e 100644
--- a/llvm/lib/Target/X86/X86FrameLowering.h
+++ b/llvm/lib/Target/X86/X86FrameLowering.h
@@ -105,7 +105,6 @@ public:
 
   void spillFPBP(MachineFunction &MF) const override;
 
-  bool hasFP(const MachineFunction &MF) const override;
   bool hasReservedCallFrame(const MachineFunction &MF) const override;
   bool canSimplifyCallFramePseudos(const MachineFunction &MF) const override;
   bool needsFrameIndexResolution(const MachineFunction &MF) const override;
@@ -201,6 +200,9 @@ public:
   /// frame of the top of stack function) as part of it's ABI.
   bool has128ByteRedZone(const MachineFunction& MF) const;
 
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
+
 private:
   bool isWin64Prologue(const MachineFunction &MF) const;
 
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
index b3753692ac2a..ec18eca82b52 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.cpp
@@ -215,7 +215,7 @@ XCoreFrameLowering::XCoreFrameLowering(const XCoreSubtarget &sti)
   // Do nothing
 }
 
-bool XCoreFrameLowering::hasFP(const MachineFunction &MF) const {
+bool XCoreFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
          MF.getFrameInfo().hasVarSizedObjects();
 }
diff --git a/llvm/lib/Target/XCore/XCoreFrameLowering.h b/llvm/lib/Target/XCore/XCoreFrameLowering.h
index a914d82e1989..b06a6f922cdd 100644
--- a/llvm/lib/Target/XCore/XCoreFrameLowering.h
+++ b/llvm/lib/Target/XCore/XCoreFrameLowering.h
@@ -46,8 +46,6 @@ namespace llvm {
     eliminateCallFramePseudoInstr(MachineFunction &MF, MachineBasicBlock &MBB,
                                   MachineBasicBlock::iterator I) const override;
 
-    bool hasFP(const MachineFunction &MF) const override;
-
     void determineCalleeSaves(MachineFunction &MF, BitVector &SavedRegs,
                               RegScavenger *RS = nullptr) const override;
 
@@ -58,6 +56,9 @@ namespace llvm {
     static int stackSlotSize() {
       return 4;
     }
+
+  protected:
+    bool hasFPImpl(const MachineFunction &MF) const override;
   };
 }
 
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
index e24cb7714d36..f46d386c9186 100644
--- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.cpp
@@ -27,7 +27,7 @@ XtensaFrameLowering::XtensaFrameLowering(const XtensaSubtarget &STI)
                           Align(4)),
       TII(*STI.getInstrInfo()), TRI(STI.getRegisterInfo()) {}
 
-bool XtensaFrameLowering::hasFP(const MachineFunction &MF) const {
+bool XtensaFrameLowering::hasFPImpl(const MachineFunction &MF) const {
   const MachineFrameInfo &MFI = MF.getFrameInfo();
   return MF.getTarget().Options.DisableFramePointerElim(MF) ||
          MFI.hasVarSizedObjects();
diff --git a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h
index 9120215af08b..3f946e1ea730 100644
--- a/llvm/lib/Target/Xtensa/XtensaFrameLowering.h
+++ b/llvm/lib/Target/Xtensa/XtensaFrameLowering.h
@@ -24,8 +24,6 @@ class XtensaFrameLowering : public TargetFrameLowering {
 public:
   XtensaFrameLowering(const XtensaSubtarget &STI);
 
-  bool hasFP(const MachineFunction &MF) const override;
-
   /// emitProlog/emitEpilog - These methods insert prolog and epilog code into
   /// the function.
   void emitPrologue(MachineFunction &, MachineBasicBlock &) const override;
@@ -50,6 +48,9 @@ public:
 
   void processFunctionBeforeFrameFinalized(MachineFunction &MF,
                                            RegScavenger *RS) const override;
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override;
 };
 
 } // namespace llvm
diff --git a/llvm/test/CodeGen/AArch64/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AArch64/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..fb559867a2d4
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple aarch64 | FileCheck %s -check-prefixes=CHECK-LE
+; RUN: llc < %s -mtriple aarch64_be | FileCheck %s -check-prefixes=CHECK-BE
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LE-LABEL: naked:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    bl main
+;
+; CHECK-BE-LABEL: naked:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    bl main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LE-LABEL: normal:
+; CHECK-LE:       // %bb.0:
+; CHECK-LE-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-LE-NEXT:    mov x29, sp
+; CHECK-LE-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-LE-NEXT:    .cfi_offset w30, -8
+; CHECK-LE-NEXT:    .cfi_offset w29, -16
+; CHECK-LE-NEXT:    bl main
+;
+; CHECK-BE-LABEL: normal:
+; CHECK-BE:       // %bb.0:
+; CHECK-BE-NEXT:    stp x29, x30, [sp, #-16]! // 16-byte Folded Spill
+; CHECK-BE-NEXT:    mov x29, sp
+; CHECK-BE-NEXT:    .cfi_def_cfa w29, 16
+; CHECK-BE-NEXT:    .cfi_offset w30, -8
+; CHECK-BE-NEXT:    .cfi_offset w29, -16
+; CHECK-BE-NEXT:    bl main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..5ff2d82c1464
--- /dev/null
+++ b/llvm/test/CodeGen/AMDGPU/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,42 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple amdgcn | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       naked$local:
+; CHECK-NEXT:    .type naked$local,@function
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, main@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, main@rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       normal$local:
+; CHECK-NEXT:    .type normal$local,@function
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    s_waitcnt vmcnt(0) expcnt(0) lgkmcnt(0)
+; CHECK-NEXT:    s_mov_b32 s16, s33
+; CHECK-NEXT:    s_mov_b32 s33, s32
+; CHECK-NEXT:    s_or_saveexec_b64 s[18:19], -1
+; CHECK-NEXT:    buffer_store_dword v40, off, s[0:3], s33 ; 4-byte Folded Spill
+; CHECK-NEXT:    s_mov_b64 exec, s[18:19]
+; CHECK-NEXT:    s_waitcnt expcnt(0)
+; CHECK-NEXT:    v_writelane_b32 v40, s16, 2
+; CHECK-NEXT:    s_addk_i32 s32, 0x400
+; CHECK-NEXT:    v_writelane_b32 v40, s30, 0
+; CHECK-NEXT:    v_writelane_b32 v40, s31, 1
+; CHECK-NEXT:    s_getpc_b64 s[16:17]
+; CHECK-NEXT:    s_add_u32 s16, s16, main@rel32@lo+4
+; CHECK-NEXT:    s_addc_u32 s17, s17, main@rel32@hi+12
+; CHECK-NEXT:    s_swappc_b64 s[30:31], s[16:17]
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/ARM/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/ARM/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..2bdc7d3e29b9
--- /dev/null
+++ b/llvm/test/CodeGen/ARM/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,55 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple arm | FileCheck %s -check-prefixes=CHECK-ALE
+; RUN: llc < %s -mtriple armeb | FileCheck %s -check-prefixes=CHECK-ABE
+; RUN: llc < %s -mtriple thumb | FileCheck %s -check-prefixes=CHECK-TLE
+; RUN: llc < %s -mtriple thumbeb | FileCheck %s -check-prefixes=CHECK-TBE
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-ALE-LABEL: naked:
+; CHECK-ALE:       @ %bb.0:
+; CHECK-ALE-NEXT:    bl main
+;
+; CHECK-ABE-LABEL: naked:
+; CHECK-ABE:       @ %bb.0:
+; CHECK-ABE-NEXT:    bl main
+;
+; CHECK-TLE-LABEL: naked:
+; CHECK-TLE:       @ %bb.0:
+; CHECK-TLE-NEXT:    bl main
+;
+; CHECK-TBE-LABEL: naked:
+; CHECK-TBE:       @ %bb.0:
+; CHECK-TBE-NEXT:    bl main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-ALE-LABEL: normal:
+; CHECK-ALE:       @ %bb.0:
+; CHECK-ALE-NEXT:    push {r11, lr}
+; CHECK-ALE-NEXT:    mov r11, sp
+; CHECK-ALE-NEXT:    bl main
+;
+; CHECK-ABE-LABEL: normal:
+; CHECK-ABE:       @ %bb.0:
+; CHECK-ABE-NEXT:    push {r11, lr}
+; CHECK-ABE-NEXT:    mov r11, sp
+; CHECK-ABE-NEXT:    bl main
+;
+; CHECK-TLE-LABEL: normal:
+; CHECK-TLE:       @ %bb.0:
+; CHECK-TLE-NEXT:    push {r7, lr}
+; CHECK-TLE-NEXT:    add r7, sp, #0
+; CHECK-TLE-NEXT:    bl main
+;
+; CHECK-TBE-LABEL: normal:
+; CHECK-TBE:       @ %bb.0:
+; CHECK-TBE-NEXT:    push {r7, lr}
+; CHECK-TBE-NEXT:    add r7, sp, #0
+; CHECK-TBE-NEXT:    bl main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/AVR/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/AVR/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..18ea60906bd0
--- /dev/null
+++ b/llvm/test/CodeGen/AVR/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,20 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple avr | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    rcall main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       ; %bb.0:
+; CHECK-NEXT:    rcall main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/BPF/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/BPF/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..4e4436296f3b
--- /dev/null
+++ b/llvm/test/CodeGen/BPF/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple bpfel | FileCheck %s -check-prefixes=CHECK-LE
+; RUN: llc < %s -mtriple bpfeb | FileCheck %s -check-prefixes=CHECK-BE
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LE-LABEL: naked:
+; CHECK-LE:       .Lnaked$local:
+; CHECK-LE-NEXT:    .type .Lnaked$local,@function
+; CHECK-LE-NEXT:    .cfi_startproc
+; CHECK-LE-NEXT:  # %bb.0:
+; CHECK-LE-NEXT:    call main
+;
+; CHECK-BE-LABEL: naked:
+; CHECK-BE:       .Lnaked$local:
+; CHECK-BE-NEXT:    .type .Lnaked$local,@function
+; CHECK-BE-NEXT:    .cfi_startproc
+; CHECK-BE-NEXT:  # %bb.0:
+; CHECK-BE-NEXT:    call main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LE-LABEL: normal:
+; CHECK-LE:       .Lnormal$local:
+; CHECK-LE-NEXT:    .type .Lnormal$local,@function
+; CHECK-LE-NEXT:    .cfi_startproc
+; CHECK-LE-NEXT:  # %bb.0:
+; CHECK-LE-NEXT:    call main
+;
+; CHECK-BE-LABEL: normal:
+; CHECK-BE:       .Lnormal$local:
+; CHECK-BE-NEXT:    .type .Lnormal$local,@function
+; CHECK-BE-NEXT:    .cfi_startproc
+; CHECK-BE-NEXT:  # %bb.0:
+; CHECK-BE-NEXT:    call main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/CSKY/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/CSKY/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..e897127eb31c
--- /dev/null
+++ b/llvm/test/CodeGen/CSKY/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple csky | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lrw a0, [.LCPI0_0]
+; CHECK-NEXT:    jsr16 a0
+; CHECK-NEXT:    .p2align 1
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    .p2align 2, 0x0
+; CHECK-NEXT:  .LCPI0_0:
+; CHECK-NEXT:    .long main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    subi16 sp, sp, 8
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    st32.w lr, (sp, 4) # 4-byte Folded Spill
+; CHECK-NEXT:    st32.w l4, (sp, 0) # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset lr, -4
+; CHECK-NEXT:    .cfi_offset l4, -8
+; CHECK-NEXT:    mov16 l4, sp
+; CHECK-NEXT:    .cfi_def_cfa_register l4
+; CHECK-NEXT:    subi16 sp, sp, 4
+; CHECK-NEXT:    lrw a0, [.LCPI1_0]
+; CHECK-NEXT:    jsr16 a0
+; CHECK-NEXT:    .p2align 1
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    .p2align 2, 0x0
+; CHECK-NEXT:  .LCPI1_0:
+; CHECK-NEXT:    .long main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/Hexagon/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Hexagon/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..c53f2d4df9b6
--- /dev/null
+++ b/llvm/test/CodeGen/Hexagon/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,30 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple hexagon | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call main
+; CHECK-NEXT:    }
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  // %bb.0:
+; CHECK-NEXT:    .cfi_def_cfa r30, 8
+; CHECK-NEXT:    .cfi_offset r31, -4
+; CHECK-NEXT:    .cfi_offset r30, -8
+; CHECK-NEXT:    {
+; CHECK-NEXT:     call main
+; CHECK-NEXT:     allocframe(r29,#0):raw
+; CHECK-NEXT:    }
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/Lanai/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Lanai/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..4e148764e478
--- /dev/null
+++ b/llvm/test/CodeGen/Lanai/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,35 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple lanai | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       .Lnaked$local:
+; CHECK-NEXT:    .type .Lnaked$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ! %bb.0:
+; CHECK-NEXT:    add %pc, 0x10, %rca
+; CHECK-NEXT:    st %rca, [--%sp]
+; CHECK-NEXT:    bt main
+; CHECK-NEXT:    nop
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       .Lnormal$local:
+; CHECK-NEXT:    .type .Lnormal$local,@function
+; CHECK-NEXT:    .cfi_startproc
+; CHECK-NEXT:  ! %bb.0:
+; CHECK-NEXT:    st %fp, [--%sp]
+; CHECK-NEXT:    add %sp, 0x8, %fp
+; CHECK-NEXT:    sub %sp, 0x8, %sp
+; CHECK-NEXT:    add %pc, 0x10, %rca
+; CHECK-NEXT:    st %rca, [--%sp]
+; CHECK-NEXT:    bt main
+; CHECK-NEXT:    nop
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/LoongArch/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/LoongArch/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..9bb449101683
--- /dev/null
+++ b/llvm/test/CodeGen/LoongArch/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple loongarch32 -mattr +d | FileCheck %s -check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple loongarch64 -mattr +d | FileCheck %s -check-prefixes=CHECK-64
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-LABEL: naked:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    bl main
+;
+; CHECK-64-LABEL: naked:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    bl main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-LABEL: normal:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    addi.w $sp, $sp, -16
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-NEXT:    st.w $ra, $sp, 12 # 4-byte Folded Spill
+; CHECK-32-NEXT:    st.w $fp, $sp, 8 # 4-byte Folded Spill
+; CHECK-32-NEXT:    .cfi_offset 1, -4
+; CHECK-32-NEXT:    .cfi_offset 22, -8
+; CHECK-32-NEXT:    addi.w $fp, $sp, 16
+; CHECK-32-NEXT:    .cfi_def_cfa 22, 0
+; CHECK-32-NEXT:    bl main
+;
+; CHECK-64-LABEL: normal:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    addi.d $sp, $sp, -16
+; CHECK-64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-64-NEXT:    st.d $ra, $sp, 8 # 8-byte Folded Spill
+; CHECK-64-NEXT:    st.d $fp, $sp, 0 # 8-byte Folded Spill
+; CHECK-64-NEXT:    .cfi_offset 1, -8
+; CHECK-64-NEXT:    .cfi_offset 22, -16
+; CHECK-64-NEXT:    addi.d $fp, $sp, 16
+; CHECK-64-NEXT:    .cfi_def_cfa 22, 0
+; CHECK-64-NEXT:    bl main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/M68k/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/M68k/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..807c52c39b6e
--- /dev/null
+++ b/llvm/test/CodeGen/M68k/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,26 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple m68k | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    jsr main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    link.w %a6, #0
+; CHECK-NEXT:    .cfi_def_cfa_offset -8
+; CHECK-NEXT:    .cfi_offset %a6, -8
+; CHECK-NEXT:    .cfi_def_cfa_register %a6
+; CHECK-NEXT:    jsr main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/MSP430/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/MSP430/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..2fdb01005bb2
--- /dev/null
+++ b/llvm/test/CodeGen/MSP430/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,27 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple msp430 | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    call #main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:         .cfi_startproc
+; CHECK-NEXT:  ; %bb.0:
+; CHECK-NEXT:    push r4
+; CHECK-NEXT:    .cfi_def_cfa_offset 4
+; CHECK-NEXT:    .cfi_offset r4, -4
+; CHECK-NEXT:    mov r1, r4
+; CHECK-NEXT:    .cfi_def_cfa_register r4
+; CHECK-NEXT:    call #main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/Mips/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Mips/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..a3820da8b221
--- /dev/null
+++ b/llvm/test/CodeGen/Mips/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple mips | FileCheck %s -check-prefixes=CHECK-32-BE
+; RUN: llc < %s -mtriple mipsel | FileCheck %s -check-prefixes=CHECK-32-LE
+; RUN: llc < %s -mtriple mips64 | FileCheck %s -check-prefixes=CHECK-64-BE
+; RUN: llc < %s -mtriple mips64el | FileCheck %s -check-prefixes=CHECK-64-LE
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-BE-LABEL: naked:
+; CHECK-32-BE:       # %bb.0:
+; CHECK-32-BE-NEXT:    jal main
+; CHECK-32-BE-NEXT:    nop
+;
+; CHECK-32-LE-LABEL: naked:
+; CHECK-32-LE:       # %bb.0:
+; CHECK-32-LE-NEXT:    jal main
+; CHECK-32-LE-NEXT:    nop
+;
+; CHECK-64-BE-LABEL: naked:
+; CHECK-64-BE:       # %bb.0:
+; CHECK-64-BE-NEXT:    jal main
+; CHECK-64-BE-NEXT:    nop
+;
+; CHECK-64-LE-LABEL: naked:
+; CHECK-64-LE:       # %bb.0:
+; CHECK-64-LE-NEXT:    jal main
+; CHECK-64-LE-NEXT:    nop
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-BE-LABEL: normal:
+; CHECK-32-BE:       # %bb.0:
+; CHECK-32-BE-NEXT:    addiu $sp, $sp, -24
+; CHECK-32-BE-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-32-BE-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-32-BE-NEXT:    sw $fp, 16($sp) # 4-byte Folded Spill
+; CHECK-32-BE-NEXT:    .cfi_offset 31, -4
+; CHECK-32-BE-NEXT:    .cfi_offset 30, -8
+; CHECK-32-BE-NEXT:    move $fp, $sp
+; CHECK-32-BE-NEXT:    .cfi_def_cfa_register 30
+; CHECK-32-BE-NEXT:    jal main
+; CHECK-32-BE-NEXT:    nop
+;
+; CHECK-32-LE-LABEL: normal:
+; CHECK-32-LE:       # %bb.0:
+; CHECK-32-LE-NEXT:    addiu $sp, $sp, -24
+; CHECK-32-LE-NEXT:    .cfi_def_cfa_offset 24
+; CHECK-32-LE-NEXT:    sw $ra, 20($sp) # 4-byte Folded Spill
+; CHECK-32-LE-NEXT:    sw $fp, 16($sp) # 4-byte Folded Spill
+; CHECK-32-LE-NEXT:    .cfi_offset 31, -4
+; CHECK-32-LE-NEXT:    .cfi_offset 30, -8
+; CHECK-32-LE-NEXT:    move $fp, $sp
+; CHECK-32-LE-NEXT:    .cfi_def_cfa_register 30
+; CHECK-32-LE-NEXT:    jal main
+; CHECK-32-LE-NEXT:    nop
+;
+; CHECK-64-BE-LABEL: normal:
+; CHECK-64-BE:       # %bb.0:
+; CHECK-64-BE-NEXT:    daddiu $sp, $sp, -16
+; CHECK-64-BE-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-64-BE-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; CHECK-64-BE-NEXT:    sd $fp, 0($sp) # 8-byte Folded Spill
+; CHECK-64-BE-NEXT:    .cfi_offset 31, -8
+; CHECK-64-BE-NEXT:    .cfi_offset 30, -16
+; CHECK-64-BE-NEXT:    move $fp, $sp
+; CHECK-64-BE-NEXT:    .cfi_def_cfa_register 30
+; CHECK-64-BE-NEXT:    jal main
+; CHECK-64-BE-NEXT:    nop
+;
+; CHECK-64-LE-LABEL: normal:
+; CHECK-64-LE:       # %bb.0:
+; CHECK-64-LE-NEXT:    daddiu $sp, $sp, -16
+; CHECK-64-LE-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-64-LE-NEXT:    sd $ra, 8($sp) # 8-byte Folded Spill
+; CHECK-64-LE-NEXT:    sd $fp, 0($sp) # 8-byte Folded Spill
+; CHECK-64-LE-NEXT:    .cfi_offset 31, -8
+; CHECK-64-LE-NEXT:    .cfi_offset 30, -16
+; CHECK-64-LE-NEXT:    move $fp, $sp
+; CHECK-64-LE-NEXT:    .cfi_def_cfa_register 30
+; CHECK-64-LE-NEXT:    jal main
+; CHECK-64-LE-NEXT:    nop
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..a1f0577c2218
--- /dev/null
+++ b/llvm/test/CodeGen/NVPTX/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,73 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple nvptx | FileCheck %s -check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple nvptx64 | FileCheck %s -check-prefixes=CHECK-64
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-LABEL: naked(
+; CHECK-32:       {
+; CHECK-32-EMPTY:
+; CHECK-32-EMPTY:
+; CHECK-32-NEXT:  // %bb.0:
+; CHECK-32-NEXT:    { // callseq 0, 0
+; CHECK-32-NEXT:    call.uni
+; CHECK-32-NEXT:    main,
+; CHECK-32-NEXT:    (
+; CHECK-32-NEXT:    );
+; CHECK-32-NEXT:    } // callseq 0
+; CHECK-32-NEXT:    // begin inline asm
+; CHECK-32-NEXT:    exit;
+; CHECK-32-NEXT:    // end inline asm
+;
+; CHECK-64-LABEL: naked(
+; CHECK-64:       {
+; CHECK-64-EMPTY:
+; CHECK-64-EMPTY:
+; CHECK-64-NEXT:  // %bb.0:
+; CHECK-64-NEXT:    { // callseq 0, 0
+; CHECK-64-NEXT:    call.uni
+; CHECK-64-NEXT:    main,
+; CHECK-64-NEXT:    (
+; CHECK-64-NEXT:    );
+; CHECK-64-NEXT:    } // callseq 0
+; CHECK-64-NEXT:    // begin inline asm
+; CHECK-64-NEXT:    exit;
+; CHECK-64-NEXT:    // end inline asm
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-LABEL: normal(
+; CHECK-32:       {
+; CHECK-32-EMPTY:
+; CHECK-32-EMPTY:
+; CHECK-32-NEXT:  // %bb.0:
+; CHECK-32-NEXT:    { // callseq 1, 0
+; CHECK-32-NEXT:    call.uni
+; CHECK-32-NEXT:    main,
+; CHECK-32-NEXT:    (
+; CHECK-32-NEXT:    );
+; CHECK-32-NEXT:    } // callseq 1
+; CHECK-32-NEXT:    // begin inline asm
+; CHECK-32-NEXT:    exit;
+; CHECK-32-NEXT:    // end inline asm
+;
+; CHECK-64-LABEL: normal(
+; CHECK-64:       {
+; CHECK-64-EMPTY:
+; CHECK-64-EMPTY:
+; CHECK-64-NEXT:  // %bb.0:
+; CHECK-64-NEXT:    { // callseq 1, 0
+; CHECK-64-NEXT:    call.uni
+; CHECK-64-NEXT:    main,
+; CHECK-64-NEXT:    (
+; CHECK-64-NEXT:    );
+; CHECK-64-NEXT:    } // callseq 1
+; CHECK-64-NEXT:    // begin inline asm
+; CHECK-64-NEXT:    exit;
+; CHECK-64-NEXT:    // end inline asm
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/PowerPC/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/PowerPC/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..59b1044084c6
--- /dev/null
+++ b/llvm/test/CodeGen/PowerPC/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,87 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple powerpc | FileCheck %s -check-prefixes=CHECK-32-BE
+; RUN: llc < %s -mtriple powerpcle | FileCheck %s -check-prefixes=CHECK-32-LE
+; RUN: llc < %s -mtriple powerpc64 | FileCheck %s -check-prefixes=CHECK-64-BE
+; RUN: llc < %s -mtriple powerpc64le | FileCheck %s -check-prefixes=CHECK-64-LE
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-BE-LABEL: naked:
+; CHECK-32-BE:       # %bb.0:
+; CHECK-32-BE-NEXT:    bl main
+;
+; CHECK-32-LE-LABEL: naked:
+; CHECK-32-LE:       # %bb.0:
+; CHECK-32-LE-NEXT:    bl main
+;
+; CHECK-64-BE-LABEL: naked:
+; CHECK-64-BE:       # %bb.0:
+; CHECK-64-BE-NEXT:    bl main
+; CHECK-64-BE-NEXT:    nop
+;
+; CHECK-64-LE-LABEL: naked:
+; CHECK-64-LE:       # %bb.0:
+; CHECK-64-LE-NEXT:    bl main
+; CHECK-64-LE-NEXT:    nop
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-BE-LABEL: normal:
+; CHECK-32-BE:       # %bb.0:
+; CHECK-32-BE-NEXT:    mflr 0
+; CHECK-32-BE-NEXT:    stwu 1, -16(1)
+; CHECK-32-BE-NEXT:    stw 31, 12(1)
+; CHECK-32-BE-NEXT:    stw 0, 20(1)
+; CHECK-32-BE-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-BE-NEXT:    .cfi_offset r31, -4
+; CHECK-32-BE-NEXT:    .cfi_offset lr, 4
+; CHECK-32-BE-NEXT:    mr 31, 1
+; CHECK-32-BE-NEXT:    .cfi_def_cfa_register r31
+; CHECK-32-BE-NEXT:    bl main
+;
+; CHECK-32-LE-LABEL: normal:
+; CHECK-32-LE:       # %bb.0:
+; CHECK-32-LE-NEXT:    mflr 0
+; CHECK-32-LE-NEXT:    stwu 1, -16(1)
+; CHECK-32-LE-NEXT:    stw 31, 12(1)
+; CHECK-32-LE-NEXT:    stw 0, 20(1)
+; CHECK-32-LE-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-LE-NEXT:    .cfi_offset r31, -4
+; CHECK-32-LE-NEXT:    .cfi_offset lr, 4
+; CHECK-32-LE-NEXT:    mr 31, 1
+; CHECK-32-LE-NEXT:    .cfi_def_cfa_register r31
+; CHECK-32-LE-NEXT:    bl main
+;
+; CHECK-64-BE-LABEL: normal:
+; CHECK-64-BE:       # %bb.0:
+; CHECK-64-BE-NEXT:    mflr 0
+; CHECK-64-BE-NEXT:    std 31, -8(1)
+; CHECK-64-BE-NEXT:    stdu 1, -128(1)
+; CHECK-64-BE-NEXT:    std 0, 144(1)
+; CHECK-64-BE-NEXT:    .cfi_def_cfa_offset 128
+; CHECK-64-BE-NEXT:    .cfi_offset r31, -8
+; CHECK-64-BE-NEXT:    .cfi_offset lr, 16
+; CHECK-64-BE-NEXT:    mr 31, 1
+; CHECK-64-BE-NEXT:    .cfi_def_cfa_register r31
+; CHECK-64-BE-NEXT:    bl main
+; CHECK-64-BE-NEXT:    nop
+;
+; CHECK-64-LE-LABEL: normal:
+; CHECK-64-LE:       # %bb.0:
+; CHECK-64-LE-NEXT:    mflr 0
+; CHECK-64-LE-NEXT:    std 31, -8(1)
+; CHECK-64-LE-NEXT:    stdu 1, -48(1)
+; CHECK-64-LE-NEXT:    std 0, 64(1)
+; CHECK-64-LE-NEXT:    .cfi_def_cfa_offset 48
+; CHECK-64-LE-NEXT:    .cfi_offset r31, -8
+; CHECK-64-LE-NEXT:    .cfi_offset lr, 16
+; CHECK-64-LE-NEXT:    mr 31, 1
+; CHECK-64-LE-NEXT:    .cfi_def_cfa_register r31
+; CHECK-64-LE-NEXT:    bl main
+; CHECK-64-LE-NEXT:    nop
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/RISCV/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/RISCV/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..de87b10d3873
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple riscv32 | FileCheck %s -check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple riscv64 | FileCheck %s -check-prefixes=CHECK-64
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-LABEL: naked:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    call main
+;
+; CHECK-64-LABEL: naked:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    call main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-LABEL: normal:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    addi sp, sp, -16
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-32-NEXT:    sw ra, 12(sp) # 4-byte Folded Spill
+; CHECK-32-NEXT:    sw s0, 8(sp) # 4-byte Folded Spill
+; CHECK-32-NEXT:    .cfi_offset ra, -4
+; CHECK-32-NEXT:    .cfi_offset s0, -8
+; CHECK-32-NEXT:    addi s0, sp, 16
+; CHECK-32-NEXT:    .cfi_def_cfa s0, 0
+; CHECK-32-NEXT:    call main
+;
+; CHECK-64-LABEL: normal:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    addi sp, sp, -16
+; CHECK-64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-64-NEXT:    sd ra, 8(sp) # 8-byte Folded Spill
+; CHECK-64-NEXT:    sd s0, 0(sp) # 8-byte Folded Spill
+; CHECK-64-NEXT:    .cfi_offset ra, -8
+; CHECK-64-NEXT:    .cfi_offset s0, -16
+; CHECK-64-NEXT:    addi s0, sp, 16
+; CHECK-64-NEXT:    .cfi_def_cfa s0, 0
+; CHECK-64-NEXT:    call main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/SPARC/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/SPARC/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..af97c573625b
--- /dev/null
+++ b/llvm/test/CodeGen/SPARC/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,45 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple sparc | FileCheck %s -check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple sparc64 | FileCheck %s -check-prefixes=CHECK-64
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-LABEL: naked:
+; CHECK-32:         .cfi_startproc
+; CHECK-32-NEXT:  ! %bb.0:
+; CHECK-32-NEXT:    call main
+; CHECK-32-NEXT:    nop
+;
+; CHECK-64-LABEL: naked:
+; CHECK-64:         .cfi_startproc
+; CHECK-64-NEXT:  ! %bb.0:
+; CHECK-64-NEXT:    call main
+; CHECK-64-NEXT:    nop
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-LABEL: normal:
+; CHECK-32:         .cfi_startproc
+; CHECK-32-NEXT:  ! %bb.0:
+; CHECK-32-NEXT:    save %sp, -96, %sp
+; CHECK-32-NEXT:    .cfi_def_cfa_register %fp
+; CHECK-32-NEXT:    .cfi_window_save
+; CHECK-32-NEXT:    .cfi_register %o7, %i7
+; CHECK-32-NEXT:    call main
+; CHECK-32-NEXT:    nop
+;
+; CHECK-64-LABEL: normal:
+; CHECK-64:         .cfi_startproc
+; CHECK-64-NEXT:  ! %bb.0:
+; CHECK-64-NEXT:    save %sp, -176, %sp
+; CHECK-64-NEXT:    .cfi_def_cfa_register %fp
+; CHECK-64-NEXT:    .cfi_window_save
+; CHECK-64-NEXT:    .cfi_register %o7, %i7
+; CHECK-64-NEXT:    call main
+; CHECK-64-NEXT:    nop
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/SystemZ/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/SystemZ/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..3eb396e40442
--- /dev/null
+++ b/llvm/test/CodeGen/SystemZ/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,28 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple s390x | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    brasl %r14, main@PLT
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    stmg %r11, %r15, 88(%r15)
+; CHECK-NEXT:    .cfi_offset %r11, -72
+; CHECK-NEXT:    .cfi_offset %r14, -48
+; CHECK-NEXT:    .cfi_offset %r15, -40
+; CHECK-NEXT:    aghi %r15, -160
+; CHECK-NEXT:    .cfi_def_cfa_offset 320
+; CHECK-NEXT:    lgr %r11, %r15
+; CHECK-NEXT:    .cfi_def_cfa_register %r11
+; CHECK-NEXT:    brasl %r14, main@PLT
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/VE/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/VE/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..3b88bea46c4d
--- /dev/null
+++ b/llvm/test/CodeGen/VE/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,41 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple ve | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    lea %s0, main@lo
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lea.sl %s12, main@hi(, %s0)
+; CHECK-NEXT:    bsic %s10, (, %s12)
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    st %s9, (, %s11)
+; CHECK-NEXT:    st %s10, 8(, %s11)
+; CHECK-NEXT:    or %s9, 0, %s11
+; CHECK-NEXT:    lea %s11, -240(, %s11)
+; CHECK-NEXT:    brge.l.t %s11, %s8, .LBB1_2
+; CHECK-NEXT:  # %bb.1:
+; CHECK-NEXT:    ld %s61, 24(, %s14)
+; CHECK-NEXT:    or %s62, 0, %s0
+; CHECK-NEXT:    lea %s63, 315
+; CHECK-NEXT:    shm.l %s63, (%s61)
+; CHECK-NEXT:    shm.l %s8, 8(%s61)
+; CHECK-NEXT:    shm.l %s11, 16(%s61)
+; CHECK-NEXT:    monc
+; CHECK-NEXT:    or %s0, 0, %s62
+; CHECK-NEXT:  .LBB1_2:
+; CHECK-NEXT:    lea %s0, main@lo
+; CHECK-NEXT:    and %s0, %s0, (32)0
+; CHECK-NEXT:    lea.sl %s12, main@hi(, %s0)
+; CHECK-NEXT:    bsic %s10, (, %s12)
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/WebAssembly/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/WebAssembly/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..fcd42e8cbfb9
--- /dev/null
+++ b/llvm/test/CodeGen/WebAssembly/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,37 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple wasm32 | FileCheck %s -check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple wasm64 | FileCheck %s -check-prefixes=CHECK-64
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-LABEL: naked:
+; CHECK-32:         .functype naked () -> ()
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    call main
+; CHECK-32-NEXT:    unreachable
+;
+; CHECK-64-LABEL: naked:
+; CHECK-64:         .functype naked () -> ()
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    call main
+; CHECK-64-NEXT:    unreachable
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-LABEL: normal:
+; CHECK-32:         .functype normal () -> ()
+; CHECK-32-NEXT:  # %bb.0:
+; CHECK-32-NEXT:    call main
+; CHECK-32-NEXT:    unreachable
+;
+; CHECK-64-LABEL: normal:
+; CHECK-64:         .functype normal () -> ()
+; CHECK-64-NEXT:  # %bb.0:
+; CHECK-64-NEXT:    call main
+; CHECK-64-NEXT:    unreachable
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/X86/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/X86/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..37756009fa7d
--- /dev/null
+++ b/llvm/test/CodeGen/X86/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,39 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -mtriple i386 | FileCheck %s -check-prefixes=CHECK-32
+; RUN: llc < %s -mtriple x86_64 | FileCheck %s -check-prefixes=CHECK-64
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-32-LABEL: naked:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    calll main
+;
+; CHECK-64-LABEL: naked:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    callq main
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-32-LABEL: normal:
+; CHECK-32:       # %bb.0:
+; CHECK-32-NEXT:    pushl %ebp
+; CHECK-32-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-32-NEXT:    .cfi_offset %ebp, -8
+; CHECK-32-NEXT:    movl %esp, %ebp
+; CHECK-32-NEXT:    .cfi_def_cfa_register %ebp
+; CHECK-32-NEXT:    calll main
+;
+; CHECK-64-LABEL: normal:
+; CHECK-64:       # %bb.0:
+; CHECK-64-NEXT:    pushq %rbp
+; CHECK-64-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-64-NEXT:    .cfi_offset %rbp, -16
+; CHECK-64-NEXT:    movq %rsp, %rbp
+; CHECK-64-NEXT:    .cfi_def_cfa_register %rbp
+; CHECK-64-NEXT:    callq main
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/XCore/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/XCore/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..429a78108a7b
--- /dev/null
+++ b/llvm/test/CodeGen/XCore/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march xcore | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    bl main
+; CHECK-NEXT:    .cc_bottom naked.function
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    entsp 2
+; CHECK-NEXT:    .cfi_def_cfa_offset 8
+; CHECK-NEXT:    .cfi_offset 15, 0
+; CHECK-NEXT:    stw r10, sp[1] # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset 10, -4
+; CHECK-NEXT:    ldaw r10, sp[0]
+; CHECK-NEXT:    .cfi_def_cfa_register 10
+; CHECK-NEXT:    extsp 1
+; CHECK-NEXT:    bl main
+; CHECK-NEXT:    ldaw sp, sp[1]
+; CHECK-NEXT:    .cc_bottom normal.function
+  call void @main()
+  unreachable
+}
diff --git a/llvm/test/CodeGen/Xtensa/naked-fn-with-frame-pointer.ll b/llvm/test/CodeGen/Xtensa/naked-fn-with-frame-pointer.ll
new file mode 100644
index 000000000000..020fcc4f6dae
--- /dev/null
+++ b/llvm/test/CodeGen/Xtensa/naked-fn-with-frame-pointer.ll
@@ -0,0 +1,31 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc < %s -march xtensa | FileCheck %s -check-prefixes=CHECK
+
+declare dso_local void @main()
+
+define dso_local void @naked() naked "frame-pointer"="all" {
+; CHECK-LABEL: naked:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    l32r a8, {{\.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NEXT:    callx0 a8
+  call void @main()
+  unreachable
+}
+
+define dso_local void @normal() "frame-pointer"="all" {
+; CHECK-LABEL: normal:
+; CHECK:       # %bb.0:
+; CHECK-NEXT:    addi a8, a1, -16
+; CHECK-NEXT:    or a1, a8, a8
+; CHECK-NEXT:    .cfi_def_cfa_offset 16
+; CHECK-NEXT:    s32i a0, a1, 4 # 4-byte Folded Spill
+; CHECK-NEXT:    s32i a15, a1, 0 # 4-byte Folded Spill
+; CHECK-NEXT:    .cfi_offset a0, -4
+; CHECK-NEXT:    .cfi_offset a15, -8
+; CHECK-NEXT:    or a15, a1, a1
+; CHECK-NEXT:    .cfi_def_cfa_register a15
+; CHECK-NEXT:    l32r a8, {{\.?LCPI[0-9]+_[0-9]+}}
+; CHECK-NEXT:    callx0 a8
+  call void @main()
+  unreachable
+}
diff --git a/llvm/unittests/CodeGen/MFCommon.inc b/llvm/unittests/CodeGen/MFCommon.inc
index 5d5720c3162d..749c5780fbac 100644
--- a/llvm/unittests/CodeGen/MFCommon.inc
+++ b/llvm/unittests/CodeGen/MFCommon.inc
@@ -14,7 +14,9 @@ public:
                     MachineBasicBlock &MBB) const override {}
   void emitEpilogue(MachineFunction &MF,
                     MachineBasicBlock &MBB) const override {}
-  bool hasFP(const MachineFunction &MF) const override { return false; }
+
+protected:
+  bool hasFPImpl(const MachineFunction &MF) const override { return false; }
 };
 
 static TargetRegisterClass *const BogusRegisterClasses[] = {nullptr};
-- 
GitLab


From 5d08625347a5467d463ff4377816709e262edb59 Mon Sep 17 00:00:00 2001
From: Timm Baeder <tbaeder@redhat.com>
Date: Fri, 18 Oct 2024 07:38:22 +0200
Subject: [PATCH 326/329] [clang][bytecode] Activate pointers in Init{,Pop}
 (#112832)

---
 clang/lib/AST/ByteCode/Interp.h           |  2 ++
 clang/test/AST/ByteCode/placement-new.cpp | 11 +++++++++++
 2 files changed, 13 insertions(+)

diff --git a/clang/lib/AST/ByteCode/Interp.h b/clang/lib/AST/ByteCode/Interp.h
index a1a92562cc5e..f034bde30903 100644
--- a/clang/lib/AST/ByteCode/Interp.h
+++ b/clang/lib/AST/ByteCode/Interp.h
@@ -1841,6 +1841,7 @@ bool Init(InterpState &S, CodePtr OpPC) {
     assert(false);
     return false;
   }
+  Ptr.activate();
   Ptr.initialize();
   new (&Ptr.deref<T>()) T(Value);
   return true;
@@ -1852,6 +1853,7 @@ bool InitPop(InterpState &S, CodePtr OpPC) {
   const Pointer &Ptr = S.Stk.pop<Pointer>();
   if (!CheckInit(S, OpPC, Ptr))
     return false;
+  Ptr.activate();
   Ptr.initialize();
   new (&Ptr.deref<T>()) T(Value);
   return true;
diff --git a/clang/test/AST/ByteCode/placement-new.cpp b/clang/test/AST/ByteCode/placement-new.cpp
index 6bd83f2372ea..8e6d802e9329 100644
--- a/clang/test/AST/ByteCode/placement-new.cpp
+++ b/clang/test/AST/ByteCode/placement-new.cpp
@@ -300,3 +300,14 @@ namespace UsedToCrash {
   }
   int alloc1 = (alloc(), 0);
 }
+
+constexpr bool change_union_member() {
+  union U {
+    int a;
+    int b;
+  };
+  U u = {.a = 1};
+  std::construct_at<int>(&u.b, 2);
+  return u.b == 2;
+}
+static_assert(change_union_member());
-- 
GitLab


From f225b0779992bf8698d08534e256363595903c43 Mon Sep 17 00:00:00 2001
From: Matt Arsenault <Matthew.Arsenault@amd.com>
Date: Fri, 18 Oct 2024 09:53:46 +0400
Subject: [PATCH 327/329] Utils: Preserve address space for global_ctors
 (#112532)

---
 llvm/lib/Transforms/Utils/CtorUtils.cpp       |  6 +++---
 .../GlobalOpt/ctor-list-preserve-addrspace.ll | 19 +++++++++++++++++++
 2 files changed, 22 insertions(+), 3 deletions(-)
 create mode 100644 llvm/test/Transforms/GlobalOpt/ctor-list-preserve-addrspace.ll

diff --git a/llvm/lib/Transforms/Utils/CtorUtils.cpp b/llvm/lib/Transforms/Utils/CtorUtils.cpp
index 507729bc5ebc..968446c4eee1 100644
--- a/llvm/lib/Transforms/Utils/CtorUtils.cpp
+++ b/llvm/lib/Transforms/Utils/CtorUtils.cpp
@@ -45,9 +45,9 @@ static void removeGlobalCtors(GlobalVariable *GCL, const BitVector &CtorsToRemov
   }
 
   // Create the new global and insert it next to the existing list.
-  GlobalVariable *NGV =
-      new GlobalVariable(CA->getType(), GCL->isConstant(), GCL->getLinkage(),
-                         CA, "", GCL->getThreadLocalMode());
+  GlobalVariable *NGV = new GlobalVariable(
+      CA->getType(), GCL->isConstant(), GCL->getLinkage(), CA, "",
+      GCL->getThreadLocalMode(), GCL->getAddressSpace());
   GCL->getParent()->insertGlobalVariable(GCL->getIterator(), NGV);
   NGV->takeName(GCL);
 
diff --git a/llvm/test/Transforms/GlobalOpt/ctor-list-preserve-addrspace.ll b/llvm/test/Transforms/GlobalOpt/ctor-list-preserve-addrspace.ll
new file mode 100644
index 000000000000..3f2f041b90e7
--- /dev/null
+++ b/llvm/test/Transforms/GlobalOpt/ctor-list-preserve-addrspace.ll
@@ -0,0 +1,19 @@
+; NOTE: Assertions have been autogenerated by utils/update_test_checks.py UTC_ARGS: --check-globals all --version 5
+; RUN: opt -S -passes=globalopt < %s | FileCheck %s
+
+; Make sure the address space of global_ctors is preserved
+
+%ini = type { i32, ptr, ptr }
+
+@llvm.global_ctors = appending addrspace(1) global [1 x %ini] [%ini { i32 65534, ptr @ctor1, ptr null }]
+
+;.
+; CHECK: @llvm.global_ctors = appending addrspace(1) global [0 x %ini] zeroinitializer
+;.
+define void @ctor1() {
+; CHECK-LABEL: define void @ctor1() local_unnamed_addr {
+; CHECK-NEXT:    ret void
+;
+  ret void
+}
+
-- 
GitLab


From 6bb63002fca8a7cfa9ff8ffd86da4c2ca3d98a3b Mon Sep 17 00:00:00 2001
From: Daniil Kovalev <dkovalev@accesssoftek.com>
Date: Fri, 18 Oct 2024 08:58:26 +0300
Subject: [PATCH 328/329] [PAC] Fix address discrimination for type info vtable
 pointers (#102199)

In #99726, `-fptrauth-type-info-vtable-pointer-discrimination` was
introduced, which is intended to enable type and address discrimination
for type_info vtable pointers. However, some codegen logic for actually
enabling address discrimination was missing. This patch addresses the
issue.

Fixes #101716
---
 clang/lib/CodeGen/ItaniumCXXABI.cpp           |  37 +-
 ...child-inheritted-from-parent-in-comdat.cpp |   2 +-
 .../inlined-key-function.cpp                  |   2 +-
 .../parent-and-child-in-comdats.cpp           |   4 +-
 .../parent-vtable-in-comdat.cpp               |   4 +-
 .../simple-vtable-definition.cpp              |   2 +-
 .../RelativeVTablesABI/type-info.cpp          |   4 +-
 clang/test/CodeGenCXX/armv7k.cpp              |   6 +-
 .../CodeGenCXX/dynamic-cast-address-space.cpp |   4 +-
 clang/test/CodeGenCXX/exceptions-no-rtti.cpp  |   2 +-
 .../CodeGenCXX/implicit-record-visibility.cpp |   2 +-
 ...default-visibility-export-mapping-rtti.cpp | 480 +++++++++---------
 clang/test/CodeGenCXX/modules-vtable.cppm     |  12 +-
 clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp |   4 +-
 .../CodeGenCXX/ptrauth-type-info-vtable.cpp   |   7 +-
 ...rauth-vtable-virtual-inheritance-thunk.cpp |  26 +-
 clang/test/CodeGenCXX/rtti-linkage.cpp        |  64 +--
 clang/test/CodeGenCXX/rtti-visibility.cpp     |   6 +-
 clang/test/CodeGenCXX/symbol-partition.cpp    |   2 +-
 clang/test/CodeGenCXX/type_visibility.cpp     |  36 +-
 .../typeinfo-with-address-space.cpp           |   4 +-
 .../test/CodeGenCXX/visibility-ms-compat.cpp  |  12 +-
 .../CodeGenCXX/vtable-align-address-space.cpp |   2 +-
 clang/test/CodeGenCXX/vtable-align.cpp        |   4 +-
 .../vtable-available-externally.cpp           |   2 +-
 .../CodeGenCXX/vtable-key-function-arm.cpp    |  24 +-
 .../CodeGenCXX/vtable-key-function-ios.cpp    |  16 +-
 .../vtable-key-function-win-comdat.cpp        |   6 +-
 .../test/CodeGenCXX/weak-extern-typeinfo.cpp  |  14 +-
 .../CodeGenCXX/windows-itanium-type-info.cpp  |   2 +-
 clang/test/CodeGenObjCXX/rtti.mm              |   9 +-
 clang/test/Modules/pr97313.cppm               |   6 +-
 clang/test/SemaCXX/typeid-ref.cpp             |   2 +-
 .../AArch64/ptrauth-type-info-vptr-discr.ll   |  21 +
 34 files changed, 429 insertions(+), 401 deletions(-)
 create mode 100644 llvm/test/CodeGen/AArch64/ptrauth-type-info-vptr-discr.ll

diff --git a/clang/lib/CodeGen/ItaniumCXXABI.cpp b/clang/lib/CodeGen/ItaniumCXXABI.cpp
index 6c2a6f9ba66f..89f945752382 100644
--- a/clang/lib/CodeGen/ItaniumCXXABI.cpp
+++ b/clang/lib/CodeGen/ItaniumCXXABI.cpp
@@ -3437,7 +3437,7 @@ class ItaniumRTTIBuilder {
   llvm::Constant *GetAddrOfExternalRTTIDescriptor(QualType Ty);
 
   /// BuildVTablePointer - Build the vtable pointer for the given type.
-  void BuildVTablePointer(const Type *Ty);
+  void BuildVTablePointer(const Type *Ty, llvm::Constant *StorageAddress);
 
   /// BuildSIClassTypeInfo - Build an abi::__si_class_type_info, used for single
   /// inheritance, according to the Itanium C++ ABI, 2.9.5p6b.
@@ -3834,7 +3834,8 @@ static bool CanUseSingleInheritance(const CXXRecordDecl *RD) {
   return true;
 }
 
-void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
+void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty,
+                                            llvm::Constant *StorageAddress) {
   // abi::__class_type_info.
   static const char * const ClassTypeInfo =
     "_ZTVN10__cxxabiv117__class_type_infoE";
@@ -3981,9 +3982,12 @@ void ItaniumRTTIBuilder::BuildVTablePointer(const Type *Ty) {
                                                           VTable, Two);
   }
 
-  if (auto &Schema = CGM.getCodeGenOpts().PointerAuth.CXXTypeInfoVTablePointer)
-    VTable = CGM.getConstantSignedPointer(VTable, Schema, nullptr, GlobalDecl(),
-                                          QualType(Ty, 0));
+  if (const auto &Schema =
+          CGM.getCodeGenOpts().PointerAuth.CXXTypeInfoVTablePointer)
+    VTable = CGM.getConstantSignedPointer(
+        VTable, Schema,
+        Schema.isAddressDiscriminated() ? StorageAddress : nullptr,
+        GlobalDecl(), QualType(Ty, 0));
 
   Fields.push_back(VTable);
 }
@@ -4099,8 +4103,18 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
       llvm::GlobalVariable::LinkageTypes Linkage,
       llvm::GlobalValue::VisibilityTypes Visibility,
       llvm::GlobalValue::DLLStorageClassTypes DLLStorageClass) {
+  SmallString<256> Name;
+  llvm::raw_svector_ostream Out(Name);
+  CGM.getCXXABI().getMangleContext().mangleCXXRTTI(Ty, Out);
+  llvm::Module &M = CGM.getModule();
+  llvm::GlobalVariable *OldGV = M.getNamedGlobal(Name);
+  // int8 is an arbitrary type to be replaced later with replaceInitializer.
+  llvm::GlobalVariable *GV =
+      new llvm::GlobalVariable(M, CGM.Int8Ty, /*isConstant=*/true, Linkage,
+                               /*Initializer=*/nullptr, Name);
+
   // Add the vtable pointer.
-  BuildVTablePointer(cast<Type>(Ty));
+  BuildVTablePointer(cast<Type>(Ty), GV);
 
   // And the name.
   llvm::GlobalVariable *TypeName = GetAddrOfTypeName(Ty, Linkage);
@@ -4218,16 +4232,7 @@ llvm::Constant *ItaniumRTTIBuilder::BuildTypeInfo(
     llvm_unreachable("HLSL doesn't support RTTI");
   }
 
-  llvm::Constant *Init = llvm::ConstantStruct::getAnon(Fields);
-
-  SmallString<256> Name;
-  llvm::raw_svector_ostream Out(Name);
-  CGM.getCXXABI().getMangleContext().mangleCXXRTTI(Ty, Out);
-  llvm::Module &M = CGM.getModule();
-  llvm::GlobalVariable *OldGV = M.getNamedGlobal(Name);
-  llvm::GlobalVariable *GV =
-      new llvm::GlobalVariable(M, Init->getType(),
-                               /*isConstant=*/true, Linkage, Init, Name);
+  GV->replaceInitializer(llvm::ConstantStruct::getAnon(Fields));
 
   // Export the typeinfo in the same circumstances as the vtable is exported.
   auto GVDLLStorageClass = DLLStorageClass;
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp
index bb86d459b02e..e6a945618bad 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/child-inheritted-from-parent-in-comdat.cpp
@@ -4,8 +4,8 @@
 // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm -fhalf-no-semantic-interposition | FileCheck %s
 
 // The inline function is emitted in each module with the same comdat
-// CHECK: $_ZTS1A = comdat any
 // CHECK: $_ZTI1A = comdat any
+// CHECK: $_ZTS1A = comdat any
 // CHECK: $_ZTI1B.rtti_proxy = comdat any
 
 // The VTable is emitted everywhere used
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp
index d5d9a85d4e22..70f8289e9df3 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/inlined-key-function.cpp
@@ -4,8 +4,8 @@
 // RUN: %clang_cc1 %s -triple=aarch64-unknown-fuchsia -O1 -o - -emit-llvm | FileCheck %s
 
 // CHECK: $_ZTV1A = comdat any
-// CHECK: $_ZTS1A = comdat any
 // CHECK: $_ZTI1A = comdat any
+// CHECK: $_ZTS1A = comdat any
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
 
 // The VTable is linkonce_odr and in a comdat here bc it’s key function is inline defined.
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp
index a033ac41868f..c1b9a9398219 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-and-child-in-comdats.cpp
@@ -8,12 +8,12 @@
 // CHECK: $_ZN1A3fooEv = comdat any
 // CHECK: $_ZN1B3fooEv = comdat any
 // CHECK: $_ZTV1A = comdat any
-// CHECK: $_ZTS1A = comdat any
 // CHECK: $_ZTI1A = comdat any
+// CHECK: $_ZTS1A = comdat any
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
 // CHECK: $_ZTV1B = comdat any
-// CHECK: $_ZTS1B = comdat any
 // CHECK: $_ZTI1B = comdat any
+// CHECK: $_ZTS1B = comdat any
 // CHECK: $_ZTI1B.rtti_proxy = comdat any
 
 // Both the vtables for A and B are emitted and in their own comdats.
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
index 341c53146d47..d6eda793cc5b 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/parent-vtable-in-comdat.cpp
@@ -7,17 +7,17 @@
 // A::foo() has a comdat since it is an inline function
 // CHECK: $_ZN1A3fooEv = comdat any
 // CHECK: $_ZTV1A = comdat any
+// CHECK: $_ZTI1A = comdat any
 // CHECK: $_ZTS1A = comdat any
 
 // The VTable for A has its own comdat section bc it has no key function
-// CHECK: $_ZTI1A = comdat any
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
 
 // The VTable for A is emitted here and in a comdat section since it has no key function, and is used in this module when creating an instance of A.
 // CHECK: @_ZTV1A.local = linkonce_odr hidden unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1A.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, comdat($_ZTV1A), align 4
+// CHECK: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, comdat, align 8
 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 // CHECK: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00", comdat, align 1
-// CHECK: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, comdat, align 8
 // CHECK: @_ZTI1A.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1A, comdat
 // CHECK: @_ZTV1A = linkonce_odr unnamed_addr alias { [3 x i32] }, ptr @_ZTV1A.local
 
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
index ad8018ee1767..9dcb1c30e562 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/simple-vtable-definition.cpp
@@ -9,9 +9,9 @@
 // The vtable definition itself is private so we can take relative references to
 // it. The vtable symbol will be exposed through a public alias.
 // CHECK: @_ZTV1A.local = internal unnamed_addr constant { [3 x i32] } { [3 x i32] [i32 0, i32 trunc (i64 sub (i64 ptrtoint (ptr @_ZTI1A.rtti_proxy to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32), i32 trunc (i64 sub (i64 ptrtoint (ptr dso_local_equivalent @_ZN1A3fooEv to i64), i64 ptrtoint (ptr getelementptr inbounds ({ [3 x i32] }, ptr @_ZTV1A.local, i32 0, i32 0, i32 2) to i64)) to i32)] }, align 4
+// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8
 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 // CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
-// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8
 
 // The rtti should be in a comdat
 // CHECK: @_ZTI1A.rtti_proxy = {{.*}}comdat
diff --git a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
index fc5ee5096433..c471e5dbd7b3 100644
--- a/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
+++ b/clang/test/CodeGenCXX/RelativeVTablesABI/type-info.cpp
@@ -5,12 +5,12 @@
 // CHECK: $_ZTI1A.rtti_proxy = comdat any
 // CHECK: $_ZTI1B.rtti_proxy = comdat any
 
+// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8
 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 // CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
-// CHECK: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 8), ptr @_ZTS1A }, align 8
+// CHECK: @_ZTI1B ={{.*}} constant { ptr, ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 8), ptr @_ZTS1B, ptr @_ZTI1A }, align 8
 // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
 // CHECK: @_ZTS1B ={{.*}} constant [3 x i8] c"1B\00", align 1
-// CHECK: @_ZTI1B ={{.*}} constant { ptr, ptr, ptr } { ptr getelementptr inbounds (i8, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i32 8), ptr @_ZTS1B, ptr @_ZTI1A }, align 8
 // CHECK: @_ZTI1A.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1A, comdat
 // CHECK: @_ZTI1B.rtti_proxy = linkonce_odr hidden unnamed_addr constant ptr @_ZTI1B, comdat
 
diff --git a/clang/test/CodeGenCXX/armv7k.cpp b/clang/test/CodeGenCXX/armv7k.cpp
index a4a243c162ea..7aa9fd7944cf 100644
--- a/clang/test/CodeGenCXX/armv7k.cpp
+++ b/clang/test/CodeGenCXX/armv7k.cpp
@@ -50,17 +50,17 @@ namespace test2 {
 
   struct __attribute__((visibility("hidden"))) B {};
   const std::type_info &b0 = typeid(B);
-  // CHECK-GLOBALS: @_ZTSN5test21BE = linkonce_odr hidden constant
   // CHECK-GLOBALS: @_ZTIN5test21BE = linkonce_odr hidden constant { {{.*}}, ptr @_ZTSN5test21BE }
+  // CHECK-GLOBALS: @_ZTSN5test21BE = linkonce_odr hidden constant
 
   const std::type_info &b1 = typeid(B*);
-  // CHECK-GLOBALS: @_ZTSPN5test21BE = linkonce_odr hidden constant
   // CHECK-GLOBALS: @_ZTIPN5test21BE = linkonce_odr hidden constant { {{.*}}, ptr @_ZTSPN5test21BE, i32 0, ptr @_ZTIN5test21BE
+  // CHECK-GLOBALS: @_ZTSPN5test21BE = linkonce_odr hidden constant
 
   struct C {};
   const std::type_info &c0 = typeid(C);
-  // CHECK-GLOBALS: @_ZTSN5test21CE = linkonce_odr constant [11 x i8] c"N5test21CE\00"
   // CHECK-GLOBALS: @_ZTIN5test21CE = linkonce_odr constant { {{.*}}, ptr @_ZTSN5test21CE }
+  // CHECK-GLOBALS: @_ZTSN5test21CE = linkonce_odr constant [11 x i8] c"N5test21CE\00"
 }
 
 // va_list should be based on "char *" rather than "ptr".
diff --git a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
index d0c87d9dfda5..271d9ede79d0 100644
--- a/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
+++ b/clang/test/CodeGenCXX/dynamic-cast-address-space.cpp
@@ -10,17 +10,17 @@ B fail;
 // CHECK: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
 // CHECK: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8
 // CHECK: @_ZTI1A = external addrspace(1) constant ptr addrspace(1)
+// CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
 // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
 // CHECK: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1
-// CHECK: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
 // CHECK: @__oclc_ABI_version = weak_odr hidden local_unnamed_addr addrspace(4) constant i32 500
 //.
 // WITH-NONZERO-DEFAULT-AS: @_ZTV1B = linkonce_odr unnamed_addr addrspace(1) constant { [3 x ptr addrspace(1)] } { [3 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1B, ptr addrspace(1) addrspacecast (ptr addrspace(4) @_ZN1A1fEv to ptr addrspace(1))] }, comdat, align 8
 // WITH-NONZERO-DEFAULT-AS: @fail = addrspace(1) global { ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds inrange(-16, 8) ({ [3 x ptr addrspace(1)] }, ptr addrspace(1) @_ZTV1B, i32 0, i32 0, i32 2) }, align 8
 // WITH-NONZERO-DEFAULT-AS: @_ZTI1A = external addrspace(1) constant ptr addrspace(1)
+// WITH-NONZERO-DEFAULT-AS: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
 // WITH-NONZERO-DEFAULT-AS: @_ZTVN10__cxxabiv120__si_class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
 // WITH-NONZERO-DEFAULT-AS: @_ZTS1B = linkonce_odr addrspace(1) constant [3 x i8] c"1B\00", comdat, align 1
-// WITH-NONZERO-DEFAULT-AS: @_ZTI1B = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1B, ptr addrspace(1) @_ZTI1A }, comdat, align 8
 //.
 // CHECK-LABEL: define dso_local noundef nonnull align 8 dereferenceable(8) ptr @_Z1fP1A(
 // CHECK-SAME: ptr noundef [[A:%.*]]) #[[ATTR0:[0-9]+]] personality ptr @__gxx_personality_v0 {
diff --git a/clang/test/CodeGenCXX/exceptions-no-rtti.cpp b/clang/test/CodeGenCXX/exceptions-no-rtti.cpp
index 7c73285b948f..a3d969665bdc 100644
--- a/clang/test/CodeGenCXX/exceptions-no-rtti.cpp
+++ b/clang/test/CodeGenCXX/exceptions-no-rtti.cpp
@@ -3,8 +3,8 @@
 // CHECK: @_ZTIN5test11AE = linkonce_odr constant
 // CHECK: @_ZTIN5test11BE = linkonce_odr constant
 // CHECK: @_ZTIN5test11CE = linkonce_odr constant
-// CHECK: @_ZTIN5test11DE = linkonce_odr constant
 // CHECK: @_ZTIPN5test11DE = linkonce_odr constant {{.*}} @_ZTIN5test11DE
+// CHECK: @_ZTIN5test11DE = linkonce_odr constant
 
 // PR6974: this shouldn't crash
 namespace test0 {
diff --git a/clang/test/CodeGenCXX/implicit-record-visibility.cpp b/clang/test/CodeGenCXX/implicit-record-visibility.cpp
index ef388c7b8316..84ad822702d3 100644
--- a/clang/test/CodeGenCXX/implicit-record-visibility.cpp
+++ b/clang/test/CodeGenCXX/implicit-record-visibility.cpp
@@ -7,6 +7,6 @@
 // under -fvisibility=hidden the type of function f, due to its va_list (aka
 // __builtin_va_list, aka __va_list_tag (*)[1]) parameter would be hidden:
 
-// CHECK: @_ZTSFvP13__va_list_tagE = linkonce_odr constant
 // CHECK: @_ZTIFvP13__va_list_tagE = linkonce_odr constant
+// CHECK: @_ZTSFvP13__va_list_tagE = linkonce_odr constant
 void f(va_list) { (void)typeid(f); }
diff --git a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp
index 1af105e915e6..2fc0a6a4ee60 100644
--- a/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp
+++ b/clang/test/CodeGenCXX/mdefault-visibility-export-mapping-rtti.cpp
@@ -16,20 +16,20 @@
 // C is an incomplete class type, so any direct or indirect pointer types should have 
 // internal linkage, as should the type info for C itself.
 struct C;
+// CHECK: @_ZTIP1C = internal constant
 // CHECK: @_ZTSP1C = internal constant
-// CHECK: @_ZTS1C = internal constant
 // CHECK: @_ZTI1C = internal constant
-// CHECK: @_ZTIP1C = internal constant
-// CHECK: @_ZTSPP1C = internal constant
+// CHECK: @_ZTS1C = internal constant
 // CHECK: @_ZTIPP1C = internal constant
+// CHECK: @_ZTSPP1C = internal constant
 
 struct __attribute__((type_visibility("default"))) D;
+// CHECK: @_ZTIP1D = internal constant
 // CHECK: @_ZTSP1D = internal constant
-// CHECK: @_ZTS1D = internal constant
 // CHECK: @_ZTI1D = internal constant
-// CHECK: @_ZTIP1D = internal constant
-// CHECK: @_ZTSPP1D = internal constant
+// CHECK: @_ZTS1D = internal constant
 // CHECK: @_ZTIPP1D = internal constant
+// CHECK: @_ZTSPP1D = internal constant
 
 void __attribute__((visibility("default"))) tfunc() {
   (void)typeid(C *);
@@ -46,12 +46,12 @@ void s::foo() {}
 // UNSPECIFIED-DEF: @_ZTV1s = unnamed_addr constant
 // UNSPECIFIED-HID: @_ZTV1s = hidden unnamed_addr constant
 // UNSPECIFIED-EXP: @_ZTV1s = dllexport unnamed_addr constant
-// UNSPECIFIED-DEF: @_ZTS1s = constant
-// UNSPECIFIED-HID: @_ZTS1s = hidden constant
-// UNSPECIFIED-EXP: @_ZTS1s = dllexport constant
 // UNSPECIFIED-DEF: @_ZTI1s = constant
 // UNSPECIFIED-HID: @_ZTI1s = hidden constant
 // UNSPECIFIED-EXP: @_ZTI1s = dllexport constant
+// UNSPECIFIED-DEF: @_ZTS1s = constant
+// UNSPECIFIED-HID: @_ZTS1s = hidden constant
+// UNSPECIFIED-EXP: @_ZTS1s = dllexport constant
 
 // explicit default visibility RTTI & vtable
 struct __attribute__((type_visibility("default"))) t {
@@ -61,12 +61,12 @@ void t::foo() {}
 // EXPLICIT-DEF: @_ZTV1t = unnamed_addr constant
 // EXPLICIT-HID: @_ZTV1t = hidden unnamed_addr constant
 // EXPLICIT-EXP: @_ZTV1t = dllexport unnamed_addr constant
-// EXPLICIT-DEF: @_ZTS1t = constant
-// EXPLICIT-HID: @_ZTS1t = hidden constant
-// EXPLICIT-EXP: @_ZTS1t = dllexport constant
 // EXPLICIT-DEF: @_ZTI1t = constant
 // EXPLICIT-HID: @_ZTI1t = hidden constant
 // EXPLICIT-EXP: @_ZTI1t = dllexport constant
+// EXPLICIT-DEF: @_ZTS1t = constant
+// EXPLICIT-HID: @_ZTS1t = hidden constant
+// EXPLICIT-EXP: @_ZTS1t = dllexport constant
 
 #ifdef FUNDAMENTAL_IS_EXPLICIT
 #define TYPE_VIS __attribute__((type_visibility("default")))
@@ -86,511 +86,511 @@ __fundamental_type_info::~__fundamental_type_info() {}
 
 // __cxxabiv1::__fundamental_type_info
 // FUND-DEF: @_ZTVN10__cxxabiv123__fundamental_type_infoE = unnamed_addr constant
-// FUND-DEF: @_ZTSN10__cxxabiv123__fundamental_type_infoE = constant
 // FUND-DEF: @_ZTIN10__cxxabiv123__fundamental_type_infoE = constant
+// FUND-DEF: @_ZTSN10__cxxabiv123__fundamental_type_infoE = constant
 // FUND-HID: @_ZTVN10__cxxabiv123__fundamental_type_infoE = hidden unnamed_addr constant
-// FUND-HID: @_ZTSN10__cxxabiv123__fundamental_type_infoE = hidden constant
 // FUND-HID: @_ZTIN10__cxxabiv123__fundamental_type_infoE = hidden constant
+// FUND-HID: @_ZTSN10__cxxabiv123__fundamental_type_infoE = hidden constant
 // FUND-EXP: @_ZTVN10__cxxabiv123__fundamental_type_infoE = dllexport unnamed_addr constant
-// FUND-EXP: @_ZTSN10__cxxabiv123__fundamental_type_infoE = dllexport constant
 // FUND-EXP: @_ZTIN10__cxxabiv123__fundamental_type_infoE = dllexport constant
+// FUND-EXP: @_ZTSN10__cxxabiv123__fundamental_type_infoE = dllexport constant
 
 // void
-// FUND-DEF: @_ZTSv = constant
 // FUND-DEF: @_ZTIv = constant
-// FUND-DEF: @_ZTSPv = constant
+// FUND-DEF: @_ZTSv = constant
 // FUND-DEF: @_ZTIPv = constant
-// FUND-DEF: @_ZTSPKv = constant
+// FUND-DEF: @_ZTSPv = constant
 // FUND-DEF: @_ZTIPKv = constant
-// FUND-HID: @_ZTSv = hidden constant
+// FUND-DEF: @_ZTSPKv = constant
 // FUND-HID: @_ZTIv = hidden constant
-// FUND-HID: @_ZTSPv = hidden constant
+// FUND-HID: @_ZTSv = hidden constant
 // FUND-HID: @_ZTIPv = hidden constant
-// FUND-HID: @_ZTSPKv = hidden constant
+// FUND-HID: @_ZTSPv = hidden constant
 // FUND-HID: @_ZTIPKv = hidden constant
-// FUND-EXP: @_ZTSv = dllexport constant
+// FUND-HID: @_ZTSPKv = hidden constant
 // FUND-EXP: @_ZTIv = dllexport constant
-// FUND-EXP: @_ZTSPv = dllexport constant
+// FUND-EXP: @_ZTSv = dllexport constant
 // FUND-EXP: @_ZTIPv = dllexport constant
-// FUND-EXP: @_ZTSPKv = dllexport constant
+// FUND-EXP: @_ZTSPv = dllexport constant
 // FUND-EXP: @_ZTIPKv = dllexport constant
+// FUND-EXP: @_ZTSPKv = dllexport constant
 
 // std::nullptr_t
-// FUND-DEF: @_ZTSDn = constant
 // FUND-DEF: @_ZTIDn = constant
-// FUND-DEF: @_ZTSPDn = constant
+// FUND-DEF: @_ZTSDn = constant
 // FUND-DEF: @_ZTIPDn = constant
-// FUND-DEF: @_ZTSPKDn = constant
+// FUND-DEF: @_ZTSPDn = constant
 // FUND-DEF: @_ZTIPKDn = constant
-// FUND-HID: @_ZTSDn = hidden constant
+// FUND-DEF: @_ZTSPKDn = constant
 // FUND-HID: @_ZTIDn = hidden constant
-// FUND-HID: @_ZTSPDn = hidden constant
+// FUND-HID: @_ZTSDn = hidden constant
 // FUND-HID: @_ZTIPDn = hidden constant
-// FUND-HID: @_ZTSPKDn = hidden constant
+// FUND-HID: @_ZTSPDn = hidden constant
 // FUND-HID: @_ZTIPKDn = hidden constant
-// FUND-EXP: @_ZTSDn = dllexport constant
+// FUND-HID: @_ZTSPKDn = hidden constant
 // FUND-EXP: @_ZTIDn = dllexport constant
-// FUND-EXP: @_ZTSPDn = dllexport constant
+// FUND-EXP: @_ZTSDn = dllexport constant
 // FUND-EXP: @_ZTIPDn = dllexport constant
-// FUND-EXP: @_ZTSPKDn = dllexport constant
+// FUND-EXP: @_ZTSPDn = dllexport constant
 // FUND-EXP: @_ZTIPKDn = dllexport constant
+// FUND-EXP: @_ZTSPKDn = dllexport constant
 
 // bool
-// FUND-DEF: @_ZTSb = constant
 // FUND-DEF: @_ZTIb = constant
-// FUND-DEF: @_ZTSPb = constant
+// FUND-DEF: @_ZTSb = constant
 // FUND-DEF: @_ZTIPb = constant
-// FUND-DEF: @_ZTSPKb = constant
+// FUND-DEF: @_ZTSPb = constant
 // FUND-DEF: @_ZTIPKb = constant
-// FUND-HID: @_ZTSb = hidden constant
+// FUND-DEF: @_ZTSPKb = constant
 // FUND-HID: @_ZTIb = hidden constant
-// FUND-HID: @_ZTSPb = hidden constant
+// FUND-HID: @_ZTSb = hidden constant
 // FUND-HID: @_ZTIPb = hidden constant
-// FUND-HID: @_ZTSPKb = hidden constant
+// FUND-HID: @_ZTSPb = hidden constant
 // FUND-HID: @_ZTIPKb = hidden constant
-// FUND-EXP: @_ZTSb = dllexport constant
+// FUND-HID: @_ZTSPKb = hidden constant
 // FUND-EXP: @_ZTIb = dllexport constant
-// FUND-EXP: @_ZTSPb = dllexport constant
+// FUND-EXP: @_ZTSb = dllexport constant
 // FUND-EXP: @_ZTIPb = dllexport constant
-// FUND-EXP: @_ZTSPKb = dllexport constant
+// FUND-EXP: @_ZTSPb = dllexport constant
 // FUND-EXP: @_ZTIPKb = dllexport constant
+// FUND-EXP: @_ZTSPKb = dllexport constant
 
 // wchar_t
-// FUND-DEF: @_ZTSw = constant
 // FUND-DEF: @_ZTIw = constant
-// FUND-DEF: @_ZTSPw = constant
+// FUND-DEF: @_ZTSw = constant
 // FUND-DEF: @_ZTIPw = constant
-// FUND-DEF: @_ZTSPKw = constant
+// FUND-DEF: @_ZTSPw = constant
 // FUND-DEF: @_ZTIPKw = constant
-// FUND-HID: @_ZTSw = hidden constant
+// FUND-DEF: @_ZTSPKw = constant
 // FUND-HID: @_ZTIw = hidden constant
-// FUND-HID: @_ZTSPw = hidden constant
+// FUND-HID: @_ZTSw = hidden constant
 // FUND-HID: @_ZTIPw = hidden constant
-// FUND-HID: @_ZTSPKw = hidden constant
+// FUND-HID: @_ZTSPw = hidden constant
 // FUND-HID: @_ZTIPKw = hidden constant
-// FUND-EXP: @_ZTSw = dllexport constant
+// FUND-HID: @_ZTSPKw = hidden constant
 // FUND-EXP: @_ZTIw = dllexport constant
-// FUND-EXP: @_ZTSPw = dllexport constant
+// FUND-EXP: @_ZTSw = dllexport constant
 // FUND-EXP: @_ZTIPw = dllexport constant
-// FUND-EXP: @_ZTSPKw = dllexport constant
+// FUND-EXP: @_ZTSPw = dllexport constant
 // FUND-EXP: @_ZTIPKw = dllexport constant
+// FUND-EXP: @_ZTSPKw = dllexport constant
 
 // char
-// FUND-DEF: @_ZTSc = constant
 // FUND-DEF: @_ZTIc = constant
-// FUND-DEF: @_ZTSPc = constant
+// FUND-DEF: @_ZTSc = constant
 // FUND-DEF: @_ZTIPc = constant
-// FUND-DEF: @_ZTSPKc = constant
+// FUND-DEF: @_ZTSPc = constant
 // FUND-DEF: @_ZTIPKc = constant
-// FUND-HID: @_ZTSc = hidden constant
+// FUND-DEF: @_ZTSPKc = constant
 // FUND-HID: @_ZTIc = hidden constant
-// FUND-HID: @_ZTSPc = hidden constant
+// FUND-HID: @_ZTSc = hidden constant
 // FUND-HID: @_ZTIPc = hidden constant
-// FUND-HID: @_ZTSPKc = hidden constant
+// FUND-HID: @_ZTSPc = hidden constant
 // FUND-HID: @_ZTIPKc = hidden constant
-// FUND-EXP: @_ZTSc = dllexport constant
+// FUND-HID: @_ZTSPKc = hidden constant
 // FUND-EXP: @_ZTIc = dllexport constant
-// FUND-EXP: @_ZTSPc = dllexport constant
+// FUND-EXP: @_ZTSc = dllexport constant
 // FUND-EXP: @_ZTIPc = dllexport constant
-// FUND-EXP: @_ZTSPKc = dllexport constant
+// FUND-EXP: @_ZTSPc = dllexport constant
 // FUND-EXP: @_ZTIPKc = dllexport constant
+// FUND-EXP: @_ZTSPKc = dllexport constant
 
 // unsigned char
-// FUND-DEF: @_ZTSh = constant
 // FUND-DEF: @_ZTIh = constant
-// FUND-DEF: @_ZTSPh = constant
+// FUND-DEF: @_ZTSh = constant
 // FUND-DEF: @_ZTIPh = constant
-// FUND-DEF: @_ZTSPKh = constant
+// FUND-DEF: @_ZTSPh = constant
 // FUND-DEF: @_ZTIPKh = constant
-// FUND-HID: @_ZTSh = hidden constant
+// FUND-DEF: @_ZTSPKh = constant
 // FUND-HID: @_ZTIh = hidden constant
-// FUND-HID: @_ZTSPh = hidden constant
+// FUND-HID: @_ZTSh = hidden constant
 // FUND-HID: @_ZTIPh = hidden constant
-// FUND-HID: @_ZTSPKh = hidden constant
+// FUND-HID: @_ZTSPh = hidden constant
 // FUND-HID: @_ZTIPKh = hidden constant
-// FUND-EXP: @_ZTSh = dllexport constant
+// FUND-HID: @_ZTSPKh = hidden constant
 // FUND-EXP: @_ZTIh = dllexport constant
-// FUND-EXP: @_ZTSPh = dllexport constant
+// FUND-EXP: @_ZTSh = dllexport constant
 // FUND-EXP: @_ZTIPh = dllexport constant
-// FUND-EXP: @_ZTSPKh = dllexport constant
+// FUND-EXP: @_ZTSPh = dllexport constant
 // FUND-EXP: @_ZTIPKh = dllexport constant
+// FUND-EXP: @_ZTSPKh = dllexport constant
 
 // signed char
-// FUND-DEF: @_ZTSa = constant
 // FUND-DEF: @_ZTIa = constant
-// FUND-DEF: @_ZTSPa = constant
+// FUND-DEF: @_ZTSa = constant
 // FUND-DEF: @_ZTIPa = constant
-// FUND-DEF: @_ZTSPKa = constant
+// FUND-DEF: @_ZTSPa = constant
 // FUND-DEF: @_ZTIPKa = constant
-// FUND-HID: @_ZTSa = hidden constant
+// FUND-DEF: @_ZTSPKa = constant
 // FUND-HID: @_ZTIa = hidden constant
-// FUND-HID: @_ZTSPa = hidden constant
+// FUND-HID: @_ZTSa = hidden constant
 // FUND-HID: @_ZTIPa = hidden constant
-// FUND-HID: @_ZTSPKa = hidden constant
+// FUND-HID: @_ZTSPa = hidden constant
 // FUND-HID: @_ZTIPKa = hidden constant
-// FUND-EXP: @_ZTSa = dllexport constant
+// FUND-HID: @_ZTSPKa = hidden constant
 // FUND-EXP: @_ZTIa = dllexport constant
-// FUND-EXP: @_ZTSPa = dllexport constant
+// FUND-EXP: @_ZTSa = dllexport constant
 // FUND-EXP: @_ZTIPa = dllexport constant
-// FUND-EXP: @_ZTSPKa = dllexport constant
+// FUND-EXP: @_ZTSPa = dllexport constant
 // FUND-EXP: @_ZTIPKa = dllexport constant
+// FUND-EXP: @_ZTSPKa = dllexport constant
 
 // short
-// FUND-DEF: @_ZTSs = constant
 // FUND-DEF: @_ZTIs = constant
-// FUND-DEF: @_ZTSPs = constant
+// FUND-DEF: @_ZTSs = constant
 // FUND-DEF: @_ZTIPs = constant
-// FUND-DEF: @_ZTSPKs = constant
+// FUND-DEF: @_ZTSPs = constant
 // FUND-DEF: @_ZTIPKs = constant
-// FUND-HID: @_ZTSs = hidden constant
+// FUND-DEF: @_ZTSPKs = constant
 // FUND-HID: @_ZTIs = hidden constant
-// FUND-HID: @_ZTSPs = hidden constant
+// FUND-HID: @_ZTSs = hidden constant
 // FUND-HID: @_ZTIPs = hidden constant
-// FUND-HID: @_ZTSPKs = hidden constant
+// FUND-HID: @_ZTSPs = hidden constant
 // FUND-HID: @_ZTIPKs = hidden constant
-// FUND-EXP: @_ZTSs = dllexport constant
+// FUND-HID: @_ZTSPKs = hidden constant
 // FUND-EXP: @_ZTIs = dllexport constant
-// FUND-EXP: @_ZTSPs = dllexport constant
+// FUND-EXP: @_ZTSs = dllexport constant
 // FUND-EXP: @_ZTIPs = dllexport constant
-// FUND-EXP: @_ZTSPKs = dllexport constant
+// FUND-EXP: @_ZTSPs = dllexport constant
 // FUND-EXP: @_ZTIPKs = dllexport constant
+// FUND-EXP: @_ZTSPKs = dllexport constant
 
 // unsigned short
-// FUND-DEF: @_ZTSt = constant
 // FUND-DEF: @_ZTIt = constant
-// FUND-DEF: @_ZTSPt = constant
+// FUND-DEF: @_ZTSt = constant
 // FUND-DEF: @_ZTIPt = constant
-// FUND-DEF: @_ZTSPKt = constant
+// FUND-DEF: @_ZTSPt = constant
 // FUND-DEF: @_ZTIPKt = constant
-// FUND-HID: @_ZTSt = hidden constant
+// FUND-DEF: @_ZTSPKt = constant
 // FUND-HID: @_ZTIt = hidden constant
-// FUND-HID: @_ZTSPt = hidden constant
+// FUND-HID: @_ZTSt = hidden constant
 // FUND-HID: @_ZTIPt = hidden constant
-// FUND-HID: @_ZTSPKt = hidden constant
+// FUND-HID: @_ZTSPt = hidden constant
 // FUND-HID: @_ZTIPKt = hidden constant
-// FUND-EXP: @_ZTSt = dllexport constant
+// FUND-HID: @_ZTSPKt = hidden constant
 // FUND-EXP: @_ZTIt = dllexport constant
-// FUND-EXP: @_ZTSPt = dllexport constant
+// FUND-EXP: @_ZTSt = dllexport constant
 // FUND-EXP: @_ZTIPt = dllexport constant
-// FUND-EXP: @_ZTSPKt = dllexport constant
+// FUND-EXP: @_ZTSPt = dllexport constant
 // FUND-EXP: @_ZTIPKt = dllexport constant
+// FUND-EXP: @_ZTSPKt = dllexport constant
 
 // int
-// FUND-DEF: @_ZTSi = constant
 // FUND-DEF: @_ZTIi = constant
-// FUND-DEF: @_ZTSPi = constant
+// FUND-DEF: @_ZTSi = constant
 // FUND-DEF: @_ZTIPi = constant
-// FUND-DEF: @_ZTSPKi = constant
+// FUND-DEF: @_ZTSPi = constant
 // FUND-DEF: @_ZTIPKi = constant
-// FUND-HID: @_ZTSi = hidden constant
+// FUND-DEF: @_ZTSPKi = constant
 // FUND-HID: @_ZTIi = hidden constant
-// FUND-HID: @_ZTSPi = hidden constant
+// FUND-HID: @_ZTSi = hidden constant
 // FUND-HID: @_ZTIPi = hidden constant
-// FUND-HID: @_ZTSPKi = hidden constant
+// FUND-HID: @_ZTSPi = hidden constant
 // FUND-HID: @_ZTIPKi = hidden constant
-// FUND-EXP: @_ZTSi = dllexport constant
+// FUND-HID: @_ZTSPKi = hidden constant
 // FUND-EXP: @_ZTIi = dllexport constant
-// FUND-EXP: @_ZTSPi = dllexport constant
+// FUND-EXP: @_ZTSi = dllexport constant
 // FUND-EXP: @_ZTIPi = dllexport constant
-// FUND-EXP: @_ZTSPKi = dllexport constant
+// FUND-EXP: @_ZTSPi = dllexport constant
 // FUND-EXP: @_ZTIPKi = dllexport constant
+// FUND-EXP: @_ZTSPKi = dllexport constant
 
 // unsigned int
-// FUND-DEF: @_ZTSj = constant
 // FUND-DEF: @_ZTIj = constant
-// FUND-DEF: @_ZTSPj = constant
+// FUND-DEF: @_ZTSj = constant
 // FUND-DEF: @_ZTIPj = constant
-// FUND-DEF: @_ZTSPKj = constant
+// FUND-DEF: @_ZTSPj = constant
 // FUND-DEF: @_ZTIPKj = constant
-// FUND-HID: @_ZTSj = hidden constant
+// FUND-DEF: @_ZTSPKj = constant
 // FUND-HID: @_ZTIj = hidden constant
-// FUND-HID: @_ZTSPj = hidden constant
+// FUND-HID: @_ZTSj = hidden constant
 // FUND-HID: @_ZTIPj = hidden constant
-// FUND-HID: @_ZTSPKj = hidden constant
+// FUND-HID: @_ZTSPj = hidden constant
 // FUND-HID: @_ZTIPKj = hidden constant
-// FUND-EXP: @_ZTSj = dllexport constant
+// FUND-HID: @_ZTSPKj = hidden constant
 // FUND-EXP: @_ZTIj = dllexport constant
-// FUND-EXP: @_ZTSPj = dllexport constant
+// FUND-EXP: @_ZTSj = dllexport constant
 // FUND-EXP: @_ZTIPj = dllexport constant
-// FUND-EXP: @_ZTSPKj = dllexport constant
+// FUND-EXP: @_ZTSPj = dllexport constant
 // FUND-EXP: @_ZTIPKj = dllexport constant
+// FUND-EXP: @_ZTSPKj = dllexport constant
 
 // long
-// FUND-DEF: @_ZTSl = constant
 // FUND-DEF: @_ZTIl = constant
-// FUND-DEF: @_ZTSPl = constant
+// FUND-DEF: @_ZTSl = constant
 // FUND-DEF: @_ZTIPl = constant
-// FUND-DEF: @_ZTSPKl = constant
+// FUND-DEF: @_ZTSPl = constant
 // FUND-DEF: @_ZTIPKl = constant
-// FUND-HID: @_ZTSl = hidden constant
+// FUND-DEF: @_ZTSPKl = constant
 // FUND-HID: @_ZTIl = hidden constant
-// FUND-HID: @_ZTSPl = hidden constant
+// FUND-HID: @_ZTSl = hidden constant
 // FUND-HID: @_ZTIPl = hidden constant
-// FUND-HID: @_ZTSPKl = hidden constant
+// FUND-HID: @_ZTSPl = hidden constant
 // FUND-HID: @_ZTIPKl = hidden constant
-// FUND-EXP: @_ZTSl = dllexport constant
+// FUND-HID: @_ZTSPKl = hidden constant
 // FUND-EXP: @_ZTIl = dllexport constant
-// FUND-EXP: @_ZTSPl = dllexport constant
+// FUND-EXP: @_ZTSl = dllexport constant
 // FUND-EXP: @_ZTIPl = dllexport constant
-// FUND-EXP: @_ZTSPKl = dllexport constant
+// FUND-EXP: @_ZTSPl = dllexport constant
 // FUND-EXP: @_ZTIPKl = dllexport constant
+// FUND-EXP: @_ZTSPKl = dllexport constant
 
 // unsigned long
-// FUND-DEF: @_ZTSm = constant
 // FUND-DEF: @_ZTIm = constant
-// FUND-DEF: @_ZTSPm = constant
+// FUND-DEF: @_ZTSm = constant
 // FUND-DEF: @_ZTIPm = constant
-// FUND-DEF: @_ZTSPKm = constant
+// FUND-DEF: @_ZTSPm = constant
 // FUND-DEF: @_ZTIPKm = constant
-// FUND-HID: @_ZTSm = hidden constant
+// FUND-DEF: @_ZTSPKm = constant
 // FUND-HID: @_ZTIm = hidden constant
-// FUND-HID: @_ZTSPm = hidden constant
+// FUND-HID: @_ZTSm = hidden constant
 // FUND-HID: @_ZTIPm = hidden constant
-// FUND-HID: @_ZTSPKm = hidden constant
+// FUND-HID: @_ZTSPm = hidden constant
 // FUND-HID: @_ZTIPKm = hidden constant
-// FUND-EXP: @_ZTSm = dllexport constant
+// FUND-HID: @_ZTSPKm = hidden constant
 // FUND-EXP: @_ZTIm = dllexport constant
-// FUND-EXP: @_ZTSPm = dllexport constant
+// FUND-EXP: @_ZTSm = dllexport constant
 // FUND-EXP: @_ZTIPm = dllexport constant
-// FUND-EXP: @_ZTSPKm = dllexport constant
+// FUND-EXP: @_ZTSPm = dllexport constant
 // FUND-EXP: @_ZTIPKm = dllexport constant
+// FUND-EXP: @_ZTSPKm = dllexport constant
 
 // long long
-// FUND-DEF: @_ZTSx = constant
 // FUND-DEF: @_ZTIx = constant
-// FUND-DEF: @_ZTSPx = constant
+// FUND-DEF: @_ZTSx = constant
 // FUND-DEF: @_ZTIPx = constant
-// FUND-DEF: @_ZTSPKx = constant
+// FUND-DEF: @_ZTSPx = constant
 // FUND-DEF: @_ZTIPKx = constant
-// FUND-HID: @_ZTSx = hidden constant
+// FUND-DEF: @_ZTSPKx = constant
 // FUND-HID: @_ZTIx = hidden constant
-// FUND-HID: @_ZTSPx = hidden constant
+// FUND-HID: @_ZTSx = hidden constant
 // FUND-HID: @_ZTIPx = hidden constant
-// FUND-HID: @_ZTSPKx = hidden constant
+// FUND-HID: @_ZTSPx = hidden constant
 // FUND-HID: @_ZTIPKx = hidden constant
-// FUND-EXP: @_ZTSx = dllexport constant
+// FUND-HID: @_ZTSPKx = hidden constant
 // FUND-EXP: @_ZTIx = dllexport constant
-// FUND-EXP: @_ZTSPx = dllexport constant
+// FUND-EXP: @_ZTSx = dllexport constant
 // FUND-EXP: @_ZTIPx = dllexport constant
-// FUND-EXP: @_ZTSPKx = dllexport constant
+// FUND-EXP: @_ZTSPx = dllexport constant
 // FUND-EXP: @_ZTIPKx = dllexport constant
+// FUND-EXP: @_ZTSPKx = dllexport constant
 
 // unsigned long long
-// FUND-DEF: @_ZTSy = constant
 // FUND-DEF: @_ZTIy = constant
-// FUND-DEF: @_ZTSPy = constant
+// FUND-DEF: @_ZTSy = constant
 // FUND-DEF: @_ZTIPy = constant
-// FUND-DEF: @_ZTSPKy = constant
+// FUND-DEF: @_ZTSPy = constant
 // FUND-DEF: @_ZTIPKy = constant
-// FUND-HID: @_ZTSy = hidden constant
+// FUND-DEF: @_ZTSPKy = constant
 // FUND-HID: @_ZTIy = hidden constant
-// FUND-HID: @_ZTSPy = hidden constant
+// FUND-HID: @_ZTSy = hidden constant
 // FUND-HID: @_ZTIPy = hidden constant
-// FUND-HID: @_ZTSPKy = hidden constant
+// FUND-HID: @_ZTSPy = hidden constant
 // FUND-HID: @_ZTIPKy = hidden constant
-// FUND-EXP: @_ZTSy = dllexport constant
+// FUND-HID: @_ZTSPKy = hidden constant
 // FUND-EXP: @_ZTIy = dllexport constant
-// FUND-EXP: @_ZTSPy = dllexport constant
+// FUND-EXP: @_ZTSy = dllexport constant
 // FUND-EXP: @_ZTIPy = dllexport constant
-// FUND-EXP: @_ZTSPKy = dllexport constant
+// FUND-EXP: @_ZTSPy = dllexport constant
 // FUND-EXP: @_ZTIPKy = dllexport constant
+// FUND-EXP: @_ZTSPKy = dllexport constant
 
 // __int128
-// FUND-DEF: @_ZTSn = constant
 // FUND-DEF: @_ZTIn = constant
-// FUND-DEF: @_ZTSPn = constant
+// FUND-DEF: @_ZTSn = constant
 // FUND-DEF: @_ZTIPn = constant
-// FUND-DEF: @_ZTSPKn = constant
+// FUND-DEF: @_ZTSPn = constant
 // FUND-DEF: @_ZTIPKn = constant
-// FUND-HID: @_ZTSn = hidden constant
+// FUND-DEF: @_ZTSPKn = constant
 // FUND-HID: @_ZTIn = hidden constant
-// FUND-HID: @_ZTSPn = hidden constant
+// FUND-HID: @_ZTSn = hidden constant
 // FUND-HID: @_ZTIPn = hidden constant
-// FUND-HID: @_ZTSPKn = hidden constant
+// FUND-HID: @_ZTSPn = hidden constant
 // FUND-HID: @_ZTIPKn = hidden constant
-// FUND-EXP: @_ZTSn = dllexport constant
+// FUND-HID: @_ZTSPKn = hidden constant
 // FUND-EXP: @_ZTIn = dllexport constant
-// FUND-EXP: @_ZTSPn = dllexport constant
+// FUND-EXP: @_ZTSn = dllexport constant
 // FUND-EXP: @_ZTIPn = dllexport constant
-// FUND-EXP: @_ZTSPKn = dllexport constant
+// FUND-EXP: @_ZTSPn = dllexport constant
 // FUND-EXP: @_ZTIPKn = dllexport constant
+// FUND-EXP: @_ZTSPKn = dllexport constant
 
 // unsigned __int128
-// FUND-DEF: @_ZTSo = constant
 // FUND-DEF: @_ZTIo = constant
-// FUND-DEF: @_ZTSPo = constant
+// FUND-DEF: @_ZTSo = constant
 // FUND-DEF: @_ZTIPo = constant
-// FUND-DEF: @_ZTSPKo = constant
+// FUND-DEF: @_ZTSPo = constant
 // FUND-DEF: @_ZTIPKo = constant
-// FUND-HID: @_ZTSo = hidden constant
+// FUND-DEF: @_ZTSPKo = constant
 // FUND-HID: @_ZTIo = hidden constant
-// FUND-HID: @_ZTSPo = hidden constant
+// FUND-HID: @_ZTSo = hidden constant
 // FUND-HID: @_ZTIPo = hidden constant
-// FUND-HID: @_ZTSPKo = hidden constant
+// FUND-HID: @_ZTSPo = hidden constant
 // FUND-HID: @_ZTIPKo = hidden constant
-// FUND-EXP: @_ZTSo = dllexport constant
+// FUND-HID: @_ZTSPKo = hidden constant
 // FUND-EXP: @_ZTIo = dllexport constant
-// FUND-EXP: @_ZTSPo = dllexport constant
+// FUND-EXP: @_ZTSo = dllexport constant
 // FUND-EXP: @_ZTIPo = dllexport constant
-// FUND-EXP: @_ZTSPKo = dllexport constant
+// FUND-EXP: @_ZTSPo = dllexport constant
 // FUND-EXP: @_ZTIPKo = dllexport constant
+// FUND-EXP: @_ZTSPKo = dllexport constant
 
 // half
-// FUND-DEF: @_ZTSDh = constant
 // FUND-DEF: @_ZTIDh = constant
-// FUND-DEF: @_ZTSPDh = constant
+// FUND-DEF: @_ZTSDh = constant
 // FUND-DEF: @_ZTIPDh = constant
-// FUND-DEF: @_ZTSPKDh = constant
+// FUND-DEF: @_ZTSPDh = constant
 // FUND-DEF: @_ZTIPKDh = constant
-// FUND-HID: @_ZTSDh = hidden constant
+// FUND-DEF: @_ZTSPKDh = constant
 // FUND-HID: @_ZTIDh = hidden constant
-// FUND-HID: @_ZTSPDh = hidden constant
+// FUND-HID: @_ZTSDh = hidden constant
 // FUND-HID: @_ZTIPDh = hidden constant
-// FUND-HID: @_ZTSPKDh = hidden constant
+// FUND-HID: @_ZTSPDh = hidden constant
 // FUND-HID: @_ZTIPKDh = hidden constant
-// FUND-EXP: @_ZTSDh = dllexport constant
+// FUND-HID: @_ZTSPKDh = hidden constant
 // FUND-EXP: @_ZTIDh = dllexport constant
-// FUND-EXP: @_ZTSPDh = dllexport constant
+// FUND-EXP: @_ZTSDh = dllexport constant
 // FUND-EXP: @_ZTIPDh = dllexport constant
-// FUND-EXP: @_ZTSPKDh = dllexport constant
+// FUND-EXP: @_ZTSPDh = dllexport constant
 // FUND-EXP: @_ZTIPKDh = dllexport constant
+// FUND-EXP: @_ZTSPKDh = dllexport constant
 
 // float
-// FUND-DEF: @_ZTSf = constant
 // FUND-DEF: @_ZTIf = constant
-// FUND-DEF: @_ZTSPf = constant
+// FUND-DEF: @_ZTSf = constant
 // FUND-DEF: @_ZTIPf = constant
-// FUND-DEF: @_ZTSPKf = constant
+// FUND-DEF: @_ZTSPf = constant
 // FUND-DEF: @_ZTIPKf = constant
-// FUND-HID: @_ZTSf = hidden constant
+// FUND-DEF: @_ZTSPKf = constant
 // FUND-HID: @_ZTIf = hidden constant
-// FUND-HID: @_ZTSPf = hidden constant
+// FUND-HID: @_ZTSf = hidden constant
 // FUND-HID: @_ZTIPf = hidden constant
-// FUND-HID: @_ZTSPKf = hidden constant
+// FUND-HID: @_ZTSPf = hidden constant
 // FUND-HID: @_ZTIPKf = hidden constant
-// FUND-EXP: @_ZTSf = dllexport constant
+// FUND-HID: @_ZTSPKf = hidden constant
 // FUND-EXP: @_ZTIf = dllexport constant
-// FUND-EXP: @_ZTSPf = dllexport constant
+// FUND-EXP: @_ZTSf = dllexport constant
 // FUND-EXP: @_ZTIPf = dllexport constant
-// FUND-EXP: @_ZTSPKf = dllexport constant
+// FUND-EXP: @_ZTSPf = dllexport constant
 // FUND-EXP: @_ZTIPKf = dllexport constant
+// FUND-EXP: @_ZTSPKf = dllexport constant
 
 // double
-// FUND-DEF: @_ZTSd = constant
 // FUND-DEF: @_ZTId = constant
-// FUND-DEF: @_ZTSPd = constant
+// FUND-DEF: @_ZTSd = constant
 // FUND-DEF: @_ZTIPd = constant
-// FUND-DEF: @_ZTSPKd = constant
+// FUND-DEF: @_ZTSPd = constant
 // FUND-DEF: @_ZTIPKd = constant
-// FUND-HID: @_ZTSd = hidden constant
+// FUND-DEF: @_ZTSPKd = constant
 // FUND-HID: @_ZTId = hidden constant
-// FUND-HID: @_ZTSPd = hidden constant
+// FUND-HID: @_ZTSd = hidden constant
 // FUND-HID: @_ZTIPd = hidden constant
-// FUND-HID: @_ZTSPKd = hidden constant
+// FUND-HID: @_ZTSPd = hidden constant
 // FUND-HID: @_ZTIPKd = hidden constant
-// FUND-EXP: @_ZTSd = dllexport constant
+// FUND-HID: @_ZTSPKd = hidden constant
 // FUND-EXP: @_ZTId = dllexport constant
-// FUND-EXP: @_ZTSPd = dllexport constant
+// FUND-EXP: @_ZTSd = dllexport constant
 // FUND-EXP: @_ZTIPd = dllexport constant
-// FUND-EXP: @_ZTSPKd = dllexport constant
+// FUND-EXP: @_ZTSPd = dllexport constant
 // FUND-EXP: @_ZTIPKd = dllexport constant
+// FUND-EXP: @_ZTSPKd = dllexport constant
 
 // long double
-// FUND-DEF: @_ZTSe = constant
 // FUND-DEF: @_ZTIe = constant
-// FUND-DEF: @_ZTSPe = constant
+// FUND-DEF: @_ZTSe = constant
 // FUND-DEF: @_ZTIPe = constant
-// FUND-DEF: @_ZTSPKe = constant
+// FUND-DEF: @_ZTSPe = constant
 // FUND-DEF: @_ZTIPKe = constant
-// FUND-HID: @_ZTSe = hidden constant
+// FUND-DEF: @_ZTSPKe = constant
 // FUND-HID: @_ZTIe = hidden constant
-// FUND-HID: @_ZTSPe = hidden constant
+// FUND-HID: @_ZTSe = hidden constant
 // FUND-HID: @_ZTIPe = hidden constant
-// FUND-HID: @_ZTSPKe = hidden constant
+// FUND-HID: @_ZTSPe = hidden constant
 // FUND-HID: @_ZTIPKe = hidden constant
-// FUND-EXP: @_ZTSe = dllexport constant
+// FUND-HID: @_ZTSPKe = hidden constant
 // FUND-EXP: @_ZTIe = dllexport constant
-// FUND-EXP: @_ZTSPe = dllexport constant
+// FUND-EXP: @_ZTSe = dllexport constant
 // FUND-EXP: @_ZTIPe = dllexport constant
-// FUND-EXP: @_ZTSPKe = dllexport constant
+// FUND-EXP: @_ZTSPe = dllexport constant
 // FUND-EXP: @_ZTIPKe = dllexport constant
+// FUND-EXP: @_ZTSPKe = dllexport constant
 
 // __ieee128
-// FUND-DEF: @_ZTSu9__ieee128 = constant
 // FUND-DEF: @_ZTIu9__ieee128 = constant
-// FUND-DEF: @_ZTSPu9__ieee128 = constant
+// FUND-DEF: @_ZTSu9__ieee128 = constant
 // FUND-DEF: @_ZTIPu9__ieee128 = constant
-// FUND-DEF: @_ZTSPKu9__ieee128 = constant
+// FUND-DEF: @_ZTSPu9__ieee128 = constant
 // FUND-DEF: @_ZTIPKu9__ieee128 = constant
-// FUND-HID: @_ZTSu9__ieee128 = hidden constant
+// FUND-DEF: @_ZTSPKu9__ieee128 = constant
 // FUND-HID: @_ZTIu9__ieee128 = hidden constant
-// FUND-HID: @_ZTSPu9__ieee128 = hidden constant
+// FUND-HID: @_ZTSu9__ieee128 = hidden constant
 // FUND-HID: @_ZTIPu9__ieee128 = hidden constant
-// FUND-HID: @_ZTSPKu9__ieee128 = hidden constant
+// FUND-HID: @_ZTSPu9__ieee128 = hidden constant
 // FUND-HID: @_ZTIPKu9__ieee128 = hidden constant
-// FUND-EXP: @_ZTSu9__ieee128 = dllexport constant
+// FUND-HID: @_ZTSPKu9__ieee128 = hidden constant
 // FUND-EXP: @_ZTIu9__ieee128 = dllexport constant
-// FUND-EXP: @_ZTSPu9__ieee128 = dllexport constant
+// FUND-EXP: @_ZTSu9__ieee128 = dllexport constant
 // FUND-EXP: @_ZTIPu9__ieee128 = dllexport constant
-// FUND-EXP: @_ZTSPKu9__ieee128 = dllexport constant
+// FUND-EXP: @_ZTSPu9__ieee128 = dllexport constant
 // FUND-EXP: @_ZTIPKu9__ieee128 = dllexport constant
+// FUND-EXP: @_ZTSPKu9__ieee128 = dllexport constant
 
 // char8_t
-// FUND-DEF: @_ZTSDu = constant
 // FUND-DEF: @_ZTIDu = constant
-// FUND-DEF: @_ZTSPDu = constant
+// FUND-DEF: @_ZTSDu = constant
 // FUND-DEF: @_ZTIPDu = constant
-// FUND-DEF: @_ZTSPKDu = constant
+// FUND-DEF: @_ZTSPDu = constant
 // FUND-DEF: @_ZTIPKDu = constant
-// FUND-HID: @_ZTSDu = hidden constant
+// FUND-DEF: @_ZTSPKDu = constant
 // FUND-HID: @_ZTIDu = hidden constant
-// FUND-HID: @_ZTSPDu = hidden constant
+// FUND-HID: @_ZTSDu = hidden constant
 // FUND-HID: @_ZTIPDu = hidden constant
-// FUND-HID: @_ZTSPKDu = hidden constant
+// FUND-HID: @_ZTSPDu = hidden constant
 // FUND-HID: @_ZTIPKDu = hidden constant
-// FUND-EXP: @_ZTSDu = dllexport constant
+// FUND-HID: @_ZTSPKDu = hidden constant
 // FUND-EXP: @_ZTIDu = dllexport constant
-// FUND-EXP: @_ZTSPDu = dllexport constant
+// FUND-EXP: @_ZTSDu = dllexport constant
 // FUND-EXP: @_ZTIPDu = dllexport constant
-// FUND-EXP: @_ZTSPKDu = dllexport constant
+// FUND-EXP: @_ZTSPDu = dllexport constant
 // FUND-EXP: @_ZTIPKDu = dllexport constant
+// FUND-EXP: @_ZTSPKDu = dllexport constant
 
 // char16_t
-// FUND-DEF: @_ZTSDs = constant
 // FUND-DEF: @_ZTIDs = constant
-// FUND-DEF: @_ZTSPDs = constant
+// FUND-DEF: @_ZTSDs = constant
 // FUND-DEF: @_ZTIPDs = constant
-// FUND-DEF: @_ZTSPKDs = constant
+// FUND-DEF: @_ZTSPDs = constant
 // FUND-DEF: @_ZTIPKDs = constant
-// FUND-HID: @_ZTSDs = hidden constant
+// FUND-DEF: @_ZTSPKDs = constant
 // FUND-HID: @_ZTIDs = hidden constant
-// FUND-HID: @_ZTSPDs = hidden constant
+// FUND-HID: @_ZTSDs = hidden constant
 // FUND-HID: @_ZTIPDs = hidden constant
-// FUND-HID: @_ZTSPKDs = hidden constant
+// FUND-HID: @_ZTSPDs = hidden constant
 // FUND-HID: @_ZTIPKDs = hidden constant
-// FUND-EXP: @_ZTSDs = dllexport constant
+// FUND-HID: @_ZTSPKDs = hidden constant
 // FUND-EXP: @_ZTIDs = dllexport constant
-// FUND-EXP: @_ZTSPDs = dllexport constant
+// FUND-EXP: @_ZTSDs = dllexport constant
 // FUND-EXP: @_ZTIPDs = dllexport constant
-// FUND-EXP: @_ZTSPKDs = dllexport constant
+// FUND-EXP: @_ZTSPDs = dllexport constant
 // FUND-EXP: @_ZTIPKDs = dllexport constant
+// FUND-EXP: @_ZTSPKDs = dllexport constant
 
 // char32_t
-// FUND-DEF: @_ZTSDi = constant
 // FUND-DEF: @_ZTIDi = constant
-// FUND-DEF: @_ZTSPDi = constant
+// FUND-DEF: @_ZTSDi = constant
 // FUND-DEF: @_ZTIPDi = constant
-// FUND-DEF: @_ZTSPKDi = constant
+// FUND-DEF: @_ZTSPDi = constant
 // FUND-DEF: @_ZTIPKDi = constant
-// FUND-HID: @_ZTSDi = hidden constant
+// FUND-DEF: @_ZTSPKDi = constant
 // FUND-HID: @_ZTIDi = hidden constant
-// FUND-HID: @_ZTSPDi = hidden constant
+// FUND-HID: @_ZTSDi = hidden constant
 // FUND-HID: @_ZTIPDi = hidden constant
-// FUND-HID: @_ZTSPKDi = hidden constant
+// FUND-HID: @_ZTSPDi = hidden constant
 // FUND-HID: @_ZTIPKDi = hidden constant
-// FUND-EXP: @_ZTSDi = dllexport constant
+// FUND-HID: @_ZTSPKDi = hidden constant
 // FUND-EXP: @_ZTIDi = dllexport constant
-// FUND-EXP: @_ZTSPDi = dllexport constant
+// FUND-EXP: @_ZTSDi = dllexport constant
 // FUND-EXP: @_ZTIPDi = dllexport constant
-// FUND-EXP: @_ZTSPKDi = dllexport constant
+// FUND-EXP: @_ZTSPDi = dllexport constant
 // FUND-EXP: @_ZTIPKDi = dllexport constant
+// FUND-EXP: @_ZTSPKDi = dllexport constant
diff --git a/clang/test/CodeGenCXX/modules-vtable.cppm b/clang/test/CodeGenCXX/modules-vtable.cppm
index 5cc3504d7262..6589b9f3c5d6 100644
--- a/clang/test/CodeGenCXX/modules-vtable.cppm
+++ b/clang/test/CodeGenCXX/modules-vtable.cppm
@@ -40,13 +40,13 @@ inline
 Base::~Base() {}
 
 // CHECK: @_ZTVW3Mod4Base = unnamed_addr constant
-// CHECK: @_ZTSW3Mod4Base = constant
 // CHECK: @_ZTIW3Mod4Base = constant
+// CHECK: @_ZTSW3Mod4Base = constant
 
 // With the new Itanium C++ ABI, the linkage of vtables in modules don't need to be linkonce ODR.
 // CHECK-INLINE: @_ZTVW3Mod4Base = {{.*}}unnamed_addr constant
-// CHECK-INLINE: @_ZTSW3Mod4Base = {{.*}}constant
 // CHECK-INLINE: @_ZTIW3Mod4Base = {{.*}}constant
+// CHECK-INLINE: @_ZTSW3Mod4Base = {{.*}}constant
 
 module :private;
 int private_use() {
@@ -61,12 +61,12 @@ int use() {
     return 43;
 }
 
-// CHECK-NOT: @_ZTSW3Mod4Base
 // CHECK-NOT: @_ZTIW3Mod4Base
+// CHECK-NOT: @_ZTSW3Mod4Base
 // CHECK: @_ZTVW3Mod4Base = external
 
-// CHECK-INLINE-NOT: @_ZTSW3Mod4Base
 // CHECK-INLINE-NOT: @_ZTIW3Mod4Base
+// CHECK-INLINE-NOT: @_ZTSW3Mod4Base
 // CHECK-INLINE: @_ZTVW3Mod4Base = external
 
 // Check the case that the declaration of the key function comes from another
@@ -86,8 +86,8 @@ int a_use() {
 }
 
 // CHECK: @_ZTVW1M1C = unnamed_addr constant
-// CHECK: @_ZTSW1M1C = constant
 // CHECK: @_ZTIW1M1C = constant
+// CHECK: @_ZTSW1M1C = constant
 
 //--- M-B.cppm
 export module M:B;
@@ -101,5 +101,5 @@ int b_use() {
 }
 
 // CHECK: @_ZTVW1M1C = external
-// CHECK-NOT: @_ZTSW1M1C
 // CHECK-NOT: @_ZTIW1M1C
+// CHECK-NOT: @_ZTSW1M1C
diff --git a/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
index 2b633addd677..b50e0908f9db 100644
--- a/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-rtti-layout.cpp
@@ -5,12 +5,12 @@
 
 struct A { int a; };
 
+// DARWIN: @_ZTI1A = linkonce_odr hidden constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1A to i64), i64 -9223372036854775808) to ptr) }
 // DARWIN: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 // DARWIN: @_ZTS1A = linkonce_odr hidden constant [3 x i8] c"1A\00"
-// DARWIN: @_ZTI1A = linkonce_odr hidden constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1A to i64), i64 -9223372036854775808) to ptr) }
 
+// ELF: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }
 // ELF: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 // ELF: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00"
-// ELF: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }
 
 auto ATI = typeid(A);
diff --git a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
index 174aeda89d17..f4396e402703 100644
--- a/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-type-info-vtable.cpp
@@ -60,12 +60,13 @@ static_assert(__has_feature(ptrauth_type_info_vtable_pointer_discrimination) ==
 extern "C" int disc_std_type_info = __builtin_ptrauth_string_discriminator("_ZTVSt9type_info");
 
 // CHECK: @_ZTV10TestStruct = unnamed_addr constant { [4 x ptr] } { [4 x ptr] [ptr null, ptr @_ZTI10TestStruct, ptr ptrauth (ptr @_ZN10TestStructD1Ev, i32 0, i64 52216, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 2)), ptr ptrauth (ptr @_ZN10TestStructD0Ev, i32 0, i64 39671, ptr getelementptr inbounds ({ [4 x ptr] }, ptr @_ZTV10TestStruct, i32 0, i32 0, i32 3))] }, align 8
-// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
-// CHECK: @_ZTS10TestStruct = constant [13 x i8] c"10TestStruct\00", align 1
 
 // NODISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS10TestStruct }, align 8
 
-// DISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 [[STDTYPEINFO_DISC]]), ptr @_ZTS10TestStruct }, align 8
+// DISC: @_ZTI10TestStruct = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 [[STDTYPEINFO_DISC]], ptr @_ZTI10TestStruct), ptr @_ZTS10TestStruct }, align 8
+
+// CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
+// CHECK: @_ZTS10TestStruct = constant [13 x i8] c"10TestStruct\00", align 1
 
 struct TestStruct {
   virtual ~TestStruct();
diff --git a/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
index 031bb48608af..b5c15a29eb6b 100644
--- a/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
+++ b/clang/test/CodeGenCXX/ptrauth-vtable-virtual-inheritance-thunk.cpp
@@ -94,30 +94,30 @@
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1AD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 5)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1AD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1A, i32 0, i32 0, i32 6))] }, align 8
 
+// CHECK: @_ZTI1A = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }, align 8
+
 // CHECK: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 
 // CHECK: @_ZTS1A = constant [3 x i8] c"1A\00", align 1
 
-// CHECK: @_ZTI1A = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1A }, align 8
+// CHECK: @_ZTI1C = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1C, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8
 
 // CHECK: @_ZTVN10__cxxabiv121__vmi_class_type_infoE = external global [0 x ptr]
 
 // CHECK: @_ZTS1C = constant [3 x i8] c"1C\00", align 1
 
+// DARWIN: @_ZTI1B = linkonce_odr hidden constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1B to i64), i64 -9223372036854775808) to ptr), ptr @_ZTI1A }, align 8
+// ELF:    @_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8
+
 // CHECK: @_ZTVN10__cxxabiv120__si_class_type_infoE = external global [0 x ptr]
 
 // DARWIN: @_ZTS1B = linkonce_odr hidden constant [3 x i8] c"1B\00", align 1
 // ELF:    @_ZTS1B = linkonce_odr constant [3 x i8] c"1B\00", comdat, align 1
 
-// DARWIN: @_ZTI1B = linkonce_odr hidden constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr inttoptr (i64 add (i64 ptrtoint (ptr @_ZTS1B to i64), i64 -9223372036854775808) to ptr), ptr @_ZTI1A }, align 8
-// ELF:    @_ZTI1B = linkonce_odr constant { ptr, ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv120__si_class_type_infoE, i64 2), i32 2), ptr @_ZTS1B, ptr @_ZTI1A }, comdat, align 8
-
-// CHECK: @_ZTI1C = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1C, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8
+// CHECK: @_ZTI1D = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1D, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8
 
 // CHECK: @_ZTS1D = constant [3 x i8] c"1D\00", align 1
 
-// CHECK: @_ZTI1D = constant { ptr, ptr, i32, i32, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1D, i32 0, i32 1, ptr @_ZTI1B, i64 -6141 }, align 8
-
 // CHECK: @_ZTV1E = unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI1E,
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1E1fEv, i32 0, i64 28408, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 2)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1E1gEv, i32 0, i64 22926, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 3)),
@@ -125,10 +125,10 @@
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1ED1Ev, i32 0, i64 5817, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 5)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1ED0Ev, i32 0, i64 26464, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1E, i32 0, i32 0, i32 6))] }, align 8
 
-// CHECK: @_ZTS1E = constant [3 x i8] c"1E\00", align 1
-
 // CHECK: @_ZTI1E = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2), ptr @_ZTS1E }, align 8
 
+// CHECK: @_ZTS1E = constant [3 x i8] c"1E\00", align 1
+
 // CHECK: @_ZTC1F0_1C = unnamed_addr constant { [5 x ptr], [11 x ptr] } { [5 x ptr] [ptr inttoptr (i64 16 to ptr), ptr null, ptr @_ZTI1C,
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 0, i32 3)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1F0_1C, i32 0, i32 0, i32 4))], [11 x ptr] [ptr inttoptr (i64 -16 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -16 to ptr), ptr @_ZTI1C,
@@ -149,10 +149,10 @@
 // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 9)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1F8_1D, i32 0, i32 1, i32 10))] }, align 8
 
-// CHECK: @_ZTS1F = constant [3 x i8] c"1F\00", align 1
-
 // CHECK: @_ZTI1F = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1F, i32 3, i32 3, ptr @_ZTI1C, i64 2, ptr @_ZTI1D, i64 2050, ptr @_ZTI1E, i64 -8189 }, align 8
 
+// CHECK: @_ZTS1F = constant [3 x i8] c"1F\00", align 1
+
 // CHECK: @_ZTC1G0_1C = unnamed_addr constant { [5 x ptr], [11 x ptr] } { [5 x ptr] [ptr inttoptr (i64 24 to ptr), ptr null, ptr @_ZTI1C,
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD1Ev, i32 0, i64 31214, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 0, i32 3)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1CD0Ev, i32 0, i64 8507, ptr getelementptr inbounds ({ [5 x ptr], [11 x ptr] }, ptr @_ZTC1G0_1C, i32 0, i32 0, i32 4))], [11 x ptr] [ptr inttoptr (i64 -24 to ptr), ptr null, ptr null, ptr null, ptr inttoptr (i64 -24 to ptr), ptr @_ZTI1C,
@@ -173,10 +173,10 @@
 // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD1Ev, i32 0, i64 2043, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 9)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZTv0_n48_N1DD0Ev, i32 0, i64 63674, ptr getelementptr inbounds ({ [7 x ptr], [11 x ptr] }, ptr @_ZTC1G8_1D, i32 0, i32 1, i32 10))] }, align 8
 
-// CHECK: @_ZTS1G = constant [3 x i8] c"1G\00", align 1
-
 // CHECK: @_ZTI1G = constant { ptr, ptr, i32, i32, ptr, i64, ptr, i64, ptr, i64 } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv121__vmi_class_type_infoE, i64 2), i32 2), ptr @_ZTS1G, i32 3, i32 3, ptr @_ZTI1E, i64 -8189, ptr @_ZTI1C, i64 2, ptr @_ZTI1D, i64 2050 }, align 8
 
+// CHECK: @_ZTS1G = constant [3 x i8] c"1G\00", align 1
+
 // CHECK: @_ZTV1B = linkonce_odr unnamed_addr constant { [7 x ptr] } { [7 x ptr] [ptr null, ptr @_ZTI1B,
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1A1fEv, i32 0, i64 55636, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 2)),
 // CHECK-SAME: ptr ptrauth (ptr @_ZN1A1gEv, i32 0, i64 19402, ptr getelementptr inbounds ({ [7 x ptr] }, ptr @_ZTV1B, i32 0, i32 0, i32 3)),
diff --git a/clang/test/CodeGenCXX/rtti-linkage.cpp b/clang/test/CodeGenCXX/rtti-linkage.cpp
index ca50a1bc6f01..03e7cdedd346 100644
--- a/clang/test/CodeGenCXX/rtti-linkage.cpp
+++ b/clang/test/CodeGenCXX/rtti-linkage.cpp
@@ -3,73 +3,73 @@
 
 #include <typeinfo>
 
+// CHECK-BOTH: _ZTIP1C = internal constant
 // CHECK-BOTH: _ZTSP1C = internal constant
-// CHECK-BOTH: _ZTS1C = internal constant
 // CHECK-BOTH: _ZTI1C = internal constant
-// CHECK-BOTH: _ZTIP1C = internal constant
-// CHECK-BOTH: _ZTSPP1C = internal constant
+// CHECK-BOTH: _ZTS1C = internal constant
 // CHECK-BOTH: _ZTIPP1C = internal constant
-// CHECK-BOTH: _ZTSM1Ci = internal constant
+// CHECK-BOTH: _ZTSPP1C = internal constant
 // CHECK-BOTH: _ZTIM1Ci = internal constant
-// CHECK-BOTH: _ZTSPM1Ci = internal constant
+// CHECK-BOTH: _ZTSM1Ci = internal constant
 // CHECK-BOTH: _ZTIPM1Ci = internal constant
-// CHECK-BOTH: _ZTSM1CS_ = internal constant
+// CHECK-BOTH: _ZTSPM1Ci = internal constant
 // CHECK-BOTH: _ZTIM1CS_ = internal constant
-// CHECK-BOTH: _ZTSM1CPS_ = internal constant
+// CHECK-BOTH: _ZTSM1CS_ = internal constant
 // CHECK-BOTH: _ZTIM1CPS_ = internal constant
+// CHECK-BOTH: _ZTSM1CPS_ = internal constant
+// CHECK-BOTH: _ZTIM1A1C = internal constant
 // CHECK-BOTH: _ZTSM1A1C = internal constant
-// CHECK: _ZTS1A = linkonce_odr constant
-// CHECK-WITH-HIDDEN: _ZTS1A = linkonce_odr hidden constant
 // CHECK: _ZTI1A = linkonce_odr constant
 // CHECK-WITH-HIDDEN: _ZTI1A = linkonce_odr hidden constant
-// CHECK-BOTH: _ZTIM1A1C = internal constant
-// CHECK-BOTH: _ZTSM1AP1C = internal constant
+// CHECK: _ZTS1A = linkonce_odr constant
+// CHECK-WITH-HIDDEN: _ZTS1A = linkonce_odr hidden constant
 // CHECK-BOTH: _ZTIM1AP1C = internal constant
+// CHECK-BOTH: _ZTSM1AP1C = internal constant
 
 // CHECK-WITH-HIDDEN: _ZTSFN12_GLOBAL__N_11DEvE = internal constant
-// CHECK-WITH-HIDDEN: @_ZTSPK2T4 = linkonce_odr hidden constant 
-// CHECK-WITH-HIDDEN: @_ZTS2T4 = linkonce_odr hidden constant 
-// CHECK-WITH-HIDDEN: @_ZTI2T4 = linkonce_odr hidden constant 
-// CHECK-WITH-HIDDEN: @_ZTIPK2T4 = linkonce_odr hidden constant 
-// CHECK-WITH-HIDDEN: @_ZTSZ2t5vE1A = internal constant
+// CHECK-WITH-HIDDEN: @_ZTIPK2T4 = linkonce_odr hidden constant
+// CHECK-WITH-HIDDEN: @_ZTSPK2T4 = linkonce_odr hidden constant
+// CHECK-WITH-HIDDEN: @_ZTI2T4 = linkonce_odr hidden constant
+// CHECK-WITH-HIDDEN: @_ZTS2T4 = linkonce_odr hidden constant
 // CHECK-WITH-HIDDEN: @_ZTIZ2t5vE1A = internal constant
-// CHECK-WITH-HIDDEN: @_ZTSZ2t6vE1A = linkonce_odr hidden constant
+// CHECK-WITH-HIDDEN: @_ZTSZ2t5vE1A = internal constant
 // CHECK-WITH-HIDDEN: @_ZTIZ2t6vE1A = linkonce_odr hidden constant
+// CHECK-WITH-HIDDEN: @_ZTSZ2t6vE1A = linkonce_odr hidden constant
+// CHECK-WITH-HIDDEN: @_ZTIPZ2t7vE1A = linkonce_odr hidden constant
 // CHECK-WITH-HIDDEN: @_ZTSPZ2t7vE1A = linkonce_odr hidden constant
-// CHECK-WITH-HIDDEN: @_ZTSZ2t7vE1A = linkonce_odr hidden constant
 // CHECK-WITH-HIDDEN: @_ZTIZ2t7vE1A = linkonce_odr hidden constant
-// CHECK-WITH-HIDDEN: @_ZTIPZ2t7vE1A = linkonce_odr hidden constant
+// CHECK-WITH-HIDDEN: @_ZTSZ2t7vE1A = linkonce_odr hidden constant
 
-// CHECK: _ZTSN12_GLOBAL__N_11DE = internal constant
 // CHECK: _ZTIN12_GLOBAL__N_11DE = internal constant
-// CHECK: _ZTSPN12_GLOBAL__N_11DE = internal constant
+// CHECK: _ZTSN12_GLOBAL__N_11DE = internal constant
 // CHECK: _ZTIPN12_GLOBAL__N_11DE = internal constant
-// CHECK: _ZTSFN12_GLOBAL__N_11DEvE = internal constant
+// CHECK: _ZTSPN12_GLOBAL__N_11DE = internal constant
 // CHECK: _ZTIFN12_GLOBAL__N_11DEvE = internal constant
-// CHECK: _ZTSFvN12_GLOBAL__N_11DEE = internal constant
+// CHECK: _ZTSFN12_GLOBAL__N_11DEvE = internal constant
 // CHECK: _ZTIFvN12_GLOBAL__N_11DEE = internal constant
+// CHECK: _ZTSFvN12_GLOBAL__N_11DEE = internal constant
+// CHECK: _ZTIPFvvE = linkonce_odr constant
 // CHECK: _ZTSPFvvE = linkonce_odr constant
-// CHECK: _ZTSFvvE = linkonce_odr constant
 // CHECK: _ZTIFvvE = linkonce_odr constant
-// CHECK: _ZTIPFvvE = linkonce_odr constant
-// CHECK: _ZTSN12_GLOBAL__N_11EE = internal constant
+// CHECK: _ZTSFvvE = linkonce_odr constant
 // CHECK: _ZTIN12_GLOBAL__N_11EE = internal constant
-// CHECK: _ZTSA10_i = linkonce_odr constant
+// CHECK: _ZTSN12_GLOBAL__N_11EE = internal constant
 // CHECK: _ZTIA10_i = linkonce_odr constant
+// CHECK: _ZTSA10_i = linkonce_odr constant
 // CHECK: _ZTI1TILj0EE = linkonce_odr constant
 // CHECK: _ZTI1TILj1EE = weak_odr constant
 // CHECK: _ZTI1TILj2EE = external constant
-// CHECK: _ZTSZ2t5vE1A = internal constant
 // CHECK: _ZTIZ2t5vE1A = internal constant
-// CHECK: _ZTS1B ={{.*}} constant
+// CHECK: _ZTSZ2t5vE1A = internal constant
 // CHECK: _ZTI1B ={{.*}} constant
+// CHECK: _ZTS1B ={{.*}} constant
 // CHECK: _ZTS1F = linkonce_odr constant
-// CHECK: _ZTSZ2t6vE1A = linkonce_odr constant
 // CHECK: _ZTIZ2t6vE1A = linkonce_odr constant
+// CHECK: _ZTSZ2t6vE1A = linkonce_odr constant
+// CHECK: _ZTIPZ2t7vE1A = linkonce_odr constant
 // CHECK: _ZTSPZ2t7vE1A = linkonce_odr constant
-// CHECK: _ZTSZ2t7vE1A = linkonce_odr constant
 // CHECK: _ZTIZ2t7vE1A = linkonce_odr constant
-// CHECK: _ZTIPZ2t7vE1A = linkonce_odr constant
+// CHECK: _ZTSZ2t7vE1A = linkonce_odr constant
 
 // CHECK: _ZTIN12_GLOBAL__N_11DE
 
diff --git a/clang/test/CodeGenCXX/rtti-visibility.cpp b/clang/test/CodeGenCXX/rtti-visibility.cpp
index 5945be5c73a2..1813fee658c7 100644
--- a/clang/test/CodeGenCXX/rtti-visibility.cpp
+++ b/clang/test/CodeGenCXX/rtti-visibility.cpp
@@ -6,10 +6,10 @@
 
 namespace Test1 {
   // A is explicitly marked hidden, so all RTTI data should also be marked hidden.
-  // CHECK-TEST1: @_ZTSN5Test11AE = linkonce_odr hidden constant
   // CHECK-TEST1: @_ZTIN5Test11AE = linkonce_odr hidden constant
-  // CHECK-TEST1: @_ZTSPN5Test11AE = linkonce_odr hidden constant
+  // CHECK-TEST1: @_ZTSN5Test11AE = linkonce_odr hidden constant
   // CHECK-TEST1: @_ZTIPN5Test11AE = linkonce_odr hidden constant
+  // CHECK-TEST1: @_ZTSPN5Test11AE = linkonce_odr hidden constant
   struct __attribute__((visibility("hidden"))) A { };
 
   void f() {
@@ -20,8 +20,8 @@ namespace Test1 {
 
 namespace Test2 {
   // A is weak, so its linkage should be linkoce_odr, but not marked hidden.
-  // CHECK-TEST2: @_ZTSN5Test21AE = linkonce_odr constant
   // CHECK-TEST2: @_ZTIN5Test21AE = linkonce_odr constant
+  // CHECK-TEST2: @_ZTSN5Test21AE = linkonce_odr constant
   struct A { };
   void f() {
     (void)typeid(A);
diff --git a/clang/test/CodeGenCXX/symbol-partition.cpp b/clang/test/CodeGenCXX/symbol-partition.cpp
index ecc58e2a847d..cefeeac63f01 100644
--- a/clang/test/CodeGenCXX/symbol-partition.cpp
+++ b/clang/test/CodeGenCXX/symbol-partition.cpp
@@ -2,8 +2,8 @@
 
 // CHECK: @gv = {{.*}}, partition "foo"
 // CHECK: @_ZTV1S = {{.*}}, partition "foo"
-// CHECK: @_ZTS1S = {{.*}}, partition "foo"
 // CHECK: @_ZTI1S = {{.*}}, partition "foo"
+// CHECK: @_ZTS1S = {{.*}}, partition "foo"
 
 // CHECK: @_Z5ifuncv = {{.*}}, partition "foo"
 
diff --git a/clang/test/CodeGenCXX/type_visibility.cpp b/clang/test/CodeGenCXX/type_visibility.cpp
index 13aafcff0fa1..00833e36944d 100644
--- a/clang/test/CodeGenCXX/type_visibility.cpp
+++ b/clang/test/CodeGenCXX/type_visibility.cpp
@@ -26,12 +26,12 @@ namespace temp0 {
   template struct B<A>;
   // FUNS-LABEL:        define weak_odr void @_ZN5temp01BINS_1AEE3fooEv(
   // VARS:        @_ZTVN5temp01BINS_1AEEE = weak_odr unnamed_addr constant
-  // VARS:        @_ZTSN5temp01BINS_1AEEE = weak_odr constant
   // VARS:        @_ZTIN5temp01BINS_1AEEE = weak_odr constant
+  // VARS:        @_ZTSN5temp01BINS_1AEEE = weak_odr constant
   // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp01BINS_1AEE3fooEv(
   // VARS-HIDDEN: @_ZTVN5temp01BINS_1AEEE = weak_odr hidden unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5temp01BINS_1AEEE = weak_odr hidden constant
   // VARS-HIDDEN: @_ZTIN5temp01BINS_1AEEE = weak_odr hidden constant
+  // VARS-HIDDEN: @_ZTSN5temp01BINS_1AEEE = weak_odr hidden constant
 }
 
 namespace temp1 {
@@ -43,12 +43,12 @@ namespace temp1 {
   template struct B<A>;
   // FUNS-LABEL:        define weak_odr void @_ZN5temp11BINS_1AEE3fooEv(
   // VARS:        @_ZTVN5temp11BINS_1AEEE = weak_odr unnamed_addr constant
-  // VARS:        @_ZTSN5temp11BINS_1AEEE = weak_odr constant
   // VARS:        @_ZTIN5temp11BINS_1AEEE = weak_odr constant
+  // VARS:        @_ZTSN5temp11BINS_1AEEE = weak_odr constant
   // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp11BINS_1AEE3fooEv(
   // VARS-HIDDEN: @_ZTVN5temp11BINS_1AEEE = weak_odr unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5temp11BINS_1AEEE = weak_odr constant
   // VARS-HIDDEN: @_ZTIN5temp11BINS_1AEEE = weak_odr constant
+  // VARS-HIDDEN: @_ZTSN5temp11BINS_1AEEE = weak_odr constant
 }
 
 namespace temp2 {
@@ -60,12 +60,12 @@ namespace temp2 {
   template struct B<A>;
   // FUNS-LABEL:        define weak_odr void @_ZN5temp21BINS_1AEE3fooEv(
   // VARS:        @_ZTVN5temp21BINS_1AEEE = weak_odr unnamed_addr constant
-  // VARS:        @_ZTSN5temp21BINS_1AEEE = weak_odr constant
   // VARS:        @_ZTIN5temp21BINS_1AEEE = weak_odr constant
+  // VARS:        @_ZTSN5temp21BINS_1AEEE = weak_odr constant
   // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp21BINS_1AEE3fooEv(
   // VARS-HIDDEN: @_ZTVN5temp21BINS_1AEEE = weak_odr hidden unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5temp21BINS_1AEEE = weak_odr hidden constant
   // VARS-HIDDEN: @_ZTIN5temp21BINS_1AEEE = weak_odr hidden constant
+  // VARS-HIDDEN: @_ZTSN5temp21BINS_1AEEE = weak_odr hidden constant
 }
 
 namespace temp3 {
@@ -77,12 +77,12 @@ namespace temp3 {
   template struct B<A>;
   // FUNS-LABEL:        define weak_odr hidden void @_ZN5temp31BINS_1AEE3fooEv(
   // VARS:        @_ZTVN5temp31BINS_1AEEE = weak_odr hidden unnamed_addr constant
-  // VARS:        @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant
   // VARS:        @_ZTIN5temp31BINS_1AEEE = weak_odr hidden constant
+  // VARS:        @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant
   // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp31BINS_1AEE3fooEv(
   // VARS-HIDDEN: @_ZTVN5temp31BINS_1AEEE = weak_odr hidden unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant
   // VARS-HIDDEN: @_ZTIN5temp31BINS_1AEEE = weak_odr hidden constant
+  // VARS-HIDDEN: @_ZTSN5temp31BINS_1AEEE = weak_odr hidden constant
 }
 
 namespace temp4 {
@@ -94,12 +94,12 @@ namespace temp4 {
   template struct B<A>;
   // FUNS-LABEL:        define weak_odr void @_ZN5temp41BINS_1AEE3fooEv(
   // VARS:        @_ZTVN5temp41BINS_1AEEE = weak_odr hidden unnamed_addr constant
-  // VARS:        @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant
   // VARS:        @_ZTIN5temp41BINS_1AEEE = weak_odr hidden constant
+  // VARS:        @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant
   // FUNS-HIDDEN-LABEL: define weak_odr hidden void @_ZN5temp41BINS_1AEE3fooEv(
   // VARS-HIDDEN: @_ZTVN5temp41BINS_1AEEE = weak_odr hidden unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant
   // VARS-HIDDEN: @_ZTIN5temp41BINS_1AEEE = weak_odr hidden constant
+  // VARS-HIDDEN: @_ZTSN5temp41BINS_1AEEE = weak_odr hidden constant
 }
 
 namespace type0 {
@@ -110,12 +110,12 @@ namespace type0 {
   void A::foo() {}
   // FUNS-LABEL:        define void @_ZN5type01A3fooEv(
   // VARS:        @_ZTVN5type01AE = unnamed_addr constant
-  // VARS:        @_ZTSN5type01AE = constant
   // VARS:        @_ZTIN5type01AE = constant
+  // VARS:        @_ZTSN5type01AE = constant
   // FUNS-HIDDEN-LABEL: define hidden void @_ZN5type01A3fooEv(
   // VARS-HIDDEN: @_ZTVN5type01AE = unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5type01AE = constant
   // VARS-HIDDEN: @_ZTIN5type01AE = constant
+  // VARS-HIDDEN: @_ZTSN5type01AE = constant
 }
 
 namespace type1 {
@@ -126,12 +126,12 @@ namespace type1 {
   void A::foo() {}
   // FUNS-LABEL:        define hidden void @_ZN5type11A3fooEv(
   // VARS:        @_ZTVN5type11AE = unnamed_addr constant
-  // VARS:        @_ZTSN5type11AE = constant
   // VARS:        @_ZTIN5type11AE = constant
+  // VARS:        @_ZTSN5type11AE = constant
   // FUNS-HIDDEN-LABEL: define hidden void @_ZN5type11A3fooEv(
   // VARS-HIDDEN: @_ZTVN5type11AE = unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5type11AE = constant
   // VARS-HIDDEN: @_ZTIN5type11AE = constant
+  // VARS-HIDDEN: @_ZTSN5type11AE = constant
 }
 
 namespace type2 {
@@ -142,12 +142,12 @@ namespace type2 {
   void A::foo() {}
   // FUNS-LABEL:        define void @_ZN5type21A3fooEv(
   // VARS:        @_ZTVN5type21AE = hidden unnamed_addr constant
-  // VARS:        @_ZTSN5type21AE = hidden constant
   // VARS:        @_ZTIN5type21AE = hidden constant
+  // VARS:        @_ZTSN5type21AE = hidden constant
   // FUNS-HIDDEN-LABEL: define hidden void @_ZN5type21A3fooEv(
   // VARS-HIDDEN: @_ZTVN5type21AE = hidden unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5type21AE = hidden constant
   // VARS-HIDDEN: @_ZTIN5type21AE = hidden constant
+  // VARS-HIDDEN: @_ZTSN5type21AE = hidden constant
 }
 
 namespace type3 {
@@ -158,11 +158,11 @@ namespace type3 {
   void A::foo() {}
   // FUNS-LABEL:        define void @_ZN5type31A3fooEv(
   // VARS:        @_ZTVN5type31AE = hidden unnamed_addr constant
-  // VARS:        @_ZTSN5type31AE = hidden constant
   // VARS:        @_ZTIN5type31AE = hidden constant
+  // VARS:        @_ZTSN5type31AE = hidden constant
   // FUNS-HIDDEN-LABEL: define void @_ZN5type31A3fooEv(
   // VARS-HIDDEN: @_ZTVN5type31AE = hidden unnamed_addr constant
-  // VARS-HIDDEN: @_ZTSN5type31AE = hidden constant
   // VARS-HIDDEN: @_ZTIN5type31AE = hidden constant
+  // VARS-HIDDEN: @_ZTSN5type31AE = hidden constant
 }
 
diff --git a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
index 60eb8f17f91f..68eb5cb48647 100644
--- a/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
+++ b/clang/test/CodeGenCXX/typeinfo-with-address-space.cpp
@@ -15,12 +15,12 @@ class B : A {
 // NO-AS: @_ZTISt9type_info = external constant ptr
 // AS: @_ZTIi = external addrspace(1) constant ptr addrspace(1)
 // NO-AS: @_ZTIi = external constant ptr
+// AS: @_ZTI1A = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1A }, comdat, align 8
+// NO-AS: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, comdat, align 8
 // AS: @_ZTVN10__cxxabiv117__class_type_infoE = external addrspace(1) global [0 x ptr addrspace(1)]
 // NO-AS: @_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
 // AS: @_ZTS1A = linkonce_odr addrspace(1) constant [3 x i8] c"1A\00", comdat, align 1
 // NO-AS: @_ZTS1A = linkonce_odr constant [3 x i8] c"1A\00", comdat, align 1
-// AS: @_ZTI1A = linkonce_odr addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1A }, comdat, align 8
-// NO-AS: @_ZTI1A = linkonce_odr constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, comdat, align 8
 // AS: @_ZTIf = external addrspace(1) constant ptr addrspace(1)
 // NO-AS: @_ZTIf = external constant ptr
 
diff --git a/clang/test/CodeGenCXX/visibility-ms-compat.cpp b/clang/test/CodeGenCXX/visibility-ms-compat.cpp
index 525691358832..0344803909cd 100644
--- a/clang/test/CodeGenCXX/visibility-ms-compat.cpp
+++ b/clang/test/CodeGenCXX/visibility-ms-compat.cpp
@@ -24,8 +24,8 @@ namespace test0 {
   // CHECK: declare void @_ZN5test01A3barEv()
 
   const std::type_info &ti = typeid(A);
-  // CHECK-GLOBAL: @_ZTSN5test01AE = linkonce_odr constant
   // CHECK-GLOBAL: @_ZTIN5test01AE = linkonce_odr constant
+  // CHECK-GLOBAL: @_ZTSN5test01AE = linkonce_odr constant
   // CHECK-GLOBAL: @_ZN5test02tiE = hidden constant
 }
 
@@ -40,8 +40,8 @@ namespace test1 {
   // CHECK: declare hidden void @_ZN5test11A3barEv()
 
   const std::type_info &ti = typeid(A);
-  // CHECK-GLOBAL: @_ZTSN5test11AE = linkonce_odr hidden constant
   // CHECK-GLOBAL: @_ZTIN5test11AE = linkonce_odr hidden constant
+  // CHECK-GLOBAL: @_ZTSN5test11AE = linkonce_odr hidden constant
   // CHECK-GLOBAL: @_ZN5test12tiE = hidden constant
 }
 
@@ -56,8 +56,8 @@ namespace test2 {
   // CHECK: declare void @_ZN5test21A3barEv()
 
   const std::type_info &ti = typeid(A);
-  // CHECK-GLOBAL: @_ZTSN5test21AE = linkonce_odr constant
   // CHECK-GLOBAL: @_ZTIN5test21AE = linkonce_odr constant
+  // CHECK-GLOBAL: @_ZTSN5test21AE = linkonce_odr constant
   // CHECK-GLOBAL: @_ZN5test22tiE = hidden constant
 }
 
@@ -73,8 +73,8 @@ namespace test3 {
   // CHECK: declare void @_ZN5test31BINS_1AEE3barEv()
 
   const std::type_info &ti = typeid(B<A>);
-  // CHECK-GLOBAL: @_ZTSN5test31BINS_1AEEE = linkonce_odr constant
   // CHECK-GLOBAL: @_ZTIN5test31BINS_1AEEE = linkonce_odr constant
+  // CHECK-GLOBAL: @_ZTSN5test31BINS_1AEEE = linkonce_odr constant
 }
 
 namespace test4 {
@@ -89,8 +89,8 @@ namespace test4 {
   // CHECK: declare void @_ZN5test41BINS_1AEE3barEv()
 
   const std::type_info &ti = typeid(B<A>);
-  // CHECK-GLOBAL: @_ZTSN5test41BINS_1AEEE = linkonce_odr constant
   // CHECK-GLOBAL: @_ZTIN5test41BINS_1AEEE = linkonce_odr constant
+  // CHECK-GLOBAL: @_ZTSN5test41BINS_1AEEE = linkonce_odr constant
 }
 
 namespace test5 {
@@ -105,6 +105,6 @@ namespace test5 {
   // CHECK: declare hidden void @_ZN5test51BINS_1AEE3barEv()
 
   const std::type_info &ti = typeid(B<A>);
-  // CHECK-GLOBAL: @_ZTSN5test51BINS_1AEEE = linkonce_odr hidden constant
   // CHECK-GLOBAL: @_ZTIN5test51BINS_1AEEE = linkonce_odr hidden constant
+  // CHECK-GLOBAL: @_ZTSN5test51BINS_1AEEE = linkonce_odr hidden constant
 }
diff --git a/clang/test/CodeGenCXX/vtable-align-address-space.cpp b/clang/test/CodeGenCXX/vtable-align-address-space.cpp
index 5eac0bd75dc5..5eccf0a0d77d 100644
--- a/clang/test/CodeGenCXX/vtable-align-address-space.cpp
+++ b/clang/test/CodeGenCXX/vtable-align-address-space.cpp
@@ -9,5 +9,5 @@ struct A {
 void A::f() {}
 
 // CHECK: @_ZTV1A ={{.*}} unnamed_addr addrspace(1) constant { [5 x ptr addrspace(1)] } { [5 x ptr addrspace(1)] [ptr addrspace(1) null, ptr addrspace(1) @_ZTI1A, ptr addrspace(1) addrspacecast (ptr @_ZN1A1fEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1A1gEv to ptr addrspace(1)), ptr addrspace(1) addrspacecast (ptr @_ZN1A1hEv to ptr addrspace(1))]
-// CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
 // CHECK: @_ZTI1A ={{.*}} addrspace(1) constant { ptr addrspace(1), ptr addrspace(1) } { ptr addrspace(1) getelementptr inbounds (ptr addrspace(1), ptr addrspace(1) @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr addrspace(1) @_ZTS1A }, align 8
+// CHECK: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
diff --git a/clang/test/CodeGenCXX/vtable-align.cpp b/clang/test/CodeGenCXX/vtable-align.cpp
index fb8ff1a582ec..f1d5e09b9730 100644
--- a/clang/test/CodeGenCXX/vtable-align.cpp
+++ b/clang/test/CodeGenCXX/vtable-align.cpp
@@ -10,8 +10,8 @@ struct A {
 void A::f() {}
 
 // CHECK-32: @_ZTV1A ={{.*}} unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A1fEv, ptr @_ZN1A1gEv, ptr @_ZN1A1hEv] }, align 4
-// CHECK-32: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
 // CHECK-32: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i32 2), ptr @_ZTS1A }, align 4
+// CHECK-32: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
 // CHECK-64: @_ZTV1A ={{.*}} unnamed_addr constant { [5 x ptr] } { [5 x ptr] [ptr null, ptr @_ZTI1A, ptr @_ZN1A1fEv, ptr @_ZN1A1gEv, ptr @_ZN1A1hEv] }, align 8
-// CHECK-64: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
 // CHECK-64: @_ZTI1A ={{.*}} constant { ptr, ptr } { ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), ptr @_ZTS1A }, align 8
+// CHECK-64: @_ZTS1A ={{.*}} constant [3 x i8] c"1A\00", align 1
diff --git a/clang/test/CodeGenCXX/vtable-available-externally.cpp b/clang/test/CodeGenCXX/vtable-available-externally.cpp
index ab105260bc75..4415e24f3f1c 100644
--- a/clang/test/CodeGenCXX/vtable-available-externally.cpp
+++ b/clang/test/CodeGenCXX/vtable-available-externally.cpp
@@ -49,8 +49,8 @@ void g() {
 // This tests mainly that the typeinfo and typename constants have their linkage
 // updated correctly.
 
-// CHECK-TEST2: @_ZTSN5Test21AE ={{.*}} constant
 // CHECK-TEST2: @_ZTIN5Test21AE ={{.*}} constant
+// CHECK-TEST2: @_ZTSN5Test21AE ={{.*}} constant
 // CHECK-TEST2: @_ZTVN5Test21AE ={{.*}} unnamed_addr constant
 namespace Test2 {
   struct A {
diff --git a/clang/test/CodeGenCXX/vtable-key-function-arm.cpp b/clang/test/CodeGenCXX/vtable-key-function-arm.cpp
index a054fd87c8ea..83889bf9f8db 100644
--- a/clang/test/CodeGenCXX/vtable-key-function-arm.cpp
+++ b/clang/test/CodeGenCXX/vtable-key-function-arm.cpp
@@ -90,8 +90,8 @@ struct Test2a {
 // V-table should be defined with strong linkage.
 Test2a::Test2a() { use(typeid(Test2a)); }
 // CHECK:      @_ZTV6Test2a ={{.*}} unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test2a ={{.*}} constant
 // CHECK-LATE: @_ZTI6Test2a ={{.*}} constant
+// CHECK-LATE: @_ZTS6Test2a ={{.*}} constant
 
 // 'bar' becomes the key function when 'foo' is defined inline.
 void Test2a::bar() {}
@@ -111,8 +111,8 @@ void Test2b::bar() {}
 // V-table should be defined with strong linkage.
 Test2b::Test2b() { use(typeid(Test2b)); }
 // CHECK:      @_ZTV6Test2b ={{.*}} unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test2b ={{.*}} constant
 // CHECK-LATE: @_ZTI6Test2b ={{.*}} constant
+// CHECK-LATE: @_ZTS6Test2b ={{.*}} constant
 
 inline void Test2b::foo() {}
 
@@ -131,8 +131,8 @@ inline void Test2c::foo() {}
 // V-table should be defined with strong linkage.
 Test2c::Test2c() { use(typeid(Test2c)); }
 // CHECK: @_ZTV6Test2c ={{.*}} unnamed_addr constant
-// CHECK: @_ZTS6Test2c ={{.*}} constant
 // CHECK: @_ZTI6Test2c ={{.*}} constant
+// CHECK: @_ZTS6Test2c ={{.*}} constant
 
 /*** Test3a ******************************************************************/
 
@@ -145,8 +145,8 @@ struct Test3a {
 // V-table should be defined with weak linkage.
 Test3a::Test3a() { use(typeid(Test3a)); }
 // CHECK:      @_ZTV6Test3a = linkonce_odr unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test3a = linkonce_odr constant
 // CHECK-LATE: @_ZTI6Test3a = linkonce_odr constant
+// CHECK-LATE: @_ZTS6Test3a = linkonce_odr constant
 
 // There ceases to be a key function after these declarations.
 inline void Test3a::bar() {}
@@ -166,8 +166,8 @@ inline void Test3b::bar() {}
 // V-table should be defined with weak linkage.
 Test3b::Test3b() { use(typeid(Test3b)); }
 // CHECK:      @_ZTV6Test3b = linkonce_odr unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test3b = linkonce_odr constant
 // CHECK-LATE: @_ZTI6Test3b = linkonce_odr constant
+// CHECK-LATE: @_ZTS6Test3b = linkonce_odr constant
 
 inline void Test3b::foo() {}
 
@@ -186,8 +186,8 @@ inline void Test3c::foo() {}
 // V-table should be defined with weak linkage.
 Test3c::Test3c() { use(typeid(Test3c)); }
 // CHECK: @_ZTV6Test3c = linkonce_odr unnamed_addr constant
-// CHECK: @_ZTS6Test3c = linkonce_odr constant
 // CHECK: @_ZTI6Test3c = linkonce_odr constant
+// CHECK: @_ZTS6Test3c = linkonce_odr constant
 
 /*** Test4a ******************************************************************/
 
@@ -200,8 +200,8 @@ template <class T> struct Test4a {
 // V-table should be defined with weak linkage.
 template <> Test4a<int>::Test4a() { use(typeid(Test4a)); }
 // CHECK: @_ZTV6Test4aIiE = linkonce_odr unnamed_addr constant
-// CHECK: @_ZTS6Test4aIiE = linkonce_odr constant
 // CHECK: @_ZTI6Test4aIiE = linkonce_odr constant
+// CHECK: @_ZTS6Test4aIiE = linkonce_odr constant
 
 // There ceases to be a key function after these declarations.
 template <> inline void Test4a<int>::bar() {}
@@ -221,8 +221,8 @@ template <> inline void Test4b<int>::bar() {}
 // V-table should be defined with weak linkage.
 template <> Test4b<int>::Test4b() { use(typeid(Test4b)); }
 // CHECK: @_ZTV6Test4bIiE = linkonce_odr unnamed_addr constant
-// CHECK: @_ZTS6Test4bIiE = linkonce_odr constant
 // CHECK: @_ZTI6Test4bIiE = linkonce_odr constant
+// CHECK: @_ZTS6Test4bIiE = linkonce_odr constant
 
 template <> inline void Test4b<int>::foo() {}
 
@@ -241,8 +241,8 @@ template <> inline void Test4c<int>::foo() {}
 // V-table should be defined with weak linkage.
 template <> Test4c<int>::Test4c() { use(typeid(Test4c)); }
 // CHECK: @_ZTV6Test4cIiE = linkonce_odr unnamed_addr constant
-// CHECK: @_ZTS6Test4cIiE = linkonce_odr constant
 // CHECK: @_ZTI6Test4cIiE = linkonce_odr constant
+// CHECK: @_ZTS6Test4cIiE = linkonce_odr constant
 
 /*** Test5a ******************************************************************/
 
@@ -258,8 +258,8 @@ template <> inline void Test5a<int>::foo();
 // V-table should be defined with weak linkage.
 template <> Test5a<int>::Test5a() { use(typeid(Test5a)); }
 // CHECK: @_ZTV6Test5aIiE = linkonce_odr unnamed_addr constant
-// CHECK: @_ZTS6Test5aIiE = linkonce_odr constant
 // CHECK: @_ZTI6Test5aIiE = linkonce_odr constant
+// CHECK: @_ZTS6Test5aIiE = linkonce_odr constant
 
 // There ceases to be a key function after these declarations.
 template <> inline void Test5a<int>::bar() {}
@@ -280,8 +280,8 @@ template <> inline void Test5b<int>::bar() {}
 // V-table should be defined with weak linkage.
 template <> Test5b<int>::Test5b() { use(typeid(Test5b)); }
 // CHECK: @_ZTV6Test5bIiE = linkonce_odr unnamed_addr constant
-// CHECK: @_ZTS6Test5bIiE = linkonce_odr constant
 // CHECK: @_ZTI6Test5bIiE = linkonce_odr constant
+// CHECK: @_ZTS6Test5bIiE = linkonce_odr constant
 
 template <> inline void Test5a<int>::foo();
 template <> inline void Test5b<int>::foo() {}
@@ -303,5 +303,5 @@ template <> inline void Test5c<int>::foo() {}
 // V-table should be defined with weak linkage.
 template <> Test5c<int>::Test5c() { use(typeid(Test5c)); }
 // CHECK: @_ZTV6Test5cIiE = linkonce_odr unnamed_addr constant
-// CHECK: @_ZTS6Test5cIiE = linkonce_odr constant
 // CHECK: @_ZTI6Test5cIiE = linkonce_odr constant
+// CHECK: @_ZTS6Test5cIiE = linkonce_odr constant
diff --git a/clang/test/CodeGenCXX/vtable-key-function-ios.cpp b/clang/test/CodeGenCXX/vtable-key-function-ios.cpp
index ff2793ad51f9..43abfb62c73a 100644
--- a/clang/test/CodeGenCXX/vtable-key-function-ios.cpp
+++ b/clang/test/CodeGenCXX/vtable-key-function-ios.cpp
@@ -63,8 +63,8 @@ struct Test1a {
 // V-table needs to be defined weakly.
 Test1a::Test1a() { use(typeid(Test1a)); }
 // CHECK:      @_ZTV6Test1a = linkonce_odr {{(dso_local )?}}unnamed_addr constant 
-// CHECK-LATE: @_ZTS6Test1a = linkonce_odr {{(dso_local )?}}constant
 // CHECK-LATE: @_ZTI6Test1a = linkonce_odr {{(dso_local )?}}constant
+// CHECK-LATE: @_ZTS6Test1a = linkonce_odr {{(dso_local )?}}constant
 
 // This defines the key function.
 inline void Test1a::foo() {}
@@ -83,8 +83,8 @@ inline void Test1b::foo() {}
 // V-table should be defined weakly..
 Test1b::Test1b() { use(typeid(Test1b)); }
 // CHECK: @_ZTV6Test1b = linkonce_odr {{(dso_local )?}}unnamed_addr constant 
-// CHECK: @_ZTS6Test1b = linkonce_odr {{(dso_local )?}}constant
 // CHECK: @_ZTI6Test1b = linkonce_odr {{(dso_local )?}}constant
+// CHECK: @_ZTS6Test1b = linkonce_odr {{(dso_local )?}}constant
 
 /*** Test2a ******************************************************************/
 
@@ -97,8 +97,8 @@ struct Test2a {
 // V-table should be defined with weak linkage.
 Test2a::Test2a() { use(typeid(Test2a)); }
 // CHECK:      @_ZTV6Test2a = linkonce_odr {{(dso_local )?}}unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test2a = linkonce_odr {{(dso_local )?}}constant
 // CHECK-LATE: @_ZTI6Test2a = linkonce_odr {{(dso_local )?}}constant
+// CHECK-LATE: @_ZTS6Test2a = linkonce_odr {{(dso_local )?}}constant
 
 void Test2a::bar() {}
 inline void Test2a::foo() {}
@@ -116,8 +116,8 @@ void Test2b::bar() {}
 // V-table should be defined with weak linkage.
 Test2b::Test2b() { use(typeid(Test2b)); }
 // CHECK:      @_ZTV6Test2b = linkonce_odr {{(dso_local )?}}unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test2b = linkonce_odr {{(dso_local )?}}constant
 // CHECK-LATE: @_ZTI6Test2b = linkonce_odr {{(dso_local )?}}constant
+// CHECK-LATE: @_ZTS6Test2b = linkonce_odr {{(dso_local )?}}constant
 
 inline void Test2b::foo() {}
 
@@ -135,8 +135,8 @@ inline void Test2c::foo() {}
 // V-table should be defined with weak linkage.
 Test2c::Test2c() { use(typeid(Test2c)); }
 // CHECK: @_ZTV6Test2c = linkonce_odr {{(dso_local )?}}unnamed_addr constant
-// CHECK: @_ZTS6Test2c = linkonce_odr {{(dso_local )?}}constant
 // CHECK: @_ZTI6Test2c = linkonce_odr {{(dso_local )?}}constant
+// CHECK: @_ZTS6Test2c = linkonce_odr {{(dso_local )?}}constant
 
 /*** Test3a ******************************************************************/
 
@@ -149,8 +149,8 @@ struct Test3a {
 // V-table should be defined with weak linkage.
 Test3a::Test3a() { use(typeid(Test3a)); }
 // CHECK:      @_ZTV6Test3a = linkonce_odr {{(dso_local )?}}unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test3a = linkonce_odr {{(dso_local )?}}constant
 // CHECK-LATE: @_ZTI6Test3a = linkonce_odr {{(dso_local )?}}constant
+// CHECK-LATE: @_ZTS6Test3a = linkonce_odr {{(dso_local )?}}constant
 
 // This defines the key function.
 inline void Test3a::bar() {}
@@ -169,8 +169,8 @@ inline void Test3b::bar() {}
 // V-table should be defined with weak linkage.
 Test3b::Test3b() { use(typeid(Test3b)); }
 // CHECK:      @_ZTV6Test3b = linkonce_odr {{(dso_local )?}}unnamed_addr constant
-// CHECK-LATE: @_ZTS6Test3b = linkonce_odr {{(dso_local )?}}constant
 // CHECK-LATE: @_ZTI6Test3b = linkonce_odr {{(dso_local )?}}constant
+// CHECK-LATE: @_ZTS6Test3b = linkonce_odr {{(dso_local )?}}constant
 
 // This defines the key function.
 inline void Test3b::foo() {}
@@ -190,5 +190,5 @@ inline void Test3c::foo() {}
 // V-table should be defined with weak linkage.
 Test3c::Test3c() { use(typeid(Test3c)); }
 // CHECK: @_ZTV6Test3c = linkonce_odr {{(dso_local )?}}unnamed_addr constant
-// CHECK: @_ZTS6Test3c = linkonce_odr {{(dso_local )?}}constant
 // CHECK: @_ZTI6Test3c = linkonce_odr {{(dso_local )?}}constant
+// CHECK: @_ZTS6Test3c = linkonce_odr {{(dso_local )?}}constant
diff --git a/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp b/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp
index dd4fd9f8754a..b3de2f634999 100644
--- a/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp
+++ b/clang/test/CodeGenCXX/vtable-key-function-win-comdat.cpp
@@ -15,11 +15,11 @@ Test1a::Test1a() { use(typeid(Test1a)); }
 inline void Test1a::foo() {}
 
 // CHECK:     $_ZTV6Test1a = comdat any
-// CHECK:     $_ZTS6Test1a = comdat any
 // CHECK:     $_ZTI6Test1a = comdat any
-// CHECK-NOT: $_ZTS6Test1a.1 = comdat any
+// CHECK:     $_ZTS6Test1a = comdat any
 // CHECK-NOT: $_ZTI6Test1a.1 = comdat any
+// CHECK-NOT: $_ZTS6Test1a.1 = comdat any
 
 // CHECK: @_ZTV6Test1a = linkonce_odr dso_local unnamed_addr constant {{.*}} ptr @_ZTI6Test1a
-// CHECK: @_ZTS6Test1a = linkonce_odr dso_local constant
 // CHECK: @_ZTI6Test1a = linkonce_odr dso_local constant {{.*}} ptr @_ZTS6Test1a
+// CHECK: @_ZTS6Test1a = linkonce_odr dso_local constant
diff --git a/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp b/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp
index 932d36f4abbd..8c948d16c90e 100644
--- a/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp
+++ b/clang/test/CodeGenCXX/weak-extern-typeinfo.cpp
@@ -30,17 +30,17 @@ class V2 : public virtual V1 {
 void V1::foo() { }
 void V2::foo() { }
 
-// CHECK: @_ZTS1A = weak_odr {{(dso_local |hidden )?}}constant
 // CHECK: @_ZTI1A = weak_odr {{(dso_local |hidden )?}}constant
-// CHECK: @_ZTS1B = weak_odr {{(dso_local |hidden )?}}constant
+// CHECK: @_ZTS1A = weak_odr {{(dso_local |hidden )?}}constant
 // CHECK: @_ZTI1B = weak_odr {{(dso_local |hidden )?}}constant
+// CHECK: @_ZTS1B = weak_odr {{(dso_local |hidden )?}}constant
+// CHECK: @_ZTI1C = weak_odr {{(dso_local |hidden )?}}constant
 // CHECK: @_ZTS1C = weak_odr {{(dso_local |hidden )?}}constant
-// CHECK: @_ZTS2T1 = linkonce_odr {{(dso_local |hidden )?}}constant
 // CHECK: @_ZTI2T1 = linkonce_odr {{(dso_local |hidden )?}}constant
-// CHECK: @_ZTS1T = linkonce_odr {{(dso_local |hidden )?}}constant
+// CHECK: @_ZTS2T1 = linkonce_odr {{(dso_local |hidden )?}}constant
 // CHECK: @_ZTI1T = linkonce_odr {{(dso_local |hidden )?}}constant
-// CHECK: @_ZTI1C = weak_odr {{(dso_local |hidden )?}}constant
-// CHECK: @_ZTS2V1 = weak_odr {{(dso_local |hidden )?}}constant
+// CHECK: @_ZTS1T = linkonce_odr {{(dso_local |hidden )?}}constant
 // CHECK: @_ZTI2V1 = weak_odr {{(dso_local |hidden )?}}constant
-// CHECK: @_ZTS2V2 = weak_odr {{(dso_local |hidden )?}}constant
+// CHECK: @_ZTS2V1 = weak_odr {{(dso_local |hidden )?}}constant
 // CHECK: @_ZTI2V2 = weak_odr {{(dso_local |hidden )?}}constant
+// CHECK: @_ZTS2V2 = weak_odr {{(dso_local |hidden )?}}constant
diff --git a/clang/test/CodeGenCXX/windows-itanium-type-info.cpp b/clang/test/CodeGenCXX/windows-itanium-type-info.cpp
index 20bd78df5098..95b7b3a4b29e 100644
--- a/clang/test/CodeGenCXX/windows-itanium-type-info.cpp
+++ b/clang/test/CodeGenCXX/windows-itanium-type-info.cpp
@@ -33,8 +33,8 @@ void f() {
 
 // CHECK-DAG: @_ZTI4base = external dllimport constant
 
-// CHECK-EH-IMPORT: @_ZTS4base = linkonce_odr dso_local constant
 // CHECK-EH-IMPORT: @_ZTI4base = linkonce_odr dso_local constant
+// CHECK-EH-IMPORT: @_ZTS4base = linkonce_odr dso_local constant
 
 struct __declspec(dllimport) gatekeeper {};
 struct zuul : gatekeeper {
diff --git a/clang/test/CodeGenObjCXX/rtti.mm b/clang/test/CodeGenObjCXX/rtti.mm
index ee3df349af18..2fc6f8722f43 100644
--- a/clang/test/CodeGenObjCXX/rtti.mm
+++ b/clang/test/CodeGenObjCXX/rtti.mm
@@ -4,19 +4,20 @@
 
 namespace std { class type_info; }
 
-// CHECK: @_ZTI1A = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS1A
 @interface A
 @end
 
-// CHECK: @_ZTI1B = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv120__si_class_type_infoE{{.*}}@_ZTS1B{{.*}}@_ZTI1A
 @interface B : A
 @end
 
 // CHECK: @_ZTIP1B = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv119__pointer_type_infoE{{.*}}@_ZTSP1B{{.*}}, i32 0, {{.*}}@_ZTI1B
-// CHECK: @_ZTI11objc_object = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS11objc_object
+// CHECK: @_ZTI1B = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv120__si_class_type_infoE{{.*}}@_ZTS1B{{.*}}@_ZTI1A
+// CHECK: @_ZTI1A = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS1A
+
 // CHECK: @_ZTIP11objc_object = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv119__pointer_type_infoE{{.*}}@_ZTSP11objc_object{{.*}}@_ZTI11objc_object
-// CHECK: @_ZTI10objc_class = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS10objc_class
+// CHECK: @_ZTI11objc_object = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS11objc_object
 // CHECK: @_ZTIP10objc_class = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv119__pointer_type_infoE{{.*}}@_ZTSP10objc_class{{.*}}@_ZTI10objc_class
+// CHECK: @_ZTI10objc_class = linkonce_odr constant {{.*}}@_ZTVN10__cxxabiv117__class_type_infoE{{.*}}@_ZTS10objc_class
 
 @protocol P;
 
diff --git a/clang/test/Modules/pr97313.cppm b/clang/test/Modules/pr97313.cppm
index ebbd0ee4e2c6..32c7112da091 100644
--- a/clang/test/Modules/pr97313.cppm
+++ b/clang/test/Modules/pr97313.cppm
@@ -107,12 +107,12 @@ auto v6 = new Template<NonTemplate>();
 // CHECK: @_ZTVW3Mod11NonTemplate = {{.*}}external
 // CHECK: @_ZTVW3Mod8TemplateIcE = {{.*}}external
 // CHECK: @_ZTVW3Mod8TemplateIjE = {{.*}}weak_odr
-// CHECK: @_ZTSW3Mod8TemplateIjE = {{.*}}weak_odr
 // CHECK: @_ZTIW3Mod8TemplateIjE = {{.*}}weak_odr
+// CHECK: @_ZTSW3Mod8TemplateIjE = {{.*}}weak_odr
 // CHECK: @_ZTVW3Mod8TemplateIdE = {{.*}}external
 // CHECK: @_ZTVW3Mod8TemplateIiE = {{.*}}linkonce_odr
-// CHECK: @_ZTSW3Mod8TemplateIiE = {{.*}}linkonce_odr
 // CHECK: @_ZTIW3Mod8TemplateIiE = {{.*}}linkonce_odr
+// CHECK: @_ZTSW3Mod8TemplateIiE = {{.*}}linkonce_odr
 // CHECK: @_ZTVW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr
-// CHECK: @_ZTSW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr
 // CHECK: @_ZTIW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr
+// CHECK: @_ZTSW3Mod8TemplateIS_11NonTemplateE = {{.*}}linkonce_odr
diff --git a/clang/test/SemaCXX/typeid-ref.cpp b/clang/test/SemaCXX/typeid-ref.cpp
index f788b04077ec..025816c42512 100644
--- a/clang/test/SemaCXX/typeid-ref.cpp
+++ b/clang/test/SemaCXX/typeid-ref.cpp
@@ -6,7 +6,7 @@ namespace std {
 struct X { };
 
 void f() {
-  // CHECK: @_ZTS1X = linkonce_odr {{(dso_local |hidden )?}}constant
   // CHECK: @_ZTI1X = linkonce_odr {{(dso_local |hidden )?}}constant
+  // CHECK: @_ZTS1X = linkonce_odr {{(dso_local |hidden )?}}constant
   (void)typeid(X&);
 }
diff --git a/llvm/test/CodeGen/AArch64/ptrauth-type-info-vptr-discr.ll b/llvm/test/CodeGen/AArch64/ptrauth-type-info-vptr-discr.ll
new file mode 100644
index 000000000000..fbd777911aec
--- /dev/null
+++ b/llvm/test/CodeGen/AArch64/ptrauth-type-info-vptr-discr.ll
@@ -0,0 +1,21 @@
+; RUN: llc -mtriple aarch64-linux-gnu    -mattr=+pauth -filetype=asm -o - %s | FileCheck --check-prefix=ELF %s
+; RUN: llc -mtriple aarch64-apple-darwin -mattr=+pauth -filetype=asm -o - %s | FileCheck --check-prefix=MACHO %s
+
+; ELF-LABEL:   _ZTI10Disc:
+; ELF-NEXT:      .xword  (_ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546,addr)
+; ELF-LABEL:   _ZTI10NoDisc:
+; ELF-NEXT:      .xword  (_ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546)
+
+; MACHO-LABEL: __ZTI10Disc:
+; MACHO-NEXT:    .quad   (__ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546,addr)
+; MACHO-LABEL: __ZTI10NoDisc:
+; MACHO-NEXT:    .quad   (__ZTVN10__cxxabiv117__class_type_infoE+16)@AUTH(da,45546)
+
+
+@_ZTI10Disc   = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 45546, ptr @_ZTI10Disc), ptr @_ZTS10Disc }, align 8
+@_ZTS10Disc   = constant [4 x i8] c"Disc", align 1
+
+@_ZTI10NoDisc = constant { ptr, ptr } { ptr ptrauth (ptr getelementptr inbounds (ptr, ptr @_ZTVN10__cxxabiv117__class_type_infoE, i64 2), i32 2, i64 45546), ptr @_ZTS10NoDisc }, align 8
+@_ZTS10NoDisc = constant [6 x i8] c"NoDisc", align 1
+
+@_ZTVN10__cxxabiv117__class_type_infoE = external global [0 x ptr]
-- 
GitLab


From 9d469b5988bfb1c2e99533f863b1f9eb5b0c58b7 Mon Sep 17 00:00:00 2001
From: =?UTF-8?q?Roger=20Ferrer=20Ib=C3=A1=C3=B1ez?= <rofirrim@gmail.com>
Date: Fri, 18 Oct 2024 08:06:47 +0200
Subject: [PATCH 329/329] [RISCV] Implement trampolines for rv64 (#96309)

This is implementation is based on what the X86 target does but
emitting the instructions that GCC emits for rv64.

---------

Co-authored-by: Pengcheng Wang <wangpengcheng.pp@bytedance.com>
---
 llvm/lib/Target/RISCV/RISCVISelLowering.cpp | 131 ++++++++++++++++++++
 llvm/lib/Target/RISCV/RISCVISelLowering.h   |   3 +
 llvm/test/CodeGen/RISCV/rv64-trampoline.ll  |  80 ++++++++++++
 3 files changed, 214 insertions(+)
 create mode 100644 llvm/test/CodeGen/RISCV/rv64-trampoline.ll

diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
index 952072c26739..fa157ca48db2 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.cpp
@@ -37,6 +37,8 @@
 #include "llvm/IR/Instructions.h"
 #include "llvm/IR/IntrinsicsRISCV.h"
 #include "llvm/IR/PatternMatch.h"
+#include "llvm/MC/MCCodeEmitter.h"
+#include "llvm/MC/MCInstBuilder.h"
 #include "llvm/Support/CommandLine.h"
 #include "llvm/Support/Debug.h"
 #include "llvm/Support/ErrorHandling.h"
@@ -625,6 +627,11 @@ RISCVTargetLowering::RISCVTargetLowering(const TargetMachine &TM,
   setOperationAction(ISD::READSTEADYCOUNTER, MVT::i64,
                      Subtarget.is64Bit() ? Legal : Custom);
 
+  if (Subtarget.is64Bit()) {
+    setOperationAction(ISD::INIT_TRAMPOLINE, MVT::Other, Custom);
+    setOperationAction(ISD::ADJUST_TRAMPOLINE, MVT::Other, Custom);
+  }
+
   setOperationAction({ISD::TRAP, ISD::DEBUGTRAP}, MVT::Other, Legal);
   setOperationAction(ISD::INTRINSIC_WO_CHAIN, MVT::Other, Custom);
   if (Subtarget.is64Bit())
@@ -7402,6 +7409,10 @@ SDValue RISCVTargetLowering::LowerOperation(SDValue Op,
     return emitFlushICache(DAG, Op.getOperand(0), Op.getOperand(1),
                            Op.getOperand(2), Flags, DL);
   }
+  case ISD::INIT_TRAMPOLINE:
+    return lowerINIT_TRAMPOLINE(Op, DAG);
+  case ISD::ADJUST_TRAMPOLINE:
+    return lowerADJUST_TRAMPOLINE(Op, DAG);
   }
 }
 
@@ -7417,6 +7428,126 @@ SDValue RISCVTargetLowering::emitFlushICache(SelectionDAG &DAG, SDValue InChain,
   return CallResult.second;
 }
 
+SDValue RISCVTargetLowering::lowerINIT_TRAMPOLINE(SDValue Op,
+                                                  SelectionDAG &DAG) const {
+  if (!Subtarget.is64Bit())
+    llvm::report_fatal_error("Trampolines only implemented for RV64");
+
+  // Create an MCCodeEmitter to encode instructions.
+  TargetLoweringObjectFile *TLO = getTargetMachine().getObjFileLowering();
+  assert(TLO);
+  MCContext &MCCtx = TLO->getContext();
+
+  std::unique_ptr<MCCodeEmitter> CodeEmitter(
+      createRISCVMCCodeEmitter(*getTargetMachine().getMCInstrInfo(), MCCtx));
+
+  SDValue Root = Op.getOperand(0);
+  SDValue Trmp = Op.getOperand(1); // trampoline
+  SDLoc dl(Op);
+
+  const Value *TrmpAddr = cast<SrcValueSDNode>(Op.getOperand(4))->getValue();
+
+  // We store in the trampoline buffer the following instructions and data.
+  // Offset:
+  //      0: auipc   t2, 0
+  //      4: ld      t0, 24(t2)
+  //      8: ld      t2, 16(t2)
+  //     12: jalr    t0
+  //     16: <StaticChainOffset>
+  //     24: <FunctionAddressOffset>
+  //     32:
+
+  constexpr unsigned StaticChainOffset = 16;
+  constexpr unsigned FunctionAddressOffset = 24;
+
+  const MCSubtargetInfo *STI = getTargetMachine().getMCSubtargetInfo();
+  assert(STI);
+  auto GetEncoding = [&](const MCInst &MC) {
+    SmallVector<char, 4> CB;
+    SmallVector<MCFixup> Fixups;
+    CodeEmitter->encodeInstruction(MC, CB, Fixups, *STI);
+    uint32_t Encoding = support::endian::read32le(CB.data());
+    return Encoding;
+  };
+
+  SDValue OutChains[6];
+
+  uint32_t Encodings[] = {
+      // auipc t2, 0
+      // Loads the current PC into t2.
+      GetEncoding(MCInstBuilder(RISCV::AUIPC).addReg(RISCV::X7).addImm(0)),
+      // ld t0, 24(t2)
+      // Loads the function address into t0. Note that we are using offsets
+      // pc-relative to the first instruction of the trampoline.
+      GetEncoding(
+          MCInstBuilder(RISCV::LD).addReg(RISCV::X5).addReg(RISCV::X7).addImm(
+              FunctionAddressOffset)),
+      // ld t2, 16(t2)
+      // Load the value of the static chain.
+      GetEncoding(
+          MCInstBuilder(RISCV::LD).addReg(RISCV::X7).addReg(RISCV::X7).addImm(
+              StaticChainOffset)),
+      // jalr t0
+      // Jump to the function.
+      GetEncoding(MCInstBuilder(RISCV::JALR)
+                      .addReg(RISCV::X0)
+                      .addReg(RISCV::X5)
+                      .addImm(0))};
+
+  // Store encoded instructions.
+  for (auto [Idx, Encoding] : llvm::enumerate(Encodings)) {
+    SDValue Addr = Idx > 0 ? DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                                         DAG.getConstant(Idx * 4, dl, MVT::i64))
+                           : Trmp;
+    OutChains[Idx] = DAG.getTruncStore(
+        Root, dl, DAG.getConstant(Encoding, dl, MVT::i64), Addr,
+        MachinePointerInfo(TrmpAddr, Idx * 4), MVT::i32);
+  }
+
+  // Now store the variable part of the trampoline.
+  SDValue FunctionAddress = Op.getOperand(2);
+  SDValue StaticChain = Op.getOperand(3);
+
+  // Store the given static chain and function pointer in the trampoline buffer.
+  struct OffsetValuePair {
+    const unsigned Offset;
+    const SDValue Value;
+    SDValue Addr = SDValue(); // Used to cache the address.
+  } OffsetValues[] = {
+      {StaticChainOffset, StaticChain},
+      {FunctionAddressOffset, FunctionAddress},
+  };
+  for (auto [Idx, OffsetValue] : llvm::enumerate(OffsetValues)) {
+    SDValue Addr =
+        DAG.getNode(ISD::ADD, dl, MVT::i64, Trmp,
+                    DAG.getConstant(OffsetValue.Offset, dl, MVT::i64));
+    OffsetValue.Addr = Addr;
+    OutChains[Idx + 4] =
+        DAG.getStore(Root, dl, OffsetValue.Value, Addr,
+                     MachinePointerInfo(TrmpAddr, OffsetValue.Offset));
+  }
+
+  SDValue StoreToken = DAG.getNode(ISD::TokenFactor, dl, MVT::Other, OutChains);
+
+  // The end of instructions of trampoline is the same as the static chain
+  // address that we computed earlier.
+  SDValue EndOfTrmp = OffsetValues[0].Addr;
+
+  // Call clear cache on the trampoline instructions.
+  SDValue Chain = DAG.getNode(ISD::CLEAR_CACHE, dl, MVT::Other, StoreToken,
+                              Trmp, EndOfTrmp);
+
+  return Chain;
+}
+
+SDValue RISCVTargetLowering::lowerADJUST_TRAMPOLINE(SDValue Op,
+                                                    SelectionDAG &DAG) const {
+  if (!Subtarget.is64Bit())
+    llvm::report_fatal_error("Trampolines only implemented for RV64");
+
+  return Op.getOperand(0);
+}
+
 static SDValue getTargetNode(GlobalAddressSDNode *N, const SDLoc &DL, EVT Ty,
                              SelectionDAG &DAG, unsigned Flags) {
   return DAG.getTargetGlobalAddress(N->getGlobal(), DL, Ty, 0, Flags);
diff --git a/llvm/lib/Target/RISCV/RISCVISelLowering.h b/llvm/lib/Target/RISCV/RISCVISelLowering.h
index 3864d58a129e..c37494479553 100644
--- a/llvm/lib/Target/RISCV/RISCVISelLowering.h
+++ b/llvm/lib/Target/RISCV/RISCVISelLowering.h
@@ -992,6 +992,9 @@ private:
   SDValue expandUnalignedRVVLoad(SDValue Op, SelectionDAG &DAG) const;
   SDValue expandUnalignedRVVStore(SDValue Op, SelectionDAG &DAG) const;
 
+  SDValue lowerINIT_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+  SDValue lowerADJUST_TRAMPOLINE(SDValue Op, SelectionDAG &DAG) const;
+
   bool isEligibleForTailCallOptimization(
       CCState &CCInfo, CallLoweringInfo &CLI, MachineFunction &MF,
       const SmallVector<CCValAssign, 16> &ArgLocs) const;
diff --git a/llvm/test/CodeGen/RISCV/rv64-trampoline.ll b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
new file mode 100644
index 000000000000..ba1840632650
--- /dev/null
+++ b/llvm/test/CodeGen/RISCV/rv64-trampoline.ll
@@ -0,0 +1,80 @@
+; NOTE: Assertions have been autogenerated by utils/update_llc_test_checks.py UTC_ARGS: --version 5
+; RUN: llc -mtriple=riscv64 -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64 %s
+; RUN: llc -mtriple=riscv64-unknown-linux-gnu -verify-machineinstrs < %s \
+; RUN:   | FileCheck -check-prefix=RV64-LINUX %s
+
+declare void @llvm.init.trampoline(ptr, ptr, ptr)
+declare ptr @llvm.adjust.trampoline(ptr)
+declare i64 @f(ptr nest, i64)
+
+define i64 @test0(i64 %n, ptr %p) nounwind {
+; RV64-LABEL: test0:
+; RV64:       # %bb.0:
+; RV64-NEXT:    addi sp, sp, -64
+; RV64-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; RV64-NEXT:    mv s0, a0
+; RV64-NEXT:    lui a0, %hi(f)
+; RV64-NEXT:    addi a0, a0, %lo(f)
+; RV64-NEXT:    sd a0, 32(sp)
+; RV64-NEXT:    li a0, 919
+; RV64-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64-NEXT:    lui a3, 6203
+; RV64-NEXT:    addi a3, a3, 643
+; RV64-NEXT:    sw a0, 8(sp)
+; RV64-NEXT:    sw a3, 12(sp)
+; RV64-NEXT:    sd a2, 16(sp)
+; RV64-NEXT:    sd a1, 24(sp)
+; RV64-NEXT:    addi a1, sp, 24
+; RV64-NEXT:    addi a0, sp, 8
+; RV64-NEXT:    addi s1, sp, 8
+; RV64-NEXT:    call __clear_cache
+; RV64-NEXT:    mv a0, s0
+; RV64-NEXT:    jalr s1
+; RV64-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64-NEXT:    addi sp, sp, 64
+; RV64-NEXT:    ret
+;
+; RV64-LINUX-LABEL: test0:
+; RV64-LINUX:       # %bb.0:
+; RV64-LINUX-NEXT:    addi sp, sp, -64
+; RV64-LINUX-NEXT:    sd ra, 56(sp) # 8-byte Folded Spill
+; RV64-LINUX-NEXT:    sd s0, 48(sp) # 8-byte Folded Spill
+; RV64-LINUX-NEXT:    sd s1, 40(sp) # 8-byte Folded Spill
+; RV64-LINUX-NEXT:    mv s0, a0
+; RV64-LINUX-NEXT:    lui a0, %hi(f)
+; RV64-LINUX-NEXT:    addi a0, a0, %lo(f)
+; RV64-LINUX-NEXT:    sd a0, 32(sp)
+; RV64-LINUX-NEXT:    li a0, 919
+; RV64-LINUX-NEXT:    lui a2, %hi(.LCPI0_0)
+; RV64-LINUX-NEXT:    ld a2, %lo(.LCPI0_0)(a2)
+; RV64-LINUX-NEXT:    lui a3, 6203
+; RV64-LINUX-NEXT:    addi a3, a3, 643
+; RV64-LINUX-NEXT:    sw a0, 8(sp)
+; RV64-LINUX-NEXT:    sw a3, 12(sp)
+; RV64-LINUX-NEXT:    sd a2, 16(sp)
+; RV64-LINUX-NEXT:    sd a1, 24(sp)
+; RV64-LINUX-NEXT:    addi a1, sp, 24
+; RV64-LINUX-NEXT:    addi a0, sp, 8
+; RV64-LINUX-NEXT:    addi s1, sp, 8
+; RV64-LINUX-NEXT:    li a2, 0
+; RV64-LINUX-NEXT:    call __riscv_flush_icache
+; RV64-LINUX-NEXT:    mv a0, s0
+; RV64-LINUX-NEXT:    jalr s1
+; RV64-LINUX-NEXT:    ld ra, 56(sp) # 8-byte Folded Reload
+; RV64-LINUX-NEXT:    ld s0, 48(sp) # 8-byte Folded Reload
+; RV64-LINUX-NEXT:    ld s1, 40(sp) # 8-byte Folded Reload
+; RV64-LINUX-NEXT:    addi sp, sp, 64
+; RV64-LINUX-NEXT:    ret
+  %alloca = alloca [32 x i8], align 8
+  call void @llvm.init.trampoline(ptr %alloca, ptr @f, ptr %p)
+  %tramp = call ptr @llvm.adjust.trampoline(ptr %alloca)
+  %ret = call i64 %tramp(i64 %n)
+  ret i64 %ret
+
+}
-- 
GitLab